├── ReadMe.md ├── analyzer.py ├── case_studies ├── comments.md └── strength_weekness.md ├── code_analyzer ├── ReadMe.md ├── __init__.py ├── config.py ├── definition_collector.py ├── macro_expand.py ├── preprocessor │ ├── __init__.py │ └── node_processor.py ├── schemas │ ├── __init__.py │ ├── ast_node.py │ ├── enums.py │ └── function_info.py ├── utils │ ├── __init__.py │ ├── addr_taken_sites_util.py │ └── func_key_collector.py ├── visit_utils │ ├── __init__.py │ ├── base_util.py │ ├── decl_util.py │ ├── func_type.py │ └── type_util.py └── visitors │ ├── __init__.py │ ├── base_func_visitor.py │ ├── base_visitor.py │ ├── func_body_visitors.py │ ├── global_visitor.py │ ├── macro_visitor.py │ └── util_visitor.py ├── config.py ├── evaluation_analyzer.py ├── experimental_logs ├── benchmark.md ├── cost.md ├── coverage.md ├── error_cases.md ├── filter_test.md ├── intermediate_res │ ├── Qwen72-0.5.csv │ ├── failed_callsites.txt │ └── global_failed_cases.txt ├── manual_analysis │ ├── callsite_bind9_1 │ │ └── log1.txt │ ├── callsite_cyclonedds_13 │ │ ├── log1.txt │ │ ├── log2.txt │ │ └── log3.txt │ ├── callsite_cyclonedds_3 │ │ └── log1.txt │ ├── callsite_hdf5_3 │ │ └── log1.txt │ ├── callsite_libjpeg-turbo_361 │ │ └── log1.txt │ └── callsite_lxc_1 │ │ └── log1.txt ├── results.md ├── results │ └── SimpleStrategy.md ├── step2.md └── updated_results.md ├── icall_solvers ├── __init__.py ├── base_solvers │ ├── ReadMe.md │ ├── __init__.py │ ├── base_matcher.py │ ├── flta │ │ ├── ReadMe.md │ │ ├── __init__.py │ │ ├── matcher.py │ │ ├── matching_result.py │ │ └── prompt.py │ ├── kelp │ │ ├── __init__.py │ │ ├── confine_func_analyzer.py │ │ └── matcher.py │ └── mlta │ │ ├── __init__.py │ │ ├── matcher.py │ │ └── type_confine_analyzer.py ├── dir_util.py └── llm_solvers │ ├── __init__.py │ ├── base_llm_solver.py │ ├── base_utils │ ├── __init__.py │ ├── func_summarizer.py │ └── prompts.py │ ├── sea │ ├── __init__.py │ ├── matcher.py │ └── prompts.py │ └── single │ ├── ReadMe.md │ ├── __init__.py │ ├── matcher.py │ └── prompt.py ├── llm_utils ├── __init__.py ├── base_analyzer.py ├── common_prompt.py ├── google_analyzer.py ├── openai_analyzer.py ├── tongyi_analyzer.py └── zhipu_analyzer.py ├── resources ├── benchmarks.txt ├── build.py └── filtered_keyword.txt ├── scope_strategy ├── ReadMe.md ├── __init__.py └── base_strategy.py ├── statistic_res.py └── tests ├── __init__.py ├── filter_test ├── __init__.py ├── simple_filter_test.py ├── test_codebert.py ├── test_varclr.py └── testcase1.py ├── llm_tests ├── __init__.py ├── codechat_test.py ├── gemini_test.py ├── glm_test.py ├── prompt_test.py ├── qwen_test.py └── vllm_test.py ├── test_data.py ├── test_data1.py ├── testcases ├── enum_decl.c ├── error_file_vlc.c ├── error_global_var_decl.c ├── error_snd_global.c ├── error_struct_decl.c ├── func_test2.c ├── macro_def_decl.c ├── macro_expand │ ├── macro_test.c │ ├── macro_test1.c │ ├── macro_test10.c │ ├── macro_test11.c │ ├── macro_test12.c │ ├── macro_test13.c │ ├── macro_test14.c │ ├── macro_test15.c │ ├── macro_test16.c │ ├── macro_test17.c │ ├── macro_test18.c │ ├── macro_test19.c │ ├── macro_test2.c │ ├── macro_test3.c │ ├── macro_test4.c │ ├── macro_test5.c │ ├── macro_test6.c │ ├── macro_test7.c │ ├── macro_test8.c │ └── macro_test9.c ├── ngx_func_decl_case1.c ├── ngx_resolver_case1.c ├── ngx_struct_specifier.c ├── struct_case1.c └── type_def_test.c └── visitor_test ├── __init__.py ├── declaration_visitor_test.py ├── file_test.py ├── funcdef_visitor_test.py ├── global_visitor_test.py ├── macro_visitor_test.py └── typedecl_visitor.py /ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | case studies refer to [case_studies](case_studies) 4 | 5 | 6 | symbol类型包括: 7 | 8 | - Struct 9 | 10 | - TypeAlias 11 | 12 | - Field 13 | 14 | - Variable 15 | 16 | - Function 17 | 18 | 该[tree-sitter commit](https://github.com/tree-sitter/py-tree-sitter/tree/4e2e765c5d8cf946b886bc757aef5cbf907c82b8)添加了visitor机制,我这里直接引用了。 19 | 20 | 21 | # 1.实现功能 22 | 23 | - 1.收集全局信息,包括project下能收集到的函数定义、类型别名、结构体定义信息。 24 | 25 | - 2.继续收集全局信息,搜寻global scope下的函数引用 26 | 27 | - 3.遍历每个函数,获取每个函数中函数引用处 28 | 29 | - 4.针对存在indirect-call的函数,收集该函数下的局部变量定义,对于icall,尝试基于参数类型匹配潜在的callee。 30 | 31 | 传统静态分析部分参考[code-analyzer](code_analyzer/ReadMe.md) 32 | 33 | - 如果要使用openai的API,请先运行`pip install openai` 34 | 35 | - 如果要使用google gemini,请运行 `pip install google-generativeai`。 36 | 37 | - 如果调用ChatGLM的API,请先运行 `pip install zhipuai`。 38 | 39 | - 如果调用通义千问的API,请运行 `pip install dashscope`。 40 | 41 | 42 | 使用: 43 | 44 | - 纯类型分析,不用LLM: 45 | 46 | ``` 47 | python evaluation_analyzer.py --only_count_scope --disable_llm_for_uncertain --llm_strategy=none --base_analyzer=kelp --evaluate_uncertain --root_path=$PATH_TO_PROJECT --scope_strategy=base --num_worker=1 --projects=$PROJECT_NAME 48 | ``` 49 | 50 | - 用LLM分析: 51 | 52 | ``` 53 | python evaluation_analyzer.py --only_count_scope --log_llm_output --log_res_to_file --disable_llm_for_uncertain --llm_strategy=sea --base_analyzer=flta --evaluate_uncertain --root_path=$PATH_TO_PROJECT --projects=$PROJECT --num_worker=12 --temperature=$TEMPERATURE --running_epoch=$EPOCH openai_local --model_type=$SELECTED_MODEL --address=$LOCAL_ADDRESS 54 | ``` 55 | 56 | 57 | # 2.LLM的部署 58 | 59 | ## 2.1.server 60 | 61 | 该项目目前支持调用[openai](https://platform.openai.com/), [智谱](https://www.zhipuai.cn/), [google gemini](https://ai.google.dev/), [阿里通义系列](https://dashscope.console.aliyun.com/)的API。本地部署的模型尝试过用3种方式部署: 62 | 63 | - [huggingface text-generation-inference](https://github.com/huggingface/text-generation-inference) 64 | 65 | - [vllm](https://github.com/vllm-project/vllm) 66 | 67 | - [sglang](https://github.com/sgl-project/sglang/) 68 | 69 | 目前以上部署方式都支持openai的api访问server。不过使用时发现了一些问题 70 | 71 | - vllm部署时通过openai api访问时不需要添加 `max_tokens` 参数,但是sglang部署时需要手动指定这些 `max` 参数,容易降低效率。 72 | 73 | - vllm和sglang部署只需要传递context长度参数 (vllm的 `--max-model-len` 以及sglang的 `--context-length`),但是text-generation-inference需要指定 `--max-total-tokens`、`--max-input-length`,感觉不是很灵活。 74 | 75 | - vllm单gpu部署时效率感觉很高,但是多gpu部署时容易出现[同步错误](https://github.com/vllm-project/vllm/issues/3839),这个错误貌似到0.4.0还没解决。 76 | 77 | 这里建议大家通过vllm或者sglang部署,如果用vllm,用 `openai_local` 调用本地模型时可以不传入 `max_tokens` 参数,但是sglang得传入,可以传个大点的比如 `3072`。 78 | 79 | chat模板加载方式: 80 | 81 | - sglang 82 | 83 | * 对于用 `launch_server` 的方式如果没有手动指定chat_template,则会用tokenizer[默认的chat_template](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/openai_api_adapter.py#L258)。 84 | 85 | * 如果用python代码调用API,chat_template加载方式为硬编码在py文件中,参考[chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py),sglang会在把modelpath lower后比对qwen等关键词查找对应模板。如果手动指定模版参数,其处理过程参考[load_chat_template_for_openai_api](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/openai_api_adapter.py#L38),模版文件必须为json格式。 86 | 87 | - vllm的模板加载相对灵活,会去model的tokenizer文件中找chat template,比如qwen1.5-14B-Chat的[tokenizer_config.json](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/file/view/master?fileName=tokenizer_config.json&status=1)中有 `chat_template` 字段定义了该模型的chat template。 88 | 89 | - swift的模型-模板对应表参考[model.py](https://github.com/modelscope/swift/blob/main/swift/llm/utils/model.py#L38),定义的全部模版参考[template.py](https://github.com/modelscope/swift/blob/main/swift/llm/utils/template.py#L23),同义硬编码。不过相比sglang,硬编码的是真多,需要在参数用 `template_type` 手动指定使用的模板。 90 | 91 | 在我们tool下,当用sglang部署model时,请添加 `max_tokens` 参数,否则sglang会用默认最大生成token数。用swift部署时,记得添加 `server_type` 参数,将 `model_name` 做一次映射。 92 | 93 | ## 2.2.models 94 | 95 | llama3存在一个eos token问题,参考[llama3 end token](https://github.com/huggingface/text-generation-inference/issues/1781),需要user手动设置eos token。 96 | 不过TGI貌似2.0.2版本后修复了这个问题,不需要手动设置eos token。 97 | 98 | # 3.others 99 | 100 | 插桩用到的pass:[LLVM Instrumentation Pass](https://github.com/for-just-we/TraceLogPass) 101 | 102 | 其它LLM模型:rebuttal的时候怕了两组code特定LLM作为baseline,当时(24年6月)算比较SOTA的code LLM,不过效果看起来并不是很好。 103 | 一个可能原因是code LLM的自然语言推理能力不如general LLM。 104 | 105 | | model | model_type | precision | recall | F1 | 106 | | ---- | ---- | ---- | ---- | ---- | 107 | | deepseek-coder-instruct | code-LLM | 27.6 | 92.1 | 36.3 | 108 | | CodeQwen-1.5-Chat |code-LLM| 27.3 | 35.4 | 26.9 | 109 | | Qwen1.5-72B-Chat | general-LLM | 49.1 | 97.3 | 59.4 | 110 | 111 | 为了减少LLM的query次数,我们起初试图用CodeBert计算callee的declaration和caller的文本相似度进行些简单的filter操作。 112 | 这部分理论上不应该引入false negative。 113 | 下表展示了分别用**余弦**和**欧氏**相似度筛选top k%的caller-callee pair时的recall, 114 | 可以看到CodeBert会不可避免的引入false negative,top-80%的recall只有74.1%。 115 | 因此CodeBert不适合用来进行pre-filter。 116 | 117 | 118 | | similarity | top 20 | 40 | 60 | 80 | 100 | 119 | | ---- | ---- | ---- | ---- | ---- | ---- | 120 | | cosine-similarity | 12.8 | 36.9 | 55.7 | 74.1 | 97.9 | 121 | | Euclidean-similarity | 12.3 | 36.5 | 51.3 | 71.1 | 97.9 | 122 | 123 | 124 | # 4.Citation 125 | 126 | ``` 127 | @inproceedings{10.1145/3691620.3695016, 128 | author = {Cheng, Baijun and Zhang, Cen and Wang, Kailong and Shi, Ling and Liu, Yang and Wang, Haoyu and Guo, Yao and Li, Ding and Chen, Xiangqun}, 129 | title = {Semantic-Enhanced Indirect Call Analysis with Large Language Models}, 130 | year = {2024}, 131 | isbn = {9798400712487}, 132 | publisher = {Association for Computing Machinery}, 133 | address = {New York, NY, USA}, 134 | url = {https://doi.org/10.1145/3691620.3695016}, 135 | doi = {10.1145/3691620.3695016}, 136 | booktitle = {Proceedings of the 39th IEEE/ACM International Conference on Automated Software Engineering}, 137 | pages = {430–442}, 138 | numpages = {13}, 139 | keywords = {indirect-call analysis, semantic analysis, LLM}, 140 | location = {Sacramento, CA, USA}, 141 | series = {ASE '24} 142 | } 143 | ``` -------------------------------------------------------------------------------- /case_studies/comments.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | The comment is quite different to use: 4 | 5 | Here comment is below the type definition, making it hard to use it. 6 | 7 | ```cpp 8 | typedef void (*isc_nm_recv_cb_t)(isc_nmhandle_t *handle, isc_result_t eresult, 9 | isc_region_t *region, void *cbarg); 10 | /*%< 11 | * Callback function to be used when receiving a packet. 12 | * 13 | * 'handle' the handle that can be used to send back the answer. 14 | * 'eresult' the result of the event. 15 | * 'region' contains the received data, if any. It will be freed 16 | * after return by caller. 17 | * 'cbarg' the callback argument passed to isc_nm_listenudp(), 18 | * isc_nm_listenstreamdns(), or isc_nm_read(). 19 | */ 20 | ``` -------------------------------------------------------------------------------- /code_analyzer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/code_analyzer/__init__.py -------------------------------------------------------------------------------- /code_analyzer/config.py: -------------------------------------------------------------------------------- 1 | from tree_sitter import Language, Parser 2 | import platform 3 | import os 4 | 5 | system_name = platform.system() 6 | root_path = os.path.dirname(os.path.dirname(__file__)) 7 | # set path to tree-sitter dynamic lib 8 | # linux 9 | if system_name == 'Linux': 10 | libpath = f"{root_path}/resources/my-languages.so" 11 | # macos 12 | elif system_name == 'Darwin': 13 | libpath = f"{root_path}/resources/my-languages.dylib" 14 | # windows 15 | elif system_name == 'Windows': 16 | libpath = f"{root_path}/resources/my-languages.dll" 17 | else: 18 | raise RuntimeError("unsupported system: ", system_name) 19 | 20 | language = Language(libpath, "c") 21 | parser = Parser() 22 | parser.set_language(language) -------------------------------------------------------------------------------- /code_analyzer/macro_expand.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.visitors.macro_visitor import MacroCallExpandVisitor, ExpandCodeConcatVisitor, \ 2 | MacroCallsiteCollectVisitor 3 | from code_analyzer.config import parser 4 | from code_analyzer.schemas.ast_node import ASTNode 5 | from code_analyzer.preprocessor.node_processor import NodeProcessor, processor 6 | 7 | from tree_sitter import Tree 8 | from typing import List, Dict, Set 9 | 10 | class MacroCallExpandUtil: 11 | def __init__(self, macro_func_bodies: Dict[str, str], macro_func_args: Dict[str, List[str]], 12 | var_arg_macro_funcs: Set[str]): 13 | self.macro_funcs: Set[str] = set(macro_func_bodies.keys()) 14 | self.macro_func_bodies: Dict[str, str] = macro_func_bodies 15 | self.macro_func_args: Dict[str, List[str]] = macro_func_args 16 | self.var_arg_macro_funcs: Set[str] = var_arg_macro_funcs 17 | self.processor = NodeProcessor(unwanted_node_type=set()) 18 | 19 | self.expanded_macros: Set[str] = set() 20 | 21 | def expand_macro_call(self, call_expr: ASTNode): 22 | code_text: str = self.expand_single_macro(call_expr) 23 | code_text = self.expand_code_text(code_text) 24 | return code_text 25 | 26 | def expand_single_macro(self, call_expr: ASTNode): 27 | macro_in_use: str = call_expr.children[0].node_text 28 | macro_body_text: str = self.macro_func_bodies[macro_in_use] 29 | macro_arg_list: List[str] = self.macro_func_args[macro_in_use] 30 | macro_tree: Tree = parser.parse(macro_body_text.encode("utf8")) 31 | macro_ast: ASTNode = self.processor.visit(macro_tree.root_node) 32 | args: List[str] = [arg_node.node_text for arg_node in call_expr.argument_list.children 33 | if arg_node.node_type not in {'(', ')', ','}] 34 | macro_call_expand_visitor = MacroCallExpandVisitor(args, macro_arg_list) 35 | macro_call_expand_visitor.traverse_node(macro_ast) 36 | 37 | expand_visitor = ExpandCodeConcatVisitor() 38 | expand_visitor.traverse_node(macro_ast) 39 | code_text: str = expand_visitor.code 40 | return code_text 41 | 42 | def expand_code_text(self, code_text: str): 43 | while True: 44 | tree: Tree = parser.parse(code_text.encode("utf-8")) 45 | macro_node: ASTNode = self.processor.visit(tree.root_node) 46 | macro_callsite_visitor = MacroCallsiteCollectVisitor(self.macro_func_args, 47 | self.var_arg_macro_funcs, 48 | self.expanded_macros) 49 | macro_callsite_visitor.traverse_node(macro_node) 50 | 51 | if len(macro_callsite_visitor.macro_callsites) == 0: 52 | break 53 | 54 | for macro_call_node in macro_callsite_visitor.macro_callsites: 55 | expand_code_text: str = self.expand_single_macro(macro_call_node) 56 | new_node = ASTNode("identifier", expand_code_text, (0, 0), (0, 0)) 57 | if macro_call_node.parent is not None: 58 | child_idx = macro_call_node.parent.children.index(macro_call_node) 59 | new_node.parent = macro_call_node.parent 60 | macro_call_node.parent.children[child_idx] = new_node 61 | else: 62 | macro_node = new_node 63 | 64 | expand_visitor = ExpandCodeConcatVisitor() 65 | expand_visitor.traverse_node(macro_node) 66 | code_text: str = expand_visitor.code 67 | 68 | return code_text -------------------------------------------------------------------------------- /code_analyzer/preprocessor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/code_analyzer/preprocessor/__init__.py -------------------------------------------------------------------------------- /code_analyzer/preprocessor/node_processor.py: -------------------------------------------------------------------------------- 1 | from tree_sitter import Node 2 | from typing import Set 3 | from code_analyzer.schemas.ast_node import ASTNode 4 | import os 5 | from typing import Dict 6 | 7 | def get_node_text(node: Node) -> str: 8 | try: 9 | text = node.text.decode('utf8') 10 | except UnicodeDecodeError: 11 | text = node.text.decode('ISO-8859-1') 12 | return text 13 | 14 | class NodeProcessor: 15 | def __init__(self, unwanted_node_type: Set[str] = {}, max_depth: int = 500, 16 | comment_func_dict: Dict[ASTNode, str] = None, 17 | comment_struct_dict: Dict[ASTNode, str] = None, 18 | comment_type_dict: Dict[ASTNode, str] = None): 19 | self.unwanted_node_type: Set[str] = unwanted_node_type 20 | self.max_depth = max_depth 21 | self.cur_file = "" 22 | self.comment_func_dict = comment_func_dict 23 | self.comment_struct_dict = comment_struct_dict 24 | self.comment_type_dict = comment_type_dict 25 | 26 | # 处理类型定义 27 | def visit(self, node: Node, depth: int = 0) -> ASTNode: 28 | # 递归深度限制 29 | if depth >= self.max_depth: 30 | return None 31 | if node.type in self.unwanted_node_type: 32 | return None 33 | ast_node: ASTNode = ASTNode(node.type, 34 | get_node_text(node), 35 | node.start_point, node.end_point, 36 | file=self.cur_file) 37 | if node.prev_sibling is not None and node.prev_sibling.type == "comment" and \ 38 | node.prev_sibling.end_point[0] + 1 == node.start_point[0]: 39 | comment = get_node_text(node.prev_sibling) 40 | if node.type == "function_definition" and self.comment_func_dict is not None: 41 | self.comment_func_dict[ast_node] = comment 42 | elif node.type == "struct_specifier" and self.comment_struct_dict is not None: 43 | self.comment_struct_dict[ast_node] = comment 44 | elif node.type == "type_definition" and self.comment_type_dict is not None: 45 | self.comment_type_dict[ast_node] = comment 46 | 47 | for child in node.children: 48 | child_type: str = child.type 49 | if child_type in self.unwanted_node_type: 50 | continue 51 | child_node: ASTNode = self.visit(child, depth + 1) 52 | if child_node is None: 53 | continue 54 | child_node.parent = ast_node 55 | ast_node.children.append(child_node) 56 | # 如果node已有对应属性 57 | if hasattr(ast_node, child_type): 58 | attr = getattr(ast_node, child_type) 59 | # 如果attr是Node 60 | if isinstance(attr, ASTNode): 61 | setattr(ast_node, child_type, [attr, child_node]) 62 | # 如果attr是List 63 | else: 64 | assert isinstance(attr, list) 65 | attr.append(child) 66 | # 如果node没有对应属性 67 | else: 68 | setattr(ast_node, child_type, child_node) 69 | return ast_node 70 | 71 | root_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) 72 | f = open(f"{root_path}/resources/filtered_keyword.txt", 'r', encoding='utf-8') 73 | line = f.read() 74 | keywords: set = set(line.split(' ')) 75 | keywords.add('\n') 76 | processor = NodeProcessor(keywords, comment_func_dict=dict(), comment_struct_dict=dict(), 77 | comment_type_dict=dict()) -------------------------------------------------------------------------------- /code_analyzer/schemas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/code_analyzer/schemas/__init__.py -------------------------------------------------------------------------------- /code_analyzer/schemas/ast_node.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Tuple 3 | 4 | @dataclass 5 | class ASTNode: 6 | node_type: str 7 | node_text: str 8 | start_point: Tuple[int, int] 9 | end_point: Tuple[int, int] 10 | children: List['ASTNode'] = field(default_factory=list) 11 | parent: 'ASTNode' = None 12 | file: str = "" 13 | 14 | @property 15 | def child_count(self) -> int: 16 | return len(self.children) 17 | 18 | def __hash__(self): 19 | # Custom hash function that includes the relevant fields 20 | return hash((self.file, self.start_point, self.end_point)) 21 | 22 | def __str__(self): 23 | return self.node_text -------------------------------------------------------------------------------- /code_analyzer/schemas/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class TypeEnum(str, Enum): 4 | FunctionType = "function_type" 5 | StructType = "struct_type" 6 | UnionType = "union_type" 7 | UnknownType = "unknown_type" 8 | EnumType = "enum_type" -------------------------------------------------------------------------------- /code_analyzer/schemas/function_info.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict, Set 2 | from code_analyzer.schemas.ast_node import ASTNode 3 | 4 | class FuncInfo: 5 | def __init__(self, parameter_types: List[Tuple[str, str]], 6 | name_2_declarator_text: Dict[str, str], 7 | declarator_texts: List[str], 8 | var_arg: bool, 9 | raw_declarator_text: str, func_body: ASTNode, file: str, 10 | func_name: str, 11 | return_type: Tuple[str, int], 12 | comment: str = ""): 13 | self.parameter_types: List[Tuple[str, str]] = parameter_types 14 | self.var_arg: bool = var_arg 15 | self.name_2_declarator_text: Dict[str, str] = name_2_declarator_text 16 | self.declarator_texts: List[str] = declarator_texts 17 | self.return_type: Tuple[str, int] = return_type 18 | self.comment = comment 19 | 20 | # 可变参数 21 | if len(self.parameter_types) > 0 and self.parameter_types[-1][0] == "va_list": 22 | self.parameter_types.pop() 23 | self.var_arg = True 24 | self.raw_declarator_text: str = raw_declarator_text 25 | self.func_body: ASTNode = func_body 26 | self.file: str = file # 相对project根目录的相对路径 27 | self.func_name: str = func_name 28 | 29 | 30 | def set_local_var_info(self, local_var: Dict[str, str]): 31 | self.local_var: Dict[str, str] = local_var 32 | 33 | # 如果局部变量包含函数指针,将函数指针变量映射到参数类型 34 | def set_func_var2param_types(self, func_var2param_types: Dict[str, List[str]]): 35 | self.func_var2param_types: Dict[str, List[str]] = func_var2param_types 36 | 37 | # 如果形参包含函数指针,将函数指针变量映射到参数类型 38 | def set_func_param2param_types(self, func_param2param_types: Dict[str, List[str]]): 39 | self.func_param2param_types: Dict[str, List[str]] = func_param2param_types 40 | 41 | # 如果包含支持函数指针的param或者local var 42 | def set_var_arg_func_param(self, var_arg_func_param: Set[str]): 43 | self.var_arg_func_param: Set[str] = var_arg_func_param 44 | 45 | def set_var_arg_func_var(self, var_arg_func_var: Set[str]): 46 | self.var_arg_func_var: Set[str] = var_arg_func_var 47 | 48 | def set_local_var2declarator(self, local_var2declarator: Dict[str, str]): 49 | self.local_var2declarator: Dict[str, str] = local_var2declarator 50 | 51 | @property 52 | def func_def_text(self) -> str: 53 | return self.raw_declarator_text + self.func_body.node_text -------------------------------------------------------------------------------- /code_analyzer/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/code_analyzer/utils/__init__.py -------------------------------------------------------------------------------- /code_analyzer/utils/func_key_collector.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, DefaultDict, Set 2 | 3 | def get_all_func_keys(callees: Dict[str, Set[str]], 4 | llm_declarator_analysis: DefaultDict[str, Set[str]]) -> Set[str]: 5 | total_func_keys: Set[str] = set() 6 | if len(callees) > 0: 7 | total_func_keys |= set.union(*callees.values()) 8 | if len(llm_declarator_analysis) > 0: 9 | total_func_keys |= set.union(*llm_declarator_analysis.values()) 10 | return total_func_keys -------------------------------------------------------------------------------- /code_analyzer/visit_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/code_analyzer/visit_utils/__init__.py -------------------------------------------------------------------------------- /code_analyzer/visit_utils/base_util.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | def loc_inside(loc: Tuple[int, int], start_loc: Tuple[int, int], 4 | end_loc: Tuple[int, int]) -> bool: 5 | # 如果区间在同一行 6 | if start_loc[0] == end_loc[0]: 7 | if loc[0] == start_loc[0]: 8 | return start_loc[1] <= loc[1] <= end_loc[1] 9 | else: 10 | return False 11 | # 如果区间在不同行,判断给定位置是否在区间内 12 | elif start_loc[0] < end_loc[0]: 13 | if start_loc[0] == loc[0]: 14 | return start_loc[1] <= loc[1] 15 | elif loc[0] == end_loc[0]: 16 | return loc[1] <= end_loc[1] 17 | elif start_loc[0] < loc[0] < end_loc[0]: 18 | return True 19 | 20 | return False -------------------------------------------------------------------------------- /code_analyzer/visit_utils/decl_util.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.schemas.ast_node import ASTNode 2 | from typing import Tuple, List, Dict, Set 3 | 4 | from code_analyzer.schemas.enums import TypeEnum 5 | from code_analyzer.visitors.util_visitor import IdentifierExtractor, FieldIdentifierExtractor 6 | 7 | # 解析param decl出现错误 8 | class DeclareTypeException(Exception): 9 | pass 10 | 11 | # 在形参定义中以及类型定义中用到,因此&是引用不是取地址运算 12 | def process_declarator(declarator: ASTNode, find_var_name: bool=True) -> Tuple[str, str, ASTNode, bool]: 13 | from code_analyzer.visitors.util_visitor import DeclaratorExtractor 14 | extractor = DeclaratorExtractor(find_var_name) 15 | extractor.traverse_node(declarator) 16 | error_flag = extractor.detected_error_node 17 | if extractor.key_node is None: 18 | raise DeclareTypeException("Exception happen when processing declarator: {}".format(declarator.node_text)) 19 | return extractor.suffix, extractor.key_node.node_text, extractor.key_node, error_flag 20 | 21 | 22 | # 处理一个declaration语句定义了多个变量的情况 23 | # 第二个参数表明是否是在处理struct/union field定义 24 | def process_multi_var_declaration(node: ASTNode, is_field_decl: bool = False, 25 | global_visitor = None)\ 26 | -> Tuple[List[Tuple[str, str]], Dict[str, List[str]], Set[str]]: 27 | var_list: List[Tuple[str, str]] = list() # 定义的变量类型以及名称 28 | unknown_var_type_list: List[Tuple[str, str]] = list() # 未知变量名以及prefix 29 | # 当前处理的变量 30 | cur_var_decl_idx = 1 31 | # 将函数指针变量映射到对应的参数类型 32 | varname2param_types: Dict[str, List[str]] = dict() 33 | # 函数指针变量中支持可变参数 34 | var_param_func_vars: Set[str] = set() 35 | 36 | cls = FieldIdentifierExtractor if is_field_decl else IdentifierExtractor 37 | 38 | # 当前处理的依旧是变量定义部分 39 | while cur_var_decl_idx < node.child_count: 40 | var_name_extractor = cls() 41 | var_node: ASTNode = node.children[cur_var_decl_idx] 42 | # 有初始化参数 43 | if var_node.node_type == "init_declarator": 44 | var_name_extractor.traverse_node(var_node.children[0]) 45 | else: 46 | var_name_extractor.traverse_node(var_node) 47 | 48 | # 是函数声明 49 | if var_name_extractor.is_function: 50 | # 如果是函数指针,那么只能定义一个函数指针变量 51 | if isinstance(var_name_extractor, FieldIdentifierExtractor): 52 | return ([("", "")], dict(), set()) 53 | else: 54 | return () 55 | 56 | # 如果是函数指针变量 57 | if var_name_extractor.is_function_type: 58 | from code_analyzer.visitors.base_func_visitor import extract_param_types 59 | type_name = TypeEnum.FunctionType.value 60 | infos = extract_param_types(node) 61 | param_types: List[str] = infos[0] 62 | var_arg: bool = infos[1] 63 | varname2param_types[var_name_extractor.var_name] = param_types 64 | var_list.append((type_name, var_name_extractor.var_name)) 65 | # 如果支持可变参数 66 | if var_arg: 67 | var_param_func_vars.add(var_name_extractor.var_name) 68 | else: 69 | unknown_var_type_list.append((var_name_extractor.suffix, 70 | var_name_extractor.var_name)) 71 | cur_var_decl_idx += 1 72 | type_node: ASTNode = node.children[0] 73 | 74 | if type_node.node_type in {"struct_specifier", "union_specifier"}: 75 | # 如果声明变量的时候同时出现匿名结构体定义 76 | if hasattr(type_node, "field_declaration_list"): 77 | assert global_visitor is not None 78 | root_type_name, anno_num = global_visitor.process_complex_specifier(type_node, 79 | TypeEnum.StructType.value, global_visitor.anonymous_struct_num) 80 | global_visitor.anonymous_struct_num = anno_num 81 | global_visitor.process_struct_specifier(type_node, root_type_name) 82 | else: 83 | assert hasattr(type_node, "type_identifier") 84 | root_type_name = type_node.type_identifier.node_text 85 | if type_node.node_type == "struct_specifier": 86 | global_visitor.struct_names.add(root_type_name) 87 | # 枚举 88 | elif type_node.node_type == "enum_specifier": 89 | # 如果声明变量或者field的时候出现匿名枚举的定义 90 | if hasattr(type_node, "enumerator_list"): 91 | root_type_name, anno_num = global_visitor.process_complex_specifier(type_node, 92 | TypeEnum.EnumType.value, 93 | global_visitor.anoymous_enum_num) 94 | global_visitor.anoymous_enum_num = anno_num 95 | else: 96 | assert hasattr(type_node, "type_identifier") 97 | root_type_name = type_node.type_identifier.node_text 98 | global_visitor.enum_infos.add(root_type_name) 99 | else: 100 | root_type_name = type_node.node_text 101 | 102 | for var_info in unknown_var_type_list: 103 | type_name = root_type_name 104 | # 指针类型 105 | if var_info[0] != "": 106 | type_name += " " + var_info[0] 107 | var_list.append((type_name, var_info[1])) 108 | 109 | return var_list, varname2param_types, var_param_func_vars 110 | -------------------------------------------------------------------------------- /code_analyzer/visit_utils/func_type.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from code_analyzer.schemas.ast_node import ASTNode 3 | 4 | def get_func_pointer_name(declarator: ASTNode, node: ASTNode): 5 | if declarator.child_count != 2: 6 | logging.getLogger("CodeAnalyzer").debug("error parsing function pointer: ", node.start_point, node.end_point, 7 | node.node_text) 8 | return False 9 | if declarator.children[0].node_type == "parenthesized_declarator": 10 | from code_analyzer.visitors.util_visitor import FuncNameExtractor 11 | func_name_extractor = FuncNameExtractor() 12 | func_name_extractor.traverse_node(declarator.children[0]) 13 | name_node: ASTNode = func_name_extractor.key_node 14 | elif declarator.children[0].node_type == "type_identifier": 15 | name_node = declarator.children[0] 16 | else: 17 | logging.getLogger("CodeAnalyzer").debug("error parsing function pointer: ", node.start_point, node.end_point, 18 | node.node_text) 19 | return False 20 | if name_node is None or name_node.node_type == "ERROR": 21 | logging.getLogger("CodeAnalyzer").debug("error parsing function pointer: ", node.start_point, node.end_point, 22 | node.node_text) 23 | return False 24 | # name_node: Node = declarator.children[0].children[1] 25 | while name_node.node_type == "pointer_declarator": 26 | name_node = name_node.children[1] 27 | # type_identifier表示类型定义,identifier表示函数指针变量声明 28 | if name_node.node_type not in {"type_identifier", "identifier"}: 29 | logging.getLogger("CodeAnalyzer").debug("error parsing function pointer: ", node.start_point, node.end_point, 30 | node.node_text) 31 | return False 32 | # assert name_node.type == "type_identifier" 33 | src_type = name_node.node_text 34 | return src_type -------------------------------------------------------------------------------- /code_analyzer/visit_utils/type_util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Tuple, Dict 3 | from code_analyzer.schemas.enums import TypeEnum 4 | 5 | # 处理指针类型,如果不是指针类型,那相当于没处理 6 | # 需要考虑的是unsigned char这种类型由2个token组成 7 | def parsing_type(cur_type: Tuple[str, int]) -> Tuple[str, int]: 8 | src_type = cur_type[0] 9 | pointer_level = cur_type[1] 10 | # 如果是指针类型 11 | if src_type.endswith("*"): 12 | res = src_type.split(" ") 13 | src_type = ' '.join(res[:-1]) 14 | pointer_level += len(res[-1]) 15 | return (src_type, pointer_level) 16 | 17 | # c语言存在下面语法: 18 | # typedef struct DDS_Security_Serializer *DDS_Security_Serializer; 19 | # 即dst_type和src_type名字一致,但是pointer_level不一样 20 | def get_original_type(src_type: Tuple[str, int], type_alias_infos: Dict[str, str])\ 21 | -> Tuple[str, int]: 22 | cur_type_name = src_type[0] 23 | cur_pointer_level = src_type[1] 24 | previous_names = set() 25 | while cur_type_name in type_alias_infos.keys(): 26 | # 出现环路,返回unknown 27 | if cur_type_name in previous_names: 28 | return (TypeEnum.UnknownType.value, 0) 29 | previous_type_name = cur_type_name 30 | previous_names.add(previous_type_name) 31 | cur_type_name = type_alias_infos.get(cur_type_name) 32 | # 处理指针类型 33 | cur_type_name, cur_pointer_level = parsing_type((cur_type_name, cur_pointer_level)) 34 | if cur_type_name == previous_type_name: 35 | break 36 | return (cur_type_name, cur_pointer_level) 37 | 38 | def get_original_type_with_name(src_type_name: str, type_alias_infos: Dict[str, str])\ 39 | -> Tuple[str, int]: 40 | if src_type_name.endswith("*"): 41 | res = src_type_name.split(" ") 42 | cur_type_name = " ".join(res[:-1]) 43 | cur_pointer_level = len(res[-1]) 44 | else: 45 | cur_type_name = src_type_name 46 | cur_pointer_level = 0 47 | return get_original_type((cur_type_name, cur_pointer_level), type_alias_infos) -------------------------------------------------------------------------------- /code_analyzer/visitors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/code_analyzer/visitors/__init__.py -------------------------------------------------------------------------------- /code_analyzer/visitors/base_visitor.py: -------------------------------------------------------------------------------- 1 | """This module implements the visitor design pattern for Tree-sitter""" 2 | from code_analyzer.schemas.ast_node import ASTNode 3 | 4 | class ASTVisitor: 5 | # 如果不是感兴趣的结点则继续访问子节点 6 | def visit(self, node: ASTNode): 7 | """Default handler that captures all nodes not already handled""" 8 | return True 9 | 10 | # 返回False表示对子节点不感兴趣,为True则继续访问子节点 11 | # Traversing ---------------------------------------------------------------- 12 | def on_visit(self, node: ASTNode): 13 | """ 14 | Handles all nodes visted in AST and calls the underlying vistor methods. 15 | 16 | This method is called for all discovered AST nodes first. 17 | Override this to handle all nodes regardless of the defined visitor methods. 18 | 19 | Returning False stops the traversal of the subtree rooted at the given node. 20 | """ 21 | visitor_fn = getattr(self, f"visit_{node.node_type}", self.visit) 22 | return visitor_fn(node) 23 | 24 | # 前序遍历 25 | def traverse_node(self, node: ASTNode): 26 | if self.on_visit(node): 27 | for child in node.children: 28 | self.traverse_node(child) -------------------------------------------------------------------------------- /code_analyzer/visitors/macro_visitor.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.schemas.ast_node import ASTNode 2 | from code_analyzer.visitors.base_visitor import ASTVisitor 3 | 4 | from typing import List, Set, Dict 5 | 6 | # 将宏函数参数的结点替换成形参 7 | class MacroCallExpandVisitor(ASTVisitor): 8 | def __init__(self, args: List[str], params: List[str]): 9 | # 实参文本 10 | self.args: List[str] = args 11 | # 形参文本 12 | self.params: List[str] = params 13 | 14 | def visit_identifier(self, node: ASTNode): 15 | if node.node_text in self.params: 16 | idx = self.params.index(node.node_text) 17 | node.node_text = self.args[idx] 18 | 19 | 20 | class ExpandCodeConcatVisitor(ASTVisitor): 21 | def __init__(self): 22 | self.terminal_nodes: List[ASTNode] = list() 23 | 24 | def visit(self, node: ASTNode): 25 | if node.child_count == 0: 26 | self.terminal_nodes.append(node) 27 | return False 28 | elif node.node_type == "string_literal": 29 | self.terminal_nodes.append(node) 30 | return False 31 | return super().visit(node) 32 | 33 | @property 34 | def code(self): 35 | return " ".join([node.node_text for node in self.terminal_nodes]) 36 | 37 | 38 | class MacroCallsiteCollectVisitor(ASTVisitor): 39 | def __init__(self, macro_funcs_args: Dict[str, List[str]], 40 | var_arg_macro_funcs: Set[str], 41 | expanded_macros: Set[str]): 42 | self.macro_funcs_args: Dict[str, List[str]] = macro_funcs_args 43 | self.var_arg_macro_funcs: Set[str] = var_arg_macro_funcs 44 | self.macro_callsites: List[ASTNode] = list() 45 | self.expanded_macros: Set[str] = expanded_macros 46 | 47 | def visit_call_expression(self, node: ASTNode): 48 | callee_expr: ASTNode = node.children[0] 49 | macro: str = callee_expr.node_text 50 | if callee_expr.node_type == "identifier" and \ 51 | macro in self.macro_funcs_args.keys(): 52 | macro_args_num = len(self.macro_funcs_args[macro]) 53 | args = node.argument_list.children 54 | args = list(filter(lambda n: n.node_text not in {"(", ")", ","}, args)) 55 | args_num = len(args) 56 | # 如果不是可变参数宏调用 57 | if macro_args_num == args_num and macro not in self.var_arg_macro_funcs\ 58 | and macro not in self.expanded_macros: 59 | self.macro_callsites.append(node) 60 | self.expanded_macros.add(macro) 61 | return False 62 | # 可变参数宏调用,那么实参 >= 形参 63 | elif macro in self.var_arg_macro_funcs and args_num >= macro_args_num\ 64 | and macro not in self.expanded_macros: 65 | self.macro_callsites.append(node) 66 | self.expanded_macros.add(macro) 67 | return False 68 | return super().visit(node) 69 | 70 | 71 | class ICallVisitor(ASTVisitor): 72 | def __init__(self, global_vars: Set[str], local_vars: Set[str], 73 | args: Set[str]): 74 | self.global_vars: Set[str] = global_vars 75 | self.local_vars: Set[str] = local_vars 76 | self.args: Set[str] = args 77 | self.call_expr: ASTNode = None 78 | 79 | def visit(self, node: ASTNode): 80 | if self.call_expr is not None: 81 | return False 82 | else: 83 | return super().visit(node) 84 | 85 | def visit_call_expression(self, node: ASTNode): 86 | if self.call_expr is not None: 87 | return False 88 | callee_expr: ASTNode = node.children[0] 89 | # 不是间接调用 90 | if callee_expr.node_type == "identifier" and \ 91 | callee_expr.node_text not in (self.global_vars | self.local_vars | self.args): 92 | return super().visit(node) 93 | # 是间接调用 94 | else: 95 | self.call_expr = node 96 | return False -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | 2 | supported_model_list = ['Qwen1.5-14B-Chat', 'Qwen1.5-32B-Chat', 'Qwen1.5-72B-Chat', 'Qwen2-72B-Instruct', 3 | 'Yi-1.5-34B-Chat', 'CodeQwen1.5-7B-Chat', 4 | 'llama-3-70b-instruct', 'llama-3-8b-instruct', 5 | 'Phi-3-mini-128k-instruct', 'Phi-3-medium-128k-instruct', 6 | 'codegemma-1.1-7b-it', 'Mixtral-8x7B-Instruct-v0.1', 7 | 'DeepSeek-Coder-V2-Lite-Instruct', 'DeepSeek-Coder-V2-Instruct', 8 | 'deepseek-coder-33b-instruct'] 9 | 10 | suffix = ["", "wo_caller_local_", "wo_caller_global_", "wo_callee_local_", "wo_callee_global_", 11 | "wo_local_", "wo_global_", "wo_all_"] -------------------------------------------------------------------------------- /experimental_logs/cost.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | | model-temperature | total input token num (k token) | total output token num (k token) | 4 | | ---- | ---- | ---- | 5 | | Qwen-0 | 95231.692 | 4612.851 | 6 | | Qwen-0.25 | 130977.157 | 6170.082 | 7 | | Qwen-0.5 | 99535.695 | 5162.244 | 8 | | Qwen-0.75 | 135530.072 | 7984.589 | 9 | | Qwen-1.0 | 144969.967 | 8272.043 | 10 | | llama3-0 | 151039.922 | 16780.559 | 11 | | llama3-0.25 | 153512.726 | 18795.727 | 12 | | llama3-0.5 | 148168.028 | 18589.131 | 13 | | llama3-0.75 | 145889.942 | 18243.632 | 14 | | llama3-1 | 158219.496 | 19825.072 | -------------------------------------------------------------------------------- /experimental_logs/coverage.md: -------------------------------------------------------------------------------- 1 | We supplement: 1.coverage report 2.manual analysis of results here 2 | 3 | # 1.coverage report 4 | 5 | | project | line coverage(%) | function coverage(%) | region coverage(%) | 6 | | ---- | ---- | ---- | ---- | 7 | | bind9 | 16.88 | 20.70 | 16.88 | 8 | | bluez | 39.86 | 10.80 | 32.62 | 9 | | cairo | 20.18 | 30.08 | 19.40 | 10 | | cyclonedds | 28.57 | 36.56 | 25.43 | 11 | | dovecot | 25.60 | 28.15 | 25.58 | 12 | | fwupd | 14.64 | 18.82 | 13.55 | 13 | | gdbm | 35.38 | 50.19 | 25.08 | 14 | | gdk-pixbuf | 4.75 | 6.79 | 4.98 | 15 | | hdf5 | 13.78 | 19.93 | 6.66 | 16 | | igraph | 62.43 | 60.44 | 51.17 | 17 | | krb5 | 9.50 | 14.32 | 6.18 | 18 | | libdwarf | 49.28 | 69.49 | 44.07 | 19 | | libjpeg-turbo | 61.64 | 72.23 | 52.03 | 20 | | libpg_query | 7.39 | 4.81 | 4.55 | 21 | | libsndfile | 6.77 | 11.25 | 7.28 | 22 | | libssh | 30.66 | 41.00 | 29.94 | 23 | | librabbitmq | 26.60 | 44.68 | 21.84 | 24 | | lua | 24.14 | 25.27 | 35.87 | 25 | | lxc | 14.88 | 23.05 | 10.92 | 26 | | md4c | 97.38 | 100.00 | 92.25 | 27 | | mdbtools | 34.21 | 39.83 | 29.71 | 28 | | nginx | 18.38 | 30.05 | 20.77 | 29 | | oniguruma | 83.60 | 80.32 | 77.35 | 30 | | opensips | 11.82 | 10.29 | 10.85 | 31 | | pjsip | 26.33 | 31.46 | 24.18 | 32 | | postfix | 8.48 | 9.54 | 11.11 | 33 | | rtpproxy | 17.89 | 23.20 | 19.57 | 34 | | selinux | 42.11 | 58.63 | 38.99 | 35 | | sudo | 68.35 | 72.49 | 65.16 | 36 | | tmux | 12.78 | 22.43 | 12.14 | 37 | | vlc | 6.61 | 8.43 | 12.11 | 38 | 39 | 40 | # 2.manual analysis of results 41 | 42 | Due to the dynamic analysis approach we employ for generating ground truth, the issue of coverage inherently introduces false negatives within the ground truth, 43 | necessitating additional manual efforts. 44 | These efforts involve manually re-annotating certain samples. 45 | However, given the complexity of real-world projects, we cannot ensure that the manually annotated samples are free from false positives or false negatives. 46 | 47 | Thus, we adopt an alternative analysis method to supplement the results, as detailed in Section 4.2 (RQ1). 48 | Starting from FLTA-exclusive cases (i.e., those interprocedural calls identified only by FLTA in traditional analysis), we have analyzed the results of Qwen1.5 with a temperature of 0.5 across 294 FLTA-exclusive cases, as this configuration performs best. 49 | The analysis includes the number of ground-truth callees, the number of FLTA-analyzed callees, the number of SEA-analyzed callees, the F1 score of FLTA, and the F1 score of SEA. 50 | Detailed information is provided [here](intermediate_res/Qwen72-0.5.csv). 51 | 52 | ## 2.1.Overall Analysis 53 | 54 | Our paper describes that in 206 cases, SEA shows improvements, in 61 cases, there is no change, and in the remaining cases, there is a decrease. 55 | 56 | - Among 294 cases 57 | 58 | * the average size of callee_set decrease from 33.05 to 6.18 after SEA filter. 59 | 60 | * the largest size of callee_set decrease from 303 to 100. 61 | 62 | * the average ratio of callee_set / ground_truth_num decrease from 26.68 to 4.03 63 | 64 | * the largest ratio of callee_set / ground_truth_num decrease from 303 to 49 65 | 66 | - And before SEA filter, there are 160 callers who owns callee set larger than 10, after SEA filter, the number decrease to 47. 67 | 68 | 69 | We continue analyze false negative and false positive problem. 70 | 71 | ## 2.2.False Negative Analysis 72 | 73 | We have discussed in the last part of 4.2 (RQ1) that ambiguous semantic similarity between the caller and callee may introduce false negatives. 74 | The VLC example we mention in the paper is as follows: 75 | 76 | ```cpp 77 | // caller 78 | probe(cb, i < strict_total, ap) 79 | 80 | // callee 81 | static int generic_start(void *func, bool forced, va_list ap) 82 | { 83 | vlc_object_t *obj = va_arg(ap, vlc_object_t *); 84 | int (*activate)(vlc_object_t *) = func; 85 | int ret; 86 | 87 | obj->force = forced; 88 | ret = activate(obj); 89 | if (ret != VLC_SUCCESS) 90 | vlc_objres_clear(obj); 91 | return ret; 92 | } 93 | ``` 94 | 95 | LLM's analysis is: 96 | 97 | ``` 98 | Based on the provided summaries, it seems that the `generic_start` function is not directly related to the functionality of the `probe` function. 99 | The `probe` function is used to test the suitability of candidate modules in the `vlc_module_load` function by calling a control function (`cb`) provided by each module. 100 | On the other hand, `generic_start` is a helper function that activates a given function pointer and handles error cleanup. 101 | ``` 102 | 103 | The semantic similarity between `probe` and `generic_start` is ambiguous. Which decrease the performance of SEA. 104 | 105 | 106 | ## 2.3.False Positive Analysis 107 | 108 | Identifying false positive analysis results poses more challenges because the false negatives in ground-truth. 109 | And analyze all cases one by one is time-consuming. We start we cases with higher ratio of callee_set / ground_truth_num. 110 | 111 | 112 | ```cpp 113 | // caller 114 | io->callback(io->context); 115 | 116 | // ground-truth callee: 117 | static void stream_send_io(struct file_ostream *fstream) { ...} 118 | ``` 119 | 120 | Some false positives reported callees by SEA includes: `i_stream_nonuls_read`, `test_ioloop_fd_cb_left`, `iostream_pump_copy.` 121 | 122 | - `test_ioloop_fd_cb_left` is due to the hallucinations of LLM. Where the summaries generated by LLM indicates no similarity between the caller and callee, LLM still respond 'yes'. 123 | 124 | - `i_stream_nonuls_read` and `iostream_pump_copy` to a extent can be attributed to LLM. On the other hand, their similarities with caller is much stronger than `test_ioloop_fd_cb_left`, more likely to incur hallucinations. 125 | 126 | 127 | ## 2.4.Potential Solutions 128 | 129 | To further solve false positives and false negatives. One potential solution is to conduct semantic-enhanced data-flow analysis. 130 | By identifying their alias information more lightweight but not simply relying on semantic relations. 131 | 132 | Or we can study more in scnarios and functions of caller and callee. But this needs time-consuming human efforts and can increate false-negatives. -------------------------------------------------------------------------------- /experimental_logs/error_cases.md: -------------------------------------------------------------------------------- 1 | 2 | # source code层面类型匹配失误的地方 3 | 4 | `LOGIT(result);`,`LOGIT` 为宏定义,在该调用处只有一个参数被传入,但是其宏定义为: 5 | 6 | ```cpp 7 | #define LOGIT(result) \ 8 | if (result == ISC_R_NOMEMORY) \ 9 | (*callbacks->error)(callbacks, "dns_master_load: %s", \ 10 | isc_result_totext(result)); \ 11 | else \ 12 | (*callbacks->error)(callbacks, "%s: %s:%lu: %s", \ 13 | "dns_master_load", source, line, \ 14 | isc_result_totext(result)) 15 | ``` 16 | 17 | 也就是该函数调用需要至少支持3个参数的,最终调用的函数为,也就是至少2个参数。展开宏定义后匹配不上。 18 | 19 | 同时,有的function definition是宏定义扩展生成的,因为无法直接对预处理前的c文件解析得到 20 | 21 | ```cpp 22 | static void 23 | isclog_error_callback(dns_rdatacallbacks_t *callbacks, const char *fmt, ...) 24 | ``` 25 | 26 | 27 | 另一种情况就是调用的函数不在当前project范围内。 28 | 29 | 然后还有c文件解析错误导致没有正常解析callee或者indirect-call的情况 30 | 31 | 类型匹配漏报情况统计: 32 | 33 | | project | 漏报情况 | 34 | |----|----| 35 | | nginx| 1外部调用 | 36 | | bind9 | 1宏定义展开参数+6宏定义展开函数 | 37 | | cyclonedds | 2解析错误 | 38 | | dovecot | 4宏定义+1复合call | 39 | | hdf5 | 4文件解析错误+2宏定义 | 40 | | igraph | 3文件解析错误+13外部调用 | 41 | | libdwarf | 47宏定义 | 42 | | lxc | 1宏定义展开函数 | 43 | | md4c | 34宏定义展开 | 44 | 45 | 46 | 47 | bind9 48 | 49 | ``` 50 | lib/dns/master.c:2113:2-宏定义展开后参数变多 51 | lib/dns/dst_api.c:2327:9-调用func为宏定义展开函数 52 | lib/dns/dst_api.c:747:10-调用func为宏定义展开函数 53 | ``` 54 | 55 | cyclonedds 56 | 57 | ``` 58 | src/ddsrt/src/xmlparser.c:641:14-调用project之外的func 59 | ``` 60 | 61 | dovecot 62 | 63 | ``` 64 | src/lib/connection.c:591:2-宏定义导致icall位置发生变化从而让ground-truth失效 65 | src/lib-smtp/smtp-server-command.c:223:2-宏定义导致icall位置发生变化从而让ground-truth失效 66 | src/lib-smtp/smtp-server-command.c:351:3-宏定义导致icall位置发生变化从而让ground-truth失效 67 | src/lib/connection.c:813:32-宏定义导致icall位置发生变化从而让ground-truth失效 + 复合表达式调用造成ground-truth失误 68 | src/lib/connection.c:816:2-宏定义导致icall位置发生变化从而让ground-truth失效 69 | ``` 70 | 71 | hdf5 72 | 73 | ``` 74 | src/H5.c:268:17-c文件解析错误 75 | src/H5VLint.c:198:13-c文件解析错误 76 | src/H5SL.c:818:19-c文件解析错误 77 | src/H5SL.c:2089:26-c文件解析错误 78 | src/H5Omessage.c:487:5-宏定义展开后参数变多 79 | src/H5Omessage.c:1160:13-宏定义展开后参数变多 80 | ``` 81 | 82 | igraph 83 | 84 | ``` 85 | src/core/error.c:171:9-c文件未正确解析 86 | src/core/error.c:282:9-c文件未正确解析 87 | src/core/error.c:383:9-c文件未正确解析 88 | src/internal/qsort.c:135:27-外部调用+宏定义扩展参数 89 | src/internal/qsort.c:158:36-外部调用+宏定义扩展参数 90 | src/internal/qsort.c:166:36-外部调用+宏定义扩展参数 91 | src/internal/qsort.c:184:27-外部调用+宏定义扩展参数 92 | src/isomorphism/bliss.cc:97:12-调用外部函数 93 | src/isomorphism/bliss/graph.cc:625:26-调用外部函数 94 | src/isomorphism/bliss/graph.cc:634:3-调用外部函数 95 | src/isomorphism/bliss/graph.cc:675:3-调用外部函数 96 | src/isomorphism/bliss/graph.cc:3973:37-调用外部函数 97 | src/isomorphism/bliss/graph.cc:183:30-调用外部函数 98 | src/isomorphism/bliss/graph.cc:189:30-调用外部函数 99 | src/isomorphism/bliss/graph.cc:729:3-调用外部函数 100 | src/isomorphism/bliss/graph.cc:420:26-调用外部函数 101 | ``` 102 | 103 | libdwarf 104 | 105 | ``` 106 | src/lib/libdwarf/dwarf_die_deliv.c:452:5-宏定义导致参数数量增加 107 | src/lib/libdwarf/dwarf_die_deliv.c:319:5-宏定义导致参数数量增加 108 | src/lib/libdwarf/dwarf_die_deliv.c:383:9-宏定义导致参数数量增加 109 | src/lib/libdwarf/dwarf_die_deliv.c:387:9-宏定义导致参数数量增加 110 | ``` -------------------------------------------------------------------------------- /experimental_logs/filter_test.md: -------------------------------------------------------------------------------- 1 | 2 | 尝试过用CodeBert或者VarCLR做简单过滤 3 | 4 | # 1.VarCLR效果 5 | 6 | ## 1.1.类型相似度预测 7 | 8 | 因为变量名随机因素太多,而类型名往往语义相似性会强些,因此这里先用VarCLR进行类型名预测。 9 | 10 | group1: 11 | 12 | src: `char`, target: `{u_char, ssize_t, int, short, intptr, unsigned char}` 13 | 14 | 结果为: `0.64, 0.45, 0.90, 0.66, 0.79, 0.66`,可以看到 `u_char`,`unsigned char` 和 `char` 的相似度低于 `int` 和 `intptr` 15 | 16 | 17 | group2: 18 | 19 | src: `ngx_conf_t`, target: `{ngx_conf_s, ngx_module_t, ngx_module_s, ngx_log_t, ngx_log_s, ngx_event_t, ngx_event_s}` 20 | 21 | 结果为 `0.52, 0.70, 0.43, 0.54, 0.53, 0.86, 0.64`,VarCLR这里依旧产生了误报,`ngx_conf_s` 理应权重最高。 22 | 23 | group 3: 24 | 25 | src: `ngx_uint_t`, target: `{ngx_log_t, ngx_err_t, uintptr_t, ngx_int_t, unsigned long, unsigned int, int, u_char, ngx_conf_t, ngx_command_t}` 26 | 27 | 结果为 `0.95, 0.63, 0.64, 0.59, 0.54, 0.80, 0.58, 0.64, 0.64, 0.63`,可以看到VarCLR依旧有误报,分数最高的 `ngx_log_t` 毫无关系,而同名类型 `uintptr_t`, `unsigned long` 分数仅仅为0.64和0.54。而文本且语义相似的 `ngx_int_t` 分数也只有0.59。 28 | 29 | 因此可以得出结论用VarCLR来替代字符串匹配做type match可行性较低。 30 | 31 | ## 1.2.函数名分析 32 | 33 | group1: 34 | 35 | src: `handler`, target: `{ngx_load_module, ngx_resolver_log_error, ngx_http_fastcgi_lowat_check, ngx_http_upstream_rewrite_location, ngx_http_fastcgi_split_path_info}` 36 | 37 | 结果为 `0.32, 0.37, 0.45, 0.31, 0.32`,基于函数名过滤还是有些难度的。 38 | 39 | 40 | group2: 41 | 42 | src: `writer`, target: `{ngx_log_error,ngx_ssl_get_cached_session,ngx_http_ssl_npn_advertised,ngx_http_v2_filter_get_shadow,ngx_http_xslt_sax_external_subset,ngx_http_xslt_sax_error,ngx_log_memory_writer,ngx_http_log_error_handler,ngx_syslog_writer,ngx_ssl_password_callback}` 43 | 44 | 结果为: `0.30, 0.69, 0.71, 0.15, 0.38, 0.30, 0.34, 0.32, 0.17, 0.43`,最可能成为true positive的2个函数分数只有 `0.34`、`0.17`,分数最高的同样也是false positive。 45 | 46 | # 2.CodeBert测试 47 | 48 | callsite code: `log->handler(log, p, last - p)` 49 | 50 | 函数声明包括: 51 | 52 | ```c 53 | char * 54 | ngx_conf_set_sec_slot(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 55 | 56 | static u_char * 57 | ngx_resolver_log_error(ngx_log_t *log, u_char *buf, size_t len) 58 | 59 | static u_char * 60 | ngx_http_log_error(ngx_log_t *log, u_char *buf, size_t len) 61 | 62 | static char * 63 | ngx_http_xslt_entities(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 64 | 65 | static char * 66 | ngx_http_try_files(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 67 | 68 | static u_char * 69 | ngx_http_v2_state_priority(ngx_http_v2_connection_t *h2c, u_char *pos, 70 | u_char *end) 71 | 72 | static u_char * 73 | ngx_http_v2_state_proxy_protocol(ngx_http_v2_connection_t *h2c, u_char *pos, 74 | u_char *end) 75 | 76 | static ngx_int_t 77 | ngx_http_geoip_country_variable(ngx_http_request_t *r, 78 | ngx_http_variable_value_t *v, uintptr_t data) 79 | 80 | static ngx_int_t 81 | ngx_http_variable_sent_last_modified(ngx_http_request_t *r, 82 | ngx_http_variable_value_t *v, uintptr_t data) 83 | 84 | static char * 85 | ngx_mail_smtp_merge_srv_conf(ngx_conf_t *cf, void *parent, void *child) 86 | ``` 87 | 88 | 分析结果为: `0.96, 0.99, 0.94, 0.96, 0.96, 0.97, 0.95, 0.97, 0.96, 0.96`,可以看到分数基本一致,很难区分true/false positive。 89 | 90 | 往icallsite添加一些上下文信息: 91 | 92 | ```c 93 | ngx_log_t *log 94 | u_char *p, *last, *msg; 95 | log->handler(log, p, last - p) 96 | ``` 97 | 98 | 结果变为: `0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.98, 0.99, 0.99, 0.99`,更难区分true/false positive。 -------------------------------------------------------------------------------- /experimental_logs/intermediate_res/failed_callsites.txt: -------------------------------------------------------------------------------- 1 | src/packet.c:1518:14, target function declarator为宏展开,匹配失误 2 | src/lxc/confile.c:3225:10, target为宏展开 3 | src/md4c-html.c:76:5, target在scope外 4 | src/core/ngx_conf_file.c:304:18, target在scope外 5 | src/st.c:468:15, unsigned long和char*, void*类型的类型转换 6 | pjlib/include/pj/pool_i.h:85:12, target在scope外 7 | pjlib/include/pj/pool_i.h:102:9, target在scope外 8 | pjlib/src/pj/os_core_unix.c:701:27, target在scope外 9 | src/rtpp_module_if.c:454:21, target在scope外 10 | plugins/sudoers/pwutil.c:307:12, target function declarator为宏,在address-taken时miss掉了 11 | plugins/sudoers/pwutil.c:1030:12, target function declarator为宏,在address-taken时miss掉了 12 | lib/util/fatal.c:335:2, target在scope外 13 | lib/util/fatal.c:341:2, target在scope外 14 | lib/util/fatal.c:190:2, target在scope外 15 | lib/util/fatal.c:251:2, target在scope外 16 | plugins/sudoers/pwutil.c:1088:14, target function declarator为宏,在address-taken时miss掉了 17 | plugins/sudoers/strvec_join.c:60:6, target在scope外 18 | lib/util/lbuf.c:455:2, target在scope外 19 | plugins/sudoers/pwutil.c:891:12, target function declarator为宏,在address-taken时miss掉了 20 | lib/util/digest.c:169:5, target function的address-taken site被宏替代 21 | src/modules/entry.c:492:9, target在scope外 22 | src/misc/messages.c:370:19, target在scope外 23 | src/input/demux.c:111:15, target在scope外 24 | include/vlc_es_out.h:182:5, target在scope外 25 | src/input/demux.c:213:16, target在scope外 26 | src/modules/modules.c:305:9, target在scope外 27 | src/modules/modules.c:275:11, target在scope外 28 | include/vlc_es_out.h:150:12, target在scope外 29 | include/vlc_es_out.h:166:12, target在scope外 30 | include/vlc_es_out.h:161:12, target在scope外 31 | include/vlc_es_out.h:155:5, target在scope外 -------------------------------------------------------------------------------- /experimental_logs/manual_analysis/callsite_bind9_1/log1.txt: -------------------------------------------------------------------------------- 1 | callsite_key: src/core/ddsi/src/ddsi_config.c:1626:11 2 | 3 | 1.Query for ChatGPT: 4 | 5 | You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information. 6 | 7 | The indirect-call expression is: (rdataset->methods->setownercase)(rdataset, name). 8 | 9 | The subsequent text provides the summary of the indirect-call and the corresponding function: 10 | 11 | ## 1.summary of indirect-call: 12 | 13 | ## 1.1.summary of function dns_rdataset_setownercase containing indirect-call and summary of indirect-call itself: 14 | 1. The dns_rdataset_setownercase function is designed to set the owner case for a given DNS resource record dataset. It checks if the provided rdataset is valid and if it has associated methods. If the method for setting the owner case exists and the dataset's attributes do not indicate to keep the case, it invokes the method to set the owner case. 15 | 16 | 2. The code surrounding the indirect call (rdataset->methods->setownercase)(rdataset, name) first checks if the method for setting the owner case exists and if the dataset attributes allow for changing the case. If both conditions are met, it invokes the method with the provided rdataset and name as arguments. This ensures that the owner case is set appropriately for the given dataset, based on the method implementation. 17 | 18 | ## 1.2.summary of the function pointer declaration for the indirect-call: 19 | The setownercase function pointer, residing within the struct dns_rdatasetmethods, is designed to handle the modification of the owner name case within a DNS resource record dataset (dns_rdataset_t). Its purpose is to set the owner name case to match a specified DNS name (dns_name_t). This pointer facilitates operations related to adjusting the case of owner names within DNS record datasets. 20 | 21 | ## 2.summary of target function rdataset_setownercase: 22 | 23 | ## 2.1.summary of the target function: 24 | This function sets a flag in a DNS resource record list to indicate uppercase letters in the owner name. 25 | 26 | ## 2.2.summary of the target function's address-taken site: 27 | The function pointer assigned by the address of dns_rdatalist_setownercase within the dns_rdatasetmethods struct is intended to set the owner case for a given dns_rdataset_t. This function pointer facilitates modifying the case of the owner name within the context of DNS resource datasets, allowing for manipulation and standardization of owner name casing within the dataset. 28 | 29 | # Question: 30 | Assess if rdataset_setownercase could be one of the target function that indirect call (rdataset->methods->setownercase)(rdataset, name) potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 31 | 32 | If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'. 33 | 34 | 2.Answer of ChatGPT: 35 | 36 | # Answer1: 37 | 38 | Yes 39 | 40 | # Answer2: 41 | 42 | Yes. 43 | 44 | # Answer3: 45 | 46 | Yes. 47 | 48 | # Answer4: 49 | 50 | Yes, based solely on their respective functionalities, it seems that rdataset_setownercase could indeed be one of the target functions that the indirect call (rdataset->methods->setownercase)(rdataset, name) potentially invokes. Both functions are involved in setting the owner case for a DNS resource record dataset, indicating a similarity in their purpose and functionality. 51 | 52 | # Answer5: 53 | 54 | Yes. -------------------------------------------------------------------------------- /experimental_logs/manual_analysis/callsite_cyclonedds_13/log1.txt: -------------------------------------------------------------------------------- 1 | callsite_key: src/core/ddsi/src/ddsi_config.c:1701:9 2 | 3 | 1.Query for ChatGPT: 4 | 5 | You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information. 6 | 7 | The indirect-call expression is: ce->print (cfgst, parent, ce, sources). 8 | 9 | The subsequent text provides the summary of the indirect-call and the corresponding function: 10 | 11 | ## 1.summary of indirect-call: 12 | 13 | ## 1.1.summary of function print_configitems containing indirect-call and summary of indirect-call itself: 14 | 1.Functionality of print_configitems function: 15 | The print_configitems function iterates over a collection of configuration elements (cfgelem) and prints their details. It handles both singular and multiple instances of configuration elements, recursively processing children and attributes. It maintains a stack (cfgst) to track the traversal and handles special cases for moved or deprecated elements. 16 | 2.Use of indirect call ce->print (cfgst, parent, ce, sources): 17 | This indirect call is utilized to print the details of a configuration element. It dynamically dispatches the appropriate print function based on the type of configuration element (ce). The function pointer ce->print allows for polymorphic behavior, enabling different types of configuration elements to be printed correctly. The function call is conditional on whether the ce->print function pointer is not NULL, ensuring that only elements with defined printing behavior are processed. 18 | 19 | ## 1.2.summary of the function pointer declaration for the indirect-call: 20 | The function pointer print within the struct cfgelem serves as a callback mechanism for printing the configuration elements. It points to a function conforming to the print_fun_t type, which takes parameters related to configuration settings (struct ddsi_cfgst, void *parent, struct cfgelem const * const, and uint32_t sources) and performs printing operations. This facilitates modular and customizable printing of configuration data within the context of the provided structures and types. 21 | 22 | ## 2.summary of target function pf_memsize16: 23 | 24 | ## 2.1.summary of the target function: 25 | The pf_memsize16 function retrieves a 16-bit memory size value from a configuration element and converts it to bytes using a lookup table, passing the result to pf_int64_unit along with specified sources and units. 26 | 27 | ## 2.2.summary of the target function's address-taken site: 28 | The cfgelem struct represents elements of a configuration system, storing information about configuration options and their attributes. The function pointer assigned by the address of pf_memsize16 within this struct is likely intended to handle printing the memory size associated with a configuration option, allowing for the formatted display of memory size information within the configuration system's output. 29 | 30 | # Question: 31 | Assess if pf_memsize16 could be one of the target function that indirect call ce->print (cfgst, parent, ce, sources) potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 32 | 33 | If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'. 34 | 35 | 2.Answer of ChatGPT: 36 | 37 | # Answer1: 38 | 39 | Based solely on the provided summaries: 40 | 41 | print_configitems function utilizes an indirect call ce->print(cfgst, parent, ce, sources) to print configuration elements dynamically based on their type. 42 | pf_memsize16 function retrieves a 16-bit memory size value from a configuration element and converts it to bytes using a lookup table, passing the result to another function along with specified sources and units. 43 | 44 | Since pf_memsize16 doesn't seem to have any direct connection with printing configuration elements or their details, and it focuses on memory size retrieval and conversion, it's unlikely to be the target function invoked by the indirect call ce->print(cfgst, parent, ce, sources). Therefore, the answer is 'no'. 45 | 46 | # Answer2: 47 | 48 | No. 49 | 50 | # Answer3: 51 | 52 | No. 53 | 54 | # Answer4: 55 | 56 | Based solely on the functionalities described: 57 | 58 | The indirect call ce->print(cfgst, parent, ce, sources) is utilized to print details of a configuration element based on its type, dynamically dispatching the appropriate print function. 59 | 60 | The target function pf_memsize16 retrieves a 16-bit memory size value from a configuration element and converts it to bytes, then passes the result to pf_int64_unit along with specified sources and units. 61 | 62 | Given these descriptions, it seems unlikely that pf_memsize16 is the target function invoked by the indirect call ce->print(cfgst, parent, ce, sources). The purpose of pf_memsize16 is specifically related to memory size retrieval and conversion, while the indirect call is intended for printing configuration element details based on their types. Therefore, the answer is 'no'. 63 | 64 | # Answer5: 65 | 66 | That reasoning sounds solid. It seems like pf_memsize16 is more specialized for memory size retrieval and conversion, whereas the indirect call you described is aimed at printing configuration element details dynamically based on their types. So, you're correct in concluding that pf_memsize16 is unlikely to be the target function invoked by the indirect call. 67 | 68 | -------------------------------------------------------------------------------- /experimental_logs/manual_analysis/callsite_cyclonedds_13/log2.txt: -------------------------------------------------------------------------------- 1 | callsite_key: src/core/ddsi/src/ddsi_config.c:1701:9 2 | 3 | 1.Query for ChatGPT: 4 | 5 | You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information. 6 | 7 | The indirect-call expression is: ce->print (cfgst, parent, ce, sources). 8 | 9 | The subsequent text provides the summary of the indirect-call and the corresponding function: 10 | 11 | ## 1.summary of indirect-call: 12 | 13 | ## 1.1.summary of function print_configitems containing indirect-call and summary of indirect-call itself: 14 | 1.Functionality of print_configitems function: 15 | The print_configitems function iterates over a collection of configuration elements (cfgelem) and prints their details. It handles both singular and multiple instances of configuration elements, recursively processing children and attributes. It maintains a stack (cfgst) to track the traversal and handles special cases for moved or deprecated elements. 16 | 2.Use of indirect call ce->print (cfgst, parent, ce, sources): 17 | This indirect call is utilized to print the details of a configuration element. It dynamically dispatches the appropriate print function based on the type of configuration element (ce). The function pointer ce->print allows for polymorphic behavior, enabling different types of configuration elements to be printed correctly. The function call is conditional on whether the ce->print function pointer is not NULL, ensuring that only elements with defined printing behavior are processed. 18 | 19 | ## 1.2.summary of the function pointer declaration for the indirect-call: 20 | The function pointer print within the struct cfgelem serves as a callback mechanism for printing the configuration elements. It points to a function conforming to the print_fun_t type, which takes parameters related to configuration settings (struct ddsi_cfgst, void *parent, struct cfgelem const * const, and uint32_t sources) and performs printing operations. This facilitates modular and customizable printing of configuration data within the context of the provided structures and types. 21 | 22 | ## 2.summary of target function pf_nop: 23 | 24 | ## 2.1.summary of the target function: 25 | The pf_nop function is a static void function that takes in several unused arguments and does nothing, essentially serving as a placeholder or a no-operation function. 26 | 27 | ## 2.2.summary of the target function's address-taken site: 28 | The function pointer assigned by the address of function pf_nop within the cfgelem struct serves the purpose of providing a no-operation behavior for certain configuration elements. This function pointer likely facilitates operations related to initialization, updating, freeing, and printing of configuration elements, allowing for flexibility in handling different configurations while providing a default behavior of no-operation when needed. 29 | 30 | # Question: 31 | Assess if pf_nop could be one of the target function that indirect call ce->print (cfgst, parent, ce, sources) potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 32 | 33 | If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'. 34 | 35 | 2.Answer of ChatGPT: 36 | 37 | # Answer1: 38 | 39 | Based solely on their respective functionalities as described: 40 | 41 | The print_configitems function utilizes the ce->print indirect call to dynamically dispatch the appropriate print function based on the type of configuration element. 42 | The pf_nop function serves as a placeholder or a no-operation function. 43 | 44 | Given that the ce->print function pointer is conditional on whether it's not NULL, and considering that pf_nop could serve as a default behavior for certain configuration elements, it's plausible that pf_nop could be one of the target functions that the indirect call ce->print(cfgst, parent, ce, sources) potentially invokes. 45 | 46 | So, the answer is 'yes'. 47 | 48 | # Answer2: 49 | 50 | Yes. 51 | 52 | # Answer3: 53 | 54 | Yes. 55 | 56 | # Answer4: 57 | 58 | Yes 59 | 60 | # Answer5: 61 | 62 | Yes. -------------------------------------------------------------------------------- /experimental_logs/manual_analysis/callsite_cyclonedds_3/log1.txt: -------------------------------------------------------------------------------- 1 | callsite_key: src/core/ddsi/src/ddsi_config.c:1626:11 2 | 3 | 1.Query for ChatGPT: 4 | 5 | You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information. 6 | 7 | The indirect-call expression is: upd (cfgst, parent, cfgelem, (n->count == n->failed), value). 8 | 9 | The subsequent text provides the summary of the indirect-call and the corresponding function: 10 | 11 | ## 1.summary of indirect-call: 12 | 13 | ## 1.1.summary of function do_update containing indirect-call and summary of indirect-call itself: 14 | The do_update function is responsible for updating configuration settings (cfgst) based on incoming updates. It creates or looks up configuration elements (cfgelem) associated with a parent entity (parent). It checks the multiplicity of the configuration element and handles errors accordingly. Additionally, it tracks the count and sources of updates for each element. 15 | 16 | The indirect call upd(cfgst, parent, cfgelem, (n->count == n->failed), value) within do_update is used to apply an update to a configuration element. The function upd is passed the configuration state, parent entity, configuration element, a condition based on the counts of successful and failed updates, and the new value. This call is conditioned on the count of the configuration element not exceeding its multiplicity and the result of the update. 17 | 18 | 19 | ## 1.2.summary of the function pointer declaration for the indirect-call: 20 | The function pointer upd of type update_fun_t is intended to update a configuration state (cfgst) based on the given parameters, including the parent element, configuration element (cfgelem), a condition (n->count == n->failed), and a value. Its purpose is to execute a function that modifies the configuration state according to these parameters. 21 | 22 | 23 | ## 2.summary of target function uf_random_seed: 24 | 25 | ## 2.1.summary of the target function: 26 | It looks like this code is a function uf_random_seed that updates a random seed based on some configuration. Let's break it down: 27 | 28 | - The function takes several arguments: a pointer to a struct ddsi_cfgst, a pointer to the parent object, a pointer to a struct cfgelem, an integer first (which seems to be unused), and a pointer to a string value. 29 | 30 | - It declares a pointer elem to ddsert_prng_seed_t, which is presumably the type of the random seed. 31 | 32 | - It checks if the value passed to the function is an empty string using strcmp. If it's empty, it generates a random seed using ddsert_prng_makeseed. 33 | 34 | - If the value is not empty, it computes an MD5 hash of the value using the ddsrt_md5_* functions and stores the result as the random seed. It appears to use the MD5 hash as a pseudo-random seed. 35 | 36 | - Finally, it returns URES_SUCCESS. 37 | 38 | A couple of things to note: 39 | 40 | - The UNUSED_ARG macro suggests that the first argument is not used in this function. 41 | - The memcpy calls copy the computed MD5 hash into elem, assuming it's at least 32 bytes long. 42 | 43 | Do you have any specific questions or concerns about this code? 44 | 45 | ## 2.2.summary of the target function's address-taken site: 46 | The variable entity_autonaming_attributes of type struct cfgelem appears to represent a configuration element related to autonaming attributes. The function pointer assigned by the address of uf_random_seed within this structure likely serves to initialize or update a configuration setting associated with providing an initial seed for entity naming, as specified in the description, enabling consistent generation of names across runs by hashing a provided string. 47 | 48 | # Question: 49 | Assess if uf_random_seed could be one of the target function that indirect call upd (cfgst, parent, cfgelem, (n->count == n->failed), value) potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 50 | 51 | If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'. 52 | 53 | 54 | 2.Answer of ChatGPT: 55 | 56 | # Answer1: 57 | 58 | Yes, based solely on their respective functionalities, it seems likely that uf_random_seed could be one of the target functions that the indirect call upd(cfgst, parent, cfgelem, (n->count == n->failed), value) potentially invokes. Both functions involve updating configuration settings (cfgst), potentially based on conditions ((n->count == n->failed)), and both take similar parameters related to configuration elements (cfgelem) and values. 59 | 60 | Therefore, the answer is 'yes'. 61 | 62 | # Answer2: 63 | 64 | Yes. 65 | 66 | # Answer3: 67 | 68 | Yes, based solely on their respective functionalities, uf_random_seed could potentially be the target function that the indirect call upd(cfgst, parent, cfgelem, (n->count == n->failed), value) invokes. Both functions involve updating configuration settings based on incoming updates, and uf_random_seed seems to handle configuration updates related to generating or updating random seeds, which aligns with the purpose described for the indirect call upd. Therefore, the answer is 'yes'. 69 | 70 | # Answer4: 71 | 72 | Yes. 73 | 74 | # Answer5: 75 | 76 | Yes. -------------------------------------------------------------------------------- /experimental_logs/manual_analysis/callsite_libjpeg-turbo_361/log1.txt: -------------------------------------------------------------------------------- 1 | callsite_key: jdmaster.c:805:9 2 | 3 | 1.Query for ChatGPT: 4 | 5 | You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information. 6 | 7 | The indirect-call expression is: (*cinfo->cconvert->start_pass) (cinfo). 8 | 9 | The subsequent text provides the summary of the indirect-call and the corresponding function: 10 | 11 | ## 1.summary of indirect-call: 12 | 13 | ## 1.1.summary of function prepare_for_output_pass containing indirect-call and summary of indirect-call itself: 14 | 1.The prepare_for_output_pass function is responsible for preparing the decompressor for an output pass. It checks if it's the final pass of 2-pass quantization, selects quantization methods, sets up various processing stages, and updates progress monitoring information. 15 | 2.The indirect call (*cinfo->cconvert->start_pass) (cinfo) is used within the context to start the color conversion process if raw data output is not enabled and merged upsampling is not being used. This call is part of the preparation steps for output pass, ensuring necessary processing stages are initiated before further processing of the decompressed image data 16 | 17 | ## 1.2.summary of the function pointer declaration for the indirect-call: 18 | The function pointer start_pass within the struct jpeg_color_deconverter serves as a callback mechanism to execute specific operations at the beginning of each pass during JPEG image color conversion. Its purpose is to initiate necessary actions before processing begins, such as initialization or setup tasks, leveraging the context provided by the j_decompress_ptr cinfo parameter. 19 | 20 | ## 2.summary of target function start_pass_dcolor: 21 | 22 | ## 2.1.summary of the target function: 23 | The function start_pass_dcolor initializes a new pass for decompression in a JPEG decompression process with no additional work required. 24 | 25 | ## 2.2.summary of the target function's address-taken site: 26 | The function pointer start_pass within the jpeg_color_converter struct is designed to initiate a pass in color conversion during JPEG compression, taking a j_compress_ptr parameter. By assigning the address of the function start_pass_dcolor to this pointer in _jinit_color_deconverter, the code establishes a specific function to execute this initiation step, tailored for color conversion in JPEG compression. This allows for flexible customization of the color conversion process within the JPEG compression workflow. 27 | 28 | # Question: 29 | Assess if start_pass_dcolor could be one of the target function that indirect call (*cinfo->cconvert->start_pass) (cinfo) potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 30 | 31 | If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'. 32 | 33 | 2.Answer of ChatGPT: 34 | 35 | # Answer1: 36 | 37 | Yes. 38 | 39 | # Answer2: 40 | 41 | Yes 42 | 43 | # Answer3: 44 | 45 | Yes. 46 | 47 | # Answer4: 48 | 49 | Yes. 50 | 51 | # Answer5: 52 | 53 | Yes. -------------------------------------------------------------------------------- /experimental_logs/manual_analysis/callsite_lxc_1/log1.txt: -------------------------------------------------------------------------------- 1 | callsite_key: src/lxc/lxccontainer.c:3091:8 2 | 3 | 1.Query for ChatGPT: 4 | 5 | You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information. 6 | 7 | The indirect-call expression is: config->set(key, v, conf, NULL). 8 | 9 | The subsequent text provides the summary of the indirect-call and the corresponding function: 10 | 11 | ## 1.summary of indirect-call: 12 | 13 | ## 1.1.summary of function lxc_set_config_item_locked containing indirect-call and summary of indirect-call itself: 14 | 1. The lxc_set_config_item_locked function is responsible for setting configuration items within a locked context. It retrieves a configuration structure based on the provided key, sets the configuration item using the retrieved configuration structure, and performs additional operations based on the result. 15 | 16 | 2. The indirect call config->set(key, v, conf, NULL) within lxc_set_config_item_locked is used to set a configuration item identified by the provided key and value. It is part of a broader process where the function retrieves a configuration structure based on the key, sets the configuration item using the retrieved structure, and then performs error handling and additional operations based on the outcome. 17 | 18 | 19 | 20 | ## 2.summary of target function set_config_init_cmd: 21 | 22 | ## 2.1.summary of the target function: 23 | The function set_config_init_cmd sets the initialization command in the LXC configuration structure based on the provided key and value. 24 | 25 | ## 2.2.summary of the target function's address-taken site: 26 | The function pointer assigned by the address of set_config_init_cmd in the config_jump_table array initializer is intended to enable the setting of the initialization command in the LXC configuration. This pointer facilitates the manipulation of initialization command configuration parameters within the broader context of LXC configuration management. 27 | 28 | # Question: 29 | Assess if set_config_init_cmd could be one of the target function that indirect call config->set(key, v, conf, NULL) potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 30 | 31 | If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'. 32 | 33 | 2.Answer of ChatGPT: 34 | 35 | # Answer1: 36 | 37 | Yes. The indirect call config->set(key, v, conf, NULL) potentially invokes the target function set_config_init_cmd based solely on their respective functionalities. Both functions involve setting configuration items based on a provided key and value within the broader context of configuration management. 38 | 39 | # Answer2: 40 | 41 | Yes. Based on the provided summaries, it seems plausible that the indirect call config->set(key, v, conf, NULL) within the function lxc_set_config_item_locked could potentially invoke the target function set_config_init_cmd. Both functions involve setting configuration items based on provided keys and values within the context of managing LXC configuration. Therefore, the indirect call could indeed be invoking set_config_init_cmd. 42 | 43 | # Answer3: 44 | 45 | Based solely on the provided summaries of the indirect call and the target function, it seems likely that the function set_config_init_cmd could indeed be one of the target functions that the indirect call config->set(key, v, conf, NULL) potentially invokes. Both functions involve setting configuration items based on a key and value within a broader context of configuration management. Therefore, I'd answer 'yes'. 46 | 47 | # Answer4: 48 | 49 | Yes, based solely on their respective functionalities, it seems plausible that the indirect call config->set(key, v, conf, NULL) could potentially invoke the target function set_config_init_cmd. Both functions involve setting configuration items based on a provided key and value within the context of managing configuration settings. However, without further analysis of the code structure, control flow, and data flow, it cannot be definitively confirmed. 50 | 51 | # Answer5: 52 | 53 | Yes. -------------------------------------------------------------------------------- /experimental_logs/updated_results.md: -------------------------------------------------------------------------------- 1 | 2 | We have updated results with following LLMs: Yi-1.5-34B-Chat, Mixtral-8x7B-Instruct-v0.1. 3 | 4 | # RQ1.Performance of SEA combined with traditional approach 5 | 6 | Full Evaluation 7 | 8 | | model-temperature | flta-P | flta-R | flta-F | mlta-P | mlta-R | mlta-F | kelp-P | kelp-R | kelp-F | 9 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 10 | | traditional approach | 26.1 | 97.9 | 34.8 | 45.1 | 97.4 | 53.4 | 49.2 | 97.4 | 57.5 | 11 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 12 | | Qwen1.5-0 | 48.7 | 96.9 | 59.0 | 51.8 | 96.6 | 61.3 | 54.8 | 96.8 | 63.7 | 13 | | Qwen1.5-0.25 | 48.8 | 97.3 | 59.1 | 51.8 | 96.9 | 61.3 | 54.7 | 97.0 | 63.6 | 14 | | Qwen1.5-0.5 | 49.1 | 97.3 | 59.4 | 51.9 | 96.9 | 61.3 | 54.9 | 97.1 | 63.8 | 15 | | Qwen1.5-0.75 | 49.3 | 97.2 | 59.4 | 52.2 | 96.9 | 61.5 | 54.8 | 97.0 | 63.7 | 16 | | Qwen1.5-1.0 | 48.5 | 96.4 | 58.6 | 51.3 | 96.0 | 60.5 | 54.9 | 96.8 | 63.6 | 17 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 18 | | llama3-0.0 | 46.5 | 95.9 | 56.6 | 51.3 | 95.6 | 60.5 | 55.0 | 96.2 | 63.6 | 19 | | llama3-0.25 | 47.0 | 96.6 | 57.1 | 51.5 | 96.3 | 60.8 | 54.9 | 96.3 | 63.6 | 20 | | llama3-0.5 | 45.9 | 96.2 | 56.2 | 51.6 | 95.8 | 60.6 | 55.3 | 96.5 | 63.8 | 21 | | llama3-0.75 | 45.4 | 96.0 | 55.8 | 50.6 | 95.6 | 59.9 | 54.3 | 96.3 | 63.1 | 22 | | llama3-1.0 | 46.3 | 96.9 | 56.6 | 52.1 | 96.6 | 61.3 | 55.2 | 96.7 | 63.8 | 23 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 24 | | Yi-0 | 44.4 | 96.4 | 54.4 | 50.3 | 96.0 | 59.2 | 54.6 | 96.8 | 63.2 | 25 | | Yi-0.25 | 44.2 | 96.1 | 54.4 | 50.1 | 95.7 | 59.1 | 54.1 | 96.1 | 62.7 | 26 | | Yi-0.5 | 44.8 | 96.9 | 55.0 | 50.5 | 96.5 | 59.6 | 53.9 | 96.6 | 62.5 | 27 | | Yi-0.75 | 44.5 | 96.4 | 54.7 | 49.9 | 96.0 | 59.1 | 53.5 | 96.5 | 62.3 | 28 | | Yi-1 | 43.6 | 96.2 | 53.5 | 49.3 | 95.7 | 58.4 | 52.8 | 95.8 | 61.5 | 29 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 30 | | Mixtral-0 | 40.9 | 96.2 | 51.0 | 48.9 | 95.8 | 57.8 | 53.2 | 96.5 | 61.9 | 31 | | Mixtral-0.25 | 41.4 | 97.1 | 51.4 | 49.3 | 96.7 | 58.3 | 53.3 | 96.7 | 61.9 | 32 | | Mixtral-0.5 | 41.9 | 97.3 | 51.9 | 49.5 | 96.9 | 58.5 | 53.4 | 96.9 | 62.0 | 33 | | Mixtral-0.75 | 43.0 | 97.3 | 52.8 | 50.2 | 96.9 | 58.9 | 53.9 | 96.9 | 62.3 | 34 | | Mixtral-1 | 42.2 | 96.7 | 51.7 | 49.6 | 96.3 | 58.3 | 54.0 | 96.9 | 62.4 | 35 | 36 | 37 | Exclusive Evaluation 38 | 39 | | model-temperature | flta-P | flta-R | flta-F | mlta-P | mlta-R | mlta-F | 40 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 41 | | traditional approach | 31.4 | 99.4 | 38.2 | 52.9 | 98.0 | 61.6 | 42 | | Qwen1.5-0 | 59.3 | 97.5 | 66.4 | 52.9 | 98.0 | 61.6 | 43 | | Qwen1.5-0.25 | 59.4 | 97.7 | 66.4 | 52.9 | 98.0 | 61.6 | 44 | | Qwen1.5-0.5 | 60.6 | 97.2 | 67.2 | 53.3 | 97.8 | 62.0 | 45 | | Qwen1.5-0.75 | 60.5 | 97.1 | 66.9 | 52.9 | 98.0 | 61.6 | 46 | | Qwen1.5-1 | 60.3 | 97.4 | 66.7 | 53.2 | 97.4 | 61.7 | 47 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 48 | | llama3-0 | 57.4 | 96.9 | 63.7 | 52.9 | 98.0 | 61.6 | 49 | | llama3-0.25 | 56.8 | 96.9 | 63.4 | 53.0 | 97.7 | 61.7 | 50 | | llama3-0.5 | 57.2 | 97.2 | 63.9 | 52.9 | 98.0 | 61.6 | 51 | | llama3-0.75 | 55.8 | 96.6 | 62.6 | 52.9 | 97.9 | 61.7 | 52 | | llama3-1.0 | 56.8 | 97.6 | 63.6 | 52.8 | 97.5 | 61.5 | 53 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 54 | | Yi-0 | 54.0 | 98.1 | 61.0 | 53.0 | 97.6 | 61.6 | 55 | | Yi-0.25 | 56.1 | 98.4 | 63.0 | 52.4 | 97.0 | 61.0 | 56 | | Yi-0.5 | 55.7 | 97.6 | 62.4 | 53.0 | 97.8 | 61.5 | 57 | | Yi-0.75 | 53.7 | 97.4 | 60.8 | 53.1 | 97.7 | 61.6 | 58 | | Yi-1 | 52.8 | 96.1 | 59.6 | 52.8 | 97.5 | 61.5 | 59 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 60 | | Mixtral-0 | 51.0 | 97.5 | 57.6 | 52.9 | 97.7 | 61.6 | 61 | | Mixtral-0.25 | 50.7 | 96.9 | 57.2 | 53.1 | 97.9 | 61.8 | 62 | | Mixtral-0.5 | 50.3 | 98.2 | 57.1 | 52.8 | 97.4 | 61.4 | 63 | | Mixtral-0.75 | 50.0 | 97.4 | 56.7 | 52.8 | 97.1 | 61.3 | 64 | | Mixtral-1 | 50.7 | 97.6 | 57.4 | 52.9 | 97.9 | 61.6 | 65 | 66 | 67 | 68 | # RQ2.Ablation Study 69 | 70 | The study is conducted under temperature of 0.5 for Qwen1.5,llama3,Yi. For Mixtral, since it perform best under temperature 0.75. Its ablation study in under 0.75 temperature. 71 | 72 | | model-group | flta-P | flta-R | flta-F | mlta-P | mlta-R | mlta-F | kelp-P | kelp-R | kelp-F | 73 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 74 | | traditional approach | 26.1 | 97.9 | 34.8 | 45.1 | 97.4 | 53.4 | 49.2 | 97.4 | 57.5 | 75 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 76 | | Qwen1.5-origin | 49.1 | 97.3 | 59.4 | 51.9 | 96.9 | 61.4 | 55.0 | 97.0 | 63.9 | 77 | | Qwen1.5-wo-local | 47.7 | 95.5 | 57.5 | 51.3 | 95.1 | 60.1 | 54.1 | 95.4 | 62.6 | 78 | | Qwen1.5-wo-global | 44.7 | 87.3 | 53.4 | 48.4 | 86.9 | 56.1 | 52.2 | 88.8 | 59.8 | 79 | | Qwen1.5-wo-all | 38.8 | 58.8 | 41.9 | 42.2 | 58.4 | 44.2 | 46.9 | 68.4 | 50.5 | 80 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 81 | | llama3-origin | 45.9 | 96.2 | 56.2 | 51.6 | 95.8 | 60.6 | 55.3 | 96.4 | 63.8 | 82 | | llama3-wo-local | 43.7 | 97.7 | 53.8 | 49.5 | 97.3 | 58.6 | 53.4 | 97.3 | 62.1 | 83 | | llama3-wo-global | 43.7 | 89.1 | 52.6 | 49.4 | 88.7 | 57.1 | 52.5 | 90.2 | 60.3 | 84 | | llama3-wo-all | 33.4 | 95.9 | 43.1 | 46.9 | 95.5 | 55.4 | 51.1 | 96.3 | 59.7 | 85 | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 86 | | Yi-origin | 44.8 | 96.9 | 55.0 | 50.5 | 96.5 | 59.6 | 53.9 | 96.6 | 62.5 | 87 | | Yi-wo-local | 42.5 | 96.7 | 52.6 | 48.6 | 96.3 | 57.6 | 53.0 | 96.3 | 61.5 | 88 | | Yi-wo-global | 40.4 | 90.6 | 49.3 | 48.2 | 90.1 | 56.0 | 52.8 | 92.6 | 60.7 | 89 | | Yi-wo-all | 32.5 | 91.2 | 41.5 | 45.7 | 90.7 | 53.5 | 51.5 | 94.6 | 59.7 | 90 | | ---- | ---- | ---- | ---- | 91 | | Mixtral-origin | 43.0 | 97.3 | 52.8 | 50.2 | 96.9 | 58.9 | 53.9 | 96.9 | 62.3 | 92 | | Mixtral-wo-local | 43.1 | 95.2 | 51.6 | 49.5 | 94.8 | 57.6 | 53.3 | 94.8 | 61.2 | 93 | | Mixtral-wo-global | 40.3 | 89.5 | 48.8 | 47.2 | 89.1 | 55.0 | 52.7 | 92.1 | 60.5 | 94 | | Mixtral-wo-all | 37.3 | 62.3 | 41.3 | 42.1 | 61.9 | 44.8 | 46.9 | 70.1 | 50.7 | 95 | -------------------------------------------------------------------------------- /icall_solvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/__init__.py -------------------------------------------------------------------------------- /icall_solvers/base_solvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/base_solvers/__init__.py -------------------------------------------------------------------------------- /icall_solvers/base_solvers/base_matcher.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from tqdm import tqdm 3 | from typing import Dict, DefaultDict, Set, List, Tuple 4 | from collections import defaultdict 5 | 6 | from code_analyzer.visitors.util_visitor import get_top_level_expr, get_local_top_level_expr 7 | from code_analyzer.schemas.ast_node import ASTNode 8 | from code_analyzer.definition_collector import BaseInfoCollector 9 | 10 | class BaseStaticMatcher: 11 | def __init__(self, args): 12 | self.args = args 13 | # 分析过的callsite 14 | self.analyzed_callsites: Set[str] = set() 15 | # 在function内没有被成功分析的case 16 | self.local_failed_callsites: Set[str] = set() 17 | 18 | # 保存匹配上的函数名 19 | self.callees: DefaultDict[str, Set[str]] = defaultdict(set) 20 | # 如果uncertain 21 | self.uncertain_callees: DefaultDict[str, Set[str]] = defaultdict(set) 22 | self.uncertain_idxs: DefaultDict[str, Dict[str, Set[int]]] = defaultdict(dict) 23 | self.llm_declarator_analysis: DefaultDict[str, Set[str]] = defaultdict(set) 24 | # 通过cast分析后得到的matching result 25 | self.cast_callees: DefaultDict[str, Set[str]] = defaultdict(set) 26 | 27 | self.expanded_macros: Dict[str, str] = dict() 28 | self.macro_call_exprs: Dict[str, str] = dict() 29 | 30 | self.icall_2_decl_text: Dict[str, str] = dict() 31 | self.icall_2_decl_type_text: Dict[str, str] = dict() 32 | self.icall_2_struct_name: Dict[str, str] = dict() 33 | 34 | self.icall_2_type_comment: Dict[str, str] = dict() 35 | self.icall_2_struct_comment: Dict[str, str] = dict() 36 | 37 | self.macro_callsites: Set[str] = set() 38 | # 保存每个indirect-callsite所在的function 39 | self.icall_2_func: Dict[str, str] = dict() 40 | # 保存每个indirect-callsite的ASTNode 41 | self.icall_nodes: Dict[str, ASTNode] = dict() 42 | 43 | @abc.abstractmethod 44 | def process_all(self): 45 | pass 46 | 47 | 48 | class BaseInfoAnalyzer: 49 | def __init__(self, collector: BaseInfoCollector, 50 | raw_global_addr_sites: Dict[str, List[ASTNode]], 51 | raw_local_addr_sites: Dict[str, Dict[str, List[ASTNode]]]): 52 | self.collector: BaseInfoCollector = collector 53 | 54 | # global scope的address-taken site只需要考虑init_declarator 55 | self.global_addr_sites: Dict[str, List[Tuple[ASTNode, int, ASTNode]]] = dict() 56 | # local scope的address-taken site考虑init_declarator, assignment_expression, argument_list, 57 | # conditional_expression 58 | self.local_declarators: DefaultDict[str, DefaultDict[str, List[Tuple[ASTNode, int, ASTNode]]]] = defaultdict( 59 | lambda: defaultdict(list)) 60 | self.local_assignment_exprs: DefaultDict[ 61 | str, DefaultDict[str, List[Tuple[ASTNode, int, ASTNode]]]] = defaultdict( 62 | lambda: defaultdict(list)) 63 | self.local_call_expr: DefaultDict[str, List[Tuple[ASTNode, int]]] = defaultdict(list) 64 | self.call_expr_arg_idx: DefaultDict[str, List[Tuple[str, int]]] = defaultdict(list) 65 | 66 | self.pre_analyze(raw_global_addr_sites, raw_local_addr_sites) 67 | 68 | 69 | def pre_analyze(self, raw_global_addr_sites: Dict[str, List[ASTNode]], 70 | raw_local_addr_sites: Dict[str, Dict[str, List[ASTNode]]]): 71 | # 全局declarator分析 72 | for func_name, nodes in tqdm(raw_global_addr_sites.items(), desc="collecting raw declarators"): 73 | decl_nodes: List[Tuple[ASTNode, int, ASTNode]] = list() 74 | for node in nodes: 75 | top_level_node, initializer_level = get_top_level_expr(node) 76 | if top_level_node is None: 77 | continue 78 | if top_level_node.node_type == "init_declarator": 79 | decl_nodes.append((top_level_node, initializer_level, node)) 80 | 81 | decl_nodes: List[Tuple[ASTNode, int, ASTNode]] = \ 82 | list(map(lambda x: (x[0], x[1], x[2]), decl_nodes)) 83 | 84 | if len(decl_nodes) > 0: 85 | self.global_addr_sites[func_name] = decl_nodes 86 | 87 | # local declarator分析 88 | for func_name, node_in_func in tqdm(raw_local_addr_sites.items(), desc="collecting local declarators"): 89 | for func_key, nodes in node_in_func.items(): 90 | for node in nodes: 91 | top_level_node, initializer_level = get_local_top_level_expr(node) 92 | if top_level_node is None: 93 | continue 94 | if top_level_node.node_type == "init_declarator": 95 | self.local_declarators[func_name][func_key].append((top_level_node, 96 | initializer_level, node)) 97 | 98 | elif top_level_node.node_type == "assignment_expression" or \ 99 | (top_level_node.node_type == "conditional_expression" 100 | and hasattr(top_level_node, "assignment_expression")): 101 | self.local_assignment_exprs[func_name][func_key].append((top_level_node, 102 | initializer_level, node)) 103 | 104 | 105 | elif top_level_node.node_type == "call_expression": 106 | self.local_call_expr[func_name].append((top_level_node, initializer_level)) 107 | 108 | 109 | @abc.abstractmethod 110 | def analyze(self): 111 | pass -------------------------------------------------------------------------------- /icall_solvers/base_solvers/flta/ReadMe.md: -------------------------------------------------------------------------------- 1 | # 1.类型分析 2 | 3 | 下面步骤(1)-(3)依旧基于传统分析,(4)开始涉及LLM。同时(1)-(4)属于一个参数一个参数的比较类型,(5)属于让LLM对类型相关的文本进行整体性比对。 4 | 5 | - (1).首先比较调用点的各个参数以及call target各个参数的类型,如果能够比较出结果,那么直接返回 `true`,比较不出,则尝试 (2) 6 | 7 | - (2).找到调用点对应的函数指针声明出,比较函数指针声明和call target匹不匹配,如果非常匹配,则返回 `true`,比较不出,则尝试 (3)。 8 | 9 | - (3).开始进行 `cast` 分析,当前步骤依旧采用规则分析:如果比较的类型有结构体类型,那么进一步查看当前两个类型是否存在包含关系,如果存在包含关系,那么则返回 `true`,否则尝试 (4)。 10 | 11 | - (4).通过LLM分析函数指针声明的类型和call target的类型是否有相似处,选取能够匹配的target。 12 | 13 | ## 1.1.传统类型比对 14 | 15 | 1.用传统严格类型匹配方法比较调用点和call target的参数 16 | 17 | 找出indirect-call对应的call expression各个实参的声明,提取类型。下面示例中参数类型为 `ngx_cycle_t*`,则所有支持一个参数的且类型为 `ngx_cycle_t*` 的函数为潜在调用目标。 18 | 19 | ```cpp 20 | ngx_cycle_t *cycle; 21 | cycle->modules[i]->init_module(cycle) 22 | ``` 23 | 24 | 对于参数类型求解,采用以下方法(通过递归求解): 25 | 26 | - 如果参数是一个 `identifer` 类型表达式,表示引用了一个变量,那么直接在局部、全局变量表搜索对应变量声明,查询类型。 27 | 28 | - 如果参数是一个 `field_expression` 类型表达式,表示访问了结构体,那么首先求解 `base` 变量的类型,随后查询对应 `field` 的类型。比如变量声明 `ngx_module_t *module;` 和 `module->ctx_index`,求解类型的时候先查出 `module` 类型为 `ngx_module*`,随后在 `ngx_module` 类型定义下找出 `ctx_index` 类型为 `ngx_uint_t`。 29 | 30 | - 如果是 `subscript_expression` 或者 `pointer_expression`,表示进行了数组或者指针访问,只需将 `base` 变量类型pointer_level减1即可,比如访问 `int` 数组,那么对应类型就是 `int`。 31 | 32 | - 其它类型,返回 `unknown_type`。 33 | 34 | 2.找出对应函数指针的声明,求解类型 35 | 36 | (a)在求解参数类型时容易出错,比如 `(*cf->handler)(cf, NULL, cf->handler_conf);`,`NULL` 可能是一个指针类型,但是不知道是什么指针,因此第2个参数类型求解不出来,这个时候就需要知道 `cf->handler` 的函数指针声明时候的类型。这里 `handler` 声明为 `ngx_conf_handler_pt handler;`, `typedef char *(*ngx_conf_handler_pt)(ngx_conf_t *cf, ngx_command_t *dummy, void *conf);`,可知,函数指针参数类型为 `(ngx_conf_t*, ngx_command_t*, void* )`。 37 | 38 | 39 | 可能会出现函数指针声明的位置:`typedef`,`struct`定义,全局变量定义,局部变量和函数参数定义。 40 | 41 | 42 | ## 1.2.cast分析 43 | 44 | 涉及到cast的例子的类型比对其实我没有在benchmark中找到,但为了sound还是抛出了一个类型分析策略 45 | 46 | cast分析主要在(3)-(4)步,这里的分析和之前类似,依旧是比对参数类型。当比对两个参数类型的时候,如果无法用严格类型匹配推断出当前比对两个类型,那么则进行cast分析。 47 | 48 | 当前步骤依旧是逐个参数逐个参数比对类型,同时在当前cast分析,只分析涉及到结构体的cast,对于 `long`, `int` 这些原生类型转换不在当前步骤处理。 49 | 50 | insight:结构体之间或者结构体和其它类型的转换通常涉及两种情况(**Note:** 通常cast都是指针类型,如果不是指针类型由于每个结构体类型占用内存空间几乎都不一样非常容易发生错误): 51 | 52 | - 类型包含 53 | 54 | - 父类子类转换 55 | 56 | 1.类型包含:下面示例中 `ngx_str_node_t` 的第一个field的类型是 `ngx_rbtree_node_t`,同时在程序中发现了从 `ngx_str_node_t*` 指针cast到 `ngx_rbtree_node_t*` 的情况,这属于典型的类型包含。这种转换通常合法,推测主要的用途就是用另一种方式访问结构体的某些field。而且几乎所有这类cast中,**被包含类型**是包含类型的**第一个field**。 57 | 58 | solution:步骤(3)用来处理这类case,通常无需LLM,如果有结构体类型,分析该结构体第1个field是不是另外一个类型就行。如果是则返回 `true`,否则继续分析是否是父类子类转换情况。 59 | 60 | ```cpp 61 | typedef struct { 62 | ngx_rbtree_node_t node; 63 | ngx_str_t str; 64 | } ngx_str_node_t; 65 | 66 | typedef struct ngx_rbtree_node_s ngx_rbtree_node_t; 67 | 68 | struct ngx_rbtree_node_s { 69 | ngx_rbtree_key_t key; 70 | ngx_rbtree_node_t *left; 71 | ngx_rbtree_node_t *right; 72 | ngx_rbtree_node_t *parent; 73 | u_char color; 74 | u_char data; 75 | }; 76 | ``` 77 | 78 | 2.父类子类转换:这类case更加复杂,通常涉及到的两个 `struct` 指针类型。下面示例中 `generic_operation` 是 `accept_operation` 和 `read_operation` 的父类。但是无法通过简单的规则比较判断,而通过LLM分析可以大致判断: 79 | 80 | - `generic_operation` 看起来像是 `accept_operation` 和 `read_operation` 的一种抽象。 81 | 82 | - field都包含 `PJ_DECL_LIST_MEMBER(struct generic_operation);` 和 `pj_ioqueue_operation_e op;`,存在相似性。这一步中 `PJ_DECL_LIST_MEMBER` 是个宏定义,展开分析非常复杂,最好的方式就是通过LLM判断这两个类型是否存在相似性。 83 | 84 | solution: 对于两个结构体指针类型,把两个 `struct` 类型定义喂给大模型,让其判断是否是父类子类关系。 85 | 86 | ```cpp 87 | struct generic_operation 88 | { 89 | PJ_DECL_LIST_MEMBER(struct generic_operation); 90 | pj_ioqueue_operation_e op; 91 | }; 92 | 93 | struct accept_operation 94 | { 95 | PJ_DECL_LIST_MEMBER(struct accept_operation); 96 | pj_ioqueue_operation_e op; 97 | 98 | pj_sock_t *accept_fd; 99 | pj_sockaddr_t *local_addr; 100 | pj_sockaddr_t *rmt_addr; 101 | int *addrlen; 102 | }; 103 | 104 | struct read_operation 105 | { 106 | PJ_DECL_LIST_MEMBER(struct read_operation); 107 | pj_ioqueue_operation_e ; 108 | 109 | void *buf; 110 | pj_size_t size; 111 | unsigned flags; 112 | pj_sockaddr_t *rmt_addr; 113 | int *rmt_addrlen; 114 | }; 115 | ``` 116 | 117 | 118 | 父类子类的一个更加极端的case如下:`Node` 是 `BagNode` 的父类,但是很明显 `Node` 的定义比 `BagNode` 复杂的多。 119 | 120 | 121 | ```cpp 122 | typedef struct _Node { 123 | union { 124 | struct { 125 | NodeType node_type; 126 | int status; 127 | struct _Node* parent; 128 | struct _Node* body; 129 | } base; 130 | 131 | StrNode str; 132 | CClassNode cclass; 133 | QuantNode quant; 134 | BagNode bag; 135 | BackRefNode backref; 136 | AnchorNode anchor; 137 | ConsAltNode cons; 138 | CtypeNode ctype; 139 | #ifdef USE_CALL 140 | CallNode call; 141 | #endif 142 | GimmickNode gimmick; 143 | } u; 144 | } Node; 145 | 146 | 147 | typedef struct { 148 | NodeType node_type; 149 | int status; 150 | struct _Node* parent; 151 | struct _Node* body; 152 | 153 | enum BagType type; 154 | union { 155 | struct { 156 | int regnum; 157 | AbsAddrType called_addr; 158 | int entry_count; 159 | int called_state; 160 | } m; 161 | struct { 162 | OnigOptionType options; 163 | } o; 164 | struct { 165 | /* body is condition */ 166 | struct _Node* Then; 167 | struct _Node* Else; 168 | } te; 169 | }; 170 | 171 | /* for multiple call reference */ 172 | OnigLen min_len; /* min length (byte) */ 173 | OnigLen max_len; /* max length (byte) */ 174 | OnigLen min_char_len; 175 | OnigLen max_char_len; 176 | int opt_count; /* referenced count in optimize_nodes() */ 177 | } BagNode; 178 | ``` 179 | 180 | 181 | ## 1.3.文本分析 182 | 183 | 如果当前declarator无法通过(1)-(4)步逐个的比对类型判断,那么 -------------------------------------------------------------------------------- /icall_solvers/base_solvers/flta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/base_solvers/flta/__init__.py -------------------------------------------------------------------------------- /icall_solvers/base_solvers/flta/matching_result.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | 3 | class MatchingResult(IntEnum): 4 | """Matching result of a signature.""" 5 | NO = 0 6 | YES = 1 7 | UNCERTAIN = 2 -------------------------------------------------------------------------------- /icall_solvers/base_solvers/flta/prompt.py: -------------------------------------------------------------------------------- 1 | 2 | system_prompt = """ 3 | You task is to determine the potential parent-child relationship between two C struct types.""" 4 | 5 | user_prompt = """ 6 | Given the struct type {struct_type1} and {struct_type2} as reference, assess whether there is parent-child relationship between {struct_type2} and {struct_type1}. Consider the following criteria: 7 | 8 | 1.Common Initial Fields: verify if the initial fields of both struct types share similar names and types. 9 | 10 | 2.Type Naming Convention: check whether the struct names {struct_type1} and {struct_type2} reflect a parent-child relationship. 11 | 12 | The type definition of {struct_type1} is: 13 | 14 | {struct_type1_definition} 15 | 16 | The type definition of {struct_type2} is: 17 | 18 | {struct_type2_definition} 19 | 20 | Answer me whether there is parent-child relationship between struct {struct_type2} and {struct_type1} with "yes" or "no" 21 | """ 22 | 23 | system_prompt_declarator = """You are a text analyzer tasked with analyzing the similarity between two declarators. 24 | """ 25 | 26 | user_prompt_declarator = """ 27 | Given a function pointer declarator and a function declarator, your task is to evaluate whether the parameter types of function pointer can match that of the function in following steps: 28 | 29 | - 1.Extract the parameter list separately from both the function pointer declarator and the function declarator. 30 | 31 | - 2.Compare each parameter's type individually for a match, ensuring identical names and pointer hierarchies for types to match. 32 | 33 | Note that: 34 | 35 | - 1.Certain parameter declarations may be wrapped or followed by macros like UNUSED_PARAM or unused, which do not impact the parameter type. For example, UNUSED_PARAM(int var) matches the type of int var. 36 | 37 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 38 | 39 | The function pointer declarator is 40 | 41 | {} 42 | 43 | The function declarator is 44 | 45 | {} 46 | 47 | {} 48 | """ 49 | 50 | system_prompt_context = """You are a text analyzer tasked with analyzing whether argument types 51 | are compatible with parameter types of a function parameter.""" 52 | 53 | user_prompt_context = """ 54 | Given an indirect-call, a function declarator. I'm not sure whether their {idx} arguments and parameters types are compatible. 55 | You need to help me determine. 56 | 57 | The indirect-call text is: 58 | 59 | {icall_text} 60 | 61 | Corresponding argument list is: {arg_text} 62 | 63 | The function declarator text is: 64 | 65 | {func_decl_text} 66 | 67 | {contexts} 68 | 69 | Analyze whether the {idx} argument types are compatible with the {idx} parameter types in two steps: 70 | 71 | - 1.Extract the expressions of {idx} arguments from argument list and analyze their types with corresponding variable declaration if provided. 72 | 73 | - 2.Extract the {idx} parameter declarator, ensuring exact match of its type with corresponding arguments if provided. Note that two types match only with identical names and pointer hierarchies. 74 | 75 | Note that: 76 | 77 | - 1.Macros like UNUSED_PARAM could appear in declarations. 78 | 79 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 80 | 81 | - 3.If declarators of {idx} arguments are missing, you may infer their compatibility with {idx} parameters using the argument text and parameter names of {idx}. 82 | 83 | """ 84 | 85 | 86 | supplement_prompts = { 87 | "user_prompt_declarator": "If function pointer parameters match function parameters, answer 'yes'; otherwise, answer 'no'.", 88 | "user_prompt_context": "If all {idx} argument types potentially match their respective parameters, respond with 'yes'. Otherwise, respond with 'no'." 89 | } -------------------------------------------------------------------------------- /icall_solvers/base_solvers/kelp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/base_solvers/kelp/__init__.py -------------------------------------------------------------------------------- /icall_solvers/base_solvers/mlta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/base_solvers/mlta/__init__.py -------------------------------------------------------------------------------- /icall_solvers/base_solvers/mlta/matcher.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.definition_collector import BaseInfoCollector 2 | 3 | from icall_solvers.base_solvers.base_matcher import BaseStaticMatcher 4 | from icall_solvers.base_solvers.flta.matcher import TypeAnalyzer 5 | from icall_solvers.base_solvers.mlta.type_confine_analyzer import TypeConfineAnalyzer 6 | 7 | from typing import Dict, Set, DefaultDict 8 | from tqdm import tqdm 9 | 10 | class StructTypeMatcher(BaseStaticMatcher): 11 | def __init__(self, collector: BaseInfoCollector, 12 | args, 13 | type_analyzer: TypeAnalyzer, 14 | confine_analyzer: TypeConfineAnalyzer, 15 | callsite_idxs: Dict[str, int] = None, 16 | escaped_types: DefaultDict[str, Set[str]] = None): 17 | super().__init__(args) 18 | self.collector: BaseInfoCollector = collector 19 | self.confine_analyzer: TypeConfineAnalyzer = confine_analyzer 20 | self.escaped_types = escaped_types 21 | 22 | # 保存类型匹配的callsite 23 | self.type_analyzer = type_analyzer 24 | self.icall_2_func = type_analyzer.icall_2_func 25 | self.uncertain_idxs.update(type_analyzer.uncertain_idxs) 26 | self.analyzed_callsites.update(type_analyzer.analyzed_callsites) 27 | self.local_failed_callsites.update(type_analyzer.local_failed_callsites) 28 | self.icall_2_struct_comment = type_analyzer.icall_2_struct_comment 29 | self.icall_2_type_comment = type_analyzer.icall_2_type_comment 30 | 31 | self.icall_2_field_name: Dict[str, str] = type_analyzer.icall_2_field_name 32 | self.callsite_idxs: Dict[str, int] = callsite_idxs 33 | 34 | # 如果icall引用了结构体的field,找到对应的结构体名称 35 | self.icall_2_struct_name: Dict[str, str] = type_analyzer.icall_2_struct_name 36 | # 如果icall引用了结构体的field,找到对应的field_name 37 | self.icall_2_field_name: Dict[str, str] = type_analyzer.icall_2_field_name 38 | self.macro_callsites: Set[str] = type_analyzer.macro_callsites 39 | self.icall_nodes = type_analyzer.icall_nodes 40 | 41 | if hasattr(type_analyzer, "log_dir"): 42 | self.log_dir = type_analyzer.log_dir 43 | 44 | self.flta_cases: Set[str] = set() 45 | self.mlta_cases: Set[str] = set() 46 | 47 | def process_all(self): 48 | # 遍历callsite 49 | for callsite_key, i in self.callsite_idxs.items(): 50 | if callsite_key not in self.icall_2_func.keys(): 51 | continue 52 | # 如果不分析macro call 53 | if callsite_key in self.macro_callsites and not self.args.enable_analysis_for_macro: 54 | continue 55 | # 如果不分析正常call 56 | elif callsite_key not in self.macro_callsites and self.args.disable_analysis_for_normal: 57 | continue 58 | 59 | self.process_callsite(callsite_key, i) 60 | 61 | 62 | def process_callsite(self, callsite_key: str, i: int): 63 | struct_name = self.icall_2_struct_name.get(callsite_key, "") 64 | field_name = self.icall_2_field_name.get(callsite_key, "") 65 | 66 | strict_type_targets: Set[str] = self.type_analyzer.callees[callsite_key].copy() 67 | uncertain_targets: Set[str] = self.type_analyzer.uncertain_callees[callsite_key].copy() 68 | llm_decl_targets: Set[str] = self.type_analyzer.llm_declarator_analysis[callsite_key].copy() 69 | 70 | if struct_name != "" and field_name != "": 71 | # 是escape type 72 | if field_name in self.escaped_types[struct_name]: 73 | self.update_default_values(callsite_key, strict_type_targets, uncertain_targets, llm_decl_targets) 74 | self.flta_cases.add(callsite_key) 75 | return 76 | func_names: Set[str] = self.confine_analyzer.struct_name_2_field_4_type[struct_name][field_name] 77 | 78 | for field_name, target_set in [("callees", strict_type_targets), 79 | ("uncertain_callees", uncertain_targets), 80 | ("llm_declarator_analysis", llm_decl_targets)]: 81 | for func_key in tqdm(target_set, desc=f"match for {field_name} callsite key: {i}"): 82 | cur_func_name = self.collector.func_info_dict[func_key].func_name 83 | if cur_func_name in func_names: 84 | getattr(self, field_name)[callsite_key].add(func_key) 85 | 86 | # 出现escape type 87 | if len((self.callees[callsite_key] | self.uncertain_callees[callsite_key])) == 0: 88 | self.update_default_values(callsite_key, strict_type_targets, uncertain_targets, llm_decl_targets) 89 | self.flta_cases.add(callsite_key) 90 | else: 91 | self.mlta_cases.add(callsite_key) 92 | 93 | else: 94 | self.update_default_values(callsite_key, strict_type_targets, uncertain_targets, llm_decl_targets) 95 | self.flta_cases.add(callsite_key) 96 | 97 | # Add a helper method to simplify code further 98 | 99 | def update_default_values(self, callsite_key: str, strict_type_targets: Set[str], 100 | uncertain_targets: Set[str], llm_decl_targets: Set[str]): 101 | self.callees[callsite_key] = strict_type_targets 102 | self.uncertain_callees[callsite_key] = uncertain_targets 103 | self.llm_declarator_analysis[callsite_key] = llm_decl_targets -------------------------------------------------------------------------------- /icall_solvers/dir_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def get_parent_directory(file, levels=4): 4 | path = os.path.realpath(file) 5 | for _ in range(levels): 6 | path = os.path.dirname(path) 7 | return path -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/llm_solvers/__init__.py -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/base_llm_solver.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from icall_solvers.base_solvers.base_matcher import BaseStaticMatcher 4 | from llm_utils.base_analyzer import BaseLLMAnalyzer 5 | from code_analyzer.definition_collector import BaseInfoCollector 6 | from code_analyzer.schemas.ast_node import ASTNode 7 | 8 | from typing import Dict, Set, DefaultDict 9 | from collections import defaultdict 10 | 11 | class BaseLLMSolver: 12 | def __init__(self, collector: BaseInfoCollector, 13 | args, 14 | base_analyzer: BaseStaticMatcher, 15 | llm_analyzer: BaseLLMAnalyzer = None, 16 | callsite_idxs: Dict[str, int] = None, 17 | func_key_2_name: Dict[str, str] = None): 18 | self.collector: BaseInfoCollector = collector 19 | self.args = args 20 | # 保存类型匹配的callsite 21 | self.type_matched_callsites: Dict[str, Set[str]] = base_analyzer.callees.copy() 22 | additional_callsite_infos: DefaultDict[str, Set[str]] = defaultdict(set) 23 | if self.args.evaluate_uncertain: 24 | additional_callsite_infos = base_analyzer.uncertain_callees 25 | elif self.args.evaluate_soly_for_llm: 26 | additional_callsite_infos = base_analyzer.llm_declarator_analysis 27 | for key, values in additional_callsite_infos.items(): 28 | self.type_matched_callsites[key] = self.type_matched_callsites.get(key, set()) | values 29 | 30 | # 保存语义匹配的callsite 31 | self.matched_callsites: DefaultDict[str, Set[str]] = defaultdict(set) 32 | self.macro_callsites: Set[str] = base_analyzer.macro_callsites 33 | 34 | self.llm_analyzer: BaseLLMAnalyzer = llm_analyzer 35 | self.callsite_idxs: Dict[str, int] = callsite_idxs 36 | self.func_key_2_name: Dict[str, str] = func_key_2_name 37 | 38 | self.icall_2_func: Dict[str, str] = base_analyzer.icall_2_func 39 | self.icall_nodes: Dict[str, ASTNode] = base_analyzer.icall_nodes 40 | 41 | self.expanded_macros: Dict[str, str] = base_analyzer.expanded_macros 42 | self.macro_call_exprs: Dict[str, str] = base_analyzer.macro_call_exprs 43 | 44 | if hasattr(base_analyzer, "kelp_cases"): 45 | self.kelp_cases = base_analyzer.kelp_cases 46 | if hasattr(base_analyzer, "mlta_cases"): 47 | self.mlta_cases = base_analyzer.mlta_cases 48 | 49 | @abc.abstractmethod 50 | def process_all(self): 51 | pass -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/base_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/llm_solvers/base_utils/__init__.py -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/base_utils/func_summarizer.py: -------------------------------------------------------------------------------- 1 | from typing import Set, Dict 2 | from code_analyzer.schemas.function_info import FuncInfo 3 | 4 | from llm_utils.base_analyzer import BaseLLMAnalyzer 5 | from icall_solvers.llm_solvers.base_utils.prompts import System_Func_Summary, User_Func_Summary 6 | 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | import threading 9 | from tqdm import tqdm 10 | import logging 11 | 12 | class FunctionSummarizer: 13 | def __init__(self, func_keys: Set[str], 14 | func_info_dict: Dict[str, FuncInfo], 15 | args, 16 | llm_analyzer: BaseLLMAnalyzer = None): 17 | self.func_keys: Set[str] = func_keys 18 | self.func_info_dict: Dict[str, FuncInfo] = func_info_dict 19 | # 将func_key映射为func_summary 20 | self.func_key2summary: Dict[str, str] = dict() 21 | self.llm_analyzer: BaseLLMAnalyzer = llm_analyzer 22 | self.args = args 23 | 24 | def analyze(self): 25 | lock = threading.Lock() 26 | 27 | def analyze_func(func_key: str): 28 | func_info: FuncInfo = self.func_info_dict[func_key] 29 | func_name: str = func_info.func_name 30 | func_def_text: str = func_info.func_def_text 31 | 32 | system_prompt_func: str = System_Func_Summary.format(func_name=func_name) 33 | user_prompt_func: str = User_Func_Summary.format(func_name=func_name, 34 | func_body=func_def_text) 35 | 36 | func_summary: str = self.llm_analyzer.get_response([system_prompt_func, 37 | user_prompt_func]) 38 | 39 | with lock: 40 | self.func_key2summary[func_key] = func_summary 41 | 42 | executor = ThreadPoolExecutor(max_workers=self.args.num_worker) 43 | pbar = tqdm(total=len(self.func_keys), desc="summarizing address-taken function") 44 | 45 | def update_progress(future): 46 | pbar.update(1) 47 | 48 | futures = [] 49 | 50 | for func_key in self.func_keys: 51 | future = executor.submit(analyze_func, func_key) 52 | future.add_done_callback(update_progress) 53 | futures.append(future) 54 | 55 | for future in as_completed(futures): 56 | future.result() 57 | 58 | logging.getLogger("CodeAnalyzer").info("summarizied {} functions, should summarize {} functions" 59 | .format(len(self.func_key2summary), len(self.func_keys))) 60 | assert len(self.func_key2summary) == len(self.func_keys) -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/base_utils/prompts.py: -------------------------------------------------------------------------------- 1 | 2 | # prompt大模型分析一个函数的功能并生成summary 3 | System_Func_Summary = """You are a code summarizer tasked with encapsulating the functionality of the {func_name} function in a concise summary.""" 4 | 5 | User_Func_Summary = """The definition of function {func_name} is as follows, and you should summarize it in one sentence, describing the function's functionality and purpose. 6 | 7 | ```c 8 | {func_body} 9 | ``` 10 | """ 11 | 12 | User_Func_Summary_With_Comment = """The definition and comment of function {func_name} is as follows, and you should summarize it in one sentence, describing the function's functionality and purpose. 13 | 14 | definition: 15 | ```c 16 | {func_body} 17 | ``` 18 | 19 | comment: 20 | {comment} 21 | """ 22 | 23 | 24 | Comment_Text = """ 25 | 26 | The comment for function {func_name} is: 27 | {comment_text} 28 | 29 | """ 30 | 31 | # prompt大模型分析indirect-call所在function的功能并分析该indirect-call所需要实现的功能。 32 | System_ICall_Summary = """You are a code analyzer, given an indirect-call and the function it lies in. Your task is to analyze the functionality of the indirect call.""" 33 | 34 | User_ICall_Summary = """The expression of the indirect-call is: `{icall_expr}` 35 | 36 | It is located within function {func_name}, whose definition is as follows: 37 | 38 | ```c 39 | {func_body} 40 | ``` 41 | {comment_placeholder} 42 | To analyze the functionality of the indirect call `{icall_expr}`, follow these two steps: 43 | 44 | - 1.Summarize the functionality of the {func_name} function to understand the function's purpose. 45 | 46 | - 2.Examine the code surrounding the indirect call `{icall_expr}` and determine the specific use of the indirect call within the context. 47 | 48 | You should only response above each of the two points with a concise summary.""" 49 | 50 | 51 | User_ICall_Summary_Macro = """The expression of the indirect-call is: {icall_expr} 52 | 53 | It is located within function {func_name}, whose definition is as follows: 54 | 55 | {func_body} 56 | 57 | Also, the indirect-call may not be seen util expand macro call {macro_call_expr}, the expanded macro text is: {expanded_macro} 58 | 59 | To analyze the functionality of the indirect call {icall_expr}, follow these two steps: 60 | 61 | - 1.Summarize the functionality of the {func_name} function to understand its purpose. 62 | 63 | - 2.Examine the code surrounding the indirect call {icall_expr} in the expanded macro text and macro call {macro_call_expr} in function {func_name} to determine the specific use of the indirect-call within the context. 64 | 65 | You should only response with a concise summary of the indirect call functionality.""" 66 | 67 | supplement_prompts = { 68 | "user_prompt_match": "If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'.", 69 | } -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/sea/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/llm_solvers/sea/__init__.py -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/sea/prompts.py: -------------------------------------------------------------------------------- 1 | 2 | # prompt大模型分析indirect-call所在function的功能并分析该indirect-call所需要实现的功能。 3 | System_func_pointer_Summary = """You are a code summarizer and are provided with declarator-related information of a function pointer. 4 | Your objective is to succinctly summarize the general intent of the function pointer.""" 5 | 6 | # prompt大模型分析target function pointer的address-taken site 7 | System_addr_taken_site_Summary = """You're tasked with summarizing the purpose of a function based on its address-taken site.""" 8 | 9 | # prompt大模型将多个address-taken site的摘要 10 | System_multi_summary = """Your task is to consolidate multiple summaries of address-taken sites for a target function into one concise summary.""" 11 | 12 | end_multi_summary = """Summarize the purpose of the function {func_name} using provided summaries of each address-taken site.""" 13 | 14 | 15 | # prompt大模型分析一个target function和indirect-call的summary并判定它们是否匹配 16 | System_Match = """You're a code analyzer tasked with assessing whether an indirect call potentially invoke a target function, given relative information.""" 17 | 18 | User_Func_Pointer = """## 1.2.summary of the function pointer declaration for the indirect-call: 19 | {}""" 20 | 21 | User_Func_Addr = """## 2.2.summary of the target function's address-taken site: 22 | {}""" 23 | 24 | User_ICall_local = """## 1.1.summary of function {parent_func_name} containing indirect-call and summary of indirect-call itself: 25 | {icall_summary}""" 26 | 27 | User_Func_Local = """## 2.1.summary of the target function: 28 | {func_summary}""" 29 | 30 | User_Match = """The indirect-call expression is: {icall_expr}. 31 | 32 | The subsequent text provides the summary of the indirect-call and the corresponding function: 33 | 34 | {icall_title} 35 | 36 | {icall_summary} 37 | 38 | {icall_additional} 39 | 40 | {func_title} 41 | 42 | {func_summary} 43 | 44 | {target_additional_information} 45 | 46 | # Question: 47 | Assess if {func_name} could be one of the target function that indirect call {icall_expr} potentially invoke based solely on their respective functionalities. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately.""" 48 | 49 | User_Match_ = """The indirect-call expression is: {icall_expr} 50 | 51 | The name of target function is: {func_name} 52 | 53 | # Question: 54 | Assess if {func_name} could be one of the target function that indirect call {icall_expr} potentially invoke based solely on their respective functionalities, you may guess their functionalities with information given above. Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately.""" 55 | -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/single/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | This is single step prompt with only input local messages like the function indirect-call lies in and the target function. -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/single/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/icall_solvers/llm_solvers/single/__init__.py -------------------------------------------------------------------------------- /icall_solvers/llm_solvers/single/prompt.py: -------------------------------------------------------------------------------- 1 | # prompt大模型分析一个target function和indirect-call的summary并判定它们是否匹配 2 | System_Match = """You're a code analyzer.""" 3 | 4 | # 提供indirect-call text, 相关函数指针或者参数的declarator 5 | # 如果是uncertain type,需要type matcing 6 | # 如果存在type cast,语义分析需要更加仔细点 7 | # Chain-of-thought 8 | # - 1.如果类型比对为uncertain,首先比对类型 9 | # - 2.进行语义分析,首先结合indirect-call的所在的source function以及function pointer declarator,推测indirect-call的功能 10 | # - 3.分析target function的功能,结合indirect-call的功能判断是不是一个caller-callee pair 11 | # Note: 12 | # - 1.只需要结合语义判断是否有可能是一个caller-callee pair,不需要考虑data-flow、control-flow 13 | User_Match = """You are tasked with assessing whether an indirect call potentially invoke a target function, given respective contexts. 14 | The indirect-call expression is: {icall_expr}. 15 | 16 | # 1.context related to indirect-call 17 | 18 | ## 1.1.source function 19 | 20 | It is located within function {src_func_name}, whose definition is as follows: 21 | 22 | {source_function_text} 23 | 24 | {icall_additional} 25 | 26 | # 2.context related to target function 27 | 28 | ## 2.1.target function definition 29 | 30 | The target function is named: {target_func_name}, with the following definition: 31 | 32 | {target_function_text} 33 | 34 | {target_additional_information} 35 | 36 | Assess if {target_func_name} could be one of the target function that indirect call {icall_expr} potentially invoke in following steps: 37 | 38 | - 1.Analyze the purpose of indirect-call: 39 | 40 | * 1.1.Summarize the functionality of the source function {src_func_name} to understand the function's purpose. 41 | 42 | * 1.2.Examine the code in source function surrounding the indirect call `{icall_expr}` and determine the specific use of the indirect call within the context. 43 | 44 | * 1.3.If global context is provided, you may summarize the function pointer's purpose with global context provided before. 45 | 46 | * 1.4.Understand the purpose of the indirect-call following step 1.1, step 1.2, and step 1.3. 47 | 48 | - 2.Analyze the functionality of the target function {target_func_name} to understand its purpose. 49 | 50 | * 2.1.Understand the functionality of target function {target_func_name} with text of its function body. 51 | 52 | * 2.2.If the code of address-taken sites of target function is provided, understand the purpose of the function pointers it assign to. 53 | 54 | - 3.Determine whether the indirect-call potentially invoke the target function based on their functionality. 55 | 56 | Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately.""" 57 | 58 | 59 | User_Match_No_COT = """You are tasked with assessing whether an indirect call potentially invoke a target function, given respective contexts. 60 | The indirect-call expression is: {icall_expr}. 61 | 62 | # 1.context related to indirect-call 63 | 64 | ## 1.1.source function 65 | 66 | It is located within function {src_func_name}, whose definition is as follows: 67 | 68 | {source_function_text} 69 | 70 | {icall_additional} 71 | 72 | # 2.context related to target function 73 | 74 | ## 2.1.target function definition 75 | 76 | The target function is named: {target_func_name}, with the following definition: 77 | 78 | {target_function_text} 79 | 80 | {target_additional_information} 81 | 82 | Assess if {target_func_name} could be one of the target function that indirect call {icall_expr} potentially invoke, respond with 'yes' or 'no'.""" 83 | 84 | 85 | User_Match_macro = """The indirect-call expression is: {icall_expr}. 86 | 87 | It is located within function {src_func_name}, whose definition is as follows: 88 | 89 | {source_function_text} 90 | 91 | Where the indirect-call expression can be seen after expand macro call: {macro_call_expr}, the expanded macro text is: {macro_text}. 92 | 93 | The target function is named: {target_func_name}, with the following definition: 94 | 95 | {target_function_text} 96 | 97 | Assess if {target_func_name} could be one of the target function that indirect call {icall_expr} potentially invoke in following steps: 98 | 99 | - 1.Analyze the purpose of indirect-call: Examine the code surrounding the indirect call {icall_expr} and determine its specific use within function {src_func_name}. 100 | 101 | - 2.Analyze the functionality of the target function {target_func_name} to understand its purpose. 102 | 103 | - 3.Determine whether the indirect-call potentially invoke the target function based on their functionality. You don't need to consider type match, data-flow, control-flow. 104 | 105 | Please disregard additional context like detailed implementation, control- & data-flow, or types and class hierarchy for now; we'll verify that separately. 106 | """ 107 | 108 | supplement_prompts = { 109 | "user_prompt_match": "If the indirect-call potentially invoke the target function, answer 'yes'; otherwise, answer 'no'.", 110 | } -------------------------------------------------------------------------------- /llm_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/llm_utils/__init__.py -------------------------------------------------------------------------------- /llm_utils/base_analyzer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from typing import List, Dict 4 | 5 | class BaseLLMAnalyzer: 6 | def __init__(self, model_type: str, temperature: float): 7 | self.model_type: str = model_type 8 | self.temperature = temperature 9 | 10 | # 只是用来记录输入和输出的token数 11 | self.input_token_num: int = 0 12 | self.output_token_num: int = 0 13 | 14 | # 记录输入和输出的最大token数 15 | self.max_input_token_num: int = 0 16 | self.max_output_token_num: int = 0 17 | self.max_total_token_num: int = 0 18 | 19 | @abc.abstractmethod 20 | def generate_response(self, diaglog: List[Dict[str, str]]) -> str: 21 | pass 22 | 23 | def generate_diaglog(self, contents: List[str]) -> List[Dict[str, str]]: 24 | assert len(contents) in {1, 2} 25 | if len(contents) == 1: 26 | return [{"role": "user", "content": contents[0]}] 27 | else: 28 | # codegemma chat template refer to https://huggingface.co/google/codegemma-7b-it 29 | if "codegemma" in self.model_type: 30 | return [{"role": "user", "content": "\n\n".join(contents)}] 31 | else: 32 | return [{"role": "system", "content": contents[0]}, 33 | {"role": "user", "content": contents[1]}] 34 | 35 | @abc.abstractmethod 36 | def get_response(self, contents: List[str], add_suffix: bool=False) -> str: 37 | pass 38 | 39 | @property 40 | @abc.abstractmethod 41 | def model_name(self): 42 | pass 43 | 44 | 45 | -------------------------------------------------------------------------------- /llm_utils/common_prompt.py: -------------------------------------------------------------------------------- 1 | 2 | summarizing_prompt = """If the following text provides a positive response, answer with only 'yes'; else if it provides a negative response, answer with only 'no'. 3 | 4 | {} 5 | """ 6 | 7 | qwen_1_5_template = """The text below answer whether an indirect call can invoke a target function; summarize and answer with just 'yes' or 'no'. 8 | 9 | {} 10 | """ 11 | 12 | qwen_1_5_template_type = """The text below answer whether argument types of an indirect call match parameters of a function; summarize and answer with just 'yes' or 'no'. 13 | 14 | {} 15 | """ 16 | 17 | summarizing_prompt_4_model = { 18 | "Qwen1.5-14B-Chat": qwen_1_5_template, 19 | "Qwen1.5-32B-Chat": qwen_1_5_template, 20 | "Qwen1.5-72B-Chat": qwen_1_5_template, 21 | "Qwen2-72B-Instruct": qwen_1_5_template, 22 | "CodeQwen1.5-7B-Chat": qwen_1_5_template, 23 | "Yi-1.5-34B-Chat": qwen_1_5_template, 24 | "llama-3-70b-instruct": qwen_1_5_template, 25 | "llama-3-8b-instruct": qwen_1_5_template, 26 | "Phi-3-mini-128k-instruct": qwen_1_5_template, 27 | "Phi-3-medium-128k-instruct'": qwen_1_5_template, 28 | "codegemma-1.1-7b-it": qwen_1_5_template, 29 | "Mixtral-8x7B-Instruct-v0.1": qwen_1_5_template, 30 | "DeepSeek-Coder-V2-Lite-Instruct": qwen_1_5_template, 31 | "DeepSeek-Coder-V2-Instruct": qwen_1_5_template 32 | } 33 | 34 | summarizing_prompt_4_model_type = { 35 | "Qwen1.5-14B-Chat": qwen_1_5_template_type, 36 | "Qwen1.5-32B-Chat": qwen_1_5_template_type, 37 | "Qwen1.5-72B-Chat": qwen_1_5_template_type, 38 | "Qwen2-72B-Instruct": qwen_1_5_template_type, 39 | "CodeQwen1.5-7B-Chat": qwen_1_5_template_type, 40 | "Yi-1.5-34B-Chat": qwen_1_5_template_type, 41 | "llama-3-70b-instruct": qwen_1_5_template_type, 42 | "llama-3-8b-instruct": qwen_1_5_template_type, 43 | "Phi-3-mini-128k-instruct": qwen_1_5_template_type, 44 | "Phi-3-medium-128k-instruct'": qwen_1_5_template_type, 45 | "codegemma-1.1-7b-it": qwen_1_5_template_type, 46 | "Mixtral-8x7B-Instruct-v0.1": qwen_1_5_template_type, 47 | "DeepSeek-Coder-V2-Lite-Instruct": qwen_1_5_template_type, 48 | "DeepSeek-Coder-V2-Instruct": qwen_1_5_template_type 49 | } 50 | 51 | -------------------------------------------------------------------------------- /llm_utils/google_analyzer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import List, Tuple 4 | import tiktoken 5 | 6 | import google.generativeai as genai 7 | from google.generativeai import GenerativeModel 8 | from google.generativeai.types.generation_types import GenerateContentResponse, GenerationConfig 9 | from google.generativeai.text import Completion 10 | from google.generativeai.discuss import ChatResponse 11 | from google.api_core.exceptions import ResourceExhausted, GoogleAPIError 12 | 13 | from llm_utils.base_analyzer import BaseLLMAnalyzer 14 | 15 | ENCODING = "cl100k_base" 16 | 17 | def num_tokens_from_string(string: str) -> int: 18 | """Returns the number of tokens in a text string.""" 19 | encoding = tiktoken.get_encoding(ENCODING) 20 | num_tokens = len(encoding.encode(string)) 21 | return num_tokens 22 | 23 | class GoogleAnalyzer(BaseLLMAnalyzer): 24 | def __init__(self, model_type: str, api_key: str, temperature: float=0): 25 | super().__init__(model_type, temperature) 26 | genai.configure(api_key=api_key) 27 | config = GenerationConfig(temperature=temperature) 28 | if model_type == "gemini-pro": 29 | self.model: GenerativeModel = GenerativeModel(model_type, generation_config=config) 30 | 31 | def send_text_to_llm(self, prompt: str) -> str: 32 | if self.model_type == "gemini-pro": 33 | response: GenerateContentResponse = self.model.generate_content(prompt) 34 | resp_text: str = response.text 35 | elif self.model_type == "text-bison-001": 36 | response: Completion = genai.generate_text( 37 | model='models/text-bison-001', prompt=prompt, 38 | temperature=self.temperature, max_output_tokens=1024) 39 | resp_text: str = response.result 40 | elif self.model_type == "chat-bison-001": 41 | chat: ChatResponse = genai.chat(model="models/chat-bison-001", 42 | messages=[prompt], temperature=0.8) 43 | resp_text: str = chat.last 44 | else: 45 | msg = "unsupported model type {}".format(self.model_type) 46 | raise RuntimeError(msg) 47 | return resp_text 48 | 49 | # 向google发送一次请求,返回一个response,可能会触发异常 50 | def get_gemini_response(self, prompt: str, times: int) -> Tuple[str, bool, int]: 51 | """ 52 | prompt: system_prompt + user_prompt 53 | """ 54 | def handle_error(exception, sleep_time): 55 | error_message = f"{exception.__class__.__name__} in request, message is: {exception}" 56 | logging.getLogger("CodeAnalyzer").debug(error_message) 57 | time.sleep(sleep_time) 58 | return str(exception), False, times 59 | 60 | try: 61 | input_num = num_tokens_from_string(prompt) 62 | self.input_token_num += input_num 63 | self.max_input_token_num = max(self.max_input_token_num, input_num) 64 | resp_text: str = self.send_text_to_llm(prompt) 65 | if resp_text is None or resp_text == "": 66 | return "empty response", False, times + 1 67 | output_num = num_tokens_from_string(resp_text) 68 | self.output_token_num += output_num 69 | self.max_output_token_num = max(self.max_output_token_num, output_num) 70 | 71 | self.max_total_token_num = max(self.max_total_token_num, input_num + output_num) 72 | return resp_text, True, times 73 | # 达到rate limit 74 | except ResourceExhausted as e: 75 | return handle_error(e, 60) 76 | # 其他GoogleAPIError或者没有返回text 77 | except (GoogleAPIError, ValueError, IndexError) as e: 78 | times += 1 79 | return handle_error(e, 30) 80 | 81 | def get_response(self, contents: List[str], add_suffix: bool=False) -> str: 82 | assert len(contents) in {1, 2} 83 | prompt = "\n\n".join(contents) 84 | resp: Tuple[str, bool, int] = self.get_gemini_response(prompt, 0) 85 | # 没有成功解析出来,就最多重试3次 86 | while not resp[1]: 87 | if resp[2] >= 3: 88 | return "empty response" 89 | resp: Tuple[str, bool, int] = self.get_gemini_response(prompt, resp[2]) 90 | content: str = resp[0] 91 | return content 92 | 93 | @property 94 | def model_name(self): 95 | return f"{self.model_type}-{self.temperature}" -------------------------------------------------------------------------------- /llm_utils/openai_analyzer.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from openai import APIError, APIConnectionError, RateLimitError, Timeout, BadRequestError, AuthenticationError, OpenAIError 3 | import logging 4 | import time 5 | from typing import List, Dict, Tuple 6 | from llm_utils.base_analyzer import BaseLLMAnalyzer 7 | 8 | openai_error_messages = { 9 | APIError: "OpenAI API returned an API Error: {}", 10 | APIConnectionError: "Failed to connect to OpenAI API: {}", 11 | RateLimitError: "OpenAI TimeLimt: {}", 12 | Timeout: "OpenAI API request timed out: {}", 13 | BadRequestError: "Invalid request to OpenAI API: {}", 14 | AuthenticationError: "Authentication error with OpenAI API: {}", 15 | } 16 | 17 | model_name_map: Dict[str, Dict[str, str]] = { 18 | "swift": { 19 | "Qwen1.5-72B-Chat": "qwen1half-72b-chat", 20 | "Qwen1.5-32B-Chat": "qwen1half-32b-chat", 21 | "Qwen1.5-14B-Chat": "qwen1half-14b-chat", 22 | 23 | "llama-3-70b-instruct": "llama3-70b-instruct", 24 | "llama-3-8b-instruct": "llama3-8b-instruct" 25 | } 26 | } 27 | 28 | class OpenAIAnalyzer(BaseLLMAnalyzer): 29 | def __init__(self, model_type: str, api_key: str, address: str, temperature: float = 0, 30 | max_tokens: int = 0, server_type = "other", add_llama3_stop: bool = False, 31 | disable_system_prompt: bool = False): 32 | super().__init__(model_type, temperature) 33 | # 必须有一个有效,如果访问远程openai服务器那么api-key不为空,如果访问本地模型那么base_url不为空 34 | assert not (api_key == "" and address == "") 35 | self.max_tokens = max_tokens 36 | self.add_llama3_stop = add_llama3_stop 37 | 38 | self.request_model_name = model_name_map.get(server_type, 39 | dict()).get(model_type, model_type) 40 | self.disable_system_prompt = disable_system_prompt 41 | 42 | # 远程访问openai模型 43 | if api_key != "": 44 | self.client = OpenAI(api_key=api_key) 45 | # 本地vllm部署的模型 46 | else: 47 | url = "http://" + address + "/v1" 48 | self.client = OpenAI(api_key="EMPTY", base_url=url) 49 | 50 | # 向openai发送一次请求,返回一个response,可能会触发异常 51 | def get_openai_response(self, dialog: List[Dict[str, str]], times: int) -> Tuple[str, bool, int]: 52 | """ 53 | :param dialog: prompt sent to openai, its format is like: 54 | [{"role": "system", "content": "SYSTEM_PROMPT"}, 55 | {"role": "user", "content": "USER_PROMPT"}] 56 | :return: first str is the response from openai or error message, 57 | second bool is whether the response is valid, True means valid, False means error occur 58 | third int is the times of retry 59 | """ 60 | try: 61 | params = { 62 | "model": self.request_model_name, 63 | "messages": dialog, 64 | "temperature": self.temperature 65 | } 66 | # 如果max_tokens不为零,添加到参数中 67 | if self.max_tokens != 0: 68 | params["max_tokens"] = self.max_tokens 69 | if self.add_llama3_stop: 70 | params["stop"] = ["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|reserved_special_token"] 71 | # 调用completions.create()方法 72 | response = self.client.chat.completions.create(**params) 73 | self.input_token_num += response.usage.prompt_tokens 74 | self.max_input_token_num = max(self.max_input_token_num, response.usage.prompt_tokens) 75 | 76 | resp_text = response.choices[0].message.content 77 | resp = (resp_text, True, times) 78 | 79 | self.output_token_num += response.usage.completion_tokens 80 | self.max_output_token_num = max(self.max_output_token_num, 81 | response.usage.completion_tokens) 82 | 83 | self.max_total_token_num = max(self.max_total_token_num, 84 | response.usage.prompt_tokens + response.usage.completion_tokens) 85 | if resp_text.strip() == "": 86 | resp = ("empty response", False, times) 87 | 88 | except OpenAIError as e: 89 | error_type = type(e) 90 | error_message: str = openai_error_messages.get(error_type, 91 | "An unknown error occurred: {}") 92 | # 如果达到了rate limit 93 | if error_type is not RateLimitError: 94 | times += 1 95 | time.sleep(10) 96 | logging.getLogger("CodeAnalyzer").debug("{}, sleeping 10s".format(error_message.format(e))) 97 | else: 98 | logging.getLogger("CodeAnalyzer").debug("{}, sleeping 60s".format(error_message.format(e))) 99 | time.sleep(60) 100 | resp = (error_message.format(e), False, times) 101 | 102 | except Exception as e: 103 | times += 1 104 | time.sleep(10) 105 | logging.getLogger("CodeAnalyzer").debug("{}, sleeping 10s".format("An unknown error occurred: {}".format(e))) 106 | resp = ("An unknown error occurred: {}".format(e), False, times) 107 | 108 | return resp 109 | 110 | def generate_response(self, diaglog: List[Dict[str, str]]) -> str: 111 | resp: Tuple[str, bool, int] = self.get_openai_response(diaglog, 0) 112 | # 没有成功解析出来,就最多重试3次 113 | while not resp[1]: 114 | if resp[2] >= 3: 115 | return "empty response" 116 | resp: Tuple[str, bool, int] = self.get_openai_response(diaglog, resp[2]) 117 | content: str = resp[0] 118 | return content 119 | 120 | def get_response(self, contents: List[str], add_suffix: bool=False) -> str: 121 | if len(contents) == 2 and self.disable_system_prompt: 122 | contents = ["\n\n".join(contents)] 123 | dialog: List[Dict[str, str]] = self.generate_diaglog(contents) 124 | return self.generate_response(dialog) 125 | 126 | @property 127 | def model_name(self): 128 | return f"{self.model_type}-{self.temperature}" -------------------------------------------------------------------------------- /llm_utils/tongyi_analyzer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import List, Tuple, Dict 4 | from http import HTTPStatus 5 | import dashscope 6 | from dashscope.api_entities.dashscope_response import GenerationResponse 7 | 8 | from llm_utils.base_analyzer import BaseLLMAnalyzer 9 | 10 | class TongyiAnalyzer(BaseLLMAnalyzer): 11 | def __init__(self, model_type: str, api_key: str, temperature: float=0): 12 | super().__init__(model_type, temperature) 13 | self.api_key = api_key 14 | 15 | # 向通义千问发送一次请求,返回一个response,可能会触发异常 16 | def get_tongyi_response(self, dialog: List[Dict[str, str]], times: int) -> Tuple[str, bool, int]: 17 | """ 18 | :param dialog: prompt sent to openai, its format is like: 19 | [{"role": "system", "content": "SYSTEM_PROMPT"}, 20 | {"role": "user", "content": "USER_PROMPT"}] 21 | :return: first str is the response from openai or error message, 22 | second bool is whether the response is valid, True means valid, False means error occur 23 | third int is the times of retry 24 | error code refer to: https://help.aliyun.com/zh/dashscope/developer-reference/return-status-code-description 25 | temperature refer to: https://help.aliyun.com/zh/dashscope/developer-reference/api-details 26 | """ 27 | try: 28 | response: GenerationResponse = dashscope.Generation.call( 29 | self.model_type, 30 | messages=dialog, 31 | api_key=self.api_key, 32 | result_format='message', # set the result to be "message" format. 33 | temperature=self.temperature 34 | ) 35 | except Exception as e: 36 | logging.getLogger("CodeAnalyzer").debug("encounter error: {}".format(e)) 37 | return (str(e), False, times + 1) 38 | if response.status_code == HTTPStatus.OK: 39 | resp_text: str = response["output"]["choices"][0]["message"]["content"] 40 | input_token_num: int = response["usage"]["input_tokens"] 41 | output_token_num: int = response["usage"]["output_tokens"] 42 | self.input_token_num += input_token_num 43 | self.output_token_num += output_token_num 44 | 45 | self.max_input_token_num = max(self.max_input_token_num, input_token_num) 46 | self.max_output_token_num = max(self.max_output_token_num, output_token_num) 47 | self.max_total_token_num = max(self.max_total_token_num, 48 | input_token_num + output_token_num) 49 | if resp_text.strip() == "": 50 | resp = ("empty response", False, times) 51 | else: 52 | resp = (resp_text, True, times) 53 | # 403表示api key不能访问 54 | elif response.status_code in {401, 403}: 55 | logging.getLogger("CodeAnalyzer").info(response["message"]) 56 | logging.getLogger("CodeAnalyzer").info("api key error") 57 | exit(-1) 58 | # rate limit 59 | elif response.status_code == 429: 60 | error_message = response["message"] 61 | if error_message == "Free allocated quota exceeded.": 62 | logging.getLogger("CodeAnalyzer").info("quota running out") 63 | exit(-1) 64 | logging.getLogger("CodeAnalyzer").debug("sleeping 60s due to rate limit") 65 | time.sleep(60) 66 | resp = (error_message, False, times) 67 | # 其它error 68 | else: 69 | error_message = response["message"] 70 | logging.getLogger("CodeAnalyzer").debug(error_message) 71 | resp = (error_message, False, times + 1) 72 | return resp 73 | 74 | def generate_response(self, diaglog: List[Dict[str, str]]) -> str: 75 | resp: Tuple[str, bool, int] = self.get_tongyi_response(diaglog, 0) 76 | # 没有成功解析出来,就最多重试3次 77 | while not resp[1]: 78 | if resp[2] >= 3: 79 | return "empty response" 80 | resp: Tuple[str, bool, int] = self.get_tongyi_response(diaglog, resp[2]) 81 | content: str = resp[0] 82 | return content 83 | 84 | def get_response(self, contents: List[str], add_suffix: bool=False) -> str: 85 | dialog: List[Dict[str, str]] = self.generate_diaglog(contents) 86 | return self.generate_response(dialog) 87 | 88 | @property 89 | def model_name(self): 90 | return f"{self.model_type}-{self.temperature}" -------------------------------------------------------------------------------- /llm_utils/zhipu_analyzer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import List, Dict, Tuple 4 | from llm_utils.base_analyzer import BaseLLMAnalyzer 5 | from zhipuai import ZhipuAI, APIReachLimitError, APIStatusError 6 | from zhipuai.api_resource.chat.completions import Completion 7 | 8 | class ZhipuAnalyzer(BaseLLMAnalyzer): 9 | def __init__(self, model_type: str, api_key: str, address: str, temperature: float=0): 10 | super().__init__(model_type, temperature) 11 | if api_key != "": 12 | self.client = ZhipuAI(api_key=api_key) 13 | else: 14 | base_url = "http://" + address + "/v1/" 15 | self.client = ZhipuAI(api_key="EMP.TY", base_url=base_url) 16 | 17 | # 向zhipu发送一次请求,返回一个response,可能会触发异常 18 | def get_glm_response(self, dialog: List[Dict[str, str]], times: int) -> Tuple[str, bool, int]: 19 | """ 20 | :param dialog: prompt sent to openai, its format is like: 21 | [{"role": "system", "content": "SYSTEM_PROMPT"}, 22 | {"role": "user", "content": "USER_PROMPT"}] 23 | :return: first str is the response from openai or error message, 24 | second bool is whether the response is valid, True means valid, False means error occur 25 | third int is the times of retry 26 | """ 27 | try: 28 | response: Completion = self.client.chat.completions.create( 29 | model=self.model_type, 30 | messages=dialog, 31 | temperature=self.temperature 32 | ) 33 | resp_text: str = response.choices[0].message.content 34 | if resp_text.strip() != "": 35 | resp = (resp_text, True, times) 36 | else: 37 | resp = ("empty response", False, times) 38 | self.input_token_num += response.usage.prompt_tokens 39 | self.output_token_num += response.usage.completion_tokens 40 | 41 | self.max_input_token_num = max(self.max_input_token_num, 42 | response.usage.prompt_tokens) 43 | self.max_output_token_num = max(self.max_output_token_num, 44 | response.usage.completion_tokens) 45 | 46 | self.max_total_token_num = max(self.max_total_token_num, 47 | response.usage.prompt_tokens + response.usage.completion_tokens) 48 | 49 | 50 | except APIReachLimitError as e: 51 | # 如果达到了rate limit 52 | time.sleep(60) 53 | logging.getLogger("CodeAnalyzer").debug("{}, sleeping 60s".format(e)) 54 | resp = (str(e), False, times) 55 | except APIStatusError as e: 56 | # 如果达到了rate limit 57 | time.sleep(10) 58 | times += 1 59 | logging.getLogger("CodeAnalyzer").debug("{}, sleeping 10s".format(e)) 60 | resp = (str(e), False, times) 61 | except Exception as e: 62 | time.sleep(20) 63 | times += 1 64 | logging.getLogger("CodeAnalyzer").debug("{}, sleeping 20s".format(e)) 65 | resp = (str(e), False, times) 66 | 67 | return resp 68 | 69 | 70 | def generate_response(self, diaglog: List[Dict[str, str]]) -> str: 71 | resp: Tuple[str, bool, int] = self.get_glm_response(diaglog, 0) 72 | # 没有成功解析出来,就最多重试3次 73 | while not resp[1]: 74 | if resp[2] >= 3: 75 | return "empty response" 76 | resp: Tuple[str, bool, int] = self.get_glm_response(diaglog, resp[2]) 77 | content: str = resp[0] 78 | return content 79 | 80 | 81 | def get_response(self, contents: List[str], add_suffix: bool=False) -> str: 82 | dialog: List[Dict[str, str]] = self.generate_diaglog(contents) 83 | return self.generate_response(dialog) 84 | 85 | 86 | @property 87 | def model_name(self): 88 | return f"{self.model_type}-{self.temperature}" -------------------------------------------------------------------------------- /resources/benchmarks.txt: -------------------------------------------------------------------------------- 1 | nginx 2 | bind9 3 | cyclonedds 4 | dovecot 5 | hdf5 6 | igraph 7 | lxc 8 | md4c 9 | rtpproxy 10 | pjsip 11 | selinux 12 | sudo 13 | vlc 14 | tmux 15 | libdwarf 16 | libjpeg-turbo 17 | bluez 18 | gdbm 19 | cairo 20 | fwupd 21 | gdk-pixbuf 22 | libbpf 23 | libfuse 24 | libpg_query 25 | krb5 26 | libsndfile 27 | libucl 28 | libssh 29 | mdbtools 30 | oniguruma 31 | lua 32 | open5gs 33 | opensips 34 | postfix 35 | librabbitmq -------------------------------------------------------------------------------- /resources/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | from tree_sitter import Language 4 | 5 | def main(): 6 | system_name = platform.system() 7 | if system_name == 'Linux': 8 | file_name = "my-languages.so" 9 | # macos 10 | elif system_name == 'Darwin': 11 | file_name = "my-languages.dylib" 12 | # windows 13 | elif system_name == 'Windows': 14 | file_name = "my-languages.dll" 15 | else: 16 | raise RuntimeError("unsupported system: ", system_name) 17 | 18 | if not (os.path.exists('tree-sitter-c') and os.path.exists('tree-sitter-cpp')): 19 | raise RuntimeError("please download tree-sitter-c and tree-sitter-cpp from github first") 20 | 21 | Language.build_library( 22 | # Store the library in the `build` directory 23 | file_name, 24 | # Include one or more languages 25 | [ 26 | 'tree-sitter-c', 27 | 'tree-sitter-cpp' 28 | ] 29 | ) 30 | 31 | # Before run this script, please download tree-sitter-c and tree-sitter-cpp from https://github.com/tree-sitter/tree-sitter-c and https://github.com/tree-sitter/tree-sitter-cpp 32 | # You can run git clone from `/resources` dir or download zip and unzip to resources dir. 33 | if __name__ == '__main__': 34 | main() -------------------------------------------------------------------------------- /resources/filtered_keyword.txt: -------------------------------------------------------------------------------- 1 | attribute_specifier storage_class_specifier type_qualifier , ; #define static ( ) comment typedef [ ] { } -------------------------------------------------------------------------------- /scope_strategy/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | The scope strategy is used to determine whether indirect-call in one file may call function in another file. 3 | 4 | Default: 5 | 6 | - BaseStrategy: Only functions in files located in the same directory as 'indirect-call' or in subdirectories of the same directory are considered. 7 | 8 | -------------------------------------------------------------------------------- /scope_strategy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/scope_strategy/__init__.py -------------------------------------------------------------------------------- /scope_strategy/base_strategy.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class BaseStrategy: 4 | def analyze_key(self, callsite_key: str, func_key: str) -> bool: 5 | callsite_file = callsite_key.split(':')[0] 6 | func_file = func_key.split(':')[0] 7 | callsite_path = os.path.dirname(callsite_file) 8 | func_path = os.path.dirname(func_file) 9 | return self.analyze(callsite_path, func_path) 10 | 11 | # 基础scope分析策略,如果潜在被调用函数在callsite同级目录或者同级目录子目录下,那么返回true 12 | def analyze(self, callsite_path: str, func_path: str) -> bool: 13 | callsite_sub_paths = callsite_path.split('/') 14 | func_sub_paths = func_path.split('/') 15 | if len(callsite_sub_paths) > len(func_sub_paths): 16 | return False 17 | for i in range(len(callsite_sub_paths)): 18 | if callsite_sub_paths[i] != func_sub_paths[i]: 19 | return False 20 | return True -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/tests/__init__.py -------------------------------------------------------------------------------- /tests/filter_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/tests/filter_test/__init__.py -------------------------------------------------------------------------------- /tests/filter_test/simple_filter_test.py: -------------------------------------------------------------------------------- 1 | from llm_analyzer.simple_filter import SimpleFilter 2 | from typing import Dict, Set, List 3 | from llm_analyzer.llm_prompts import PREFIX, QUESTION_PROMPT, FORMAT_INSTRUCTIONS 4 | 5 | def test_func(callsite_key: str, func_set: List[str], simple_filter: SimpleFilter): 6 | decl_contexts: List[str] = simple_filter.extract_decl_context(callsite_key) 7 | func_declarators: List[str] = [simple_filter.func_name_2_declarator.get(func, "") 8 | for func in func_set] 9 | 10 | print(PREFIX) 11 | print("====================") 12 | print(QUESTION_PROMPT.format(decl_contexts[-1], "\n".join(decl_contexts), 13 | "\n\n".join(func_declarators))) 14 | print("=========================") 15 | print(FORMAT_INSTRUCTIONS) -------------------------------------------------------------------------------- /tests/filter_test/test_codebert.py: -------------------------------------------------------------------------------- 1 | from typing import Set, List 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from transformers import RobertaTokenizer, RobertaModel 6 | 7 | from varclr.config import codebert_model_path 8 | 9 | class TestSim: 10 | def __init__(self, model_path: str): 11 | self.tokenizer = RobertaTokenizer.from_pretrained(model_path) 12 | self.model = RobertaModel.from_pretrained(model_path) 13 | self.model = self.model.eval() 14 | 15 | def vectorize(self, code_text: str): 16 | code_tokens: List[str] = [self.tokenizer.cls_token] + \ 17 | self.tokenizer.tokenize(code_text) + [self.tokenizer.eos_token] 18 | token_ids: List[int] = self.tokenizer.convert_tokens_to_ids(code_tokens) 19 | vectorized_ids: torch.Tensor = torch.tensor(token_ids).unsqueeze(0) 20 | output = self.model(vectorized_ids) 21 | last_embedding = output.pooler_output # shape = [1, 768] 22 | 23 | return last_embedding[0] 24 | 25 | 26 | def test_callsite_sim(icallsite: str, func_names: Set[str], test_sim: TestSim): 27 | callsite_embedding = test_sim.vectorize(icallsite) 28 | func_embeddings = [test_sim.vectorize(func_name) for func_name in func_names] 29 | sims = [F.cosine_similarity(callsite_embedding, func_embedding, dim=0).item() 30 | for func_embedding in func_embeddings] 31 | 32 | for sim in sims: 33 | print(f"{sim:.2f}", end=", ") 34 | print() 35 | 36 | if __name__ == '__main__': 37 | test_sim = TestSim(codebert_model_path) 38 | icallsite = "ngx_log_t *log\n u_char *p, *last, *msg;\n log->handler(log, p, last - p)" 39 | from tests.filter_test.testcase1 import * 40 | func_names: Set[str]= {func_declarator1, func_declarator2, func_declarator3, func_declarator4, 41 | func_declarator5, func_declarator6, func_declarator7, func_declarator8, 42 | func_declarator9, func_declarator10} 43 | test_callsite_sim(icallsite, func_names, test_sim) -------------------------------------------------------------------------------- /tests/filter_test/test_varclr.py: -------------------------------------------------------------------------------- 1 | from varclr.encoders import BERT_Encoder 2 | 3 | if __name__ == '__main__': 4 | model_path = "../../models/varclr_bert" 5 | encoder = BERT_Encoder.load(model_path) 6 | 7 | arg_name: str = "writer" 8 | param_names: set[str] = {"ngx_log_error","ngx_ssl_get_cached_session", 9 | "ngx_http_ssl_npn_advertised","ngx_http_v2_filter_get_shadow", 10 | "ngx_http_xslt_sax_external_subset","ngx_http_xslt_sax_error", 11 | "ngx_log_memory_writer","ngx_http_log_error_handler","ngx_syslog_writer", 12 | "ngx_ssl_password_callback"} 13 | for param_name in param_names: 14 | print(f"{encoder.score(arg_name, param_name):.2f}", end=", ") 15 | print() -------------------------------------------------------------------------------- /tests/filter_test/testcase1.py: -------------------------------------------------------------------------------- 1 | func_declarator1 = """ 2 | char * 3 | ngx_conf_set_sec_slot(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 4 | """ 5 | 6 | func_declarator2 = """ 7 | static u_char * 8 | ngx_resolver_log_error(ngx_log_t *log, u_char *buf, size_t len) 9 | """ 10 | 11 | func_declarator3 = """ 12 | static u_char * 13 | ngx_http_log_error(ngx_log_t *log, u_char *buf, size_t len) 14 | """ 15 | 16 | func_declarator4 = """ 17 | static char * 18 | ngx_http_xslt_entities(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 19 | """ 20 | 21 | func_declarator5 = """ 22 | static char * 23 | ngx_http_try_files(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 24 | """ 25 | 26 | func_declarator6 = """ 27 | static u_char * 28 | ngx_http_v2_state_priority(ngx_http_v2_connection_t *h2c, u_char *pos, 29 | u_char *end) 30 | """ 31 | 32 | func_declarator7 = """ 33 | static u_char * 34 | ngx_http_v2_state_proxy_protocol(ngx_http_v2_connection_t *h2c, u_char *pos, 35 | u_char *end) 36 | """ 37 | 38 | func_declarator8 = """ 39 | static ngx_int_t 40 | ngx_http_geoip_country_variable(ngx_http_request_t *r, 41 | ngx_http_variable_value_t *v, uintptr_t data) 42 | """ 43 | 44 | func_declarator9 = """ 45 | static ngx_int_t 46 | ngx_http_variable_sent_last_modified(ngx_http_request_t *r, 47 | ngx_http_variable_value_t *v, uintptr_t data) 48 | """ 49 | 50 | func_declarator10 = """ 51 | static char * 52 | ngx_mail_smtp_merge_srv_conf(ngx_conf_t *cf, void *parent, void *child) 53 | """ -------------------------------------------------------------------------------- /tests/llm_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/tests/llm_tests/__init__.py -------------------------------------------------------------------------------- /tests/llm_tests/codechat_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import google.generativeai as genai 3 | from google.generativeai.text import Completion 4 | from google.generativeai.discuss import ChatResponse 5 | 6 | # currently the default temperature of gemini is 0.4 7 | def list_models(): 8 | for m in genai.list_models(): 9 | # if 'generateContent' in m.supported_generation_methods: 10 | print(m) 11 | print(m.name) 12 | 13 | 14 | def test_icall_text_decl(): 15 | from ..test_data import context 16 | response: Completion = genai.generate_text( 17 | model='models/text-bison-001', prompt=context, temperature=0.8, max_output_tokens=1024) 18 | print(response.result) 19 | 20 | 21 | def test_icall_chat_decl(): 22 | from ..test_data import context 23 | chat: ChatResponse = genai.chat(model="models/chat-bison-001", messages=[context], temperature=0.8) 24 | print(chat.last) 25 | 26 | 27 | if __name__ == '__main__': 28 | api_key = sys.argv[1] 29 | genai.configure(api_key=api_key) 30 | list_models() 31 | 32 | print("==================") 33 | test_icall_chat_decl() -------------------------------------------------------------------------------- /tests/llm_tests/gemini_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import google.generativeai as genai 3 | from google.generativeai import GenerativeModel 4 | from google.generativeai.types.generation_types import GenerateContentResponse, GenerationConfig 5 | from google.api_core.exceptions import ResourceExhausted 6 | from concurrent.futures import ThreadPoolExecutor, as_completed 7 | import time 8 | 9 | # currently the default temperature of gemini is 0.4 10 | def list_models(): 11 | for m in genai.list_models(): 12 | if 'generateContent' in m.supported_generation_methods: 13 | print(m) 14 | print(m.name) 15 | 16 | def simple_multi_thread_test(model: GenerativeModel): 17 | # response是一个genai.types.generation_types.GenerateContentResponse对象 18 | executor = ThreadPoolExecutor(max_workers=60) 19 | 20 | def worker(id: int): 21 | while True: 22 | try: 23 | response: GenerateContentResponse = model.generate_content( 24 | "What is the meaning of life?") 25 | print("The text response of {}-th request is:\n\n" 26 | "{}\n================================\n\n".format(id + 1, response.text)) 27 | break 28 | except ResourceExhausted as e: 29 | print("ResourceExhausted in {}-th request".format(id + 1)) 30 | time.sleep(60) 31 | continue 32 | 33 | futures = [] 34 | for i in range(60): 35 | future = executor.submit(worker, i) 36 | futures.append(future) 37 | 38 | for future in as_completed(futures): 39 | future.result() 40 | 41 | def test_icall_decl(model: GenerativeModel): 42 | from ..test_data import context 43 | response: GenerateContentResponse = model.generate_content( 44 | context) 45 | print(response.text) 46 | 47 | # summarizing = """If the following text provides a positive response, answer with only 'yes'; else if it provides a negative response, answer with only 'no'. 48 | # 49 | # {} 50 | # """ 51 | # summary_resp: GenerateContentResponse = model.generate_content(summarizing.format(response.text)) 52 | # print("==================") 53 | # print(summary_resp.text) 54 | 55 | def main(): 56 | api_key = sys.argv[1] 57 | genai.configure(api_key=api_key) 58 | 59 | list_models() 60 | 61 | config = GenerationConfig(temperature=0.8) 62 | model: GenerativeModel = GenerativeModel('gemini-pro', generation_config=config) 63 | print("==================") 64 | print(model) 65 | test_icall_decl(model) 66 | 67 | 68 | if __name__ == '__main__': 69 | main() -------------------------------------------------------------------------------- /tests/llm_tests/glm_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from zhipuai import ZhipuAI 3 | from zhipuai.api_resource.chat.completions import Completion 4 | 5 | system_prompt = """You are a text analyzer tasked with analyzing the similarity between two declarators.""" 6 | 7 | user_prompt = """Given a function pointer declarator and a function declarator, your task is to evaluate whether the parameter types of function pointer can match that of the function in following steps: 8 | 9 | - 1.Extract the parameter list separately from both the function pointer declarator and the function declarator. 10 | 11 | - 2.Compare each parameter's type individually for a match, ensuring identical names and pointer hierarchies for types to match. 12 | 13 | Note that: 14 | 15 | - 1.Certain parameter declarations may be wrapped or followed by macros like UNUSED_PARAM or unused, which do not impact the parameter type. For example, UNUSED_PARAM(int var) matches the type of int var. 16 | 17 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 18 | 19 | The function pointer declarator is 20 | 21 | typedef int (*ssh_packet_callback) (ssh_session session, uint8_t type, ssh_buffer packet, void *user); 22 | 23 | The function declarator is 24 | 25 | int 26 | sftp_channel_default_subsystem_request(UNUSED_PARAM(ssh_session session), 27 | UNUSED_PARAM(ssh_channel channel), 28 | UNUSED_PARAM(const char *subsystem), 29 | UNUSED_PARAM(void *userdata)) 30 | """ 31 | 32 | def test_icall_decl(client: ZhipuAI, model_type: str): 33 | response: Completion = client.chat.completions.create( 34 | model=model_type, # 填写需要调用的模型名称 35 | messages=[ 36 | {"role": "system", "content": system_prompt}, 37 | {"role": "user", "content": user_prompt} 38 | ] 39 | ) 40 | print(response.choices[0].message.content) 41 | print(type(response.choices[0].message)) 42 | print("=================") 43 | print("input token: {}".format(response.usage.prompt_tokens)) 44 | print("output token: {}".format(response.usage.completion_tokens)) 45 | print("total token: {}".format(response.usage.total_tokens)) 46 | 47 | 48 | def main(): 49 | api_key = sys.argv[1] 50 | model_type = sys.argv[2] 51 | if not api_key.startswith("http://"): 52 | client = ZhipuAI(api_key=api_key) 53 | else: 54 | client = ZhipuAI(api_key="EMP.TY", base_url=api_key) 55 | test_icall_decl(client, model_type) 56 | 57 | if __name__ == '__main__': 58 | main() -------------------------------------------------------------------------------- /tests/llm_tests/prompt_test.py: -------------------------------------------------------------------------------- 1 | from llm_analyzer.llm_prompts.gpt_prompt import SystemPrompt1, UserPrompt1, \ 2 | UserPrompt2, SystemPrompt1_, UserPrompt1_ 3 | import openai 4 | import sys 5 | from typing import List 6 | 7 | icall_context: List[str] = [ 8 | "ngx_log_t *log", 9 | "u_char *p, *last, *msg;", 10 | "p = log->handler(log, p, last - p);" 11 | ] 12 | 13 | func_declarator = """char * 14 | ngx_conf_set_sec_slot(ngx_conf_t *cf, ngx_command_t *cmd, void *conf)""" 15 | 16 | if __name__ == '__main__': 17 | # api_key = sys.argv[1] 18 | # openai.api_key = api_key 19 | # 20 | # dialog1 = [{"role": "system", "content": SystemPrompt1}, 21 | # {"role": "user", 22 | # "content": UserPrompt1.format(icall_context[-1], "\n".join(icall_context) 23 | # , "\n\n".join(func_declarator))}] 24 | # print(UserPrompt1.format(icall_context[-1], "\n".join(icall_context) 25 | # , func_declarator)) 26 | # response = openai.ChatCompletion.create( 27 | # model="gpt-3.5-turbo", 28 | # messages=dialog1 29 | # ) 30 | # content1 = response.choices[0]["message"]["content"] 31 | # print(content1) 32 | # 33 | # dialog2 = [{"role": "user", 34 | # "content": UserPrompt2.format(content1)}] 35 | # response2 = openai.ChatCompletion.create( 36 | # model="gpt-3.5-turbo", 37 | # messages=dialog2 38 | # ) 39 | # content2 = response2.choices[0]["message"]["content"] 40 | # print(content2) 41 | 42 | from dataclasses import dataclass 43 | 44 | @dataclass 45 | class ASTNode: 46 | node_type: str 47 | node_text: str 48 | start_point: tuple 49 | end_point: tuple 50 | children: list 51 | 52 | node = ASTNode("primitive_type", "unsigned int", (1, 0), (2, 0), []) 53 | print(node.node_type) 54 | setattr(node, "child_count", 4) 55 | print(getattr(node, "child_count")) 56 | print(node) -------------------------------------------------------------------------------- /tests/llm_tests/qwen_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from http import HTTPStatus 3 | import dashscope 4 | 5 | system_prompt = """You are a text analyzer tasked with analyzing the similarity between two declarators.""" 6 | 7 | user_prompt = """Given a function pointer declarator and a function declarator, your task is to evaluate whether the parameter types of function pointer can match that of the function in following steps: 8 | 9 | - 1.Extract the parameter list separately from both the function pointer declarator and the function declarator. 10 | 11 | - 2.Compare each parameter's type individually for a match, ensuring identical names and pointer hierarchies for types to match. 12 | 13 | Note that: 14 | 15 | - 1.Certain parameter declarations may be wrapped or followed by macros like UNUSED_PARAM or unused, which do not impact the parameter type. For example, UNUSED_PARAM(int var) matches the type of int var. 16 | 17 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 18 | 19 | The function pointer declarator is 20 | 21 | typedef int (*ssh_packet_callback) (ssh_session session, uint8_t type, ssh_buffer packet, void *user); 22 | 23 | The function declarator is 24 | 25 | int 26 | sftp_channel_default_subsystem_request(UNUSED_PARAM(ssh_session session), 27 | UNUSED_PARAM(ssh_channel channel), 28 | UNUSED_PARAM(const char *subsystem), 29 | UNUSED_PARAM(void *userdata)) 30 | """ 31 | 32 | from dashscope.common.error import RequestFailure 33 | 34 | def call_with_messages(api_key): 35 | messages = [{'role': 'system', 'content': system_prompt}, 36 | {'role': 'user', 'content': user_prompt}] 37 | try: 38 | response = dashscope.Generation.call( 39 | dashscope.Generation.Models.qwen_max, 40 | messages=messages, 41 | api_key = api_key, 42 | result_format='message', # set the result to be "message" format. 43 | temperature=0.5 44 | ) 45 | except RequestFailure as e: 46 | if int(e.http_code) == 429: 47 | print("catch rate limit") 48 | print(e) 49 | else: 50 | print("catch other error") 51 | except Exception as e: 52 | print("catch error") 53 | 54 | if response.status_code == HTTPStatus.OK: 55 | print(response) 56 | resp: str = response["output"]["choices"][0]["message"]["content"] 57 | input_token_num: int = response["usage"]["input_tokens"] 58 | output_token_num: int = response["usage"]["output_tokens"] 59 | 60 | print("response is: {}".format(resp)) 61 | print("input token num: {} , output token num: {}".format(input_token_num, output_token_num)) 62 | elif response.status_code == 429: 63 | print("rate limit") 64 | print(response.message) 65 | print(response) 66 | else: 67 | print('Request id: %s, Status code: %s, error code: %s, error message: %s' % ( 68 | response.request_id, response.status_code, 69 | response.code, response.message 70 | )) 71 | 72 | 73 | def multi_thread_test(api_key): 74 | from concurrent.futures import ThreadPoolExecutor, as_completed 75 | from tqdm import tqdm 76 | 77 | pbar = tqdm(total=120, desc="test") 78 | 79 | def update_progress(future): 80 | pbar.update(1) 81 | 82 | def worker(): 83 | call_with_messages(api_key) 84 | 85 | executor = ThreadPoolExecutor(max_workers=120) 86 | futures = [] 87 | for i in range(120): 88 | future = executor.submit(worker) 89 | future.add_done_callback(update_progress) 90 | futures.append(future) 91 | 92 | if __name__ == '__main__': 93 | call_with_messages(sys.argv[1]) 94 | -------------------------------------------------------------------------------- /tests/llm_tests/vllm_test.py: -------------------------------------------------------------------------------- 1 | system_prompt = """You are a text analyzer tasked with analyzing the similarity between two declarators.""" 2 | 3 | user_prompt = """Given a function pointer declarator and a function declarator, your task is to evaluate whether the parameter types of function pointer can match that of the function in following steps: 4 | 5 | - 1.Extract the parameter list separately from both the function pointer declarator and the function declarator. 6 | 7 | - 2.Compare each parameter's type individually for a match, ensuring identical names and pointer hierarchies for types to match. 8 | 9 | Note that: 10 | 11 | - 1.Certain parameter declarations may be wrapped or followed by macros like UNUSED_PARAM or unused, which do not impact the parameter type. For example, UNUSED_PARAM(int var) matches the type of int var. 12 | 13 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 14 | 15 | The function pointer declarator is 16 | 17 | typedef int (*ssh_packet_callback) (ssh_session session, uint8_t type, ssh_buffer packet, void *user); 18 | 19 | The function declarator is 20 | 21 | int 22 | sftp_channel_default_subsystem_request(UNUSED_PARAM(ssh_session session), 23 | UNUSED_PARAM(ssh_channel channel), 24 | UNUSED_PARAM(const char *subsystem), 25 | UNUSED_PARAM(void *userdata)) 26 | """ 27 | 28 | import sys 29 | from openai import OpenAI 30 | 31 | def test_icall_decl(client: OpenAI, model_type: str, max_tokens: int = None): 32 | if "codegemma" in model_type: 33 | dialog = [{"role": "user", "content": system_prompt + "\n\n" +user_prompt}] 34 | else: 35 | dialog = [{"role": "system", "content": system_prompt}, 36 | {"role": "user", "content": user_prompt}] 37 | params = { 38 | "model": model_type, 39 | "messages": dialog 40 | } 41 | if max_tokens is not None: 42 | params["max_tokens"] = max_tokens 43 | if "llama" in model_type.lower() and "3" in model_type.lower(): 44 | params["stop"] = ["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|reserved_special_token"] 45 | response = client.chat.completions.create(**params) 46 | print(response.choices[0].message.content) 47 | print(type(response.choices[0].message)) 48 | print("=================") 49 | print("input token: {}".format(response.usage.prompt_tokens)) 50 | print("output token: {}".format(response.usage.completion_tokens)) 51 | print("total token: {}".format(response.usage.total_tokens)) 52 | 53 | def main(): 54 | model_id = sys.argv[1] 55 | address = sys.argv[2] 56 | url = "http://{}/v1".format(address) 57 | max_tokens = None if len(sys.argv) <= 3 else int(sys.argv[3]) 58 | client = OpenAI(api_key="EMPTY", base_url=url) 59 | test_icall_decl(client, model_id, max_tokens) 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | context = """ 4 | You are a text analyzer tasked with analyzing the similarity between two declarators. 5 | 6 | 7 | 8 | Given a function pointer declarator and a function declarator, your task is to evaluate whether the parameter types of function pointer can match that of the function in following steps: 9 | 10 | - 1.Extract the parameter list separately from both the function pointer declarator and the function declarator. 11 | 12 | - 2.Compare each parameter's type individually for a match, ensuring identical names and pointer hierarchies for types to match. 13 | 14 | Note that: 15 | 16 | - 1.Certain parameter declarations may be wrapped or followed by macros like UNUSED_PARAM or unused, which do not impact the parameter type. For example, UNUSED_PARAM(int var) matches the type of int var. 17 | 18 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 19 | 20 | The function pointer declarator is 21 | 22 | typedef int (*ssh_packet_callback) (ssh_session session, uint8_t type, ssh_buffer packet, void *user); 23 | 24 | The function declarator is 25 | 26 | int 27 | sftp_channel_default_subsystem_request(UNUSED_PARAM(ssh_session session), 28 | UNUSED_PARAM(ssh_channel channel), 29 | UNUSED_PARAM(const char *subsystem), 30 | UNUSED_PARAM(void *userdata)) 31 | 32 | """ 33 | 34 | summ = """If function pointer parameters match function parameters, answer 'yes'; otherwise, answer 'no'. 35 | """ -------------------------------------------------------------------------------- /tests/test_data1.py: -------------------------------------------------------------------------------- 1 | 2 | system_prompt = """You are a text analyzer tasked with analyzing the similarity between two declarators.""" 3 | 4 | user_prompt = """Given a function pointer declarator and a function declarator, your task is to evaluate whether the parameter types of function pointer can match that of the function in following steps: 5 | 6 | - 1.Extract the parameter list separately from both the function pointer declarator and the function declarator. 7 | 8 | - 2.Compare each parameter's type individually for a match, ensuring identical names and pointer hierarchies for types to match. 9 | 10 | Note that: 11 | 12 | - 1.Certain parameter declarations may be wrapped or followed by macros like UNUSED_PARAM or unused, which do not impact the parameter type. For example, UNUSED_PARAM(int var) matches the type of int var. 13 | 14 | - 2.Types like int, long, size_t could be considered as compatible due to implicit cast. 15 | 16 | The function pointer declarator is 17 | 18 | typedef int (*ssh_packet_callback) (ssh_session session, uint8_t type, ssh_buffer packet, void *user); 19 | 20 | The function declarator is 21 | 22 | int 23 | sftp_channel_default_subsystem_request(UNUSED_PARAM(ssh_session session), 24 | UNUSED_PARAM(ssh_channel channel), 25 | UNUSED_PARAM(const char *subsystem), 26 | UNUSED_PARAM(void *userdata)) 27 | """ -------------------------------------------------------------------------------- /tests/testcases/enum_decl.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | typedef enum { 4 | CAIRO_BOILERPLATE_MODE_TEST, 5 | CAIRO_BOILERPLATE_MODE_PERF, 6 | 7 | /* This will allow running performance test with threads. The 8 | * GL backend is very slow on some drivers when run with thread 9 | * awareness turned on. */ 10 | CAIRO_BOILERPLATE_MODE_PERF_THREADS, 11 | } cairo_boilerplate_mode_t; 12 | 13 | 14 | typedef enum _cairo_backend_type { 15 | CAIRO_TYPE_DEFAULT, 16 | CAIRO_TYPE_SKIA, 17 | } cairo_backend_type_t; 18 | 19 | enum Status { 20 | ABNORMAL, 21 | NORMAL 22 | }; 23 | 24 | enum { 25 | CAIRO_BOILERPLATE_MODE_TEST, 26 | CAIRO_BOILERPLATE_MODE_PERF, 27 | 28 | /* This will allow running performance test with threads. The 29 | * GL backend is very slow on some drivers when run with thread 30 | * awareness turned on. */ 31 | CAIRO_BOILERPLATE_MODE_PERF_THREADS, 32 | } cairo_boilerplate_mode_t; -------------------------------------------------------------------------------- /tests/testcases/error_file_vlc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static int Activate (vlc_object_t *); 10 | 11 | vlc_module_begin () 12 | set_subcategory (SUBCAT_AUDIO_AFILTER) 13 | set_description (N_("Integer audio volume")) 14 | set_capability ("audio volume", 9) 15 | set_callback(Activate) 16 | vlc_module_end () 17 | 18 | static void FilterS32N (audio_volume_t *vol, block_t *block, float volume) { 19 | 20 | } -------------------------------------------------------------------------------- /tests/testcases/error_global_var_decl.c: -------------------------------------------------------------------------------- 1 | const pg_wchar_tbl pg_wchar_table[] = { 2 | {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, /* PG_SQL_ASCII */ 3 | {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JP */ 4 | {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, /* PG_EUC_CN */ 5 | {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, /* PG_EUC_KR */ 6 | {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, /* PG_EUC_TW */ 7 | {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JIS_2004 */ 8 | {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4}, /* PG_UTF8 */ 9 | {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4}, /* PG_MULE_INTERNAL */ 10 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN1 */ 11 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN2 */ 12 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN3 */ 13 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN4 */ 14 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN5 */ 15 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN6 */ 16 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN7 */ 17 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN8 */ 18 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN9 */ 19 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN10 */ 20 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1256 */ 21 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1258 */ 22 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN866 */ 23 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN874 */ 24 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8R */ 25 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1251 */ 26 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1252 */ 27 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-5 */ 28 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-6 */ 29 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-7 */ 30 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-8 */ 31 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1250 */ 32 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1253 */ 33 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1254 */ 34 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1255 */ 35 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1257 */ 36 | {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8U */ 37 | {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, /* PG_SJIS */ 38 | {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2}, /* PG_BIG5 */ 39 | {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, /* PG_GBK */ 40 | {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, /* PG_UHC */ 41 | {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4}, /* PG_GB18030 */ 42 | {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3}, /* PG_JOHAB */ 43 | {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */ 44 | }; 45 | 46 | 47 | extern PGDLLIMPORT const pg_wchar_tbl pg_wchar_table[]; -------------------------------------------------------------------------------- /tests/testcases/error_snd_global.c: -------------------------------------------------------------------------------- 1 | /* 2 | 这个case中宏定义会被正确解析, 3 | 但是typedef SF_CART_INFO_VAR (256) SF_CART_INFO ; 4 | 引用了宏导致下面所有的类型定义语句都被错误解析 5 | */ 6 | 7 | #define SF_CART_INFO_VAR(p_tag_text_size) \ 8 | struct \ 9 | { char version [4] ; \ 10 | char title [64] ; \ 11 | char artist [64] ; \ 12 | char cut_id [64] ; \ 13 | char client_id [64] ; \ 14 | char category [64] ; \ 15 | char classification [64] ; \ 16 | char out_cue [64] ; \ 17 | char start_date [10] ; \ 18 | char start_time [8] ; \ 19 | char end_date [10] ; \ 20 | char end_time [8] ; \ 21 | char producer_app_id [64] ; \ 22 | char producer_app_version [64] ; \ 23 | char user_def [64] ; \ 24 | int32_t level_reference ; \ 25 | SF_CART_TIMER post_timers [8] ; \ 26 | char reserved [276] ; \ 27 | char url [1024] ; \ 28 | uint32_t tag_text_size ; \ 29 | char tag_text [p_tag_text_size] ; \ 30 | } 31 | 32 | typedef SF_CART_INFO_VAR (256) SF_CART_INFO ; 33 | 34 | /* Virtual I/O functionality. */ 35 | 36 | typedef sf_count_t (*sf_vio_get_filelen) (void *user_data) ; 37 | typedef sf_count_t (*sf_vio_seek) (sf_count_t offset, int whence, void *user_data) ; 38 | typedef sf_count_t (*sf_vio_read) (void *ptr, sf_count_t count, void *user_data) ; 39 | typedef sf_count_t (*sf_vio_write) (const void *ptr, sf_count_t count, void *user_data) ; 40 | typedef sf_count_t (*sf_vio_tell) (void *user_data) ; 41 | 42 | struct SF_VIRTUAL_IO 43 | { sf_vio_get_filelen get_filelen ; 44 | sf_vio_seek seek ; 45 | sf_vio_read read ; 46 | sf_vio_write write ; 47 | sf_vio_tell tell ; 48 | } ; 49 | 50 | typedef struct SF_VIRTUAL_IO SF_VIRTUAL_IO ; 51 | -------------------------------------------------------------------------------- /tests/testcases/error_struct_decl.c: -------------------------------------------------------------------------------- 1 | #ifndef CAIRO_PRIVATE_H 2 | #define CAIRO_PRIVATE_H 3 | 4 | #include "cairo-types-private.h" 5 | #include "cairo-reference-count-private.h" 6 | 7 | CAIRO_BEGIN_DECLS 8 | 9 | struct _cairo { 10 | cairo_reference_count_t ref_count; 11 | cairo_status_t status; 12 | cairo_user_data_array_t user_data; 13 | 14 | const cairo_backend_t *backend; 15 | }; -------------------------------------------------------------------------------- /tests/testcases/func_test2.c: -------------------------------------------------------------------------------- 1 | #ifndef DDSI_UNUSED_H 2 | #define DDSI_UNUSED_H 3 | 4 | #ifdef __GNUC__ 5 | #define UNUSED_ARG(x) x __attribute__ ((unused)) 6 | #else 7 | 8 | #define UNUSED_PARAM(param) param __unused__ 9 | 10 | 11 | // case from igraph,这个case中tree-sitter错误将这个全局变量定义解析为一个全局变量声明和一个变量定义。 12 | static IGRAPH_THREAD_LOCAL igraph_error_handler_t *igraph_i_error_handler = 0; 13 | 14 | static void 15 | status_prompt_menu_callback(__unused struct menu *menu, u_int idx, key_code key, 16 | void *data) { 17 | fp = (ngx_flag_t *) (p + cmd->offset); 18 | } 19 | 20 | static int 21 | ovpn_nl_cb_finish(struct nl_msg (*msg) __attribute__ ((unused)), void *arg) { 22 | } 23 | 24 | static cairo_test_status_t 25 | record_replay (cairo_t *cr, cairo_t *(*func)(cairo_t *), int width, int height) { 26 | } 27 | 28 | static void CAIRO_BOILERPLATE_PRINTF_FORMAT(2,3) 29 | _log (cairo_test_context_t *ctx, 30 | const char *fmt, 31 | ...) { 32 | } 33 | 34 | // __attribute__((__unused__))这种GCC扩展语法无法用tree-sitter处理 35 | static int 36 | setup_env(void **unused __attribute__((__unused__))) { 37 | char *env = getenv("ISC_BENCHMARK_LOOPS"); 38 | if (env != NULL) { 39 | loops = atoi(env); 40 | } 41 | assert_int_not_equal(loops, 0); 42 | 43 | env = getenv("ISC_BENCHMARK_DELAY"); 44 | if (env != NULL) { 45 | delay_loop = atoi(env); 46 | } 47 | assert_int_not_equal(delay_loop, 0); 48 | 49 | return (0); 50 | } 51 | 52 | 53 | static isc_result_t 54 | publish_key(dns_diff_t *diff, dns_dnsseckey_t *key, const dns_name_t *origin, 55 | dns_ttl_t ttl, isc_mem_t *mctx, 56 | void (*report)(const char *, ...) ISC_FORMAT_PRINTF(1, 2)) { 57 | dns_rdata_ #_t *# = source; 58 | } 59 | 60 | 61 | void 62 | dns_rdatalist_disassociate(dns_rdataset_t *rdataset DNS__DB_FLARG) { 63 | UNUSED(rdataset); 64 | } 65 | 66 | // 从hdf5收集的样本 67 | static herr_t 68 | H5P__ocrt_pipeline_copy(const char H5_ATTR_UNUSED *name, size_t H5_ATTR_UNUSED size, void *value) { 69 | ... 70 | } 71 | 72 | int a[] = {1, 2, 3, 4}; 73 | 74 | size_t 75 | sudo_strlcpy(char * restrict dst, const char * restrict src, size_t dsize) { 76 | 77 | } 78 | 79 | static void default_log_func( 80 | __attribute__(( unused )) enum fuse_log_level level, 81 | const char *fmt, va_list ap) 82 | { 83 | vfprintf(stderr, fmt, ap); 84 | } 85 | 86 | static OM_uint32 KRB5_CALLCONV 87 | krb5_gss_inquire_attrs_for_mech(OM_uint32 *minor_status, 88 | gss_const_OID mech, 89 | gss_OID_set *mech_attrs, 90 | gss_OID_set *known_mech_attrs){ 91 | } 92 | 93 | static void none_crypt(UNUSED_PARAM(struct ssh_cipher_struct *cipher), 94 | void *in, 95 | void *out, 96 | size_t len) 97 | { 98 | memcpy(out, in, len); 99 | } -------------------------------------------------------------------------------- /tests/testcases/macro_def_decl.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #ifdef HAVE_STRLCPY 8 | # define cpy_default strlcpy 9 | #else 10 | # define cpy_default sudo_strlcpy 11 | #endif 12 | 13 | #define H5FD_SEC2 (H5FDperform_init(H5FD_sec2_init)) -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #define LOGIT(result) \ 5 | if (result == ISC_R_NOMEMORY) \ 6 | (*callbacks->error)(callbacks, "dns_master_load: %s", \ 7 | isc_result_totext(result)); \ 8 | else \ 9 | (*callbacks->error)(callbacks, "%s: %s:%lu: %s", \ 10 | "dns_master_load", source, line, \ 11 | isc_result_totext(result)) 12 | 13 | struct dns_rdatacallbacks { 14 | unsigned int magic; 15 | 16 | /*% 17 | * dns_load_master calls this when it has rdatasets to commit. 18 | */ 19 | dns_addrdatasetfunc_t add; 20 | 21 | /*% 22 | * dns_master_load*() call this when loading a raw zonefile, 23 | * to pass back information obtained from the file header 24 | */ 25 | dns_rawdatafunc_t rawdata; 26 | dns_zone_t *zone; 27 | 28 | /*% 29 | * dns_load_master / dns_rdata_fromtext call this to issue a error. 30 | */ 31 | void (*error)(struct dns_rdatacallbacks *, const char *, ...); 32 | /*% 33 | * dns_load_master / dns_rdata_fromtext call this to issue a warning. 34 | */ 35 | void (*warn)(struct dns_rdatacallbacks *, const char *, ...); 36 | /*% 37 | * Private data handles for use by the above callback functions. 38 | */ 39 | void *add_private; 40 | void *error_private; 41 | void *warn_private; 42 | }; 43 | 44 | 45 | typedef struct dns_rdatacallbacks dns_rdatacallbacks_t; 46 | 47 | int main() { 48 | dns_rdatacallbacks_t *callbacks; 49 | isc_result_t result = ISC_R_UNEXPECTED; 50 | LOGIT(result); 51 | return 0; 52 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test10.c: -------------------------------------------------------------------------------- 1 | 2 | #define MD_LOG(msg) \ 3 | do { \ 4 | if(ctx->parser.debug_log != NULL) \ 5 | ctx->parser.debug_log((msg), ctx->userdata); \ 6 | } while(0) 7 | 8 | 9 | #define MD_ENTER_SPAN(type, arg) \ 10 | do { \ 11 | ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \ 12 | if(ret != 0) { \ 13 | MD_LOG("Aborted from enter_span() callback."); \ 14 | goto abort; \ 15 | } \ 16 | } while(0) 17 | 18 | static int 19 | md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type, 20 | const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest, 21 | const CHAR* title, SZ title_size) { 22 | MD_ENTER_SPAN(type, &det); 23 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test11.c: -------------------------------------------------------------------------------- 1 | 2 | #define STR(off) (ctx->text + (off)) 3 | 4 | 5 | #define MD_LOG(msg) \ 6 | do { \ 7 | if(ctx->parser.debug_log != NULL) \ 8 | ctx->parser.debug_log((msg), ctx->userdata); \ 9 | } while(0) 10 | 11 | 12 | #define MD_TEXT(type, str, size) \ 13 | do { \ 14 | if(size > 0) { \ 15 | ret = ctx->parser.text((type), (str), (size), ctx->userdata); \ 16 | if(ret != 0) { \ 17 | MD_LOG("Aborted from text() callback."); \ 18 | goto abort; \ 19 | } \ 20 | } \ 21 | } while(0) 22 | 23 | 24 | static int 25 | md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { 26 | MD_TEXTTYPE text_type; 27 | OFF off = lines[0].beg; 28 | OFF tmp = (line->end < mark->beg ? line->end : mark->beg); 29 | MD_TEXT(text_type, STR(mark->beg+1), 1); 30 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test12.c: -------------------------------------------------------------------------------- 1 | 2 | #define CALL_METHOD(obj, method, args...) (obj)->method(obj, ## args) 3 | 4 | static void 5 | init_config_bail(struct rtpp_cfg *cfsp, int rval, const char *msg, int memdeb) { 6 | CALL_METHOD(cfsp->bindaddrs_cf, dtor); 7 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test13.c: -------------------------------------------------------------------------------- 1 | 2 | #define CALL_SMETHOD(obj, method, args...) GET_SMETHODS(obj)->method(obj, ## args) 3 | #define CALL_SMETHOD1(obj, method, ...) GET_SMETHODS(obj)->method(obj) 4 | 5 | #define GET_SMETHODS(obj) _Generic((obj), \ 6 | struct rtpp_refcnt *: rtpp_refcnt_smethods, \ 7 | struct rtpp_pearson_perfect *: rtpp_pearson_perfect_smethods, \ 8 | struct rtpp_netaddr *: rtpp_netaddr_smethods, \ 9 | struct rtpp_server *: rtpp_server_smethods, \ 10 | struct rtpp_stats *: rtpp_stats_smethods, \ 11 | struct rtpp_timed *: rtpp_timed_smethods, \ 12 | struct rtpp_stream *: rtpp_stream_smethods, \ 13 | struct rtpp_pcount *: rtpp_pcount_smethods, \ 14 | struct rtpp_record *: rtpp_record_smethods, \ 15 | struct rtpp_hash_table *: rtpp_hash_table_smethods, \ 16 | struct rtpp_weakref *: rtpp_weakref_smethods, \ 17 | struct rtpp_analyzer *: rtpp_analyzer_smethods, \ 18 | struct rtpp_pcnt_strm *: rtpp_pcnt_strm_smethods, \ 19 | struct rtpp_ttl *: rtpp_ttl_smethods, \ 20 | struct rtpp_pipe *: rtpp_pipe_smethods, \ 21 | struct rtpp_ringbuf *: rtpp_ringbuf_smethods, \ 22 | struct rtpp_sessinfo *: rtpp_sessinfo_smethods, \ 23 | struct rtpp_rw_lock *: rtpp_rw_lock_smethods, \ 24 | struct rtpp_proc_servers *: rtpp_proc_servers_smethods, \ 25 | struct rtpp_proc_wakeup *: rtpp_proc_wakeup_smethods, \ 26 | struct pproc_manager *: pproc_manager_smethods \ 27 | ) 28 | 29 | static void 30 | init_cstats(struct rtpp_stats *sobj, struct rtpp_command_stats *csp) 31 | { 32 | CALL_SMETHOD(sobj, getidxbyname, "ncmds_rcvd"); 33 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test14.c: -------------------------------------------------------------------------------- 1 | 2 | #define CALL_METHOD(obj, method, args...) (obj)->method(obj, ## args) 3 | 4 | #define RTPP_LOG(log, args...) CALL_METHOD((log), genwrite, __FUNCTION__, \ 5 | __LINE__, ## args) 6 | 7 | struct ul_opts * 8 | rtpp_command_ul_opts_parse(const struct rtpp_cfg *cfsp, struct rtpp_command *cmd) { 9 | RTPP_LOG(cmd->glog, RTPP_LOG_ERR, "DELETE: unknown command modifier %c'", *cp); 10 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test15.c: -------------------------------------------------------------------------------- 1 | #define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end)) 2 | 3 | typedef struct OnigEncodingTypeST { 4 | int (*mbc_enc_len)(const OnigUChar* p); 5 | const char* name; 6 | int max_enc_len; 7 | int min_enc_len; 8 | int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end); 9 | OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end); 10 | int (*code_to_mbclen)(OnigCodePoint code); 11 | int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf); 12 | int (*mbc_case_fold)(OnigCaseFoldType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to); 13 | int (*apply_all_case_fold)(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg); 14 | int (*get_case_fold_codes_by_str)(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem acs[]); 15 | int (*property_name_to_ctype)(struct OnigEncodingTypeST* enc, OnigUChar* p, OnigUChar* end); 16 | int (*is_code_ctype)(OnigCodePoint code, OnigCtype ctype); 17 | int (*get_ctype_code_range)(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[]); 18 | OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p); 19 | int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end); 20 | int (*init)(void); 21 | int (*is_initialized)(void); 22 | int (*is_valid_mbc_string)(const OnigUChar* s, const OnigUChar* end); 23 | unsigned int flag; 24 | OnigCodePoint sb_range; 25 | int index; 26 | } OnigEncodingType; 27 | 28 | typedef OnigEncodingType* OnigEncoding; 29 | 30 | static void 31 | print_enc_string(FILE* fp, OnigEncoding enc, 32 | const UChar *s, const UChar *end) { 33 | const UChar *p; 34 | ONIGENC_MBC_TO_CODE(enc, p, end); 35 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test16.c: -------------------------------------------------------------------------------- 1 | 2 | #define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p) 3 | 4 | #define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) 5 | 6 | static int 7 | compile_length_string_node(Node* node, regex_t* reg) { 8 | UChar *p; 9 | OnigEncoding enc = reg->enc; 10 | enclen(enc, p); 11 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test17.c: -------------------------------------------------------------------------------- 1 | 2 | #define ERREXIT1(cinfo, code, p1) \ 3 | ((cinfo)->err->msg_code = (code), \ 4 | (cinfo)->err->msg_parm.i[0] = (p1), \ 5 | (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo))) 6 | 7 | static void 8 | start_pass_fdctmgr(j_compress_ptr cinfo) { 9 | int qtblno; 10 | ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno); 11 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test18.c: -------------------------------------------------------------------------------- 1 | 2 | #define MAKESTMT(stuff) do { stuff } while (0) 3 | 4 | #define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \ 5 | MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \ 6 | _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ 7 | (cinfo)->err->msg_code = (code); \ 8 | (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); ) 9 | 10 | 11 | int 12 | get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_lossless, 13 | boolean is_arith) { 14 | TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker, 15 | (int)cinfo->image_width, (int)cinfo->image_height, 16 | cinfo->num_components); 17 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test19.c: -------------------------------------------------------------------------------- 1 | #define INPUT_RELOAD(cinfo) \ 2 | ( next_input_byte = datasrc->next_input_byte, \ 3 | bytes_in_buffer = datasrc->bytes_in_buffer ) 4 | 5 | #define MAKE_BYTE_AVAIL(cinfo, action) \ 6 | if (bytes_in_buffer == 0) { \ 7 | if (!(*datasrc->fill_input_buffer) (cinfo)) \ 8 | { action; } \ 9 | INPUT_RELOAD(cinfo); \ 10 | } 11 | 12 | #define MAKESTMT(stuff) do { stuff } while (0) 13 | 14 | #define INPUT_BYTE(cinfo, V, action) \ 15 | MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \ 16 | bytes_in_buffer--; \ 17 | V = *next_input_byte++; ) 18 | 19 | int 20 | get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_lossless, 21 | boolean is_arith) { 22 | INPUT_BYTE(cinfo, cinfo->data_precision, return FALSE); 23 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test2.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #define gmp_alloc(size) ((*gmp_allocate_func)((size))) 4 | 5 | static mp_ptr 6 | gmp_alloc_limbs (mp_size_t size) 7 | { 8 | gmp_alloc (size * sizeof (mp_limb_t)); 9 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test3.c: -------------------------------------------------------------------------------- 1 | 2 | #define LOADPTR(PTR, PTRINFO) \ 3 | (assert((PTRINFO)->loadptr != NULL), (PTRINFO)->loadptr(PTR)) 4 | 5 | struct ptr_info { 6 | void *(*loadptr)(const void *); 7 | void (*storeptr)(void *, void *); 8 | const struct atype_info *basetype; 9 | }; 10 | 11 | static void 12 | free_atype(const struct atype_info *a, void *val) { 13 | const struct ptr_info *ptrinfo = a->tinfo; 14 | LOADPTR(val, ptrinfo); 15 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test4.c: -------------------------------------------------------------------------------- 1 | 2 | #define callfrealloc(g,block,os,ns) ((*g->frealloc)(g->ud, block, os, ns)) 3 | 4 | #define firsttry(g,block,os,ns) callfrealloc(g, block, os, ns) 5 | 6 | typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); 7 | 8 | typedef struct global_State { 9 | lua_Alloc frealloc; /* function to reallocate memory */ 10 | void *ud; /* auxiliary data to 'frealloc' */ 11 | l_mem totalbytes; /* number of bytes currently allocated - GCdebt */ 12 | l_mem GCdebt; /* bytes allocated not yet compensated by the collector */ 13 | lu_mem GCestimate; /* an estimate of the non-garbage memory in use */ 14 | lu_mem lastatomic; /* see function 'genstep' in file 'lgc.c' */ 15 | stringtable strt; /* hash table for strings */ 16 | TValue l_registry; 17 | TValue nilvalue; /* a nil value */ 18 | unsigned int seed; /* randomized seed for hashes */ 19 | lu_byte currentwhite; 20 | lu_byte gcstate; /* state of garbage collector */ 21 | lu_byte gckind; /* kind of GC running */ 22 | lu_byte gcstopem; /* stops emergency collections */ 23 | lu_byte genminormul; /* control for minor generational collections */ 24 | lu_byte genmajormul; /* control for major generational collections */ 25 | lu_byte gcstp; /* control whether GC is running */ 26 | lu_byte gcemergency; /* true if this is an emergency collection */ 27 | lu_byte gcpause; /* size of pause between successive GCs */ 28 | lu_byte gcstepmul; /* GC "speed" */ 29 | lu_byte gcstepsize; /* (log2 of) GC granularity */ 30 | GCObject *allgc; /* list of all collectable objects */ 31 | GCObject **sweepgc; /* current position of sweep in list */ 32 | GCObject *finobj; /* list of collectable objects with finalizers */ 33 | GCObject *gray; /* list of gray objects */ 34 | GCObject *grayagain; /* list of objects to be traversed atomically */ 35 | GCObject *weak; /* list of tables with weak values */ 36 | GCObject *ephemeron; /* list of ephemeron tables (weak keys) */ 37 | GCObject *allweak; /* list of all-weak tables */ 38 | GCObject *tobefnz; /* list of userdata to be GC */ 39 | GCObject *fixedgc; /* list of objects not to be collected */ 40 | /* fields for generational collector */ 41 | GCObject *survival; /* start of objects that survived one GC cycle */ 42 | GCObject *old1; /* start of old1 objects */ 43 | GCObject *reallyold; /* objects more than one cycle old ("really old") */ 44 | GCObject *firstold1; /* first OLD1 object in the list (if any) */ 45 | GCObject *finobjsur; /* list of survival objects with finalizers */ 46 | GCObject *finobjold1; /* list of old1 objects with finalizers */ 47 | GCObject *finobjrold; /* list of really old objects with finalizers */ 48 | struct lua_State *twups; /* list of threads with open upvalues */ 49 | lua_CFunction panic; /* to be called in unprotected errors */ 50 | struct lua_State *mainthread; 51 | TString *memerrmsg; /* message for memory-allocation errors */ 52 | TString *tmname[TM_N]; /* array with tag-method names */ 53 | struct Table *mt[LUA_NUMTYPES]; /* metatables for basic types */ 54 | TString *strcache[STRCACHE_N][STRCACHE_M]; /* cache for strings in API */ 55 | lua_WarnFunction warnf; /* warning function */ 56 | void *ud_warn; /* auxiliary data to 'warnf' */ 57 | } global_State; 58 | 59 | 60 | void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) { 61 | global_State *g = G(L); 62 | firsttry(g, block, osize, nsize); 63 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test5.c: -------------------------------------------------------------------------------- 1 | 2 | #define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } 3 | 4 | typedef void (*Pfunc) (lua_State *L, void *ud); 5 | 6 | int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { 7 | l_uint32 oldnCcalls = L->nCcalls; 8 | struct lua_longjmp lj; 9 | lj.status = LUA_OK; 10 | lj.previous = L->errorJmp; /* chain new error handler */ 11 | L->errorJmp = &lj; 12 | LUAI_TRY(L, &lj, 13 | (*f)(L, ud); 14 | ); 15 | L->errorJmp = lj.previous; /* restore old error handler */ 16 | L->nCcalls = oldnCcalls; 17 | return lj.status; 18 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test6.c: -------------------------------------------------------------------------------- 1 | 2 | #define cast(t, exp) ((t)(exp)) 3 | 4 | typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); 5 | 6 | lua_State *lua_newstate (lua_Alloc f, void *ud) { 7 | int i; 8 | lua_State *L; 9 | global_State *g; 10 | cast(LG*, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG))); 11 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test7.c: -------------------------------------------------------------------------------- 1 | 2 | #define sudo_debug_exit_bool(_func, _file, _line, _sys, _ret) \ 3 | do { \ 4 | sudo_debug_printf2(NULL, NULL, 0, (_sys) | SUDO_DEBUG_TRACE, \ 5 | "<- %s @ %s:%d := %s", (_func), (_file), (_line), (_ret) ? "true": "false");\ 6 | } while (0) 7 | 8 | #define debug_return_bool(ret) \ 9 | do { \ 10 | bool sudo_debug_ret = (ret); \ 11 | sudo_debug_exit_bool(__func__, __FILE__, __LINE__, sudo_debug_subsys, \ 12 | sudo_debug_ret); \ 13 | return sudo_debug_ret; \ 14 | } while (0) 15 | 16 | 17 | struct sudo_defs_types { 18 | const char *name; 19 | int type; 20 | const char *desc; 21 | struct def_values *values; 22 | bool (*callback)(struct sudoers_context *ctx, const char *file, int line, int column, const union sudo_defs_val *, int op); 23 | union sudo_defs_val sd_un; 24 | }; 25 | 26 | 27 | static bool 28 | run_callback(struct sudoers_context *ctx, const char *file, int line, 29 | int column, struct sudo_defs_types *def, int op) 30 | { 31 | debug_decl(run_callback, SUDOERS_DEBUG_DEFAULTS); 32 | 33 | if (def->callback == NULL) 34 | debug_return_bool(true); 35 | debug_return_bool(def->callback(ctx, file, line, column, &def->sd_un, op)); 36 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test8.c: -------------------------------------------------------------------------------- 1 | 2 | #define VBUF_SPACE(v,n) ((v)->space((v),(n))) 3 | 4 | #define VBUF_SKIP(bp) do { \ 5 | while ((bp)->cnt > 0 && *(bp)->ptr) \ 6 | (bp)->ptr++, (bp)->cnt--; \ 7 | } while (0) 8 | 9 | #define VBUF_SNPRINTF(bp, sz, fmt, arg) do { \ 10 | if (VBUF_SPACE((bp), (sz)) != 0) \ 11 | return (bp); \ 12 | sprintf((char *) (bp)->ptr, (fmt), (arg)); \ 13 | VBUF_SKIP(bp); \ 14 | } while (0) 15 | 16 | #define vstring_str(vp) ((char *) (vp)->vbuf.data) 17 | 18 | struct VBUF { 19 | int flags; /* status, see below */ 20 | unsigned char *data; /* variable-length buffer */ 21 | ssize_t len; /* buffer length */ 22 | ssize_t cnt; /* bytes left to read/write */ 23 | unsigned char *ptr; /* read/write position */ 24 | VBUF_GET_READY_FN get_ready; /* read buffer empty action */ 25 | VBUF_PUT_READY_FN put_ready; /* write buffer full action */ 26 | VBUF_SPACE_FN space; /* request for buffer space */ 27 | }; 28 | 29 | typedef int (*VBUF_SPACE_FN) (VBUF *, ssize_t); 30 | 31 | typedef struct VBUF VBUF; 32 | 33 | VBUF *vbuf_print(VBUF *bp, const char *format, va_list ap) { 34 | static VSTRING *fmt; /* format specifier */ 35 | int width; /* width and numerical precision */ 36 | int prec; /* are signed for overflow defense */ 37 | char *s; 38 | VBUF_SNPRINTF(bp, (width > prec ? width : prec) + INT_SPACE, 39 | vstring_str(fmt), s); 40 | } -------------------------------------------------------------------------------- /tests/testcases/macro_expand/macro_test9.c: -------------------------------------------------------------------------------- 1 | 2 | #define CHECK(expr) do { \ 3 | status=expr; if (status!=PJ_SUCCESS) return status; } \ 4 | while (0) 5 | 6 | typedef pj_status_t (*pj_json_writer)(const char *s, 7 | unsigned size, 8 | void *user_data); 9 | 10 | struct write_state 11 | { 12 | pj_json_writer writer; 13 | void *user_data; 14 | char indent_buf[MAX_INDENT]; 15 | int indent; 16 | char space[PJ_JSON_NAME_MIN_LEN]; 17 | }; 18 | 19 | static pj_status_t write_children(const pj_json_list *list, 20 | const char quotes[2], 21 | struct write_state *st) { 22 | CHECK( st->writer( "\n", 1, st->user_data) ); 23 | } -------------------------------------------------------------------------------- /tests/testcases/ngx_struct_specifier.c: -------------------------------------------------------------------------------- 1 | typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); 2 | struct ngx_buf_s { 3 | u_char *pos; 4 | u_char *last; 5 | off_t file_pos; 6 | off_t file_last; 7 | 8 | u_char *start; /* start of buffer */ 9 | u_char *end; /* end of buffer */ 10 | ngx_buf_tag_t tag; 11 | ngx_file_t *file; 12 | ngx_buf_t *shadow; 13 | 14 | 15 | /* the buf's content could be changed */ 16 | unsigned temporary:1; 17 | 18 | /* 19 | * the buf's content is in a memory cache or in a read only memory 20 | * and must not be changed 21 | */ 22 | unsigned memory:1; 23 | 24 | /* the buf's content is mmap()ed and must not be changed */ 25 | unsigned mmap:1; 26 | 27 | unsigned recycled:1; 28 | unsigned in_file:1; 29 | unsigned flush:1; 30 | unsigned sync:1; 31 | unsigned last_buf:1; 32 | unsigned last_in_chain:1; 33 | 34 | unsigned last_shadow:1; 35 | unsigned temp_file:1; 36 | 37 | /* STUB */ int num; 38 | }; 39 | 40 | struct ngx_buf_s ngx_bug_var1; 41 | 42 | struct A { 43 | int a, b, *c; 44 | int (*add)(int, int); 45 | } Aa; 46 | 47 | 48 | struct student { 49 | const int num; 50 | char* str; 51 | }; 52 | 53 | void main() { 54 | struct student s; 55 | s.num = 1; 56 | s.str = "abc"; 57 | p->func(s.str[0]); // char 58 | code((struct ngx_http_script_engine_t **) &e); 59 | } 60 | -------------------------------------------------------------------------------- /tests/testcases/struct_case1.c: -------------------------------------------------------------------------------- 1 | 2 | typedef struct mosquitto_plugin_id_t{ 3 | struct mosquitto__listener *listener; 4 | } mosquitto_plugin_id_t; 5 | 6 | // typedef中首先定义匿名结构体再定义类型别名 7 | typedef struct { 8 | const void* srcPtr; 9 | size_t srcSize; 10 | void* cPtr; 11 | size_t cRoom; 12 | size_t cSize; 13 | void* resPtr; 14 | size_t resSize; 15 | } blockParam_t; 16 | 17 | // 随便构造的,一个field_declaration定义多个field 18 | struct Student { 19 | int num, stunumber, *datas, friend_ids[10]; 20 | }; 21 | 22 | // 复杂匿名结构体定义+变量声明+初始化,from hdf5 23 | struct { 24 | herr_t (*func)(void); 25 | const char *descr; 26 | } initializer[] = { 27 | {H5E_init, "error"} 28 | ,{H5VL_init_phase1, "VOL"} 29 | ,{H5SL_init, "skip lists"} 30 | ,{H5FD_init, "VFD"} 31 | ,{H5_default_vfd_init, "default VFD"} 32 | ,{H5P_init_phase1, "property list"} 33 | ,{H5AC_init, "metadata caching"} 34 | ,{H5L_init, "link"} 35 | ,{H5S_init, "dataspace"} 36 | ,{H5PL_init, "plugins"} 37 | /* Finish initializing interfaces that depend on the interfaces above */ 38 | ,{H5P_init_phase2, "property list"} 39 | ,{H5VL_init_phase2, "VOL"} 40 | }; 41 | 42 | 43 | typedef struct H5FS_section_class_t { 44 | /* Class variables */ 45 | const unsigned type; /* Type of free space section */ 46 | size_t serial_size; /* Size of serialized form of section */ 47 | unsigned flags; /* Class flags */ 48 | void *cls_private; /* Class private information */ 49 | 50 | /* Class methods */ 51 | herr_t (*init_cls)(struct H5FS_section_class_t *, 52 | void *); /* Routine to initialize class-specific settings */ 53 | herr_t (*term_cls)(struct H5FS_section_class_t *); /* Routine to terminate class-specific settings */ 54 | 55 | /* Object methods */ 56 | herr_t (*add)(H5FS_section_info_t **, unsigned *, 57 | void *); /* Routine called when section is about to be added to manager */ 58 | herr_t (*serialize)(const struct H5FS_section_class_t *, const H5FS_section_info_t *, 59 | uint8_t *); /* Routine to serialize a "live" section into a buffer */ 60 | H5FS_section_info_t *(*deserialize)( 61 | const struct H5FS_section_class_t *, const uint8_t *, haddr_t, hsize_t, 62 | unsigned *); /* Routine to deserialize a buffer into a "live" section */ 63 | htri_t (*can_merge)(const H5FS_section_info_t *, const H5FS_section_info_t *, 64 | void *); /* Routine to determine if two nodes are mergeable */ 65 | herr_t (*merge)(H5FS_section_info_t **, H5FS_section_info_t *, void *); /* Routine to merge two nodes */ 66 | htri_t (*can_shrink)(const H5FS_section_info_t *, 67 | void *); /* Routine to determine if node can shrink container */ 68 | herr_t (*shrink)(H5FS_section_info_t **, void *); /* Routine to shrink container */ 69 | herr_t (*free)(H5FS_section_info_t *); /* Routine to free node */ 70 | herr_t (*valid)(const struct H5FS_section_class_t *, 71 | const H5FS_section_info_t *); /* Routine to check if a section is valid */ 72 | H5FS_section_info_t *(*split)(H5FS_section_info_t *, hsize_t); /* Routine to create the split section */ 73 | herr_t (*debug)(const H5FS_section_info_t *, FILE *, int, 74 | int); /* Routine to dump debugging information about a section */ 75 | } H5FS_section_class_t; -------------------------------------------------------------------------------- /tests/testcases/type_def_test.c: -------------------------------------------------------------------------------- 1 | 2 | size_t const a; 3 | static const size_t b; 4 | 5 | typedef HRESULT (WINAPI*D2D1CreateFactoryFunc)( 6 | D2D1_FACTORY_TYPE factoryType, 7 | REFIID iid, 8 | CONST D2D1_FACTORY_OPTIONS *pFactoryOptions, 9 | void **factory 10 | ); 11 | 12 | typedef H5PL_type_t(__cdecl *H5PL_get_plugin_type_t)(void); 13 | 14 | typedef ngx_int_t (*ngx_shm_zone_init_pt) (ngx_shm_zone_t *zone, void *data); 15 | 16 | typedef size_t ZSTD_sequenceProducer_F ( 17 | void* sequenceProducerState, 18 | ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, 19 | const void* src, size_t srcSize, 20 | const void* dict, size_t dictSize, 21 | int compressionLevel, 22 | size_t windowSize 23 | ); 24 | 25 | // 定义了3种不同的类型,其中2种为指针类型 26 | typedef struct _TRANSMIT_FILE_BUFFERS { 27 | LPVOID Head; 28 | DWORD HeadLength; 29 | LPVOID Tail; 30 | DWORD TailLength; 31 | } TRANSMIT_FILE_BUFFERS, *PTRANSMIT_FILE_BUFFERS, FAR *LPTRANSMIT_FILE_BUFFERS; 32 | 33 | XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) 34 | { 35 | } 36 | 37 | typedef cairo_status_t 38 | (*cairo_spline_add_point_func_t) (void *closure, 39 | const cairo_point_t *point, 40 | const cairo_slope_t *tangent); 41 | 42 | 43 | typedef BOOL (* LPFN_TRANSMITPACKETS) ( 44 | SOCKET hSocket, 45 | TRANSMIT_PACKETS_ELEMENT *lpPacketArray, 46 | DWORD nElementCount, 47 | DWORD nSendSize, 48 | LPOVERLAPPED lpOverlapped, 49 | DWORD dwFlags 50 | ); 51 | 52 | typedef int (*ngx_wsapoll_pt)( 53 | LPWSAPOLLFD fdArray, 54 | ULONG fds, 55 | INT timeout 56 | ); 57 | 58 | // 支持可变参数 59 | typedef void (*isc_errorcallback_t)(const char *, int, const char *, 60 | const char *, va_list); 61 | 62 | typedef ns_hooklist_t ns_hooktable_t[NS_HOOKPOINTS_COUNT]; 63 | 64 | typedef struct dns_rdata_ #{ 65 | dns_rdatacommon_t common; 66 | isc_mem_t *mctx; /* if required */ 67 | /* type & class specific elements */ 68 | } 69 | dns_rdata_ #_t; 70 | 71 | typedef void timeout_callback_t(void *context); 72 | 73 | typedef void iostream_pump_callback_t(enum iostream_pump_status status, 74 | void *context); 75 | 76 | // 宏定义影响 77 | typedef pam_const void *pam_item_t; 78 | 79 | // 宏定义影响 80 | typedef H5PL_type_t(__cdecl *H5PL_get_plugin_type_t)(void); 81 | 82 | // pjsip中的case 83 | typedef void *pj_hash_entry_buf[(PJ_HASH_ENTRY_BUF_SIZE+sizeof(void*)-1)/(sizeof(void*))]; 84 | 85 | // error type 86 | typedef __u64 __bitwise __be64; 87 | 88 | // krb5 case, KRB5_CALLCONV的存在使得类型解析错误 89 | typedef struct gss_config { 90 | gss_OID_desc mech_type; 91 | void * context; 92 | OM_uint32 (KRB5_CALLCONV *gss_acquire_cred) 93 | ( 94 | OM_uint32*, /* minor_status */ 95 | gss_name_t, /* desired_name */ 96 | OM_uint32, /* time_req */ 97 | gss_OID_set, /* desired_mechs */ 98 | int, /* cred_usage */ 99 | gss_cred_id_t*, /* output_cred_handle */ 100 | gss_OID_set*, /* actual_mechs */ 101 | OM_uint32* /* time_rec */ 102 | ); 103 | } *gss_mechanism; -------------------------------------------------------------------------------- /tests/visitor_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/for-just-we/CodeAnalyzer/fe7c747356c557d31133497b47a1746819d15a1b/tests/visitor_test/__init__.py -------------------------------------------------------------------------------- /tests/visitor_test/declaration_visitor_test.py: -------------------------------------------------------------------------------- 1 | from tree_sitter import Tree 2 | 3 | from code_analyzer.config import parser 4 | from code_analyzer.visitors.global_visitor import GlobalVisitor 5 | from code_analyzer.preprocessor.node_processor import processor 6 | from code_analyzer.schemas.ast_node import ASTNode 7 | 8 | global_decl_case1 = """ 9 | static ngx_event_module_t ngx_epoll_module_ctx = { 10 | &epoll_name, 11 | ngx_epoll_create_conf, /* create configuration */ 12 | ngx_epoll_init_conf, /* init configuration */ 13 | 14 | { 15 | ngx_epoll_add_event, /* add an event */ 16 | ngx_epoll_del_event, /* delete an event */ 17 | ngx_epoll_add_event, /* enable an event */ 18 | ngx_epoll_del_event, /* disable an event */ 19 | ngx_epoll_add_connection, /* add an connection */ 20 | ngx_epoll_del_connection, /* delete an connection */ 21 | #if (NGX_HAVE_EVENTFD) 22 | ngx_epoll_notify, /* trigger a notify */ 23 | #else 24 | NULL, /* trigger a notify */ 25 | #endif 26 | ngx_epoll_process_events, /* process the events */ 27 | ngx_epoll_init, /* init the events */ 28 | ngx_epoll_done, /* done the events */ 29 | } 30 | }; 31 | """ 32 | 33 | global_decl_case2 = """ 34 | static const int a; 35 | char* b = "aaaa"; 36 | static int* p[]; 37 | int (*p)[]; 38 | const struct Node* node; 39 | int* (*add)(int a, int b); 40 | 41 | stk->auto_op.func2 = stk->auto_op.func2_default = (H5E_auto2_t)H5E__print2; 42 | """ 43 | 44 | global_decl_case3 = """ 45 | static void (*const test_functions[])(void) = { 46 | test_qp_decoder, 47 | NULL 48 | }; 49 | """ 50 | 51 | global_4 = """ 52 | event_create_passthrough(cmd->context.event)-> 53 | set_name("smtp_server_command_started"); 54 | """ 55 | 56 | global_5 = """ 57 | #define $LINE MakeString( Stringize, __LINE__ ) 58 | """ 59 | 60 | global_6 = """ 61 | static log_sink_t sinks[2] = { 62 | [LOG] = { .func = default_sink, .ptr = NULL, .out = NULL }, 63 | [TRACE] = { .func = default_sink, .ptr = NULL, .out = NULL } 64 | }; 65 | """ 66 | 67 | if __name__ == '__main__': 68 | decls = [global_decl_case1, global_decl_case2, global_decl_case3, global_4, global_5, 69 | global_6] 70 | visitor = GlobalVisitor() 71 | for i, decl in enumerate(decls): 72 | tree: Tree = parser.parse(decl.encode("utf-8")) 73 | root_node: ASTNode = processor.visit(tree.root_node) 74 | visitor.traverse_node(root_node) 75 | pass -------------------------------------------------------------------------------- /tests/visitor_test/file_test.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.config import parser 2 | from code_analyzer.visitors.base_func_visitor import FunctionDefVisitor 3 | from code_analyzer.visitors.global_visitor import GlobalVisitor 4 | from tree_sitter import Tree 5 | from code_analyzer.schemas.ast_node import ASTNode 6 | from code_analyzer.preprocessor.node_processor import processor 7 | 8 | file = "../testcases/struct_case1.c" 9 | file2 = "../testcases/func_test2.c" 10 | file3 = "../testcases/error_file_vlc.c" 11 | file4 = "../testcases/macro_def_decl.c" 12 | file5 = "../testcases/error_struct_decl.c" 13 | file6 = "../testcases/error_global_var_decl.c" 14 | file7 = "../testcases/error_snd_global.c" 15 | 16 | if __name__ == '__main__': 17 | global_visitor = GlobalVisitor() 18 | visitor = FunctionDefVisitor() 19 | for i, file_name in enumerate([file, file2, file3, file4, file5, file6, 20 | file7]): 21 | code = open(file_name, 'rb').read() 22 | tree: Tree = parser.parse(code) 23 | root_node: ASTNode = processor.visit(tree.root_node) 24 | global_visitor.traverse_node(root_node) 25 | visitor.traverse_node(root_node) 26 | pass 27 | print() -------------------------------------------------------------------------------- /tests/visitor_test/macro_visitor_test.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.config import parser 2 | from code_analyzer.visitors.base_func_visitor import FunctionDefVisitor, LocalVarVisitor 3 | from code_analyzer.visitors.global_visitor import GlobalVisitor 4 | from code_analyzer.visitors.macro_visitor import MacroCallExpandVisitor, ICallVisitor 5 | 6 | from tree_sitter import Tree 7 | from code_analyzer.schemas.ast_node import ASTNode 8 | from code_analyzer.schemas.function_info import FuncInfo 9 | 10 | from typing import Dict, Tuple, List, Set, DefaultDict 11 | from code_analyzer.definition_collector import BaseInfoCollector 12 | from code_analyzer.preprocessor.node_processor import processor, NodeProcessor 13 | from code_analyzer.macro_expand import MacroCallExpandUtil 14 | 15 | new_processor = NodeProcessor(unwanted_node_type=set()) 16 | file = "../testcases/macro_expand/macro_test.c" 17 | file1 = "../testcases/macro_expand/macro_test1.c" 18 | file2 = "../testcases/macro_expand/macro_test2.c" 19 | file3 = "../testcases/macro_expand/macro_test3.c" 20 | file4 = "../testcases/macro_expand/macro_test4.c" 21 | file5 = "../testcases/macro_expand/macro_test5.c" 22 | file6 = "../testcases/macro_expand/macro_test6.c" 23 | file7 = "../testcases/macro_expand/macro_test7.c" 24 | file8 = "../testcases/macro_expand/macro_test8.c" 25 | file9 = "../testcases/macro_expand/macro_test9.c" 26 | file10 = "../testcases/macro_expand/macro_test10.c" 27 | file11 = "../testcases/macro_expand/macro_test11.c" 28 | file12 = "../testcases/macro_expand/macro_test12.c" 29 | file13 = "../testcases/macro_expand/macro_test13.c" 30 | file14 = "../testcases/macro_expand/macro_test14.c" 31 | file15 = "../testcases/macro_expand/macro_test15.c" 32 | file16 = "../testcases/macro_expand/macro_test16.c" 33 | file17 = "../testcases/macro_expand/macro_test17.c" 34 | file18 = "../testcases/macro_expand/macro_test18.c" 35 | file19 = "../testcases/macro_expand/macro_test19.c" 36 | 37 | 38 | call_expr_idx = { 39 | file: 2, 40 | file1: 3, 41 | file2: 0, 42 | file3: 1, 43 | file4: 1, 44 | file5: 5, 45 | file6: 3, 46 | file7: 2, 47 | file8: 4, 48 | file9: 0, 49 | file10: 0, 50 | file11: 3, 51 | file12: 0, 52 | file13: 0, 53 | file14: 0, 54 | file15: 1, 55 | file16: 2, 56 | file17: 1, 57 | file18: 0, 58 | file19: 0 59 | } 60 | 61 | def main(): 62 | cur_file = file13 63 | global_visitor = GlobalVisitor() 64 | func_visitor = FunctionDefVisitor() 65 | code = open(cur_file, 'rb').read() 66 | tree: Tree = parser.parse(code) 67 | root_node: ASTNode = processor.visit(tree.root_node) 68 | global_visitor.traverse_node(root_node) 69 | func_visitor.traverse_node(root_node) 70 | 71 | func_info_dict: Dict[str, FuncInfo] = func_visitor.func_info_dict 72 | func_key_2_declarator: Dict[str, str] = dict() 73 | func_key = list(func_info_dict.keys())[0] 74 | func_info = list(func_info_dict.values())[0] 75 | 76 | func_key_2_declarator[func_key] = func_info.raw_declarator_text 77 | local_var_visitor = LocalVarVisitor(global_visitor) 78 | local_var_visitor.traverse_node(func_info.func_body) 79 | 80 | expand_util = MacroCallExpandUtil(global_visitor.macro_func_bodies, global_visitor.macro_func_args, 81 | global_visitor.var_arg_macro_funcs) 82 | call_expr: ASTNode = func_info.func_body.children[call_expr_idx[cur_file]].children[0] 83 | code_text = expand_util.expand_macro_call(call_expr) 84 | 85 | 86 | macro_local_var_visitor = LocalVarVisitor(global_visitor) 87 | expand_call_tree: Tree = parser.parse(code_text.encode("utf-8")) 88 | expand_root_node: ASTNode = processor.visit(expand_call_tree.root_node) 89 | macro_local_var_visitor.traverse_node(expand_root_node) 90 | 91 | args: Set[str] = set(func_info.name_2_declarator_text.keys()) 92 | global_vars: Set[str] = set(global_visitor.global_var_2_declarator_text.keys()) 93 | local_vars: Set[str] = local_var_visitor.local_var_2_declarator_text.keys() | \ 94 | macro_local_var_visitor.local_var_2_declarator_text.keys() 95 | icall_visitor = ICallVisitor(global_vars, local_vars, args) 96 | icall_visitor.traverse_node(expand_root_node) 97 | pass 98 | 99 | 100 | 101 | if __name__ == '__main__': 102 | main() -------------------------------------------------------------------------------- /tests/visitor_test/typedecl_visitor.py: -------------------------------------------------------------------------------- 1 | from code_analyzer.config import parser 2 | from tree_sitter import Tree 3 | from code_analyzer.visitors.global_visitor import GlobalVisitor 4 | from code_analyzer.schemas.ast_node import ASTNode 5 | from code_analyzer.preprocessor.node_processor import processor 6 | 7 | def testTypeDecl(): 8 | file = "../testcases/type_def_test.c" 9 | tree: Tree = parser.parse(open(file, 'rb').read()) 10 | root_node: ASTNode = processor.visit(tree.root_node) 11 | globalVisitor = GlobalVisitor() 12 | globalVisitor.traverse_node(root_node) 13 | pass 14 | 15 | if __name__ == '__main__': 16 | testTypeDecl() --------------------------------------------------------------------------------