├── LICENSE
├── README.md
├── concept.md
├── image
    ├── README
    │   ├── image-20221101172355617.png
    │   └── image-20221101172359239.png
    └── concept
    │   ├── 20180808092854c7d4c799-e91c-408d-8e8a-2902b90de5c8.png
    │   ├── 220px-S-expression_tree.svg.png
    │   ├── image-20221017210725900.png
    │   ├── image-20221017220650594.png
    │   ├── image-20221017224231772.png
    │   └── image-20221019212544350.png
└── notes
    ├── A_Survey_of_Binary_Code_Similarity.md
    ├── Asm2Vec.md
    ├── BLEX.md
    ├── BinGo.md
    ├── BinGo_E.md
    ├── BinHunt.md
    ├── CoP.md
    ├── Esh.md
    ├── Gemini.md
    ├── Genius.md
    ├── Graph-based_comparison_of_executable_objects.md
    ├── How_Machine_Learning_is_Solving_the_BInary_Function_Similarity_Problem.md
    ├── InnerEye.md
    ├── Multi-MH.md
    ├── Order_Matters.md
    ├── SAFE.md
    ├── TEDEM.md
    ├── Tracy.md
    ├── discovRE.md
    └── image
        ├── A_Survey_of_Binary_Code_Similarity
            ├── image-20221014112852989.png
            └── image-20221015210819269.png
        ├── Asm2Vec
            ├── image-20221031200624966.png
            ├── image-20221031203211929.png
            └── image-20221031204510624.png
        ├── BLEX
            ├── image-20221027222637730.png
            ├── image-20221027222646562.png
            ├── image-20221027222821612.png
            ├── image-20221027223048621.png
            ├── image-20221027223600635.png
            └── image-20221028113440022.png
        ├── BinGo
            ├── image-20221108150148825.png
            ├── image-20221108150551485.png
            ├── image-20221108150610480.png
            └── image-20221108155628263.png
        ├── BinGo_E
            ├── image-20221111230259570.png
            ├── image-20221114154606899.png
            └── image-20221114155853617.png
        ├── BinHunt
            └── image-20221110155012164.png
        ├── CoP
            └── image-20221111163317832.png
        ├── Gemini
            ├── image-20221025200017843.png
            ├── image-20221025202453422.png
            └── image-20221025204716133.png
        ├── Genius
            ├── image-20221017104002392.png
            └── image-20221017172441347.png
        ├── Graph-based_comparison_of_executable_objects
            ├── image-20221030221707934.png
            ├── image-20221030221916849.png
            ├── image-20221030222302179.png
            └── image-20221030222310518.png
        ├── InnerEye
            ├── image-20221110164501342.png
            ├── image-20221111104900669.png
            └── image-20221111105534404.png
        ├── Multi-MH
            ├── image-20221019220132563.png
            └── image-20221021170403764.png
        ├── Order_Matters
            ├── image-20221116144757272.png
            └── image-20221116155208628.png
        ├── SAFE
            ├── image-20221115094019450.png
            └── image-20221115095432970.png
        ├── TEDEM
            └── image-20221026145548233.png
        ├── Tracy
            ├── image-20221108101452921.png
            └── image-20221108102940405.png
        └── discovRE
            ├── image-20221018195211933.png
            ├── image-20221019162340206.png
            ├── image-20221019171324901.png
            ├── image-20221019171737100.png
            └── image-20221019201526215.png


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # binary-similarity-learning
  2 | 
  3 | 二进制代码相似度分析（Binary Code Similarity Analysis）学习笔记
  4 | 
  5 | `[paper]`：论文发布页；`[note]`：论文笔记 ；`[github]`：github源码；`[dataset]`：数据集；`[model]`：算法模型
  6 | 
  7 | 方法名前的`*`表示该方法使用了动态分析
  8 | 
  9 | ## 基本概念解析
 10 | 
 11 | 已整理于[基本概念解析文档](./concept.md)，学习笔记中涉及相关概念位置均已设置超链接。
 12 | 
 13 | ## 综述 (review)
 14 | 
 15 | - A Survey of Binary Code Similarity (*WOS-Q1; 中科院-1区；2021*) [[paper]](https://dl.acm.org/doi/abs/10.1145/3446371) [[note]](./notes/A_Survey_of_Binary_Code_Similarity.md)
 16 |   - HAQ I U, CABALLERO J. A Survey of Binary Code Similarity [J]. ACM Comput Surv, 2021, 54(3): Article 51. 
 17 |   - 领域内常用方法的分类与概述，适合入门
 18 |   - 仅包含2019年及以前的文献
 19 | 
 20 | ## Binary Diffing
 21 | 
 22 | - Bindiff (*DIMVA2004*) [[paper]](https://www.researchgate.net/profile/Thomas-Dullien-3/publication/28356113_Structural_Comparison_of_Executable_Objects/links/568c0fb108ae197e426895bc/Structural-Comparison-of-Executable-Objects.pdf)
 23 |   - FLAKE H. Structural comparison of executable objects[C]//Proc. of the International GI Workshop on Detection of Intrusions and Malware & Vulnerability Assessment, number P-46 in Lecture Notes in Informatics.2004:161-174. 
 24 | - Graph-based comparison of executable objects (*SSTIC2005*) [[paper]](http://195.154.171.95/SSTIC05/Analyse_differentielle_de_binaires/SSTIC05-article-Flake-Graph_based_comparison_of_Executable_Objects.pdf) [[note]](./notes/Graph-based_comparison_of_executable_objects.md)
 25 |   - DULLIEN T, ROLLES R. Graph-based comparison of executable objects (english version) [J]. Sstic, 2005, 5(1): 3.
 26 | - BinHunt (*CCF-C；ICICS2008*) [[paper]](https://link.springer.com/chapter/10.1007/978-3-540-88625-9_16) [[note]](./notes/BinHunt.md)
 27 |   - GAO D, REITER M K, SONG D. BinHunt: Automatically Finding Semantic Differences in Binary Programs[C]//International Conference on Information and Communications Security. Berlin, Heidelberg:Springer Berlin Heidelberg,2008:238-255. 
 28 | 
 29 | 
 30 | ## Binary Similarity (one-to-one)
 31 | 
 32 | - *BLEX *(CCF-A; USENIX2014)* [[paper]](https://www.usenix.org/conference/usenixsecurity14/technical-sessions/presentation/egele) [[note]](./notes/BLEX.md)
 33 |   - EGELE M, WOO M, CHAPMAN P, et al. Blanket execution: Dynamic similarity testing for program binaries and components[C]//23rd USENIX Security Symposium (USENIX Security 14).2014:303-317. 
 34 | 
 35 | ## Binary Search (one-to-many)
 36 | 
 37 | - TEDEM (*CCF-B; ACSAC2014*)  [[paper]](https://dl.acm.org/doi/abs/10.1145/2664243.2664269) [[note]](./notes/TEDEM.md)
 38 |   - PEWNY J, SCHUSTER F, BERNHARD L, et al. Leveraging semantic signatures for bug search in binary programs[C]//Proceedings of the 30th Annual Computer Security Applications Conference.2014:406-415. 
 39 | - Tracy (*CCF-A；PLDI2014*) [[paper]](https://dl.acm.org/doi/abs/10.1145/2666356.2594343) [[github]](https://github.com/Yanivmd/TRACY) [[note]](./notes/Tracy.md)
 40 |   - DAVID Y, YAHAV E. Tracelet-based code search in executables[C]//Proceedings of the 35th ACM SIGPLAN Conference on Programming Language Design and Implementation. Edinburgh, United Kingdom:Association for Computing Machinery,2014:349–360. 10.1145/2594291.2594343.
 41 | - Multi-MH (*CCF-A；S&P2015*) [[paper]](https://ieeexplore.ieee.org/abstract/document/7163056) [[note]](./notes/Multi-MH.md)
 42 |   - PEWNY J, GARMANY B, GAWLIK R, et al. Cross-Architecture Bug Search in Binary Executables[C]//2015 IEEE Symposium on Security and Privacy.2015:709-724. 10.1109/SP.2015.49.
 43 | - BinGo (*CCF-A；FSE2016*) [[paper]](https://dl.acm.org/doi/10.1145/2950290.2950350) [[note]](./notes/BinGo.md)
 44 |   - CHANDRAMOHAN M, XUE Y, XU Z, et al. Bingo: Cross-architecture cross-os binary search[C]//Proceedings of the 2016 24th ACM SIGSOFT International Symposium on Foundations of Software Engineering.2016:678-689. 
 45 | - discovRE (*CCF-A；NDSS2016*)  [[paper]](https://www.ndss-symposium.org/wp-content/uploads/2017/09/discovre-efficient-cross-architecture-identification-bugs-binary-code.pdf) [[note]](./notes/discovRE.md)
 46 |   - ESCHWEILER S, YAKDAN K, GERHARDS-PADILLA E. discovRE: Efficient Cross-Architecture Identification of Bugs in Binary Code[C]//NDSS.2016
 47 | - Esh (*CCF-A；PLDI2016*) [[paper]](https://nlibvpn.bit.edu.cn/https/77726476706e69737468656265737421f4fb0f9d243d265f6c0f/doi/10.1145/2908080.2908126) [[github]](https://github.com/tech-srl/esh) [[note]](./notes/Esh.md)
 48 |   - DAVID Y, PARTUSH N, YAHAV E. Statistical similarity of binaries[C]//Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation. Santa Barbara, CA, USA:Association for Computing Machinery,2016:266–280. 10.1145/2908080.2908126.
 49 | - Genius (*CCF-A；CCS2016*) [[paper]](https://dl.acm.org/doi/abs/10.1145/2976749.2978370) [[github]](https://github.com/qian-feng/Gencoding) [[note]](./notes/Genius.md)
 50 |   - FENG Q, ZHOU R, XU C, et al. Scalable Graph-based Bug Search for Firmware Images[C]//Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security. Vienna, Austria:Association for Computing Machinery,2016:480–491. 10.1145/2976749.2978370.
 51 | - Gemini (*CCF-A；CCS2017*) [[paper]](https://dl.acm.org/doi/abs/10.1145/3133956.3134018) [[github]](https://github.com/Yunlongs/Gemini) [[note]](./notes/Gemini.md)
 52 |   - XU X, LIU C, FENG Q, et al. Neural Network-based Graph Embedding for Cross-Platform Binary Code Similarity Detection[C]//Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security. Dallas, Texas, USA:Association for Computing Machinery,2017:363–376. 10.1145/3133956.3134018.
 53 | - SAFE (*CCF-C；DIMVA2019*) [[paper]](https://link.springer.com/chapter/10.1007/978-3-030-22038-9_15) [[github]](https://github.com/gadiluna/SAFE) [[note]](./notes/SAFE.md)
 54 |   - MASSARELLI L, LUNA G A D, PETRONI F, et al. Safe: Self-attentive function embeddings for binary similarity[C]//International Conference on Detection of Intrusions and Malware, and Vulnerability Assessment.Springer,2019:309-329. 
 55 | 
 56 | - InnerEye (*CCF-A；NDSS2019*) [[paper]](https://www.ndss-symposium.org/ndss-paper/neural-machine-translation-inspired-binary-code-similarity-comparison-beyond-function-pairs/) [[model]](https://nmt4binaries.github.io/#model) [[note]](./notes/InnerEye.md)
 57 |   - ZUO F, LI X, YOUNG P, et al. Neural Machine Translation Inspired Binary Code Similarity Comparison beyond Function Pairs[C]//Network and Distributed Systems Security (NDSS) Symposium 2019.2019
 58 | - Order Matters: Semantic-Aware Neural Networks for Binary Code Similarity Detection (*CCF-A；AAAI2020*) [[paper]](https://ojs.aaai.org/index.php/AAAI/article/view/5466) [[note]](./notes/Order_Matters.md)
 59 |   - YU Z, CAO R, TANG Q, et al. Order Matters: Semantic-Aware Neural Networks for Binary Code Similarity Detection[C]//Proceedings of the AAAI Conference on Artificial Intelligence.2020:1145-1152. 10.1609/aaai.v34i01.5466.
 60 | 
 61 | 
 62 | ## Plagiarism Detection
 63 | 
 64 | - CoP (*CCF-A；FSE2016*) [[paper]](https://dl.acm.org/doi/abs/10.1145/2635868.2635900) [[note]](./notes/CoP.md)
 65 |   - LUO L, MING J, WU D, et al. Semantics-based obfuscation-resilient binary code similarity comparison with applications to software plagiarism detection[C]//Proceedings of the 22nd ACM SIGSOFT International Symposium on Foundations of Software Engineering.2014:389-400. 
 66 | 
 67 | ## Clone Search
 68 | 
 69 | - Kam1n0 *(CCF-A; KDD2016)*  [[paper]](https://dl.acm.org/doi/abs/10.1145/2939672.2939719) [[github]](https://github.com/McGill-DMaS/Kam1n0-Community)
 70 |   - DING S H H, FUNG B C M, CHARLAND P. Kam1n0: MapReduce-based Assembly Clone Search for Reverse Engineering[C]//Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining.2016:461-470. 
 71 |   - 侧重于新哈希算法和MapReduce方案的设计
 72 | - Asm2Vec (*CCF-A；S&P2019*) [[paper]](https://ieeexplore.ieee.org/abstract/document/8835340) [[note]](./notes/Asm2Vec.md)
 73 |   - DING S H H, FUNG B C M, CHARLAND P. Asm2Vec: Boosting Static Representation Robustness for Binary Clone Search against Code Obfuscation and Compiler Optimization[C]//2019 IEEE Symposium on Security and Privacy (SP).2019:472-489. 10.1109/SP.2019.00003.
 74 | - \*BinGo-E (*WOS-Q1; 中科院-1区；2019*) [[paper]](https://ieeexplore.ieee.org/document/8338420) [[note]](./notes/BinGo_E.md)
 75 |   - XUE Y, XU Z, CHANDRAMOHAN M, et al. 2019. Accurate and Scalable Cross-Architecture Cross-OS Binary Code Search with Emulation. IEEE Transactions on Software Engineering [J], 45: 1125-1149.
 76 | 
 77 | ## Measurement Study
 78 | 
 79 | - BinKit (*WOS-Q1; 中科院-1区；2022*) [[paper]](https://ieeexplore.ieee.org/abstract/document/9813408) [[dataset]](https://github.com/SoftSec-KAIST/binkit) [[github]](https://github.com/SoftSec-KAIST/tiknib)
 80 |   - KIM D, KIM E, CHA S K, et al. 2022. Revisiting Binary Code Similarity Analysis using Interpretable Feature Engineering and Lessons Learned. IEEE Transactions on Software Engineering [J]: 1-23.
 81 |   - 分析了非语义特征（句法、结构特征）在二进制相似性分析中的作用
 82 | 
 83 | - How machine learning is solving the binary function similarity problem *(CCF-A; USENIX2022)* [[paper]](https://www.usenix.org/conference/usenixsecurity22/presentation/marcelli) [[github]](https://github.com/Cisco-Talos/binary_function_similarity) [[note]](./notes/How_Machine_Learning_is_Solving_the_BInary_Function_Similarity_Problem.md)
 84 |   - MARCELLI A, GRAZIANO M, UGARTE-PEDRERO X, et al. How machine learning is solving the binary function similarity problem[C]//31st USENIX Security Symposium (USENIX Security 22).2022:2099-2116. 
 85 |   - 构建开源数据集，将现有方法在同一基准下进行测试
 86 |   - 阅读相关论文时可作为分析用参考，精读对应部分
 87 | 
 88 | ## Dataset
 89 | 
 90 | - Esh Dataset [[dataset]](https://github.com/nimrodpar/esh-dataset-1523)
 91 |   - 包含3015个二进制函数，覆盖8类实际漏洞
 92 | 
 93 | 
 94 | ## 专有名词及其缩写
 95 | 
 96 | | 缩写 | 名词全称                            | 中文释义           |
 97 | | ---- | ----------------------------------- | ------------------ |
 98 | | ACFG | Attributed Control Flow Graph       | 属性控制流图       |
 99 | | ALSH | Adaptive Locality Sensitive Hashing | 自适应局部敏感哈希 |
100 | | ASLR | Address Space Layout Randomization  | 空间地址随机化     |
101 | | BB   | Basic Block                         | 基本块             |
102 | | CDF  | Cumulative Distribution Function    | （累计）分布函数   |
103 | | CFG  | Control Flow Graph                  | 控制流图           |
104 | | CG   | Call Graph                          | 函数调用图         |
105 | | GI   | Graph Isomorphism                   | 图同构             |
106 | | IR   | Intermediate Representation         | 中间表示           |
107 | | IVL  | Intermediate Verification Language  | 中间验证语言       |
108 | | LCS  | Longest Common Subsequence          | 最长公共子序列     |
109 | | LSH  | Locality Sensitive Hashing          | 局部敏感哈希       |
110 | | MCS  | Maximum Common Subgraph             | 最大公共子图       |
111 | | MLP  | Multilayer Perceptron               | 多层感知机         |
112 | | MRR  | Mean Reciprocal Rank                | 平均倒数排名       |
113 | | PDG  | Program Dependence Graph            | 程序依赖图         |
114 | | TED  | Tree Edit Distance                  | 树编辑距离         |
115 | 
116 | # to-do list
117 | 
118 | - [ ] 略读文献：Graph-based Comparison of Executable Objects.
119 | - [x] 略读文献：BinHunt
120 | - [ ] 略读文献：Binary Function Clustering Using Semantic Hashes.
121 | - [ ] 略读文献：Fast Location of Similar Code Fragments Using Semantic ’Juice’
122 | - [ ] 略读文献：Discovering Potential Binary Code Re-use.
123 | - [x] 基本概念：S-Expression
124 | - [ ] 阅读文献：*Rendezvous
125 | - [ ] 阅读：https://googleprojectzero.blogspot.com/2018/12/searching-statically-linked-vulnerable.html
126 | - [ ] 阅读文献：Binary Similarity Detection Using Machine Learning.
127 | - [x] 阅读文献：Safe: Self-attentive function embeddings for binary similarity
128 | - [ ] 阅读文献：Investigating Graph Embedding Neural Networks with Unsupervised Features Extraction for Binary Analysis. 
129 | - [ ] 阅读文献：Codecmr: Cross-modal retrieval for function-level binary source code matching.
130 | - [ ] 阅读文献：Trex: Learning execution semantics from micro-traces for binary similarity
131 | - [ ] 阅读文献：Library functions identification in binary code by using graph isomorphism testings
132 | - [ ] 阅读文献：Cross-Architecture Bug Search in Binary Executables
133 | - [ ] 阅读文献：*Binary Code Clone Detection across Architectures and Compiling Configurations
134 | - [ ] 阅读文献：BinClone: Detecting Code Clones in Malware
135 | - [ ] 阅读文献：Compiler-agnostic function detection in binaries
136 | - [x] 阅读文献：Accurate and Scalable Cross-Architecture Cross-OS Binary Code Search with Emulation
137 | - [ ] 阅读文献：***Learning Program-Wide Code Representations for Binary Diffing
138 | - [ ] 阅读文献：*jTrans: jump-aware transformer for binary code similarity detection
139 | - [ ] 阅读文献：*[Semantic Learning and Emulation Based Cross-Platform Binary Vulnerability Seeker](https://www.semanticscholar.org/paper/4c16bf0be5ee1eff6f3749a56fea215cc812ba96)
140 | - [x] 阅读文献：[Revisiting Binary Code Similarity Analysis using Interpretable Feature Engineering and Lessons Learned](https://www.semanticscholar.org/paper/3121b307c1e1e893e001ac8f7742e8b3f87ea966)
141 | - [ ] 阅读文献：**BinSim: Trace-based Semantic Binary Diffing via System Call Sliced Segment Equivalence Checking
142 | - [ ] 阅读文献：**Similarity of binaries through re-optimization
143 | - [x] 阅读文献：semantics-based obfuscation-resilient binary code similarity comparison with applications to software plagiarism detection.
144 | - [ ] 略读文献：*value-based program characterization and its application to software plagiarism detection
145 | - [ ] 略读文献：Finer-grained control flow integrity for stripped binaries（bingo前置）
146 | - [ ] 略读文献：Towards automatic software lineage inference（经典文献）
147 | - [ ] 略读文献：*Binslayer: accurate comparison of binary executables
148 | - [ ] 阅读文献：*[Extracting Conditional Formulas for Cross-Platform Bug Search](https://www.semanticscholar.org/paper/c75d9f1ff9177b26b7681876d7ee810d14401a49)
149 | - [ ] 阅读文献：***Patch based vulnerability matching for binary programs
150 | - [ ] 略读文献：iBinHunt
151 | - [ ] 阅读文献：**[Function Representations for Binary Similarity](https://www.semanticscholar.org/paper/7c688e25d85326f3c0eea0e75c38d25f3c8c2f2e)
152 | - [ ] 阅读文献：[VulSeeker: A Semantic Learning Based Vulnerability Seeker for Cross-Platform Binary](https://www.semanticscholar.org/paper/6d361ffdcd75ebc75d2dd295dc30460c982ee7fb)
153 | - [ ] 阅读文献：*[$\alpha$ Diff: Cross-Version Binary Code Similarity Detection with DNN](https://www.semanticscholar.org/paper/4e9f86c4da00682276752778dd74642280ebe086)
154 | - [ ] 阅读文献：*[FirmUp: Precise Static Detection of Common Vulnerabilities in Firmware](https://www.semanticscholar.org/paper/b1ef9380982946089b7d619af1fc0555e2209110)
155 | - [ ] 阅读文献：[Binary Code Clone Detection across Architectures and Compiling Configurations](https://www.semanticscholar.org/paper/65823f9f70b00c245d283e9284a03258e68aeaff)
156 | - [ ] 阅读文献：[BinMatch: A Semantics-Based Hybrid Approach on Binary Code Clone Analysis](https://www.semanticscholar.org/paper/37564c8be9e7afdfb92a2886f669867dbe78e501)
157 | - [ ] 略读文献：cross-architecture binarysemantics understanding via similar code comparison
158 | - [ ] 阅读文献：[A Semantics-Based Hybrid Approach on Binary Code Similarity Comparison](https://www.semanticscholar.org/paper/69e34b0c43addcbfc799403bea5c428b3b74d6dd)
159 | - [ ] 阅读文献：Binary Code Similarity Detection
160 | - [ ] 阅读文献：[Codee: A Tensor Embedding Scheme for Binary Code Search](https://www.semanticscholar.org/paper/02f5f13de274c237042ef5df6ed14ff16639722d)
161 | - [ ] 阅读文献：A  deep learning  approach  to  program  similarity
162 | - [ ] 阅读文献：Hybrid firmware  analysis  for  known  mobile  and  IoT  security  vulnerabilities
163 | - [ ] 略读文献：Testing intermediate representations for binary analysis
164 | - [ ] 阅读文献：Similarity  of  binaries  across  optimization  levels  and  obfuscation,
165 | - [ ] 略读文献：Bert: Pre-training of deep bidirectional transformers for language understanding （基本概念）
166 | - [ ] 略读文献：Neural message passing for quantum chemistry（基本概念）
167 | - [ ] 阅读文献：[PalmTree: Learning an Assembly Language Model for Instruction Embedding](https://www.semanticscholar.org/paper/7d0c1cb43e8b398ad5b064e74f00802d4d585be6)
168 | - [ ] 阅读文献：Multi-relational Instruction Association Graph for Cross-architecture Binary Similarity Comparison
169 | - [ ] 阅读文献：[Hierarchical Attention Graph Embedding Networks for Binary Code Similarity against Compilation Diversity](https://www.semanticscholar.org/paper/c91ffc484fbd77c302b21dbb50398a11085f9d19)
170 | - [ ] 阅读文献：[Investigating Graph Embedding Neural Networks with Unsupervised Features Extraction for Binary Analysis](https://www.semanticscholar.org/paper/64a0f24f726fc7d67988ee6b58997c21a1aaa2d1)
171 | - [ ] 阅读文献：[EnBinDiff: Identifying Data-only Patches for Binaries](https://ieeexplore.ieee.org/iel7/8858/4358699/09645381.pdf)
172 | 
173 | 


--------------------------------------------------------------------------------
/concept.md:
--------------------------------------------------------------------------------
  1 | # 基本概念解析
  2 | 
  3 | ### P, NP, NP-Complete, NP-Hard问题
  4 | 
  5 | *标签：算法复杂度*
  6 | 
  7 | P：polynomial-time，指有多项式时间解的问题
  8 | 
  9 | NP：non-deterministic polynomial-time，非确定性多项式时间，指不确定是否能在多项式时间内得到解
 10 | 
 11 | NP-Complete：还没有找到多项式时间解，但是可以在多项式时间内被验证（得到一个答案，确定它是不是一个解）的问题
 12 | 
 13 | NP-Hard：还没有找到多项式时间解，也不确定能不能再多项式时间内被验证的问题
 14 | 
 15 | ![image-20221019212544350](./image/concept/image-20221019212544350.png)
 16 | 
 17 | ### 幂集（power set）
 18 | 
 19 | *标签：集合*
 20 | 
 21 | 集合的幂集定义为由该集合全部子集为元素构成的集合
 22 | 
 23 | 若 $\displaystyle S$ 是集合 $\displaystyle \{a,b,c\}$ ，则 $\displaystyle S$ 的全部子集如下：
 24 | 
 25 | -  $\displaystyle \varnothing $（空集）
 26 | -  $\displaystyle \{a\}$ 
 27 | -  $\displaystyle \{b\}$ 
 28 | -  $\displaystyle \{c\}$ 
 29 | -  $\displaystyle \{a,b\}$ 
 30 | -  $\displaystyle \{a,c\}$ 
 31 | -  $\displaystyle \{b,c\}$ 
 32 | -  $\displaystyle \{a,b,c\}$ 
 33 | 
 34 | 因此 $\displaystyle S$ 的幂集为
 35 | 
 36 | $${\mathcal  {P}}(S)=\{{\displaystyle \varnothing }, {\displaystyle \{a\}}, {\displaystyle \{b\}}, {\displaystyle \{c\}}, {\displaystyle \{a,b\}}, {\displaystyle \{a,c\}}, {\displaystyle \{b,c\}}, {\displaystyle \{a,b,c\}}{\displaystyle \}\,\!}$$
 37 | 
 38 | ### Jaccard相似度
 39 | 
 40 | *标签：统计学*
 41 | 
 42 | $$J(A,B)=\frac{|A\cap B|}{|A\cup B|}$$
 43 | 
 44 | 例如集合 $A=\{s_1,s_2,s_3,s_4,s_5\}$ 和 $B=\{s_1,s_5,s_6,s_7\}$ 的 Jaccard 相似度：
 45 | 
 46 | $$J(A,B)=\frac{|A\cap B|}{|A\cup B|}=\frac{\{s_1,s_5\}}{\{s_1,s_2,s_3,s_4,s_5,s_6,s_7\}}=\frac{2}{7}$$
 47 | 
 48 | ### MinHash
 49 | 
 50 | *标签：统计学；哈希算法*
 51 | 
 52 | 当两个集合很大时，计算Jaccard距离会很耗资源，若从集合A和B的合集中随机抽取一个元素X，那么X既属于A又属于B的概率是 $\frac{|A\cap B|}{|A\cup B|}$ ，即Jaccard相似度
 53 | 
 54 | 基于这一原理进行如下操作：首先找一个具有很好的均匀性的随机映射hash函数，对各自集合的每一个元素作哈希运算得到哈希值集合，并在各自哈希值集合里找出最小哈希值，记为 $h_{min}(A)$ 和 $h_{min}(B)$ ，称为最小哈希操作。由于随机的等概率性，那么 $h_{min}(A)$ 和 $h_{min}(B)$ 相等的概率等于 $\frac{|A\cap B|}{|A\cup B|}$ 。
 55 | 
 56 | 从另一角度解释：选取一个哈希函数进行最小哈希操作相当于选取集合A和B合集中的一个元素的选取，由于随机性，这个最小哈希值对应的元素属于两个集合的交集的概率即是 Jaccard 相似度。
 57 | 
 58 | 假如我们使用k个不同的hash函数，其中k是固定的整数，使用这k个函数对这两个集合进行最小哈希操作，记y是使得 $h_{min}(A)$ 和 $h_{min}(B)$ 相等的哈希函数个数，那么 $\frac{y}{k}$ 可以作为集合A和B的 Jaccard 相似度的估计。且此估计是无偏估计，而且可以通过增加k来减少估计方差。
 59 | 
 60 | 但是计算多个哈希函数的代价太高，因此另一种实现方法是仅使用单一的哈希函选出其中的多个值来估计。例如对集合A和B使用一个哈希函数得到各自m个最小哈希值作为各自的特征集合，那么他们特征集合的Jaccard相似度可以作为原两个集合间的Jaccard相似度的估计。
 61 | 
 62 | ### 皮尔逊积矩相关系数（Pearson product-moment correlation coefficient）
 63 | 
 64 | *标签：统计学*
 65 | 
 66 | 两个变量之间的皮尔逊相关系数定义为两个变量之间的协方差和标准差的商：
 67 | 
 68 | $$\rho_{X,Y}=\frac{\mathrm{cov}(X,Y)}{\sigma_X \sigma_Y}=\frac{\mathrm{E}[(X-\mu_X)(Y-\mu_Y)]}{\sigma_X \sigma_Y}$$
 69 | 
 70 | ### 平均倒数排名（Mean Reciprocal Rank，MRR）
 71 | 
 72 | *标签：统计学*
 73 | 
 74 | 评价搜索算法的通用评价指标，设搜索请求序列为 $Q$ ，对于第 $i$ 个搜索请求，正确结果在搜索结果中的排位为 $\mathrm{rank_i}$ ，则
 75 | 
 76 | $$\displaystyle\mathrm{MRR}=\frac{1}{|Q|}\sum_{i=1}^{|Q|}\frac{1}{\mathrm{rank_i}}$$ 
 77 | 
 78 | 例：
 79 | 
 80 | | 请求  | 搜索结果             | 正确结果 | 排名 | 倒数排名 |
 81 | | ----- | -------------------- | -------- | ---- | -------- |
 82 | | cat   | catten, cati, cats   | cats     | 3    | 1/3      |
 83 | | torus | torii, tori, toruses | tori     | 2    | 1/2      |
 84 | | virus | viruses, virii, viri | viruses  | 1    | 1        |
 85 | 
 86 | $$\mathrm{MRR}= (1/3 + 1/2 + 1)/3 = 11/18 \approx 0.61$$
 87 | 
 88 | ### 二分图
 89 | 
 90 | *标签：图论*
 91 | 
 92 | 如果图中点可以被分为两组，并且使得所有边都跨越组的边界，则这就是一个二分图。
 93 | 
 94 | 准确地说：把一个图的顶点划分为两个不相交集 $U$ 和 $V$ ，使得每一条边都分别连接 $U$ 、 $V$ 中的顶点。如果存在这样的划分，则此图为一个二分图。
 95 | 
 96 | 图 1 是一个二分图。为了清晰，可以转化为图 2 的形式。
 97 | 
 98 | ![image-20221017224231772](./image/concept/image-20221017224231772.png)
 99 | 
100 | **匹配**：在图论中，一个「匹配」（matching）是一个边的集合，其中任意两条边都没有公共顶点。例如，图 3、图 4 中红色的边就是图 2 的匹配。
101 | 
102 | **最大匹配**：一个图所有匹配中，所含匹配边数最多的匹配，称为这个图的最大匹配。图 4 是一个最大匹配，它包含 4 条匹配边。
103 | 
104 | **完美匹配**：如果一个图的某个匹配中，所有的顶点都是匹配点，那么它就是一个完美匹配。图 4 是一个完美匹配。显然，完美匹配一定是最大匹配（完美匹配的任何一个点都已经匹配，添加一条新的匹配边一定会与已有的匹配边冲突）。但并非每个图都存在完美匹配。
105 | 
106 | ### 介数中心性
107 | 
108 | *标签：图论；网络理论*
109 | 
110 | 全连接网络图，其中任意两个节点均至少存在一个最短路径，每个节点的介数中心性即为这些最短路径穿过该节点的次数。
111 | 
112 | 节点 $v$ 的介数中心性可表达为以下公式：
113 | 
114 | $$g(v)=\sum_{s\ne v \ne t} \frac{\sigma_{st}(v)}{\sigma_{st}}$$
115 | 
116 | 其中 $\sigma_{st}$ 是节点 $s$ 到节点 $t$ 的最短路径的数量，而 $\sigma_{st}(v)$ 是这些路径经过 $v$ 的次数。
117 | 
118 | 下图中，每个点的介数中心性从数值最低（红色）到最高（蓝色）着色。
119 | 
120 | ![image-20221017220650594](./image/concept/image-20221017220650594.png)
121 | 
122 | ### S-expression
123 | 
124 | *标签：计算机编程*
125 | 
126 | Lisp语言源码的书写形式，通常使用二叉树来实现S-expression。
127 | 
128 | `a = b + c` 的S-expression为 `(= a (+ b c))` 。
129 | 
130 |  S-expression `(* 2 (+ 3 4))` 的二叉树表示如下图所示。
131 | 
132 | ![img](./image/concept/220px-S-expression_tree.svg.png)


--------------------------------------------------------------------------------
/image/README/image-20221101172355617.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/README/image-20221101172355617.png


--------------------------------------------------------------------------------
/image/README/image-20221101172359239.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/README/image-20221101172359239.png


--------------------------------------------------------------------------------
/image/concept/20180808092854c7d4c799-e91c-408d-8e8a-2902b90de5c8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/concept/20180808092854c7d4c799-e91c-408d-8e8a-2902b90de5c8.png


--------------------------------------------------------------------------------
/image/concept/220px-S-expression_tree.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/concept/220px-S-expression_tree.svg.png


--------------------------------------------------------------------------------
/image/concept/image-20221017210725900.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/concept/image-20221017210725900.png


--------------------------------------------------------------------------------
/image/concept/image-20221017220650594.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/concept/image-20221017220650594.png


--------------------------------------------------------------------------------
/image/concept/image-20221017224231772.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/concept/image-20221017224231772.png


--------------------------------------------------------------------------------
/image/concept/image-20221019212544350.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/image/concept/image-20221019212544350.png


--------------------------------------------------------------------------------
/notes/A_Survey_of_Binary_Code_Similarity.md:
--------------------------------------------------------------------------------
  1 | # A Survey of Binary Code Similarity
  2 | 
  3 | ## 基本概念
  4 | 
  5 | ### 编译流程
  6 | 
  7 | 下图为广义编译流程，在基本编译流程上增加了源代码之间/二进制之间的转义（混淆）
  8 | 
  9 | 二进制代码相似度分析的难点主要在于归并图中灰色部分所带来的“程序异构性”
 10 | 
 11 | <img src="./image/A_Survey_of_Binary_Code_Similarity/image-20221014112852989.png" alt="image-extended_complication_process"  />
 12 | 
 13 | ### 二进制代码相似度检测应用场景
 14 | 
 15 | - bug搜索：在检索范围内确定与已知bug相似的程序片段。*变种：跨架构bug搜索*
 16 | - 恶意软件检测：寻找与已知恶意软件同一家族的恶意软件
 17 | - 恶意软件聚类：将相似的恶意软件聚类为簇
 18 | - 恶意软件族谱生成：已知属于同一家族的恶意软件，构建此家族恶意软件的演变过程
 19 | - 补丁生成与分析：通过比较分析同一软件不同版本间的差异，确定补丁内容。*下游任务：可进一步用于后续补丁分析/迁移等*
 20 | - 代码窃取检测：通过相似度分析，判断是否存在代码抄袭/许可证滥用等
 21 | 
 22 | ## 二进制代码相似度分析概述
 23 | 
 24 | ### 句法相似度分析
 25 | 
 26 | 分析二进制表示（指令序列）的相似度。常用做法：提取指令序列-序列标准化-分析指令序列相似度
 27 | 
 28 | 滑动窗口：获取定长指令序列的常见方法，使用时需考虑窗口大小和步长。当步长为1时，窗口大小为n的滑动窗口被称为n-gram
 29 | 
 30 | 相似度计算方法：哈希、嵌入、对齐（对齐两段指令序列，计算差异值）
 31 | 
 32 | ### 语法相似度分析
 33 | 
 34 | 分析两段二进制程序是否有相似的运行效果，常用方法如下：
 35 | 
 36 | - 指令分类（instruction classification）：将指令按照功能分类，使用功能分类的组合来表示一个基本块的行为特征
 37 | 
 38 | - 输入输出对（input-output pairs）：直观上认为具有相同输入输出的代码片段在功能上等价。因此，此类方法仅关注代码执行完毕后的最终状态，给予代码片段多种不同的输入，观测各代码片段的输出是否完全相同
 39 | 
 40 | - 符号式（symbolic formula）：将汇编指令转化为符号式，利用符号式分析相似度
 41 | 
 42 |   例：`add %eax,%ebx --> EBX2 = EAX + EBX1`  
 43 | 
 44 |   符号式分析方法：
 45 | 
 46 |   1. 定理证明器：使用定理证明器证明两个符号式等价。*缺陷：只支持一对一分析，在一对多任务上耗时较长*
 47 | 
 48 |   2. 语义哈希：在标准化和化简后，判断符号式的哈希是否相同
 49 |   3. 图距离：将符号式看作树，通过计算图/树之间的编辑距离判断相似度
 50 | 
 51 | ### 结构相似度分析
 52 | 
 53 | 分析二进制程序图表示之间的相似度。其分析粒度介于句法相似度与语法相似度之间。
 54 | 
 55 | 常见的图表示：控制流图（control flow graph，CFG）、过程间控制流图（inter-procedural control flow graph，ICFG）、调用图（callgraph，CG）
 56 | 
 57 | 常用方法：
 58 | 
 59 | - 同构（子）图分析：直接判断两个图是否为同构图，算法复杂性较高。一些改良算法如下：
 60 |   1. 贪心算法：首先确认一组相同节点，通过不断分析邻居阶段扩张匹配的范围
 61 |   2. 回溯法：当配对效果不好或需要修复错误配对时，回溯分析问题
 62 | - “优化”方法：将相似度分析问题转化为优化问题。设置代价函数，计算两个图之间的最小代价
 63 | - k-子图匹配：将图拆分为子图，每个子图至多有k个相连的节点。为每个子图生成指纹，将具有最多子图匹配的两个图视为相似
 64 | - 路径相似度：从控制流图中提取一组执行路径，通过分析执行路径的相似度推导二进制程序相似度
 65 | - 图嵌入：为每个图生成一个特征向量，比较特征向量之间的相似度
 66 | 
 67 | ### 基于特征的相似度分析
 68 | 
 69 | 从二进制程序中提取特征向量或一组特征，相似的二进制程序会具有相似的特征。通过计算特征间的相似度指标（Jaccard指数、点积、欧式距离、余弦距离等）判断相似程度
 70 | 
 71 | 特征提取的两种方式：
 72 | 
 73 | - 特征选择-特征编码
 74 | - 图嵌入
 75 | 
 76 | ![image-alternative_method_for_feature-based_similarity](./image/A_Survey_of_Binary_Code_Similarity/image-20221015210819269.png)
 77 | 
 78 | ### 哈希算法
 79 | 
 80 | 哈希：将任意大小的数据映射为固定大小的一种函数。
 81 | 
 82 | 二进制代码相似度分析中常用的哈希算法：
 83 | 
 84 | - 密码学哈希：用于定位完全一致的输入（输入相差一点可能导致哈希值有很大差异）
 85 | - 局部敏感哈希：对于相似的输入会有相似的哈希值
 86 | - 可执行文件哈希：输入为可执行文件，哈希过程只对文件中的一部分进行（如可执行文件头部的部分内容等）。其目标在于为同一恶意软件的不同变种输出相同的哈希值
 87 | 
 88 | ### 跨架构相似度分析
 89 | 
 90 | 主要通过以下两种手段：
 91 | 
 92 | - 将不同架构的二进制程序转化为架构无关的中间表示
 93 | - 使用基于特征的相似度分析方法
 94 | 
 95 | ### 标准化
 96 | 
 97 | 去除具有相似功能汇编指令之间的句法差异，合并为相同的形式
 98 | 
 99 | - 移除操作数：只保留操作码（助记符），删除操作数
100 | 
101 | - 操作数标准化：将操作数替换为更广义的形式。例：寄存器：REG；内存：MEM；立即数：IMM
102 | 
103 |   `add %ecx,%edx --> add REG,REG` 
104 | 
105 | - 操作码（助记符）标准化：使用同一符号表示多个助记符
106 | 
107 | ### 难点及未来研究方向
108 | 
109 | - 分析小型二进制片段
110 | 
111 |   许多二进制代码相似度分析方法在考虑基本块时设置了块中指令数量的最小下限（指令数小于下限的基本块不考虑）
112 | 
113 |   难点：
114 | 
115 |   - 小型二进制片段可能只包含一个基本块，无法从结构角度进行分析
116 | 
117 |   - 具有不同语义行为的基本块可能有完全相同的语法信息
118 | 
119 | - 源代码-二进制相似度分析
120 | 
121 | - 数据相似度分析
122 | 
123 |   有时不同版本程序间的差异仅仅体现在数据的不同，如改变了机器学习分类器的参数
124 | 
125 |   数据结构隐含部分函数功能信息
126 | 
127 | - 语义关系
128 | 
129 |   定位具有相关性的一些函数，如加密函数与解密函数
130 | 
131 | - 抗混淆
132 | 


--------------------------------------------------------------------------------
/notes/Asm2Vec.md:
--------------------------------------------------------------------------------
  1 | # Asm2Vec
  2 | 
  3 | | Target（目标）     | 第四类克隆（语义相似）检测                                   |
  4 | | :----------------- | :----------------------------------------------------------- |
  5 | | Input（输入）      | 二进制程序                                                   |
  6 | | Process（处理）    | 1. 使用边覆盖率和随机游走策略构建序列，训练asm2vec嵌入模型<br />2. 使用模型将函数转化为嵌入向量<br />3. 计算被搜索函数的嵌入向量<br />4. 通过比较嵌入向量获取相近函数列表 |
  7 | | Output（输出）     | 与输入函数相近的函数的列表                                   |
  8 | | Problem（问题）    | 解决的问题：<br />1. 现有方法较少考虑二进制程序的语义信息<br />2. 基于图结构的方法难以应对混淆 |
  9 | | Condition（条件）  | 所有函数为同一指令架构                                       |
 10 | | Difficulty（难点） | 从具有差异的汇编代码中提取有价值的语义信息                   |
 11 | | Level（水平）      | S&P2019                                                      |
 12 | 
 13 | ## 算法原理
 14 | 
 15 | ### 算法原理图
 16 | 
 17 | ![image-20221031200624966](./image/Asm2Vec/image-20221031200624966.png)
 18 | 
 19 | 
 20 | 
 21 | ### 嵌入模型
 22 | 
 23 | #### PV-DM模型
 24 | 
 25 | 论文使用的嵌入模型基于PV-DM模型实现，PV-DM模型是对原始word2vec模型的扩展。
 26 | 
 27 | 模型使用滑动窗口遍历整个段落，使用段落编号和滑动窗口中单词编号结合生成嵌入向量
 28 | 
 29 |  
 30 | 
 31 | ![image-20221031203211929](./image/Asm2Vec/image-20221031203211929.png)
 32 | 
 33 | 记语料库为 $T$ ，段落为 $p$ ，语句为 $s$ ，单词为 $w$ ，则模型希望最大化下述log概率：
 34 | 
 35 | $$\displaystyle \sum_p^T\sum_s^p\sum_{t=k}^{|s|-k}log\mathrm{P}(w_t|p,w_{t-k},\dots,w_{t+k})$$
 36 | 
 37 | #### Asm2Vec模型
 38 | 
 39 | ![image-20221031204510624](./image/Asm2Vec/image-20221031204510624.png)
 40 | 
 41 | 该模型将每个函数映射为一个向量 $\overrightarrow{\theta}_{f_s}\in\mathbb{R}^{2\times d}$ ，其中 $d$ 为用户选定的参数。
 42 | 
 43 | 算法将函数库中的每个函数都视作由多个句子 $\mathcal{S}$ 组成（句子的前后顺序随机），每个句子由一系列指令 $\mathcal{I}$ 构成，每个指令由多个操作码 $\mathcal{A}$ 和一个操作数 $\mathcal{P}$ 组成
 44 | 
 45 | 将每一行指令中的操作码和操作数都视为token，映射为嵌入向量 $\overrightarrow{v}_t\in\mathbb{R}^d$ ；此外，将指令映射为另一个用于预测的向量 $\overrightarrow{v'}_t\in\mathbb{R}^{2\times d}$  
 46 | 
 47 | 每次考虑前后共三条指令，模型希望最大化下述log概率：
 48 | 
 49 | $$\displaystyle \sum_{f_s}^{RP}\sum_{seq_i}^{\mathcal{S}(f_s)}\sum_{in_j}^{\mathcal{I}(seq_i)}\sum_{t_c}^{\mathcal{T}(in_j)}log\mathrm{P}(t_c|f_s,in_{j-1},in_{j+1})$$
 50 | 
 51 | 建模某个指令需要考虑前后两条指令的信息，前后指令信息表示为操作码嵌入拼接上操作数嵌入的均值，具体计算方式为：
 52 | 
 53 | $$\delta(in_j,f_s)=\frac{1}{3}(\overrightarrow{\theta}_{f_s}+\mathcal{CT}(in_{j-1})+\mathcal{CT}(in_{j+1}))$$
 54 | 
 55 | $$\mathcal{CT}(in)=\overrightarrow{v}_{p(in)}||\frac{1}{|\mathcal{A}(in)|}\displaystyle \sum_t^{\mathcal{A(in)}}\overrightarrow{v}_{t_b}$$
 56 | 
 57 | 由此，log概率中的 $\mathrm{P}(t_c|f_s,in_{j-1},in_{j+1})$ 可被写为 $\mathrm{P}(t_c|\delta(in_j,f_s))$ 
 58 | 
 59 | 进一步地：
 60 | 
 61 | $$\mathrm{P}(t_c|\delta(in_j,f_s))=\mathrm{P}(\overrightarrow{v'}_{t_c}|\delta(in_j,f_s))=\frac{f(\overrightarrow{v'}_{t_d},\delta(in_j,f_s))}{\sum_d^Df(\overrightarrow{v'}_{t_c},\delta(in_j,f_s))}\\f(\overrightarrow{v'}_{t_c},\delta(in_j,f_s))=Uh((\overrightarrow{v'}_{t_c})^T\times\delta(in_j,f_s))$$
 62 | 
 63 | 其中 $Uh$ 表示对向量中的每个值做sigmod变换。
 64 | 
 65 | 遍历整个单词空间 $D$ 的计算复杂度太高，使用 $k$ 个样本采样来代替，于是log概率可以写为：
 66 | 
 67 | $$log\ \mathrm{P}(t_c|\delta(in_j,f_s))\approx log\ f(\overrightarrow{v'}_{t_c},\delta(in_j,f_s))+\displaystyle \sum_{i=1}^k\mathbb{E}_{t_d\sim P_n(t_c)}(log\ f(-1\times \overrightarrow{v'}_{t_d},\delta(in_j,f_s)))$$
 68 | 
 69 | 其中 $d\ne c$ ，随后使用梯度下降法即可优化嵌入向量生成
 70 | 
 71 | ### 汇编函数建模
 72 | 
 73 | 汇编函数与文本段落不同，不能直接用于嵌入。
 74 | 
 75 | #### 被调函数扩展
 76 | 
 77 | 造成控制流图差异的一大原因是优化过程中的函数内联。
 78 | 
 79 | 本文参考BinGo的方法，计算 $\alpha(f_c)=outdegree(f_c)/(outdegree(f_c)+indegree(f_c))$ ，当超过阈值0.01时主动内联调用的函数
 80 | 
 81 | 为了避免内联导致原函数更接近被内联的函数，计算下述指标 $\delta(f_s,f_c)=length(f_c)/length(f_s)$ ，仅内联小于0.6或短于10个指令的函数
 82 | 
 83 | #### 边覆盖率
 84 | 
 85 | 对控制流图中的边做随机采样，最终确保所有边都被采样到。将采样后边对应的汇编代码合并为一个新的序列。
 86 | 
 87 | #### 随机游走
 88 | 
 89 | 使用随机游走策略在控制流图中选择序列。在这一过程中，包含多个分支的关键节点更容易被覆盖到。
 90 | 
 91 | ## 实验设计
 92 | 
 93 | 1. 对比实验，搜索不同优化选项（O0和O3）下的二进制文件
 94 | 2. 对比实验，搜索混淆条件下的二进制文件。三种混淆方式：
 95 |    1. Bogus Control Flow Graph (BCF)：添加大量无关的基本块和分支
 96 |    2. Control Flow Flattening (FLA)：将控制流图扁平化（例：使用一个统一的switch函数）
 97 |    3. Instruction Substitution (SUB)：将一部分指令按照规则替换为等价的指令
 98 | 3. 对比实验，结合1和2中的数据
 99 | 4. 实际漏洞检测实验
100 | 
101 | ## 笔者总结
102 | 
103 | 算法特点：
104 | 
105 | - 从文本领域迁移相关方法，提取二进制程序语义信息
106 | - 基于边覆盖率和随机游走构建汇编序列用于嵌入模型训练
107 | 
108 | 可能存在的问题：
109 | 
110 | - 序列选择对于算法效果的影响？
111 | - 混淆对于序列上下文的影响？
112 | - 不支持跨架构分析
113 | - 缺少可解释性
114 | 


--------------------------------------------------------------------------------
/notes/BLEX.md:
--------------------------------------------------------------------------------
  1 | # BLEX
  2 | 
  3 | | Target（目标）     | 判断两个二进制函数是否语义相似                               |
  4 | | :----------------- | :----------------------------------------------------------- |
  5 | | Input（输入）      | 待比较的两个二进制函数                                       |
  6 | | Process（处理）    | 动态分析，记录函数执行过程中的特征。<br />特征相同的函数被判定为语义相似 |
  7 | | Output（输出）     | 两个二进制函数的相似度                                       |
  8 | | Problem（问题）    | 解决的问题：<br />1. 现有方法面对不同编译器/优化选项生成的二进制文件时效果不佳<br />2. 现有方法无法进行程序间分析 |
  9 | | Condition（条件）  | 1. 对比的二进制函数不包含debug信息<br />2. 二进制未被打包    |
 10 | | Difficulty（难点） | 设计动态分析方法，保证动态分析覆盖率                         |
 11 | | Level（水平）      | USENIX2014                                                   |
 12 | 
 13 | ## 问题分析
 14 | 
 15 | 1. 语义相似的函数在二进制程序中不一定句法相似（受编译选项影响），案例见下图
 16 | 
 17 |    ![image-20221027222646562](./image/BLEX/image-20221027222646562.png)
 18 | 
 19 | 2. 判断两个二进制函数是否相似需要考虑程序间调用，因为部分优化选项会把函数调用优化掉（inline）
 20 | 
 21 | 3. 不能仅考虑控制流图结构信息，案例见下图
 22 | 
 23 | ![image-20221027223048621](./image/BLEX/image-20221027223048621.png)
 24 | 
 25 | ## 算法原理
 26 | 
 27 | ### 算法原理图
 28 | 
 29 | ![image-20221027223600635](./image/BLEX/image-20221027223600635.png)
 30 | 
 31 | ### 动态分析
 32 | 
 33 | #### 环境设置
 34 | 
 35 | 环境：为所有寄存器和内存空间提前设置固定的值，一套这样的“固定值”称为一个环境
 36 | 
 37 | #### 动态分析
 38 | 
 39 | 每次从函数 $f$ 的第一个未被执行的指令开始，重复进行动态分析，直到函数的所有指令都被执行。具体流程如下：
 40 | 
 41 | 1. 指定执行环境
 42 | 1. 提取函数中未被执行过的指令，构建未执行指令集
 43 | 1. 选择未执行指令集中地址最低的指令开始执行，将被执行过的命令从未执行指令集删除
 44 | 1. 记录执行过程中的特征信息
 45 | 1. 重复3、4，直至未执行指令集为空
 46 | 
 47 | #### 实际实施
 48 | 
 49 | - 通过从程序的指定地址载入实现从任意指令开始执行
 50 | - 设置 `LD_BIND_NOW` 环境变量，使程序在开始执行前就载入所有动态链接库，避免动态载入导致无法记录函数特征
 51 | - 使用Pin框架监控动态分析结果
 52 | 
 53 | #### 动态分析停止时机
 54 | 
 55 | - 程序执行到函数尾。通过记录栈深度实现，函数调用使深度+1，返回使深度-1。当深度为0时，执行结束。
 56 | - 程序抛出异常
 57 | - 执行了指定数量的指令
 58 | - 超时
 59 | 
 60 | #### 记录的特征
 61 | 
 62 | - 从堆中读取的值
 63 | - 向堆中写入的值
 64 | - 从栈中读取的值
 65 | - 从栈中写入的值
 66 | - 通过plt表调用的函数
 67 | - 在执行过程中进行的系统调用
 68 | - 函数执行完毕后的返回值
 69 | 
 70 | ### 语义相似度计算
 71 | 
 72 | 使用[Jaccard相似度](../concept.md#jaccard相似度)计算特征间相似性，设 $v_i(f,env_k)$ 为函数 $f$ 在环境 $k$ 下生成的特征， $w$ 为权重，则环境 $k$ 下的语义相似度可记为
 73 | 
 74 | $$sim_k(f,g)=\displaystyle\sum_{i=1}^N(w_i\times \frac{|v_i(f,env_k)\cap v_i(g,env_k)|}{|v_i(f,env_k)\cup v_i(g,env_k)|})/\sum_{l=1}^Nw_l$$
 75 | 
 76 | 最终的相似度是多个环境下相似度的平均值
 77 | 
 78 | $$\displaystyle sim(f,g)=\frac{1}{K}\sum_k sim_k(f,g)$$
 79 | 
 80 | #### 权重设计
 81 | 
 82 | 进行7次小实验，每次仅使用7个特征中的一个特征，结果如下
 83 | 
 84 | ![image-20221028113440022](./image/BLEX/image-20221028113440022.png)
 85 | 
 86 | 使用支持向量机计算每个权重
 87 | 
 88 | ## 笔者总结
 89 | 
 90 | 算法特点：
 91 | 
 92 | - 使用动态分析应对跨架构、跨编译选项相似度分析
 93 | - 提出一种能够保证覆盖率的动态分析方法
 94 | 
 95 | 可能存在的问题：
 96 | 
 97 | - 算法复杂度高，需考虑执行过程中的许多特例情况
 98 | 
 99 | - 匹配准确率不高，只有80左右，猜测与”环境“设计有很大关系，本文中设置的环境距离实际场景较远，且环境数量较少
100 | 


--------------------------------------------------------------------------------
/notes/BinGo.md:
--------------------------------------------------------------------------------
 1 | # BinGo
 2 | 
 3 | | Target（目标）     | 已知某个二进制函数，在其他二进制文件中检索具有与之相似的函数 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 待搜索二进制代码库；已知二进制函数                           |
 6 | | Process（处理）    | 1. 函数内联：根据决策规则将部分函数的汇编代码内联<br />2. 函数筛选：设置规则缩小待检索范围<br />3. 函数匹配：提取函数片段，提取片段语义特征并以此进行函数匹配 |
 7 | | Output（输出）     | 与已知二进制函数相似的函数列表                               |
 8 | | Problem（问题）    | 需解决的问题：<br />1. 算法需要对于跨架构、操作系统和编译器的二进制程序鲁棒<br />2. 需要考虑算法的语义信息<br />3. 能使用于大范围搜索 |
 9 | | Condition（条件）  | 程序可被正常反编译                                           |
10 | | Difficulty（难点） | 需应对编译器优化所带来的控制流图结构差异                     |
11 | | Level（水平）      | FSE2016                                                      |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221108150148825](./image/BinGo/image-20221108150148825.png)
18 | 
19 | ### 选择性内联
20 | 
21 | #### 函数调用模式
22 | 
23 | 定义了6种常见的函数调用模式，见下图
24 | 
25 | ![image-20221108150610480](./image/BinGo/image-20221108150610480.png)
26 | 
27 | 1. 直接调用库函数，需内联
28 | 2. 调用用户自定义的函数，需内联
29 | 3. 调用用户自定义函数，该函数调用了几个库函数，几乎不调用用户自定义函数。此类函数多为用户撰写的需复用的工具函数，需内联
30 | 4. 与3类似，区别在于完全不调用用户自定义函数，需内联
31 | 5. 与4类似，但是只调用终止函数（exit/abort等），此类函数主要起终止作用，语义价值较少，不内联
32 | 6. 用户自定义函数，调用大量其他自定义函数，此时认为该函数起调度器作用，语义价值较少，不内联
33 | 
34 | #### 内联决策
35 | 
36 | 设置函数耦合分数（Function Coupling Score）：
37 | 
38 | $$\alpha=\lambda_e/(\lambda_e+\lambda_a)$$
39 | 
40 | 其中 $\lambda_e$ 代表被调用函数调用的自定义函数数量； $\lambda_a$ 代表调用被调用函数的自定义函数数量
41 | 
42 | 该值越小，说明函数越应该被内联
43 | 
44 | ### 函数筛选
45 | 
46 | 共设计三种筛选策略
47 | 
48 | 1. 仅查找与待搜索函数调用完全相同库函数的函数。缺点是无法跨操作系统，且不支持名称不同但是功能相似的库函数
49 | 2. 在1的基础上，提取库函数的高层次语义特征（见下图），使用图中level1级特征解决1中问题。缺点是无法涵盖用户自定义的相似功能函数以及在优化过程中被编译器内联的函数。
50 | 3. 将函数抽象为具体的语义类别（如mov被分类至数据移动；push被分类至栈操作），依据语义类别进行筛选
51 | 
52 | ![image-20221108155628263](./image/BinGo/image-20221108155628263.png)
53 | 
54 | 实际筛选过程中，给三个筛选策略分类不同的权重（策略1的权重最高，3的权重最低），使用Jacarrd距离计算相似度，乘以权重后排序，提取排名靠前的用于下一步。
55 | 
56 | ### 函数匹配
57 | 
58 | #### 函数片段提取
59 | 
60 | 使用[Tracy](./Tracy.md)的方法提取函数片段
61 | 
62 | #### 语义特征提取
63 | 
64 | 使用I/O对表示语义特征。为避免语义提取过程中忽视了变量的前后关联性，额外考虑变量的前后状态，使用Z3证明器生成I/O对。
65 | 
66 | #### 路径剪枝
67 | 
68 | - 除去无效路径（程序片段）：如果Z3证明器无法确定某个变量在执行前后的状态，则认为是无效路径
69 | - 除去编译器优化相关路径（程序片段）：如果编译器相关指令在路径中占比过大，则剪枝
70 | 
71 | #### 相似度计算
72 | 
73 | 算法不固定路径长度（函数片段的长度可为1，2，3）
74 | 
75 | 使用Jaccard相似度评判两个函数的相似程度
76 | 
77 | ## 笔者总结
78 | 
79 | 算法特点：
80 | 
81 | - 通过主动内联应对编译器优化问题
82 | - 增强变量的前后关联性
83 | - 不固定路径长度（选用1，2，3长度）
84 | 
85 | 可能存在的问题：
86 | 
87 | - 算法效果受制于定义证明器的性能
88 | - 算法预处理/搜索时间较长
89 | - 考虑借用文本领域方法替代定理证明器
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/notes/BinGo_E.md:
--------------------------------------------------------------------------------
 1 | # BinGo-E
 2 | 
 3 | | Target（目标）     | 已知某个二进制函数，在其他二进制文件中检索具有与之相似的函数 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 待搜索二进制代码库；已知二进制函数                           |
 6 | | Process（处理）    | 1. 提取高层次语义特征、结构特征进行初步筛选<br />2. 选择性内联函数<br />3. 低层次语义特征提取（模拟器）<br />4. 结合高层次和低层次特征进行相似性分析 |
 7 | | Output（输出）     | 与已知二进制函数相似的函数列表                               |
 8 | | Problem（问题）    | 解决的问题：<br />1. 使用低层次语义特征，忽视函数间语义关系<br />2. 基于符号执行的算法运算开销大 |
 9 | | Condition（条件）  | 程序可被正常反编译                                           |
10 | | Difficulty（难点） | 多层次特征的综合利用                                         |
11 | | Level（水平）      | 1区；2019                                                    |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221111230259570](./image/BinGo_E/image-20221111230259570.png)
18 | 
19 | ### 特征提取
20 | 
21 | ![image-20221114155853617](./image/BinGo_E/image-20221114155853617.png)
22 | 
23 | #### 3D-CFG（结构特征）
24 | 
25 | 为CFG中每个节点设置一个三元组 $(x,y,z)$ 其中 $x$ 代表节点在CFG中的序号； $y$ 代表节点的出度； $z$ 代表节点的循环深度
26 | 
27 | 设 $e(p,q)$ 为节点 $p$ 和 $q$ 之间的边， $w_p$ 为基本块中的汇编指令数量，则某个节点的“质心” $c$ 可表示为：
28 | 
29 | $$\displaystyle c_i=\frac{\sum_{e(p,q)\in 3D-CFG}(w_pi_p+w_qi_q)}{w},where\ i\in{x,y,z}$$
30 | 
31 | $$w=\sum_{e(p,q)\in 3D-CFG}(w_p+w_q)$$
32 | 
33 | ##### 加权质心
34 | 
35 | 在原本 $w$ 的基础上额外考虑函数调用数量 $N$
36 | 
37 | $$w'=w+N$$
38 | 
39 | 然后使用 $w'$ 重新计算 $c$
40 | 
41 | ##### 质心差异度（CDD）
42 | 
43 | $$\displaystyle CDD(\overrightarrow{c_1},\overrightarrow{c_2})=\max(\frac{|c_{1x}-c_{2x}|}{c_{1x}+c_{2x}},\frac{|c_{1y}-c_{2y}|}{c_{1y}+c_{2y}},\frac{|c_{1z}-c_{2z}|}{c_{1z}+c_{2z}},\frac{|w_1-w_2|}{w_1+w_2})$$
44 | 
45 | ##### 函数差异度（FDD）
46 | 
47 | 质心差异度和加权质心差异度中的最大值
48 | 
49 | #### 高层次语义特征
50 | 
51 | 论文中提取的高层次语义特征如图所示
52 | 
53 | ![image-20221114154606899](./image/BinGo_E/image-20221114154606899.png)
54 | 
55 | - 操作码类型：给不同的操作码分类，记录每一类出现几次
56 | - 系统调用标签：给系统调用分类，记录下调用的系统调用类型
57 | - 系统调用顺序
58 | - 函数输出值
59 | - 使用的局部变量
60 | - 操作码xu'lie
61 | 
62 | #### 低层次语义特征
63 | 
64 | 提取类似[BLEX](./BLEX.md#记录的特征)中的语义特征，该特征使用模拟器分析获得
65 | 
66 | ### 选择性函数内联
67 | 
68 | 使用与[BinGo](./BinGo.md#选择性内联)相同的内联策略
69 | 
70 | ### 相似度计算
71 | 
72 | 高维语义特征：使用Jaccard相似度计算
73 | 
74 | 结构特征： $1-FDD$
75 | 
76 | 低维语义特征：与BinGo相同的计算方式
77 | 
78 | ## 笔者总结
79 | 
80 | 算法特点：
81 | 
82 | - 增加结构特征和高层次语义特征（跨架构时不使用结构特征）
83 | - 使用模拟器替代定理证明器
84 | 
85 | 可能存在的问题：
86 | 
87 | - 使用Jacarrd相似度进行相似性计算，偏向于规则判断，可能在精度上还有提升空间
88 | - 可以考虑结合一些机器学习方法
89 | - 低层次语义相似的要求可能略微苛刻
90 | 
91 | 


--------------------------------------------------------------------------------
/notes/BinHunt.md:
--------------------------------------------------------------------------------
 1 | # BinHunt
 2 | 
 3 | | Target（目标）     | 对比分析两个二进制程序的相同/不同部分                        |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 待比较的两个二进制程序                                       |
 6 | | Process（处理）    | 1. 反编译，转化未中间表示<br />2. 提取控制流图、函数调用图<br />3. 计算基本块相似度，在此基础上发掘函数的最大公共子图 |
 7 | | Output（输出）     | 输入两个二进制程序的函数配对结果及对应的置信度               |
 8 | | Problem（问题）    | 解决的问题：<br />1. 现有方法未利用二进制语义信息            |
 9 | | Condition（条件）  | 输入的两个二进制程序确定为相似程序                           |
10 | | Difficulty（难点） | 设计算法完成从粗粒度到细粒度的二进制差分                     |
11 | | Level（水平）      | ICICS2008                                                    |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221110155012164](./image/BinHunt/image-20221110155012164.png)
18 | 
19 | ### 基本块相似度分析
20 | 
21 | 使用符号执行（定理证明器）判断两个基本块涉及的寄存器和变量的数值在执行完毕后是否一致
22 | 
23 | ### 函数相似度分析
24 | 
25 | 在完成基本块相似度分析的条件下，发掘函数的最大公共子图


--------------------------------------------------------------------------------
/notes/CoP.md:
--------------------------------------------------------------------------------
 1 | # CoP
 2 | 
 3 | | Target（目标）     | 判断程序之间是否存在抄袭行为                                 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 待比较的两个二进制程序                                       |
 6 | | Process（处理）    | 1. 利用定理证明器完成基本块级相似度比较<br />2. 从源函数提取代码片段<br />3. 利用最长公共子序列算法计算目标函数和各代码片段的相似度（公共子序列长度）<br />4. 加权平均得到整体代码相似度 |
 7 | | Output（输出）     | 二进制相似度得分                                             |
 8 | | Problem（问题）    | 现有方法无法应对基本块合并/缺失等问题                        |
 9 | | Condition（条件）  | 单一架构；一对一分析                                         |
10 | | Difficulty（难点） | 设计弱控制流图路径依赖的代码片段匹配算法                     |
11 | | Level（水平）      | FSE2014                                                      |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221111163317832](./image/CoP/image-20221111163317832.png)
18 | 
19 | ### 基本块级相似度比较
20 | 
21 | 基本块等价的定义：
22 | 
23 | 两个基本块中输入数量较少的那一个可以在另一个基本块中找到与之等价的输入输出
24 | 
25 | 通过定理证明器完成比较。
26 | 
27 | ### 路径相似度比较
28 | 
29 | 1. 首先从输入程序中提取代码片段（路径）
30 | 2. 使用最长公共子序列算法计算目标程序与该代码片段的相似度（该算法支持应对基本块的缺失、增加等）
31 | 
32 | 回溯处理：如果输入函数中存在连续的基本块没法匹配，则将其合并为一个重新尝试匹配。部分解决基本块切分/合并带来的问题
33 | 
34 | ### 函数级相似度计算
35 | 
36 | 根据代码片段长度为该代码片段的相似度加权（越长权值越大），计算平均路径相似度
37 | 
38 | 


--------------------------------------------------------------------------------
/notes/Esh.md:
--------------------------------------------------------------------------------
 1 | # Esh
 2 | 
 3 | | Target（目标）     | 已知某个二进制函数，在其他二进制文件中检索具有与之相似的函数 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 待搜索二进制代码库；已知二进制函数                           |
 6 | | Process（处理）    | 1. 将二进制函数拆分为片段<br />2. 根据输入输出和中间变量判断两个代码片段是否相同<br />3. 基于代码片段的相似度综合判断二进制函数的相似度 |
 7 | | Output（输出）     | 与已知二进制函数相似的函数列表                               |
 8 | | Problem（问题）    | 解决的问题：<br />1. 算法建立在程序控制流图基本不变的前提上（对控制流图结构敏感），跨编译器分析效果不佳<br />2. 算法分析粒度较粗（函数级/程序级）相似性分析误报率高 |
 9 | | Condition（条件）  | 程序未被混淆                                                 |
10 | | Difficulty（难点） | 1. 代码片段拆分<br />2. 不同代码片段间高效率的匹配分析       |
11 | | Level（水平）      | PLDI2016                                                     |
12 | 
13 | ## 算法原理
14 | 
15 | 利用图像领域中子图匹配的思想，如果一段二进制代码能够用另一段二进制代码的片段构成，则认为两段代码相似
16 | 
17 | ### 拆分为代码片段
18 | 
19 | 将汇编代码转换为中间验证语言（IVL），对函数中的每一个变量做程序切片，获得代码片段
20 | 
21 | ### 代码片段间比较
22 | 
23 | 1. 假设两个代码片段的输入等价
24 | 2. 设置判断代码片段输出等价的断言
25 | 3. 使用程序验证器检查断言，计算等价的变量
26 | 
27 | #### 匹配程度计算
28 | 
29 | - 程序状态 $\sigma$ ：一个对 $(l,values)$ ，表示一组变量在程序第 $l$ 行的值为 $values$ 
30 | 
31 | - 程序记录 $\pi$ ：描述程序一次运行的一个程序状态序列 $\langle\sigma_0,\dots,\sigma_n\rangle$ ，程序所有可能的记录集合表示为 $[\![P]\!]$ 
32 | 
33 | - 变量关联 $\gamma$ ：如果两个程序状态之间存在变量存在相同的值，则可将变量关联
34 | 
35 |   - 例： $values_q=\{x\mapsto3,y\mapsto4\},values_t=\{a\mapsto4\}$，则变量关联为 $\{y\mapsto a\}$ 
36 | 
37 | - 程序状态/记录等价 $\equiv$ ：程序状态中的所有变脸都能匹配/程序记录中最后一个状态等价
38 | - 程序片段等价：需满足两个条件：1）片段输入能实现关联；2）与输入相同的所有程序记录等价
39 | - 程序状态间的VCP（variable containment proportion，VCP）：关联变量占全部变量的比例
40 |   - $\displaystyle VCP(\sigma_q,\sigma_t)\triangleq\frac{|\gamma_{max}|}{|\sigma_q|}$
41 | - 程序记录间的VCP：记录最后一个状态之间的VCP
42 | - 程序片段间的VCP：所有匹配程序记录中关联的最大变量数占所有变量的比例
43 |   - $\displaystyle VCP(s_q,s_t)\triangleq \frac{\max\{|\gamma|\big{|}\forall(\pi_q,\pi_t)\in(s_q,s_t):\pi_q\equiv\pi_t\}}{|Var(s_q)|}$
44 |   - 注意：VCP非对称，即将两个参数交换位置结果可能不同
45 | 
46 | 
47 | #### 实际实现过程
48 | 
49 | 将代码转换为BoogieIVL中间语言，使用Boogie程序验证器检查断言
50 | 
51 | #### 匹配重要性计算
52 | 
53 | 不同代码片段的重要程度不同（有些代码片段是通用代码，不能很好表征函数特征），代码片段匹配的重要性用局部置信度（local evidence score，LES）表示。
54 | 
55 | $$\displaystyle LES(s_q|t)=\log\frac{\max_{s_t\in t}Pr(s_q|s_t)}{Pr(s_q|H_0)}$$
56 | 
57 | 论文使用sigmoid函数近似某一函数中代码片段出现的概率（sigmoid函数的中心设置至 $x=0.5$ ，$k$ 取10）
58 | 
59 | $$\displaystyle Pr(s_q|s_t)\triangleq g(VCP(s_q,s_t))=1/(1+e^{-k(VCP(s_q,s_t)-0.5)})$$
60 | 
61 | 而 $Pr(s_q|H_0)$ 表示在随机函数中找到与被搜索代码片段匹配的代码片段的概率，设 $T$ 为待搜索二进制代码库：
62 | 
63 | $$\displaystyle Pr(s_q|H_0)=\frac{\sum_{s_t\in T}Pr(s_q|s_t)}{|T|}$$
64 | 
65 | ### 代码片段相似度上升为函数相似度
66 | 
67 | 根据LES计算全局置信度（global evidence score，GES）
68 | 
69 | $$\displaystyle GES(q|t)=\sum_{s_q\in q}LES(s_q|t)=\sum_{s_q\in q}\log\frac{\max_{s_t\in t}Pr(s_q|s_t)}{Pr(s_q|H_0)}$$
70 | 
71 | 


--------------------------------------------------------------------------------
/notes/Gemini.md:
--------------------------------------------------------------------------------
 1 | # Gemini
 2 | 
 3 | | Target（目标）     | 已知某个bug，在其他二进制文件中检索具有相同问题的函数        |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 二进制程序（函数级）                                         |
 6 | | Process（处理）    | 1. 利用改进后的structure2vec方法完成图嵌入<br />2. 使用孪生网络训练嵌入模型<br />3. 依据嵌入向量完成相似度计算 |
 7 | | Output（输出）     | 与输入函数相近的函数的列表                                   |
 8 | | Problem（问题）    | 解决的问题：<br />1. 现有算法嵌入过程效率低（“密码本”生成依赖图匹配算法）<br />2. 运行开销与“密码本”大小正相关，而小的“密码本”难以保证算法精度 |
 9 | | Condition（条件）  | 需做区分的程序在控制流图上需有明显差异                       |
10 | | Difficulty（难点） | 使用合适的嵌入方法替代二分图匹配过程                         |
11 | | Level（水平）      | CCS2017                                                      |
12 | 
13 | ## 算法原理
14 | 
15 | ### 图嵌入网络
16 | 
17 | 在Structure2vec的基础上进行优化改造而成
18 | 
19 | #### 基本Structure2vec方法
20 | 
21 | 其核心是一个嵌套的运算过程，设图 $g$ 的节点 $v$ 的特征向量为 $x_v$ ，邻居节点为 $\mathcal{N}(v)$ ，则嵌套运算过程可表示为：
22 | 
23 | $$\mu_v^{(t+1)}=\mathcal{F}(x_v,\displaystyle\sum_{u\in \mathcal{N}(v)}\mu_u^{(t)}), \forall v\in \mathcal{V}$$
24 | 
25 |  其中， $\mu_{v}^{(0)}$ 为全0向量， $\mathcal{F}$ 为一个非线性映射函数。嵌套过程按照拓扑关系进行，某个节点的特征向量会在嵌套过程中不断向后传播
26 | 
27 | 非线性映射函数的各项参数通过学习获得
28 | 
29 | #### 本文修改后的方法
30 | 
31 | 本文使用神经网络替代公式中的 $\mathcal{F}$ ：
32 | 
33 | $$\mathcal{F}(x_v,\displaystyle\sum_{u\in \mathcal{N}(v)}\mu_u^{(t)})=\tanh(W_1x_v+\sigma(\displaystyle\sum_{u\in \mathcal{N}(v)}\mu_u))$$
34 | 
35 | 其中 $\sigma$ 是非线性转移函数，论文使用一个n层的全连接替代：
36 | 
37 | $$\sigma(l)=\underbrace{P_1\times \mathrm{ReLU}(P_2\times \cdots\mathrm{ReLU}(P_n l))}_{n\ \mathrm{levels}}$$
38 | 
39 | ![image-20221025204716133](./image/Gemini/image-20221025204716133.png)
40 | 
41 | ![image-20221025200017843](./image/Gemini/image-20221025200017843.png)
42 | 
43 | #### 默认超参数
44 | 
45 | 嵌入向量大小 $p=64$ ，嵌入深度 $n=2$ ，嵌套次数 $T=5$ 
46 | 
47 | ACFG的特征在[Genius](./Genius.md)的基础上删除了[介数中心性](../concept.md#介数中心性)特征。
48 | 
49 | ### 使用孪生网络训练网络参数
50 | 
51 | 孪生网络的网络结构如下图所示，图中两个嵌入网络结构相同，共享参数。孪生网络输出为两个嵌入网络输出的余弦距离。
52 | 
53 | 当输入的两个样本为使用同一段代码在不同条件下编译而来时，标签为+1；否则标签为-1。
54 | 
55 | 该模型支持预训练（使用上述策略）和重训练（可适当修改标签计算方式）
56 | 
57 | ![image-20221025202453422](./image/Gemini/image-20221025202453422.png)
58 | 
59 | ## 笔者总结
60 | 
61 | 算法特点：
62 | 
63 | - 使用神经网络替代传统图匹配算法进行嵌入
64 | - 利用孪生神经网络进行训练，解决相似度分析任务标签问题
65 | - 支持模型预训练和重训练
66 | 
67 | 可能存在的问题：
68 | 
69 | - 无法区分控制流图结构上相似的函数（如补丁前和补丁后）
70 | - 图嵌入依赖提取的特征进行嵌入，并非真正意义上的“端到端”
71 | - 弱化图连接/结构特征
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/notes/Genius.md:
--------------------------------------------------------------------------------
  1 | # Genius——<u>G</u>raph <u>En</u>coding for B<u>u</u>g <u>S</u>earch
  2 | 
  3 | | Target（目标）     | 已知某个bug，在其他物联网固件中检索具有相同问题的固件        |
  4 | | :----------------- | :----------------------------------------------------------- |
  5 | | Input（输入）      | 二进制程序（函数级）                                         |
  6 | | Process（处理）    | 1. 特征提取：从二进制中提取属性控制流图（ACFG）<br />2. “密码本”生成：利用二分图匹配方法计算图间相似度，将聚类后簇的几何中心作为编码基准<br />3. 特征编码：使用VLAD嵌入完成ACFG在高维空间中的编码<br />4. 在线搜索：使用局部敏感哈希对编码后的特征进行搜索 |
  7 | | Output（输出）     | 与输入函数相近的函数的列表                                   |
  8 | | Problem（问题）    | 解决的问题：<br />1. bug搜索过程由多次一对一相似度匹配构成，搜索效率低<br />2. 现有方法的相似度分析主要依赖图结构的相似度匹配，面对跨架构搜索问题时准确率低 |
  9 | | Condition（条件）  | 程序可被正常反编译                                           |
 10 | | Difficulty（难点） | 如何构造包含二进制程序结构与语义信息的特征编码               |
 11 | | Level（水平）      | CCS2016                                                      |
 12 | 
 13 | ## 算法原理
 14 | 
 15 | ### 算法原理图
 16 | 
 17 | ![image-approach_overview](./image/Genius/image-20221017104002392.png)
 18 | 
 19 | ### 特征提取
 20 | 
 21 | #### 属性控制流图（Attributed Control Flow Graph）
 22 | 
 23 | 一个有向图 $G=\langle V,E,\phi \rangle$ ，其中 $V$ 是基本块集合， $E\subseteq V \times V$ 表示基本块之间边的集合， $\phi : V\rightarrow \sum$ 为基本块到属性集合的映射函数。
 24 | 
 25 | #### ACFG生成方法
 26 | 
 27 | 在CFG的基础上添加每个基本块的属性特征，属性特征由统计特征和结构特征两部分组成，特征列表如下表所示。
 28 | 
 29 | 部分特征说明：
 30 | 
 31 | - No. of offspring：控制流图中该节点的子节点个数
 32 | - Betweeness：节点的[介数中心性](../concept.md#介数中心性)
 33 | 
 34 | ![image-basic-block_level_features](./image/Genius/image-20221017172441347.png)
 35 | 
 36 | ### “密码本”生成
 37 | 
 38 | “密码本”（codebook）是一个有限离散集合，其中每个元素均为一个聚类簇的几何中心。
 39 | 
 40 | #### ACFG相似度计算
 41 | 
 42 | ##### [二分图](../concept.md#二分图)匹配
 43 | 
 44 | 利用两个ACFG $G_1$ 与 $G_2$ 构造二分图 $G_{bp}=(\hat V,\hat E)$ ，其中 $G_1$ 、 $G_2$ 的节点（代码块）分属于 $\hat V$ 的两个不相交集。
 45 | 
 46 | 使用两个ACFG之间的最小匹配损失计算这两个图之间的相似度，一条匹配边的损失函数记为：
 47 | 
 48 | $$cost(v,\hat v)=\frac{\sum_i\alpha_i|a_i-\hat a_i|}{\sum_i\alpha_i\max(a_i,\hat a_i)}$$
 49 | 
 50 | 其中 $a_i$ 和 $\hat a_i$ 分别表示进行匹配的两个基本块的第 $i$ 个特征， $\alpha$ 为上表中的权重。
 51 | 
 52 | ##### 标准化
 53 | 
 54 | 越复杂的ACFG之间通常会具有越大的匹配损失，因此需要对匹配损失做标准化。设置一个空的ACFG $\phi$ ，则ACFG间的相似度可记为：
 55 | 
 56 | $$\kappa (g_1,g_2)=1-\frac{cost(g_1,g_2)}{\max(cost(g_1,\phi),cost(g_2,\phi))}$$
 57 | 
 58 | ##### 实现
 59 | 
 60 | 实际实现过程中，使用[discovre](./discovRE.md)的方法计算各个特征的权重。
 61 | 
 62 | #### 聚类
 63 | 
 64 | 使用谱聚类（核化）对ACFG进行聚类。为减少时间开销，对数据集进行了采样；采样时按照ACFG大小进行分级，避免因采样偏差影响聚类效果。
 65 | 
 66 | 聚类数量人工选定为16。
 67 | 
 68 | ### 特征编码
 69 | 
 70 | 定义 $NN(g_i)$ 为ACFG $g_i$ 最近的 $n$ 个聚类簇几何中心，其中 $\mathcal{C}$ 表示“密码本”：
 71 | 
 72 | $$NN(g_i)=\arg \max\limits_{c_j\in \mathcal{C}}\kappa(g_i,c_j)$$
 73 | 
 74 | #### “特征袋”嵌入
 75 | 
 76 | 类比了词袋（bag-of-words）嵌入，嵌入表示为：
 77 | 
 78 | $$q(g_i)=\sum\limits_{g_i:NN(g_i)=c_j}[\mathbb{1}(1=j),\cdots,\mathbb{1}(n=j)]^T$$
 79 | 
 80 | 即将离ACFG最近的 $n$ 个聚类簇几何中心的 one-hot 编码相加
 81 | 
 82 | #### VLAD嵌入
 83 | 
 84 | 在“特征袋”嵌入的基础上额外考虑ACFG到聚类簇几何中心的距离：
 85 | 
 86 | $$q(g_i)=\sum\limits_{g_i:NN(g_i)=c_j}[\mathbb{1}(1=j)\kappa(g_i,c_1),\cdots,\mathbb{1}(n=j)\kappa(g_i,c_n)]^T$$
 87 | 
 88 | 本文使用此类嵌入方法
 89 | 
 90 | ### 在线搜索
 91 | 
 92 | 使用局部敏感哈希对嵌入向量进行哈希，随后使用欧氏距离/余弦距离进行距离计算
 93 | 
 94 | ## 笔者总结
 95 | 
 96 | 算法特点：
 97 | 
 98 | - 在节点属性中增加图结构信息
 99 | - 使用二分图匹配计算图之间的相似性
100 | - 引入”嵌入“思想
101 | 
102 | 可能存在的问题：
103 | 
104 | - 依赖提取出的统计特征和结构特征是否能够全面描述一个控制流图？在二分图匹配过程中直接舍弃了节点间的连接信息，仅依赖提取的特征进行匹配
105 | - 各基本块特征的权重可能为具体某一数据集下的最优结果，泛化性有待考量
106 | - 似乎缺乏语义信息的提取与分析
107 | - 缺乏对于跨架构二进制程序的细节分析以及针对性优化
108 | 


--------------------------------------------------------------------------------
/notes/Graph-based_comparison_of_executable_objects.md:
--------------------------------------------------------------------------------
  1 | # Graph-based comparison of executable objects
  2 | 
  3 | | Target（目标）     | 对比分析两个二进制程序的相同/不同部分                        |
  4 | | :----------------- | :----------------------------------------------------------- |
  5 | | Input（输入）      | 待比较的两个二进制程序                                       |
  6 | | Process（处理）    | 1. 将程序看作多级有向图<br />2.  从程序级开始，不断寻找图中能够匹配的匹配点（fixedpoints），并对匹配点进行扩展<br />3. 当无法再扩展时，细化进入函数级<br />4. 使用相同的方法寻找匹配点并扩展，随后细化进入基本块级重复操作 |
  7 | | Output（输出）     | 两个二进制程序的相同/不同部分                                |
  8 | | Problem（问题）    | 解决的问题：<br />1. 尚未有较为成熟的二进制程序差分算法      |
  9 | | Condition（条件）  | 输入的两个二进制程序确定为相似程序                           |
 10 | | Difficulty（难点） | 设计算法完成从粗粒度到细粒度的二进制差分                     |
 11 | | Level（水平）      | SSTIC2005                                                    |
 12 | 
 13 | ## 基本概念
 14 | 
 15 | ### 二进制文件之间的差异
 16 | 
 17 | 同一源代码编译后的二进制文件之间的差异可能体现在：
 18 | 
 19 | - 调用了不同的寄存器
 20 | - 指令顺序出现变化
 21 | - 分支的判断条件发生调转（后续分支也对应调转）
 22 | 
 23 | ### 有向图的有向图（graph of graphs）
 24 | 
 25 | 将一个程序看作一个有向图（函数调用图，call graph）；其中图的每个节点代表一个函数，又是一个有向图（函数的控制流图）；控制流图中的每个基本块也可看作一个图结构（指令图）。
 26 | 
 27 | ## 算法原理
 28 | 
 29 | ### 基本思路
 30 | 
 31 | 1. 从程序级开始，不断寻找图中能够匹配的匹配点（fixedpoints），并对匹配点进行扩展
 32 | 2. 当无法再扩展时，细化进入函数级
 33 | 3. 使用相同的方法寻找匹配点并扩展，随后细化进入基本块级重复操作
 34 | 
 35 | ### 选择器
 36 | 
 37 | 选择器 $s$ 的输入为一个节点和另一个图中的一组节点，输出为后者中的一个节点或空集
 38 | 
 39 | $$\displaystyle s:A^n\times \mathfrak{B}(B^n)\rightarrow B^n\cup\varnothing$$
 40 | 
 41 | 上式中， $\mathfrak{B}$ 表示[幂集](../concept.md#幂集（power-set）) 。选择器的任务是从一组节点中选出与输入节点最相似的节点。
 42 | 
 43 | #### 论文中使用的选择器
 44 | 
 45 | - 函数调用图（节点为函数）
 46 |   - 函数中包含的基本块数量
 47 |   - CFG中边的数量
 48 |   - 子函数调用数量
 49 |   - 将上述三个特征合成为向量，计算欧氏距离
 50 | - CFG（节点为基本块）
 51 |   - 到函数出口的最短距离
 52 |   - 到函数入口的最短距离
 53 |   - 基本块中的子函数调用数量
 54 | - 基本块级
 55 |   - 到函数出口的最短距离
 56 |   - 到函数入口的最短距离
 57 | 
 58 | ### 属性
 59 | 
 60 | 定义为两个图 $A$ 和 $B$ 到他们节点子集的映射
 61 | 
 62 | $$\pi(A,B)\rightarrow(A'^n,B'^n)$$
 63 | 
 64 | 其作用主要是降低搜索空间的大小
 65 | 
 66 | #### 论文中选取的属性
 67 | 
 68 | - 通用属性
 69 |   - 选择入度/出度为k的节点
 70 |   - 选择循环节点（节点存在指向自己的环路）
 71 | - 函数调用图属性
 72 |   - 相同函数名称
 73 |   - 使用了相同的字符串
 74 |   - 含有相同的“小素数积”（见下文）
 75 | - CFG属性
 76 |   - 包含相同的子函数调用
 77 | 
 78 | ### 同构图获取
 79 | 
 80 | 1. 从 $\pi(A,B)$ 中选取起始点（见下图1）
 81 | 2. 从已匹配点的父/子节点中寻找匹配点（见下图2）
 82 | 
 83 | ![image-20221030222302179](./image/Graph-based_comparison_of_executable_objects/image-20221030222302179.png)
 84 | 
 85 | ![image-20221030222310518](./image/Graph-based_comparison_of_executable_objects/image-20221030222310518.png)
 86 | 
 87 | ### 兼容指令序列变化
 88 | 
 89 | 指令顺序的变化可以等价为下述问题：
 90 | 
 91 | 假设存在两个等长的单词 $a、b$ ，需要判断是否存在一个排列 $\sigma$ ，使 $\sigma(a)=b$ 
 92 | 
 93 | 解决方案：使用“小素数积”算法。
 94 | 
 95 | 令 $P_m:=\{3,\dots,\rho_m\}$ 为前m个奇素数，m为构成单词的字母表长度。
 96 | 
 97 | 构造字母到奇素数的一对一映射 $\tau$ ，计算构成单词的各字母映射后的积
 98 | 
 99 | $$\displaystyle \prod_{i=1}^n\tau(a_i)=\prod_{i=1}^n\tau(b_i)$$
100 | 
101 | 若相同则认为存在 $\sigma(a)=b$ 
102 | 
103 | 当n较大时，算法计算复杂度较高，论文对该方面做出了优化，此处不再展开
104 | 
105 | 


--------------------------------------------------------------------------------
/notes/How_Machine_Learning_is_Solving_the_BInary_Function_Similarity_Problem.md:
--------------------------------------------------------------------------------
 1 | # How Machine Learning Is Solving the Binary Function Similarity Problem
 2 | 
 3 | ## 实验设计
 4 | 
 5 | ### 数据集构建
 6 | 
 7 | - 删除了基本块小于5的函数
 8 | - 删除了重复函数
 9 | 
10 | ### 任务分类
11 | 
12 | | 任务名称 | 任务中函数一致的部分             | 任务中函数不一致部分                                         |
13 | | -------- | -------------------------------- | ------------------------------------------------------------ |
14 | | XO       | 编译器类型；编译器版本；编译架构 | 优化选项                                                     |
15 | | XC       | 编译架构；程序位数（32/64位）    | 编译器类型；编译器版本；优化选项                             |
16 | | XC+XB    | 编译架构                         | 编译器类型；编译器版本；优化选项；程序位数（32/64位）        |
17 | | XA       | 编译器类型；编译器版本；优化选项 | 编译架构；程序位数（32/64位）                                |
18 | | XA+XO    | 编译器类型；编译器版本           | 编译架构；程序位数（32/64位）；优化选项                      |
19 | | XM       | 无                               | 编译架构；程序位数（32/64位）；优化选项；<br />编译器类型；编译器版本 |
20 | 
21 | ## 实验结果
22 | 
23 | 待阅读对应论文后补充分析
24 | 


--------------------------------------------------------------------------------
/notes/InnerEye.md:
--------------------------------------------------------------------------------
 1 | # InnerEye
 2 | 
 3 | | Target（目标）     | 1. 判断二进制语义相似度<br />2. 判断代码片段是否被包含在另一程序中 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 被查询的二进制代码序列；待搜索二进制库                       |
 6 | | Process（处理）    | 1. 单一架构指令嵌入<br />2. 利用LSTM生成基本块级的嵌入向量<br />3. 最大公共子序列算法评判相似度 |
 7 | | Output（输出）     | 二进制相似度值                                               |
 8 | | Problem（问题）    | 1. 人工选择的特征导致指令间关系的信息损失<br />2. 不支持比函数级更细粒度的相似性分析 |
 9 | | Condition（条件）  | 需要包含基本块级相似度的数据集                               |
10 | | Difficulty（难点） | 1. 提取二进制程序间的语义信息<br />2. 实现跨架构语义间的对比 |
11 | | Level（水平）      | NDSS2019                                                     |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221110164501342](./image/InnerEye/image-20221110164501342.png)
18 | 
19 | ### 指令嵌入
20 | 
21 | #### word2vec 
22 | 
23 | 使用滑动窗口分析整个句子，最大化如下概率：
24 | 
25 | $$\displaystyle J(w)=\frac{1}{T}\sum_{t=1}^{T}\sum_{w_k\in C_t}(\log P(w_k|w_t))$$
26 | 
27 | 其中：
28 | 
29 | $$\displaystyle P(w_k\in C_t|w_t)=\frac{\exp(\mathbf{w}_t^T\mathbf{w}_k)}{\sum_{w_i\in C_t}\exp(\mathbf{w}_t^T\mathbf{w}_i)}$$
30 | 
31 | 即希望建立嵌入向量与上下文的关系
32 | 
33 | #### 数据集建立
34 | 
35 | 在训练前需要解决OOV（out of vocabulary）问题，因此提前对汇编代码做处理：
36 | 
37 | - 数值常量替代为0，保留正负号
38 | - 字符串替换为`<STR>`
39 | - 函数名称被替换为 FOO
40 | - 其他符号常量被替换为 `<TAG>`
41 | 
42 | #### 嵌入模型训练
43 | 
44 | 为每个架构的汇编代码训练一个嵌入模型
45 | 
46 | ### 基本块嵌入向量生成
47 | 
48 | 使用孪生网络结构是的不同架构下相似的”句子“在经过LSTM提取特征后相似。
49 | 
50 | **需要基本块级的训练数据**
51 | 
52 | ![image-20221111104900669](./image/InnerEye/image-20221111104900669.png)
53 | 
54 | ### 代码片段相似度比较
55 | 
56 | 首先从控制流图中提取代码片段，计算最大公共子序列（LCS）作为相似度值
57 | 
58 | 设 $\Gamma=\{\mathcal{P}_1^t,\dots,\mathcal{P}_i^t\}$ 是目标程序 $T$ 中提取的代码片段，相似度可写为：
59 | 
60 | $$\displaystyle\psi(P,T)=\frac{\max_{\mathcal{P}\in \Gamma}|\mathrm{LCS}(\mathcal{P},\mathcal{P_i^t})|}{|\mathcal{P}|}$$
61 | 
62 | ## 笔者总结
63 | 
64 | 算法特色：
65 | 
66 | - 使用word2vec提取指令嵌入用于相似度分析
67 | - 指令替换，避免OOV问题
68 | - 算法效率大大提升
69 | 
70 | 可能存在的问题：
71 | 
72 | - 独立生成跨架构分析模型，可能存在跨架构精度不足的问题
73 | - 使用孪生网络“强行”使不同的嵌入表示可比
74 | - 还需思考语句嵌入方法下如何更好解决跨架构问题（中间表示？）


--------------------------------------------------------------------------------
/notes/Multi-MH.md:
--------------------------------------------------------------------------------
  1 | # Multi-MH
  2 | 
  3 | | Target（目标）     | 已知某个bug，在其他二进制文件中检索具有相同问题的函数        |
  4 | | :----------------- | :----------------------------------------------------------- |
  5 | | Input（输入）      | 二进制程序（基本块级）                                       |
  6 | | Process（处理）    | 1. 获取bug函数的指纹<br />2. 将bug函数的指纹与目标程序都转化为中间表示<br />3. 构造基本块级的语义哈希<br />4. 结合控制流图，使用最佳匹配扩展算法进行相似度计算 |
  7 | | Output（输出）     | 与已知漏洞所在函数相近的函数                                 |
  8 | | Problem（问题）    | 解决的问题：<br />1. 依赖源代码进行bug搜索<br />2. 仅能支持单一架构下的搜索【重点】<br />3. 依赖动态调试 |
  9 | | Condition（条件）  | 1. 算法在使用专家优化后的二进制片段（基本块+块间连接）时有最优效果<br />2. 编译器在编译时未使用多种不同的优化策略 |
 10 | | Difficulty（难点） | 寻找支持跨架构分析的特征表示手段                             |
 11 | | Level（水平）      | S&P2015                                                      |
 12 | 
 13 | ## 算法原理
 14 | 
 15 | ### 算法原理图
 16 | 
 17 | ![image-20221019220132563](./image/Multi-MH/image-20221019220132563.png)
 18 | 
 19 | ### bug指纹
 20 | 
 21 | 人工选择与待查询bug强相关的二进制片段作为bug指纹
 22 | 
 23 | ### 构建中间表示
 24 | 
 25 | - 使用VEX（pyvex）提取中间表示
 26 | - 将中间表示输入“Z3”定理证明器，获取可用于输入输出计算的[S-Expression](../concept.md#s-expression)
 27 | 
 28 | ### 采样获取语义特征
 29 | 
 30 | 随机生成向量，向量各元素的取值范围为 $[-1000,1000]$ 
 31 | 
 32 | 将输入长度、输入、输出组合，生成64位的CRC校验值作为输入输出对的表示
 33 | 
 34 | 只用拥有相同输入长度的基本块之间才具有可比性
 35 | 
 36 | #### 优化方案
 37 | 
 38 | -  $a:=b-c$ 与 $a:=c-b$ 之间应当是相似的，但是在输入相同时得到的结果完全相反。为解决这一问题，在构造输入时同时构造置换顺序后的输入。
 39 | 
 40 |   例：输入 $(b=1,c=2)$ 和 $(b=3,c=5)$ 时两个式子的输出分别为 $(-1,-2)$ 和 $(1,2)$ 。增加置换后的输入 $(b=2,c=1),(b=5,c=3)$ 后输出变为 $(-1,-2,1,2)$ 和 $(1,2,-1,-2)$ ，具有一定的相似性
 41 | 
 42 | - 当输入为内存中的值时，默认需要两个输入：内存地址和地址中的值。由于不同架构下的寄存器数量不同，可能导致输入数量不同（a架构使用寄存器作为输入，b架构使用内存中的值作为输入）。为解决这一问题，方法统一将内存中的值处理为单一输入
 43 | 
 44 | ### 语义哈希
 45 | 
 46 | 论文使用[MinHash](../concept.md#MinHash)对输入输出对的CRC校验值进行语义哈希。MinHash中使用的哈希变换函数为仿射哈希变换：
 47 | 
 48 | $$h(x)=ax+b\  \mathrm{mod} \ p$$ 
 49 | 
 50 | 为了进一步优化MinHash效果，对上述函数的结果做轮转和异或变换，使得MinHash选中的CRC校验值更倾向于随机：
 51 | 
 52 | $$t(h(x))=rotate(h(x),a)\oplus b$$
 53 | 
 54 | #### 优化方案
 55 | 
 56 | - 为每个基本块计算按照不同的输入变量个数计算多次MinHash，两个基本块之间的相似度计算公式如下：
 57 | 
 58 |   $$\frac{\sum_is_i\cdot(\omega_i+\omega_i')}{\sum_i(\omega_i+\omega_i')}$$
 59 | 
 60 |   其中 $s_i$ 是输入变量个数为 $i$ 时的相似度， $\omega_i$ 和 $\omega_i'$ 分别为两个基本块中的式子数。解决方法在式子数较少的基本块上的偏差
 61 | 
 62 | - 保存每个哈希函数计算后的 $k$ 个最小哈希值，以考虑一个基本块有多个相同匹配的情况，在本文中取 $k=3$
 63 | 
 64 | ### bug指纹匹配
 65 | 
 66 | 使用最佳匹配扩展（Best-Hit-Broadening）算法进行指纹匹配，算法流程：
 67 | 
 68 | 1. 在bug签名中选择起始点，输入起始点在上一步骤中获得的相似基本块列表
 69 | 2. 选择列表中的一个，将起始点与该基本块设置为已匹配
 70 | 3. 根据CFG，分析已匹配基本块的直接前驱和直接后继，寻找每个基本块在搜索空间中的相似基本块，并计算其相似度
 71 | 4. 选择一个相似度最高的基本块对设置为已匹配
 72 | 5. 重复3-4，直至所有基本块都匹配完成
 73 | 6. 计算bug指纹与匹配后的基本块序列的相似度
 74 | 7. 重复2-6，直至基本块列表中的所有元素均计算完成
 75 | 
 76 | 经实验研究，论文将最初的相似基本块列表设置为200
 77 | 
 78 | ![image-20221021170403764](./image/Multi-MH/image-20221021170403764.png)
 79 | 
 80 | ## 笔者总结
 81 | 
 82 | 算法特点：
 83 | 
 84 | - 首次提出跨架构的二进制搜索方法，利用中间表示作为过渡
 85 | - 可以进行基本块粒度的搜索
 86 | - 使用输入输出对提取了二进制语义信息
 87 | 
 88 | 可能存在的问题：
 89 | 
 90 | - 认为控制流图在跨架构编译后基本不变，基本不考虑不同编译选项对于控制流图结构的影响，对于不同编译选项的鲁棒性不佳
 91 | 
 92 | 
 93 | - 语法特征的采样过程需考虑覆盖率问题，当前覆盖率存疑
 94 | 
 95 | 
 96 | - 可扩展性较弱，对于一种新语言的支持需要人工设计汇编-中间表示的规则。如当前不支持x64
 97 | - 在进行较大函数的匹配时可能会出错
 98 | - 【？】方法对于控制流图的结构十分敏感
 99 | - 目前无法区分未修复/已修复的漏洞
100 | 


--------------------------------------------------------------------------------
/notes/Order_Matters.md:
--------------------------------------------------------------------------------
 1 | # Order Matters: Semantic-Aware Neural Networks for Binary Code Similarity Detection
 2 | 
 3 | | Target（目标）     | 函数级二进制相似度分析                                       |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 二进制函数的控制流图                                         |
 6 | | Process（处理）    | 1. 使用BERT进行基本块语义建模<br />2. MPNN对BERT的输出进一步做嵌入，获取整个函数的嵌入<br />3. 提取邻接矩阵，使用CNN提取特征，与2中的特征合并用于相似度分析 |
 7 | | Output（输出）     | 相似度分值                                                   |
 8 | | Problem（问题）    | 解决的问题：<br />1. 未利用节点顺序信息<br />2. 获取相似基本块需要依靠专家经验 |
 9 | | Condition（条件）  | 程序可被正常反编译                                           |
10 | | Difficulty（难点） | 节点顺序信息和结构信息的提取                                 |
11 | | Level（水平）      | AAAI2020                                                     |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221116144757272](./image/Order_Matters/image-20221116144757272.png)
18 | 
19 | ### 语义特征建模
20 | 
21 | 使用4任务bert提取语义信息，详细结构如下；
22 | 
23 | ![image-20221116155208628](./image/Order_Matters/image-20221116155208628.png)
24 | 
25 | MLM：Masked language model，提取基本块内的语义信息。在输入层掩盖某个token，尝试在输出层进行预测
26 | 
27 | ANP：Adjacency node prediction，提取图中所有的相邻的基本块，预测随机采样的两个基本块是否相邻
28 | 
29 | BIG：Block inside graph，让模型判断两个节点是否同时存在于同一个图中
30 | 
31 | GC：Graph classification，分类不同平台、架构、优化选项下的图
32 | 
33 | ### 结构特征建模
34 | 
35 | 使用MPNN对BERT的输出进一步做嵌入，获取整个函数的嵌入
36 | 
37 | ### 基本块顺序建模
38 | 
39 | 提取邻接矩阵，使用CNN提取特征
40 | 
41 | 除最后一层外，CNN不使用池化层，确保对于连接信息的提取
42 | 
43 | ### 笔者总结
44 | 
45 | 算法特色：
46 | 
47 | - 综合考虑多方面信息
48 | - 使用CNN分析邻接矩阵，提取连接信息
49 | 
50 | 可能存在的问题：
51 | 
52 | - CNN如何处理输入大小不一致问题？是否会导致对于基本块数量较多的函数分类效果较差？
53 | - 


--------------------------------------------------------------------------------
/notes/SAFE.md:
--------------------------------------------------------------------------------
 1 | # SAFE
 2 | 
 3 | | Target（目标）     | 已知某个二进制函数，在其他二进制文件中检索具有与之相似的函数 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 二进制程序（函数级）                                         |
 6 | | Process（处理）    | 1. 使用word2vec构造汇编指令嵌入<br />2. 带注意力机制的双向RNN实现函数嵌入<br />3. 利用函数嵌入实现相似度匹配 |
 7 | | Output（输出）     | 与输入函数相近的函数的列表                                   |
 8 | | Problem（问题）    | 解决问题：<br />1. 先前方法使用人工选择的特征进行嵌入，特征向量存在偏差<br />2. 先前方法特征提取速度慢 |
 9 | | Condition（条件）  | 使用不同模型处理单一架构/跨架构任务                          |
10 | | Difficulty（难点） | 嵌入向量的端到端提取                                         |
11 | | Level（水平）      | DIMVA2019                                                    |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221115094019450](./image/SAFE/image-20221115094019450.png)
18 | 
19 | ### 汇编指令嵌入（i2v）
20 | 
21 | 预处理：将汇编指令中所有内存都替换为MEM；将大于5000的立即数都替换为5000
22 | 
23 | 使用 word2vec 进行嵌入
24 | 
25 | ### 函数嵌入
26 | 
27 | 将指令嵌入输入带注意力机制的双向RNN，提取函数嵌入
28 | 
29 | 使用孪生网络进行训练
30 | 
31 | ![image-20221115095432970](./image/SAFE/image-20221115095432970.png)


--------------------------------------------------------------------------------
/notes/TEDEM.md:
--------------------------------------------------------------------------------
 1 | # TEDEM——Tree Edit Distance based Equational Matching
 2 | 
 3 | | Target（目标）     | 已知某个bug，在其他二进制文件中检索具有相同问题的代码片段    |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 二进制程序（分析粒度为基本块级）                             |
 6 | | Process（处理）    | 1. 预处理：反编译二进制程序，提取表达式树<br />2. 使用基本块的统计特征进行初步筛选<br />3. 使用树编辑距离对初筛结果进行相似度分析<br />4. 使用贪婪算法对bug指纹（CFG）进行匹配 |
 7 | | Output（输出）     | 与输入函数相近的函数的列表                                   |
 8 | | Problem（问题）    | 解决的问题：<br />1. 仅基于结构的相似度分析方法无法区分结构相似，语义不同的二进制代码<br />2. 函数级的搜索方法在处理较小的代码片段时效果较差 |
 9 | | Condition（条件）  | 1. 程序未被混淆<br />2. 搜索空间仅包含单一架构下的二进制程序 |
10 | | Difficulty（难点） | 提取二进制代码的语义信息并做相似度比较                       |
11 | | Level（水平）      | ACSAC2014                                                    |
12 | 
13 | ## 算法原理
14 | 
15 | ### 算法原理图
16 | 
17 | ![image-20221026145548233](./image/TEDEM/image-20221026145548233.png)
18 | 
19 | ### 语义提取
20 | 
21 | 将基本块的指令序列转化为符号式，不考虑式子变量之间的前后联系。
22 | 
23 | - 使用[METASM](https://link.springer.com/article/10.1007/s11416-009-0126-4)（此处提供论文链接，不详细展开）的方法提取中间表示，并对中间表示进行化简
24 | - 将符号式转化为[S-expression](../concept.md#s-expression)格式，方便后续利用树结构进行分析处理
25 | 
26 | 
27 | ### bug指纹匹配
28 | 
29 | bug指纹定义为存在漏洞函数CFG的一部分，仅包含漏洞的关键特征。
30 | 
31 | #### 基本块相似度比较
32 | 
33 | 将符号式转化为树结构，其中，根节点为赋值或跳转；叶子节点为寄存器或内存地址；中间节点为运算符。
34 | 
35 | 使用树编辑距离判别符号式之间的相似程度
36 | 
37 | #### 候选项筛选/搜索
38 | 
39 | 1. 在bug指纹中选择具有典型特征的基本块（统计特征与其他基本块重合的概率小于 $t$ ）作为搜索起始点
40 | 2. 使用粗粒度统计特征（符号式数量、树的深度、树的节点数量等）进行初步筛选
41 | 3. 论文设定 $t=5\%$ , 候选基本块数量（用于下一步）为20
42 | 
43 | #### 邻居节点扩展
44 | 
45 | 利用CFG，使用与[Multi-MH](./Multi-MH.md#bug指纹匹配)基本相同的贪婪算法进行邻居节点扩展，此处不再重复记录。
46 | 
47 | ## 笔者总结
48 | 
49 | 算法特点
50 | 
51 | - 将基本块转化为树结构进行相似度计算
52 | - 考虑了二进制程序的语义信息
53 | 
54 | 可能存在的问题
55 | 
56 | - 无法处理误报，即当搜索空间中没有相似的指纹时，依然会输出匹配结果（不会显示“没有匹配项”）
57 | - 算法时间复杂度高
58 | - 要求匹配项的控制流图基本完全相同
59 | 
60 | 


--------------------------------------------------------------------------------
/notes/Tracy.md:
--------------------------------------------------------------------------------
 1 | # Tracy
 2 | 
 3 | | Target（目标）     | 已知某个二进制函数，在其他二进制文件中检索具有与之相似的函数 |
 4 | | :----------------- | :----------------------------------------------------------- |
 5 | | Input（输入）      | 待搜索二进制代码库；已知二进制函数                           |
 6 | | Process（处理）    | 1. 将二进制程序分解为片段<br />2. 使用重写规则判断片段之间的距离 |
 7 | | Output（输出）     | 与已知二进制函数相似的函数列表                               |
 8 | | Problem（问题）    | 尚未有较为成熟的二进制搜索算法 |
 9 | | Condition（条件）  | 1. 程序未被混淆<br />2. 需要部分debug信息<br />3. 要求被分析的函数至少有100个基本块 |
10 | | Difficulty（难点） | 评判二进制程序片段的相似度 |
11 | | Level（水平）      | PLDI2014                                                     |
12 | 
13 | ## 算法原理
14 | 
15 | ### 预处理
16 | 
17 | - 使用被调用函数的名称替换被调用函数的偏移地址
18 | - 使用全局变量的具体值替代全局变量的起始地址
19 | - 使用导入表中的变量名替代变量偏移量
20 | 
21 | ### 将二进制程序分解为片段
22 | 
23 | 从程序控制流图中，提取连续的k个基本块作为片段（提取全部能够拆分出的片段）
24 | 
25 | 提取片段的汇编指令，删除其中的跳转（因为在提取过程中已经指定了），作为片段之间距离计算的输入
26 | 
27 | ### 计算二进制片段之间的距离
28 | 
29 | 使用编辑距离评估片段相似度，将距离计算问题转化为重写问题，即计算需要花费多少代价才能从片段一修改为片段二
30 | 
31 | #### 指令对齐
32 | 
33 | 使用“最长公共子序列”算法完成相似指令的对齐，详见下一小节。
34 | 
35 | #### 相似度计算
36 | 
37 | $$sim(c,c')=\begin{cases}2+|\{i|args(c)[i]=args)(c')[i]\}|\ SameKind(c,c') \\\\ -1\ otherwise\end{cases}$$
38 | 
39 | 相似度计算伪代码如下图所示
40 | 
41 | ![image-20221108101452921](./image/Tracy/image-20221108101452921.png)
42 | 
43 | 该算法可以得到三类结果：
44 | 
45 | - 匹配指令的集合（实现指令对齐）
46 | - 两个片段的相似度
47 | - 增加/删除的指令
48 | 
49 | #### 相似度值归一化
50 | 
51 | 包含两种归一化方法：
52 | 
53 | containment： $S/min(RIdent,TIdent)$
54 | 
55 | ratio： $(S*2)/(RIdent+TIdent)$ 
56 | 
57 | ### 使用重写引擎进一步提升效果
58 | 
59 | ![image-20221108102940405](./image/Tracy/image-20221108102940405.png)
60 | 
61 | 将目标片段中的寄存器、偏移地址等使用变量替代（上图中中间部分）
62 | 
63 | 使用solver尝试对代码片段进一步匹配
64 | 
65 | 匹配完毕后重新计算相似度分数
66 | 
67 | 


--------------------------------------------------------------------------------
/notes/discovRE.md:
--------------------------------------------------------------------------------
  1 | # discovRE
  2 | 
  3 | | Target（目标）     | 已知某个bug，在其他二进制文件中检索具有相同问题的函数        |
  4 | | :----------------- | :----------------------------------------------------------- |
  5 | | Input（输入）      | 待搜索二进制代码库；已知漏洞（bug）                         |
  6 | | Process（处理）    | 1. 提取算法统计特征，使用knn进行初步筛选<br />2. 通过求解最大公共子图计算两个控制流图之间的相似度 |
  7 | | Output（输出）     | 与已知漏洞所在函数相近的函数 |
  8 | | Problem（问题）    | 解决的问题：<br />1. 现有方法的搜索条件较为苛刻，不支持跨架构搜索<br />2. 搜索效率低 |
  9 | | Condition（条件）  | 1. 程序可被正常反编译<br />2. 程序编译过程中未使用函数嵌入（function inlining） |
 10 | | Difficulty（难点） | 1. 自行构建数据集<br />2. 分析数据，提出trick进一步优化准确率/搜索速度 |
 11 | | Level（水平）      | NDSS2016                                                     |
 12 | 
 13 | ## 算法原理
 14 | 
 15 | ### 算法原理图
 16 | 
 17 | ![image-20221018195211933](./image/discovRE/image-20221018195211933.png)
 18 | 
 19 | ### 数据准备
 20 | 
 21 | 选择7个开源项目，在不同平台（Windows、Linux），不同架构（x86、x64、ARM），不同编译器（GCC、CL、ICC、VC），不同编译器优化选项下进行编译。
 22 | 
 23 | 优化选项中排除了函数嵌入（function inlining），即取消部分小型函数的函数调用，直接将函数的并入上级函数中。
 24 | 
 25 | 数据集中剔除了编译选项不同，但是编译结果相同的重复项；剔除了统计和结构特征完全相同的重复函数。
 26 | 
 27 | ### 统计特征选择
 28 | 
 29 | 基本思想：使用一组统计特征来描述一个二进制程序。
 30 | 
 31 | 统计特征基本要求：1. 该值在不容编译器/编译选项下应当基本保持不变；2. 该值应当在一个较大的范围内分布
 32 | 
 33 | 下图中灰色部分为算法中使用的统计特征
 34 | 
 35 | ![image-20221019201526215](./image/discovRE/image-20221019201526215.png)
 36 | 
 37 | ### 统计相似度分析
 38 | 
 39 | 首先根据统计特征对二进制函数相似性做初步筛选
 40 | 
 41 | - 在进行筛选前对统计特征进行标准化
 42 | - 使用knn（k-d树版本）对候选函数进行初筛
 43 | 
 44 | ### 结构相似度分析
 45 | 
 46 | #### 块属性特征
 47 | 
 48 | 在提取的控制流图基础上为每个节点（基本块）添加属性特征，使用求解最大公共子图（maximum common subgraph，MCS）的方法计算相似度。
 49 | 
 50 | 添加的属性信息如下图所示，定义属性间差异 $d_{BB}$ ，其中 $c_{if}$ 表示函数 $f$ 的第 $i$ 个属性特征：
 51 | 
 52 | $$d_{BB}=\frac{\sum \alpha_i |c_{if}-c_{ig}|}{\sum \alpha_i \max(c_{if},c_{ig})}$$
 53 | 
 54 | ![image-20221019162340206](./image/discovRE/image-20221019162340206.png)
 55 | 
 56 | 各属性权重的确定源自下述优化问题：
 57 | 
 58 | $$\max (d_{BB}(f_i,g_j)-d_{BB}(f_i,f_j))$$
 59 | 
 60 | 即不同块之间的差异尽可能大，相同块（同一代码在不同编译器下的编译结果）之间的差异尽可能小
 61 | 
 62 | 论文使用遗传算法求解该问题
 63 | 
 64 | #### 距离计算
 65 | 
 66 | 传统的MCS距离计算方法可表示为：
 67 | 
 68 | $$d_{mcs.orig}(G_1,G_2)s=1-\frac{|mcs(G_1,G_2)|}{\max(|G_1|,|G_2|)}$$
 69 | 
 70 | 论文方法在此基础上增加了属性特征损失项：
 71 | 
 72 | $$d_{mcs}(G_1,G_2)s=1-\frac{|mcs(G_1,G_2)|-\sum d_{BB}(b_i,b_j)}{\max(|G_1|,|G_2|)}$$
 73 | 
 74 | 求解最大公共子图的问题是一个NP完全问题，为节约时间开销，有必要在一定时间后终止计算，使用当前近似解替代最终结果。下图为求解最大公共子图的迭代次数与获得结果之间的关系。
 75 | 
 76 | 经作者研究，使用 $16\max (|G_1|,|G_2|)$ 次迭代后的近似解，与经过10000次迭代后的最终结果有99.11%的概率相同，因此可提前结束迭代节约时间。
 77 | 
 78 | ![](./image/discovRE/image-20221019171324901.png)
 79 | 
 80 | 此外，控制流图的大小将直接影响MCS距离（见下图），作者发现相同函数的平均MCS距离与控制流图大小的关系近似为指数关系（左图中实线为实际拟合的关系曲线，虚线为使用指数函数拟合的关系曲线）。为了弥补控制流图大小带来的相似度差异，进一步修改距离计算方法：
 81 | 
 82 | $$d_{mcs\_comp}(G_1,G_2)=\frac{d_{mcs}(G_1,G_2)}{comp(G_1,G_2)}$$
 83 | 
 84 | $$comp(G_1,G_2)=i+k\log(\max(|G_1|,|G_2|))$$
 85 | 
 86 | ![image-20221019171737100](./image/discovRE/image-20221019171737100.png)
 87 | 
 88 | ## 笔者总结
 89 | 
 90 | 算法特点：
 91 | 
 92 | - 研究并使用了许多小trick以提高准确率/搜索速度
 93 | - 利用统计特征进行初筛，结构特征进行精确查找
 94 | 
 95 | 可能存在的问题：
 96 | 
 97 | - 将控制流图直接作为跨架构二进制相似度分析的中间表示，实际上同一程序在不同架构下的控制流图是有部分差异的
 98 | - 各基本块特征的权重可能为具体某一数据集下的最优结果，泛化性有待考量
 99 | - 无法应对混淆
100 | 


--------------------------------------------------------------------------------
/notes/image/A_Survey_of_Binary_Code_Similarity/image-20221014112852989.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/A_Survey_of_Binary_Code_Similarity/image-20221014112852989.png


--------------------------------------------------------------------------------
/notes/image/A_Survey_of_Binary_Code_Similarity/image-20221015210819269.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/A_Survey_of_Binary_Code_Similarity/image-20221015210819269.png


--------------------------------------------------------------------------------
/notes/image/Asm2Vec/image-20221031200624966.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Asm2Vec/image-20221031200624966.png


--------------------------------------------------------------------------------
/notes/image/Asm2Vec/image-20221031203211929.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Asm2Vec/image-20221031203211929.png


--------------------------------------------------------------------------------
/notes/image/Asm2Vec/image-20221031204510624.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Asm2Vec/image-20221031204510624.png


--------------------------------------------------------------------------------
/notes/image/BLEX/image-20221027222637730.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BLEX/image-20221027222637730.png


--------------------------------------------------------------------------------
/notes/image/BLEX/image-20221027222646562.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BLEX/image-20221027222646562.png


--------------------------------------------------------------------------------
/notes/image/BLEX/image-20221027222821612.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BLEX/image-20221027222821612.png


--------------------------------------------------------------------------------
/notes/image/BLEX/image-20221027223048621.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BLEX/image-20221027223048621.png


--------------------------------------------------------------------------------
/notes/image/BLEX/image-20221027223600635.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BLEX/image-20221027223600635.png


--------------------------------------------------------------------------------
/notes/image/BLEX/image-20221028113440022.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BLEX/image-20221028113440022.png


--------------------------------------------------------------------------------
/notes/image/BinGo/image-20221108150148825.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo/image-20221108150148825.png


--------------------------------------------------------------------------------
/notes/image/BinGo/image-20221108150551485.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo/image-20221108150551485.png


--------------------------------------------------------------------------------
/notes/image/BinGo/image-20221108150610480.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo/image-20221108150610480.png


--------------------------------------------------------------------------------
/notes/image/BinGo/image-20221108155628263.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo/image-20221108155628263.png


--------------------------------------------------------------------------------
/notes/image/BinGo_E/image-20221111230259570.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo_E/image-20221111230259570.png


--------------------------------------------------------------------------------
/notes/image/BinGo_E/image-20221114154606899.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo_E/image-20221114154606899.png


--------------------------------------------------------------------------------
/notes/image/BinGo_E/image-20221114155853617.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinGo_E/image-20221114155853617.png


--------------------------------------------------------------------------------
/notes/image/BinHunt/image-20221110155012164.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/BinHunt/image-20221110155012164.png


--------------------------------------------------------------------------------
/notes/image/CoP/image-20221111163317832.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/CoP/image-20221111163317832.png


--------------------------------------------------------------------------------
/notes/image/Gemini/image-20221025200017843.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Gemini/image-20221025200017843.png


--------------------------------------------------------------------------------
/notes/image/Gemini/image-20221025202453422.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Gemini/image-20221025202453422.png


--------------------------------------------------------------------------------
/notes/image/Gemini/image-20221025204716133.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Gemini/image-20221025204716133.png


--------------------------------------------------------------------------------
/notes/image/Genius/image-20221017104002392.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Genius/image-20221017104002392.png


--------------------------------------------------------------------------------
/notes/image/Genius/image-20221017172441347.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Genius/image-20221017172441347.png


--------------------------------------------------------------------------------
/notes/image/Graph-based_comparison_of_executable_objects/image-20221030221707934.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Graph-based_comparison_of_executable_objects/image-20221030221707934.png


--------------------------------------------------------------------------------
/notes/image/Graph-based_comparison_of_executable_objects/image-20221030221916849.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Graph-based_comparison_of_executable_objects/image-20221030221916849.png


--------------------------------------------------------------------------------
/notes/image/Graph-based_comparison_of_executable_objects/image-20221030222302179.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Graph-based_comparison_of_executable_objects/image-20221030222302179.png


--------------------------------------------------------------------------------
/notes/image/Graph-based_comparison_of_executable_objects/image-20221030222310518.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Graph-based_comparison_of_executable_objects/image-20221030222310518.png


--------------------------------------------------------------------------------
/notes/image/InnerEye/image-20221110164501342.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/InnerEye/image-20221110164501342.png


--------------------------------------------------------------------------------
/notes/image/InnerEye/image-20221111104900669.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/InnerEye/image-20221111104900669.png


--------------------------------------------------------------------------------
/notes/image/InnerEye/image-20221111105534404.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/InnerEye/image-20221111105534404.png


--------------------------------------------------------------------------------
/notes/image/Multi-MH/image-20221019220132563.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Multi-MH/image-20221019220132563.png


--------------------------------------------------------------------------------
/notes/image/Multi-MH/image-20221021170403764.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Multi-MH/image-20221021170403764.png


--------------------------------------------------------------------------------
/notes/image/Order_Matters/image-20221116144757272.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Order_Matters/image-20221116144757272.png


--------------------------------------------------------------------------------
/notes/image/Order_Matters/image-20221116155208628.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Order_Matters/image-20221116155208628.png


--------------------------------------------------------------------------------
/notes/image/SAFE/image-20221115094019450.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/SAFE/image-20221115094019450.png


--------------------------------------------------------------------------------
/notes/image/SAFE/image-20221115095432970.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/SAFE/image-20221115095432970.png


--------------------------------------------------------------------------------
/notes/image/TEDEM/image-20221026145548233.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/TEDEM/image-20221026145548233.png


--------------------------------------------------------------------------------
/notes/image/Tracy/image-20221108101452921.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Tracy/image-20221108101452921.png


--------------------------------------------------------------------------------
/notes/image/Tracy/image-20221108102940405.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/Tracy/image-20221108102940405.png


--------------------------------------------------------------------------------
/notes/image/discovRE/image-20221018195211933.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/discovRE/image-20221018195211933.png


--------------------------------------------------------------------------------
/notes/image/discovRE/image-20221019162340206.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/discovRE/image-20221019162340206.png


--------------------------------------------------------------------------------
/notes/image/discovRE/image-20221019171324901.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/discovRE/image-20221019171324901.png


--------------------------------------------------------------------------------
/notes/image/discovRE/image-20221019171737100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/discovRE/image-20221019171737100.png


--------------------------------------------------------------------------------
/notes/image/discovRE/image-20221019201526215.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vict0rShen/binary-similarity-learning/61cf50186d0703ead416e2e516bcd20a15ec765a/notes/image/discovRE/image-20221019201526215.png


--------------------------------------------------------------------------------