├── .gitmodules ├── LICENSE ├── README.md ├── __init__.py ├── act ├── __init__.py ├── act.py ├── actionfactory.py ├── actiontype.py ├── asbuilder.py ├── astbuilder.py ├── bcbuilder.py ├── cluster.py ├── featureextractor.py ├── getstatistics.py ├── pdgbuilder.py ├── queryinconsistency.py └── sampledownloader.py ├── argsparser.py ├── arguments.py ├── iBench └── groundtruth.py ├── install.sh ├── learning ├── __init__.py ├── clustering.py ├── graph2vec │ ├── __init__.py │ ├── corpus_parser.py │ ├── graph2vec.py │ ├── parallelgraph2vec.py │ ├── skipgram.py │ ├── train_utils.py │ └── utils.py ├── graphkernel │ ├── __init__.py │ └── weisfeiler_lehman.py ├── node2vec │ ├── __init__.py │ ├── main.py │ └── node2vec.py ├── similarity.py ├── statistics.py └── struc2vec │ ├── algorithms.py │ ├── algorithms_distances.py │ ├── graph.py │ ├── main.py │ ├── struc2vec.py │ └── utils.py ├── requirements.txt ├── sample ├── __init__.py ├── astfile.py ├── bitcodefile.py ├── languagetype.py ├── projectcode.py ├── slicer.py └── sourcefile.py ├── scripts ├── get_inconsistencies.sh ├── get_inconsistencies_NN_G2v.sh ├── get_inconsistencies_g2v.sh ├── get_inconsistencies_real_programs.sh └── get_inconsistencies_real_programs_NN_G2v.sh ├── settings-bak.py ├── settings.py ├── ssh_private_key_password.py └── utils ├── __init__.py ├── computation.py ├── inout.py └── progress.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dg"] 2 | path = dg 3 | url = https://github.com/ManSoSec/dg 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FICS 2 | 3 | 4 | # Download & configure FICS 5 | 6 | 1. Clone the repository 7 | - For example: mkdir /home/mansour/code 8 | - cd /home/mansour/code 9 | - ```git clone --recurse-submodules https://github.com/RiS3-Lab/FICS.git``` 10 | - cd FICS 11 | 2. ```sh install.sh``` 12 | 3. create a directory as the root of your data (e.g., source code, bitcodes, graphs, etc.) 13 | - For example: mkdir /home/mansour/data 14 | - cd /home/mansour/data 15 | - create a directory inside and name it 'projects': mkdir projects 16 | - cd /home/mansour/data/projects 17 | 4. Modify settings.py and update DATA_DIR to the root of your data 18 | - For example: DATA_DIR = '/home/mansour/data' 19 | 20 | # Prepare target codebase 21 | 22 | 5. In the "projects" directory, clone the source code a codebase you target: 23 | - For example: git clone https://gitlab.com/libtiff/libtiff.git libtiff-19f6b70 24 | - cd libtiff-19f6b70 25 | - git checkout 19f6b70 . 26 | 6. Compile the project with clang-3.8 and get compilation database (FICS just supports clang 3.8 and llvm 3.8) 27 | - For example: cmake -D CMAKE_C_COMPILER="/usr/bin/clang-3.8" -D CMAKE_CXX_COMPILER="/usr/bin/clang++-3.8" . 28 | - get compilation database: bear make 29 | 30 | # Discover the inconsistencies 31 | 32 | 7. Run FICS on the target codebase: 33 | - For example: ```sh scripts/get_inconsistencies_real_programs_NN_G2v.sh libtiff-19f6b70 p ns``` 34 | - If you need to run FICS on larger projects like QEMU, change 'ns' to 's'. FICS splits the codebase to submodules 35 | - *The inconsistencies are saved in mongodb* 36 | 37 | # Query the found inconsistencies!!! 38 | 8. To query the saved inconsistencies, you need to run the following command: 39 | - ```python __init__.py -a=QI -p=libtiff-19f6b70 -it=check -f``` 40 | - "-it" argument is inconsistency type and can be: check | call | type | store | order | all 41 | - if you need to disable filtering, just remove -f 42 | 43 | # Here is the list of bugs found by FICS 44 | 45 | | Bug | Link | 46 | | ------------- | ------------- | 47 | | Codebase | OpenSSL | 48 | | Missing check | [Report/Patch](https://github.com/openssl/openssl/issues/7650) | 49 | | Missing check | [Patch](https://github.com/openssl/openssl/pull/7427)| 50 | | Wrong use of clear_free | [Report/Patch](https://github.com/openssl/openssl/issues/10406)| 51 | | Null dereference | [Report/Patch](https://github.com/openssl/openssl/issues/10404)| 52 | | Null dereference | [Report/Patch](https://github.com/openssl/openssl/issues/10405)| 53 | | Inconsistent Check | [Report/Patch](https://github.com/openssl/openssl/pull/7880)| 54 | | Memory Leak | [Report/Patch](https://github.com/openssl/openssl/issues/10294)| 55 | | Missing clear_free | [Report/Patch](https://github.com/openssl/openssl/issues/7657)| 56 | | Codebase | QEMU | 57 | | 2 Missing checks | [Report/Patch](https://patchew.org/QEMU/20200414133052.13712-1-philmd@redhat.com/20200414133052.13712-11-philmd@redhat.com/) | 58 | | Undefined Behaviour | [Report](https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg05749.html)/[Patch](https://patchwork.kernel.org/patch/11446203/) | 59 | | Uninitialized variable | [Report/Patch](https://lists.gnu.org/archive/html/qemu-trivial/2020-03/msg00239.html) | 60 | | Codebase | LibTIFF | 61 | | Missing checks | [Patch](https://gitlab.com/libtiff/libtiff/-/merge_requests/96) 62 | | Mislocated check - Bad casting | [Report/Patch](https://gitlab.com/libtiff/libtiff/-/issues/162)| 63 | | Missing TIFFClose | [Report/Patch](https://gitlab.com/libtiff/libtiff/-/issues/171) 64 | | Codebase | wolfSSL | 65 | | Missing check | [Report/Patch](https://github.com/wolfSSL/wolfssl/issues/2038) | 66 | | Missing check | [Report/Patch](https://github.com/wolfSSL/wolfssl/issues/2037)| 67 | | Memory exhaustion | [Report/Patch](https://github.com/wolfSSL/wolfssl/issues/2527)| 68 | | Codebase | OpenSSH | 69 | | Missing bzero | [Patch](https://github.com/openssh/openssh-portable/commit/2d1428b11c8b6f616f070f2ecedce12328526944)| 70 | | Codebase | libredwg | 71 | | Bad casting (Overflow) | [Report](https://github.com/LibreDWG/libredwg/issues/174)/[Patch](https://github.com/LibreDWG/libredwg/commit/631bbacb3e18403db1015ef4063c3d19e9c8e11a) | 72 | | Null dereference | [Report](https://github.com/LibreDWG/libredwg/issues/172)/[Patch](https://github.com/LibreDWG/libredwg/commit/373c8e4849f2013d7123913bca8edb35ff6bc3d6) | 73 | | Null dereference | [Report](https://github.com/LibreDWG/libredwg/issues/173)/[Patch](https://github.com/LibreDWG/libredwg/commit/373c8e4849f2013d7123913bca8edb35ff6bc3d6) | 74 | | Codebase | TCPdump | 75 | | Missing initialization | [Report](https://github.com/the-tcpdump-group/tcpdump/issues/801) | 76 | 77 | # Citation 78 | 79 | If your found FICS useful for your research, please cite the following paper: 80 | 81 | ```Latex 82 | @inproceedings{fics, 83 | abstract = { 84 | Probabilistic classification has shown success in detecting known types of software bugs. However, the works following this approach tend to require a large amount of specimens to train their models. We present a new machine learning-based bug detection technique that does not require any external code or samples for training. Instead, our technique learns from the very codebase on which the bug detection is performed, and therefore, obviates the need for the cumbersome task of gathering and cleansing training samples (e.g., buggy code of certain kinds). The key idea behind our technique is a novel two-step clustering process applied on a given codebase. This clustering process identifies code snippets in a project that are functionally-similar yet appear in inconsistent forms. Such inconsistencies are found to cause a wide range of bugs, anything from missing checks to unsafe type conversions. Unlike previous works, our technique is generic and not specific to one type of inconsistency or bug. We prototyped our technique and evaluated it using 5 popular open source software, including QEMU and OpenSSL. With a minimal amount of manual analysis on the inconsistencies detected by our tool, we discovered 22 new unique bugs, despite the fact that many of these programs are constantly undergoing bug scans and new bugs in them are believed to be rare. 85 | }, 86 | author = {Ahmadi, Mansour and Mirzazade farkhani, Reza and Williams, Ryan and Lu, Long}, 87 | booktitle = {Proceedings of the 30th USENIX Security Symposium}, 88 | month = {August}, 89 | series = {USENIX Security'21}, 90 | title = {Finding Bugs Using Your Own Code: Detecting Functionally-similar yet Inconsistent Code}, 91 | year = {2021} 92 | ``` 93 | } 94 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from act.actionfactory import ActionFactory 4 | from argsparser import ArgsParser 5 | 6 | if __name__ == '__main__': 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | args_parser = ArgsParser() 10 | args_parser.parse() 11 | args_parser.do_basic_checks() 12 | ActionFactory(args_parser.arguments).perform_actions() 13 | -------------------------------------------------------------------------------- /act/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/act/__init__.py -------------------------------------------------------------------------------- /act/act.py: -------------------------------------------------------------------------------- 1 | class Act: 2 | 3 | def __init__(self, arguments): 4 | self.arguments = arguments 5 | 6 | def start(self): 7 | pass 8 | -------------------------------------------------------------------------------- /act/actionfactory.py: -------------------------------------------------------------------------------- 1 | from asbuilder import ASBuilder 2 | from pdgbuilder import PDGBuilder 3 | from bcbuilder import BCBuilder 4 | from cluster import Cluster 5 | from actiontype import ActionType 6 | from featureextractor import FeatureExtractor 7 | from astbuilder import ASTBuilder 8 | from getstatistics import GetStatistics 9 | from queryinconsistency import QueryInconsistency 10 | 11 | 12 | class ActionFactory: 13 | 14 | def __init__(self, arguments): 15 | self.arguments = arguments 16 | self.action = None 17 | 18 | def perform_actions(self): 19 | 20 | for action in self.arguments.actions: 21 | if action == ActionType.AST.name: 22 | print '=======================================' 23 | print '| Retrieve ASTs from the source codes |' 24 | print '=======================================' 25 | self.start(ASTBuilder(arguments=self.arguments)) 26 | elif action == ActionType.BC.name: 27 | print '=======================================' 28 | print '| Retrieve bc from the source codes |' 29 | print '=======================================' 30 | self.start(BCBuilder(arguments=self.arguments)) 31 | elif action == ActionType.PDG.name: 32 | print '=======================================' 33 | print '| Retrieve PDG from the LLVM bitcodes |' 34 | print '=======================================' 35 | self.start(PDGBuilder(arguments=self.arguments)) 36 | elif action == ActionType.AS.name: 37 | print '=======================================' 38 | print '| Extract Abstract Slices from PDG |' 39 | print '=======================================' 40 | self.start(ASBuilder(arguments=self.arguments)) 41 | elif action == ActionType.FE.name: 42 | print '=======================================' 43 | print '| Extract features |' 44 | print '=======================================' 45 | self.start(FeatureExtractor(arguments=self.arguments)) 46 | elif action == ActionType.MC.name: 47 | print '=======================================' 48 | print '| Cluster samples |' 49 | print '=======================================' 50 | self.start(Cluster(arguments=self.arguments)) 51 | elif action == ActionType.ST.name: 52 | print '=======================================' 53 | print '| Print Clusters stats |' 54 | print '=======================================' 55 | self.start(GetStatistics(arguments=self.arguments)) 56 | elif action == ActionType.QI.name: 57 | print '=======================================' 58 | print '| Query Inconsistencies |' 59 | print '=======================================' 60 | self.start(QueryInconsistency(arguments=self.arguments)) 61 | 62 | def start(self, action): 63 | action.start() 64 | 65 | -------------------------------------------------------------------------------- /act/actiontype.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ActionType(Enum): 5 | AST = 'Retrieve Abstract Syntax Tree' 6 | BC = 'Retrieve bitcode' 7 | PDG = 'Retrieve Program Dependence Graph' 8 | AS = 'Extract Abstract Forward Slices' 9 | FE = 'Feature Extraction' 10 | MC = 'Model Construction' 11 | ST = 'Get cluster statistics' 12 | QI = 'Query inconsistencies' 13 | 14 | @staticmethod 15 | def get_names(): 16 | return [e.name for e in ActionType] 17 | 18 | @staticmethod 19 | def get_detail(): 20 | return ['{}: {}'.format(e.name, e.value) for e in ActionType] 21 | -------------------------------------------------------------------------------- /act/asbuilder.py: -------------------------------------------------------------------------------- 1 | 2 | from act import Act 3 | from sample.projectcode import ProjectCode 4 | from utils.inout import * 5 | 6 | 7 | class ASBuilder(Act): 8 | 9 | def start(self): 10 | projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir) 11 | dir_names = get_directories(projects_dir) 12 | for dir_name in dir_names: 13 | for project_name in self.arguments.projects: 14 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 15 | print 'Analyzing {}'.format(get_basename(dir_name)) 16 | project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments) 17 | project_code.retrieve_as() 18 | -------------------------------------------------------------------------------- /act/astbuilder.py: -------------------------------------------------------------------------------- 1 | 2 | from act import Act 3 | from sample.projectcode import ProjectCode 4 | from utils.inout import * 5 | 6 | 7 | class ASTBuilder(Act): 8 | 9 | def start(self): 10 | projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir) 11 | dir_names = get_directories(projects_dir) 12 | for dir_name in dir_names: 13 | for project_name in self.arguments.projects: 14 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 15 | print 'Analyzing {}'.format(get_basename(dir_name)) 16 | project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments) 17 | project_code.retrieve_ast() 18 | -------------------------------------------------------------------------------- /act/bcbuilder.py: -------------------------------------------------------------------------------- 1 | 2 | from act import Act 3 | from sample.projectcode import ProjectCode 4 | from utils.inout import * 5 | 6 | 7 | class BCBuilder(Act): 8 | 9 | def start(self): 10 | projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir) 11 | dir_names = get_directories(projects_dir) 12 | for dir_name in dir_names: 13 | for project_name in self.arguments.projects: 14 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 15 | print 'Analyzing {}'.format(get_basename(dir_name)) 16 | project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments) 17 | if self.arguments.prepare: 18 | project_code.prepare_bc() 19 | else: 20 | project_code.retrieve_bc() 21 | # project_code.link_bc_files() 22 | -------------------------------------------------------------------------------- /act/featureextractor.py: -------------------------------------------------------------------------------- 1 | from act import Act 2 | from sample.projectcode import ProjectCode 3 | from utils.inout import * 4 | from timeit import default_timer 5 | 6 | 7 | class FeatureExtractor(Act): 8 | 9 | def start(self): 10 | projects_dir = join_path(self.arguments.data_dir, self.arguments.bcs_dir) 11 | datasets_dir = join_path(self.arguments.data_dir, self.arguments.datasets_dir) 12 | dir_names = get_directories(projects_dir) 13 | # feature_types = self.arguments.feature_types.split(',') 14 | for feature_type in self.arguments.feature_types: 15 | for dir_name in dir_names: 16 | for project_name in self.arguments.projects: 17 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 18 | if self.arguments.split != 'True': 19 | print 'Extracting {} features for {}'.format(feature_type, get_basename(dir_name)) 20 | project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments, 21 | feature_type=feature_type) 22 | start_time = default_timer() 23 | project_code.extract_features(save=True) 24 | elapsed_time = default_timer() - start_time 25 | time_file = join_path(datasets_dir, get_basename(dir_name), 26 | '{}.feature_extraction.time.txt'.format( 27 | feature_type)) 28 | print 'Feature Extraction Time:', elapsed_time 29 | write_file(time_file, '{}'.format(elapsed_time)) 30 | # project_code.save_features() 31 | else: 32 | for module in get_directories(dir_name): 33 | print 'Module:', get_basename(module) 34 | project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments, 35 | feature_type=feature_type, module_name=get_basename(module)) 36 | 37 | start_time = default_timer() 38 | project_code.extract_features(save=True) 39 | elapsed_time = default_timer() - start_time 40 | if project_code.num_abstract_slices != 0: 41 | time_file = join_path(datasets_dir, get_basename(dir_name), 42 | '{}.{}.feature_extraction.time.txt'.format( 43 | get_basename(module), feature_type)) 44 | print 'Feature Extraction Time:', elapsed_time 45 | write_file(time_file, '{}'.format(elapsed_time)) 46 | # project_code.save_features() 47 | -------------------------------------------------------------------------------- /act/getstatistics.py: -------------------------------------------------------------------------------- 1 | from act import Act 2 | from learning.statistics import Statistics 3 | from utils.inout import * 4 | 5 | 6 | class GetStatistics(Act): 7 | 8 | def start(self): 9 | if self.arguments.stat_type == 'VI': 10 | datasets_dir = join_path(self.arguments.data_dir, self.arguments.datasets_dir) 11 | cluster_files = get_files_in_dir(datasets_dir, ext='.clusters.txt') 12 | for cluster_file in cluster_files: 13 | for project_name in self.arguments.projects: 14 | if get_basename(get_parent_dir(get_parent_dir(cluster_file))) == project_name or \ 15 | len(self.arguments.projects) == 0: 16 | print cluster_file 17 | statistics = Statistics(arguments=self.arguments, 18 | project_clusters_info_file=cluster_file) 19 | statistics.print_vul_info() 20 | 21 | elif self.arguments.stat_type == 'SI': 22 | projects_dir = join_path(self.arguments.data_dir, self.arguments.bcs_dir) 23 | dir_names = get_directories(projects_dir) 24 | for dir_name in dir_names: 25 | for project_name in self.arguments.projects: 26 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 27 | statistics = Statistics(arguments=self.arguments, project_dir=dir_name) 28 | statistics.print_slices_info() 29 | 30 | elif self.arguments.stat_type == 'SS': 31 | projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir) 32 | dir_names = get_directories(projects_dir) 33 | for dir_name in dir_names: 34 | for project_name in self.arguments.projects: 35 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 36 | statistics = Statistics(arguments=self.arguments, project_dir=dir_name) 37 | statistics.print_slices_similarities() 38 | 39 | elif self.arguments.stat_type == 'ST': 40 | projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir) 41 | time_data_min = {} 42 | time_data_hour = {} 43 | project_name_mapping = {'libpcap-545e77d8': 'libpcap', 'libtiff-19f6b70d': 'libtiff', 44 | 'mbedtls-0592ea7': 'mbedtls', 'openssh-c2fa53c': 'openssh', 45 | 'openssl-a75be9f': 'openssl', 'nginx-0098761': 'nginx', 46 | 'wolfssl-c26cb53': 'wolfssl'} 47 | for project_name in self.arguments.projects: 48 | project_dir = join_path(projects_dir, project_name) 49 | statistics = Statistics(arguments=self.arguments, project_dir=project_dir) 50 | 51 | if project_name in project_name_mapping.keys(): 52 | project_name = project_name_mapping[project_name] 53 | time_data_min[project_name], time_data_hour[project_name] = statistics.print_performance_time() 54 | 55 | Statistics.draw_bar_chart(self.arguments, time_data_min, time_data_hour) 56 | -------------------------------------------------------------------------------- /act/pdgbuilder.py: -------------------------------------------------------------------------------- 1 | 2 | from act import Act 3 | from sample.projectcode import ProjectCode 4 | from utils.inout import * 5 | 6 | 7 | class PDGBuilder(Act): 8 | 9 | def start(self): 10 | projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir) 11 | dir_names = get_directories(projects_dir) 12 | for dir_name in dir_names: 13 | for project_name in self.arguments.projects: 14 | if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0: 15 | print 'Analyzing {}'.format(get_basename(dir_name)) 16 | project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments) 17 | project_code.retrieve_pdg() 18 | -------------------------------------------------------------------------------- /act/sampledownloader.py: -------------------------------------------------------------------------------- 1 | 2 | from act import Act 3 | 4 | 5 | class SampleDownloader(Act): 6 | 7 | def start(self): 8 | pass 9 | -------------------------------------------------------------------------------- /argsparser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from act.actiontype import ActionType 4 | from arguments import Arguments 5 | from sample.languagetype import LanguageType 6 | from settings import * 7 | from utils import inout 8 | 9 | 10 | class ArgsParser: 11 | def __init__(self): 12 | self.parser = argparse.ArgumentParser() 13 | self.init_arguments() 14 | self.arguments = None 15 | 16 | def parse(self): 17 | args, unparsed = self.parser.parse_known_args() 18 | self.arguments = Arguments( 19 | actions=args.actions.split(','), 20 | languages=args.languages.split(','), 21 | data_dir=args.data_dir, 22 | projects_dir=args.projects_dir, 23 | asts_dir=args.asts_dir, 24 | bcs_dir=BCS_DIR, 25 | datasets_dir=DATASETS_DIR, 26 | plots_dir=PLOTS_DIR, 27 | search_spaces=SEARCH_SPACES, 28 | clang_lib_dir=args.clang_lib_dir, 29 | clustering_algs=args.clustering_algs.split(','), 30 | clustering_feat=args.clustering_feat.split(','), 31 | second_clustering=args.second_clustering, 32 | cose_similarity_chunk_size=args.cose_similarity_chunk_size, 33 | big_clusters_ignore=args.big_clusters_ignore, 34 | chunk_window_size=args.chunk_window_size, 35 | split=args.split, 36 | projects=args.projects.split(','), 37 | save_format=SAVE_FORMAT, 38 | ignore_compile_commands=IGNORE_COMPILE_COMMANDS, 39 | feature_types=args.feature_types.split(','), 40 | llvm_config=LLVM_CONFIG, 41 | includes=args.includes.split(','), 42 | pdg_dumper=PDG_DUMPER, 43 | clang=CLANG, 44 | stat_type=STAT_TYPE, 45 | stat_sim_types=STAT_SIM_TYPES.split(','), 46 | has_control_flow=args.has_control_flow, 47 | inconsistency_type=args.inconsistency_type, 48 | similarity_threshold=args.similarity_threshold, 49 | granularity=args.granularity.split(','), 50 | dependency=args.dependency, 51 | call_inconsistency=args.call_inconsistency.split(','), 52 | type_inconsistency=args.type_inconsistency.split(','), 53 | store_inconsistency=args.store_inconsistency.split(','), 54 | inconsistency_query_options=args.inconsistency_query_options, 55 | ssh=args.ssh, 56 | filtering=args.filtering, 57 | count_cpu=args.count_cpu, 58 | ids=args.ids.split(','), 59 | starting_report_item=args.starting_report_item, 60 | prepare=args.prepare 61 | ) 62 | 63 | def init_arguments(self): 64 | self.parser.add_argument( 65 | '--actions', 66 | '-a', 67 | type=str, 68 | default=ACTIONS, 69 | help='Action must be among these: {}'.format(ActionType.get_detail()) 70 | ) 71 | 72 | self.parser.add_argument( 73 | '--languages', 74 | '-l', 75 | type=str, 76 | default=LANGUAGES, 77 | help='Target language must be among these: {}'.format(LanguageType.get_detail()) 78 | ) 79 | 80 | self.parser.add_argument( 81 | '--data_dir', 82 | '-dd', 83 | type=str, 84 | default=DATA_DIR, 85 | help='Base directory of the data' 86 | ) 87 | 88 | self.parser.add_argument( 89 | '--projects_dir', 90 | '-pd', 91 | type=str, 92 | default=PROJECTS_DIR, 93 | help='Base directory of the projects' 94 | ) 95 | 96 | self.parser.add_argument( 97 | '--asts_dir', 98 | '-ad', 99 | type=str, 100 | default=ASTS_DIR, 101 | help='Base directory of the ast of files' 102 | ) 103 | 104 | self.parser.add_argument( 105 | '--clang_lib_dir', 106 | '-cld', 107 | type=str, 108 | default=CLANG_LIB_DIR, 109 | help='Base directory of the clang library files' 110 | ) 111 | 112 | self.parser.add_argument( 113 | '--projects', 114 | '-p', 115 | type=str, 116 | default=PROJECTS, 117 | help='An array containing list of checking projects' 118 | ) 119 | 120 | self.parser.add_argument( 121 | '--clustering_algs', 122 | '-ca', 123 | type=str, 124 | default=CLUSTERING_ALGS, 125 | help='An array containing list of clustering algorithms and their thresholds' 126 | ) 127 | 128 | self.parser.add_argument( 129 | '--clustering_feat', 130 | '-cf', 131 | type=str, 132 | default=CLUSTERING_FEAT, 133 | help='An array containing list of clustering features' 134 | ) 135 | 136 | self.parser.add_argument( 137 | '--second_clustering', 138 | '-sc', 139 | type=str, 140 | default=SECOND_CLUSTERING, 141 | help='Type of second clustering, online vs offline' 142 | ) 143 | 144 | self.parser.add_argument( 145 | '--cose_similarity_chunk_size', 146 | '-cscs', 147 | type=int, 148 | default=COSE_SIMILARITY_CHUNK_SIZE, 149 | help='Batch size when compute cosine similarity, depends to available RAM' 150 | ) 151 | 152 | self.parser.add_argument( 153 | '--big_clusters_ignore', 154 | '-bci', 155 | type=int, 156 | default=BIG_CLUSTERS_IGNORE, 157 | help='Size of big clusters that should be ignored from the first step clustering' 158 | ) 159 | 160 | self.parser.add_argument( 161 | '--chunk_window_size', 162 | '-cws', 163 | type=int, 164 | default=CHUNK_WINDOW_SIZE, 165 | help='Size of basic block window from data flows' 166 | ) 167 | 168 | self.parser.add_argument( 169 | '--split', 170 | '-s', 171 | type=str, 172 | default=SPLIT, 173 | help='A boolean value use for splitting a project' 174 | ) 175 | 176 | self.parser.add_argument( 177 | '--feature_types', 178 | '-ft', 179 | type=str, 180 | default=FEATURE_TYPES, 181 | help='An array containing list of feature types' 182 | ) 183 | 184 | self.parser.add_argument( 185 | '--includes', 186 | '-i', 187 | type=str, 188 | default=INCLUDES, 189 | help='An array containing list of checking projects' 190 | ) 191 | 192 | self.parser.add_argument( 193 | '--has_control_flow', 194 | '-hcf', 195 | action='store_true', 196 | help='If true, it considers control flow as well during construct extraction' 197 | ) 198 | 199 | self.parser.add_argument( 200 | '--inconsistency_type', 201 | '-it', 202 | type=str, 203 | default=INCONSISTENCY_TYPE, 204 | help='show the result of a type of inconsistency' 205 | ) 206 | 207 | self.parser.add_argument( 208 | '--similarity_threshold', 209 | '-st', 210 | type=float, 211 | default=SIMILARITY_THRESHOLD, 212 | help='Only show the inconsistencies having a similarity greater than a threshold' 213 | ) 214 | 215 | self.parser.add_argument( 216 | '--granularity', 217 | '-g', 218 | type=str, 219 | default=GRANULARITY, 220 | help='Granularity of the construct' 221 | ) 222 | 223 | self.parser.add_argument( 224 | '--dependency', 225 | '-d', 226 | type=str, 227 | default=DEPENDENCY, 228 | help='Dependency of the construct' 229 | ) 230 | 231 | self.parser.add_argument( 232 | '--call_inconsistency', 233 | '-ci', 234 | type=str, 235 | default=CALL_INCONSISTENCY, 236 | help='Select the inconsistencies containing specific calls' 237 | ) 238 | 239 | self.parser.add_argument( 240 | '--type_inconsistency', 241 | '-ti', 242 | type=str, 243 | default=TYPE_INCONSISTENCY, 244 | help='Select the inconsistencies containing specific types' 245 | ) 246 | 247 | self.parser.add_argument( 248 | '--store_inconsistency', 249 | '-sti', 250 | type=str, 251 | default=STORE_INCONSISTENCY, 252 | help='Select the inconsistencies containing specific stores' 253 | ) 254 | 255 | self.parser.add_argument( 256 | '--inconsistency_query_options', 257 | '-iqo', 258 | type=str, 259 | default=INCONSISTENCY_QUERY_OPTIONS, 260 | help='Set specific options during querying the inconsistencies' 261 | ) 262 | 263 | self.parser.add_argument( 264 | '--ssh', 265 | '-ssh', 266 | action='store_true', 267 | help='If it needs to connect to a remote mongodb server' 268 | ) 269 | 270 | self.parser.add_argument( 271 | '--filtering', 272 | '-f', 273 | action='store_true', 274 | help='Filter the less potential inconsistencies' 275 | ) 276 | 277 | self.parser.add_argument( 278 | '--count_cpu', 279 | '-cc', 280 | type=str, 281 | default=COUNT_CPU, 282 | help='Number of cores' 283 | ) 284 | 285 | self.parser.add_argument( 286 | '--ids', 287 | '-ids', 288 | type=str, 289 | default='', 290 | help='IDs of inconsistencies' 291 | ) 292 | 293 | self.parser.add_argument( 294 | '--starting_report_item', 295 | '-si', 296 | type=int, 297 | default=1, 298 | help='Show inconsistencies starting from specific item' 299 | ) 300 | 301 | self.parser.add_argument( 302 | '--prepare', 303 | '-pp', 304 | action='store_true', 305 | help='If bitcodes are given, prepare them for pdg extraction' 306 | ) 307 | 308 | def do_basic_checks(self): 309 | 310 | possible_actions = ActionType.get_names() 311 | for action in self.arguments.actions: 312 | if action not in possible_actions: 313 | self.parser.print_help() 314 | inout.show_error('action argument is wrong!\n') 315 | 316 | possible_languages = LanguageType.get_names() 317 | for language in self.arguments.languages: 318 | if language not in possible_languages: 319 | self.parser.print_help() 320 | inout.show_error('language argument is wrong!\n') 321 | 322 | if self.arguments.data_dir is None or self.arguments.data_dir == '' or \ 323 | not inout.exist_dir(self.arguments.data_dir): 324 | self.parser.print_help() 325 | inout.show_error('data_dir argument is not valid!\n') 326 | 327 | if self.arguments.projects_dir is None or self.arguments.projects_dir == '' or \ 328 | not inout.exist_dir(inout.join_path(self.arguments.data_dir, self.arguments.projects_dir)): 329 | self.parser.print_help() 330 | inout.show_error('projects_dir argument is not valid!\n') 331 | 332 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | class Arguments: 4 | def __init__(self, actions, languages, data_dir, projects_dir, asts_dir, bcs_dir, 5 | datasets_dir, plots_dir, search_spaces, clang_lib_dir, clustering_algs, clustering_feat, 6 | second_clustering, cose_similarity_chunk_size, big_clusters_ignore, chunk_window_size, 7 | split, projects, save_format, 8 | ignore_compile_commands, 9 | feature_types, llvm_config, includes, pdg_dumper, clang, stat_type, stat_sim_types, has_control_flow, 10 | inconsistency_type, similarity_threshold, granularity, dependency, call_inconsistency, 11 | type_inconsistency, store_inconsistency, inconsistency_query_options, ssh, filtering, count_cpu, ids, 12 | starting_report_item, prepare): 13 | self.actions = actions 14 | self.languages = languages 15 | self.data_dir = data_dir 16 | self.projects_dir = projects_dir 17 | self.asts_dir = asts_dir 18 | self.bcs_dir = bcs_dir 19 | self.datasets_dir = datasets_dir 20 | self.plots_dir = plots_dir 21 | self.search_spaces = search_spaces 22 | self.clang_lib_dir = clang_lib_dir 23 | self.clustering_algs = clustering_algs 24 | self.clustering_feat = clustering_feat 25 | self.second_clustering = second_clustering 26 | self.cose_similarity_chunk_size = cose_similarity_chunk_size 27 | self.big_clusters_ignore = big_clusters_ignore 28 | self.chunk_window_size = chunk_window_size 29 | self.split = split 30 | self.projects = projects 31 | self.save_format = save_format 32 | self.ignore_compile_commands = ignore_compile_commands 33 | self.feature_types = feature_types 34 | self.llvm_config = llvm_config 35 | self.includes = includes 36 | self.pdg_dumper = pdg_dumper 37 | self.clang = clang 38 | self.stat_type = stat_type 39 | self.stat_sim_types = stat_sim_types 40 | self.has_control_flow = has_control_flow 41 | self.inconsistency_type = inconsistency_type 42 | self.similarity_threshold = similarity_threshold 43 | self.granularity = granularity 44 | self.dependency = dependency 45 | self.call_inconsistency = call_inconsistency 46 | self.type_inconsistency = type_inconsistency 47 | self.store_inconsistency = store_inconsistency 48 | self.inconsistency_query_options = inconsistency_query_options 49 | self.ssh = ssh 50 | self.filtering = filtering 51 | self.count_cpu = count_cpu 52 | self.ids = ids 53 | self.starting_report_item = starting_report_item 54 | self.prepare = prepare 55 | 56 | try: 57 | mp.cpu_count() 58 | except: 59 | self.count_cpu = 1 60 | print 'Running the code by {} CPUs'.format(self.count_cpu) 61 | -------------------------------------------------------------------------------- /iBench/groundtruth.py: -------------------------------------------------------------------------------- 1 | ground_truth = [] 2 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | sudo apt-get update 2 | sudo apt-get -y install clang-3.8 3 | sudo apt-get -y install clang-6.0 4 | sudo apt-get -y install llvm-3.8 5 | sudo apt-get -y install llvm-6.0 6 | sudo apt-get -y install bear 7 | sudo apt-get -y install git 8 | sudo apt-get -y install cmake 9 | sudo apt-get -y install libpng-dev libfreetype6-dev 10 | sudo apt-get -y install python-dev graphviz libgraphviz-dev pkg-config 11 | sudo apt-get -y install python-pip 12 | pip install --upgrade pip 13 | pip install --upgrade setuptools 14 | pip install -r requirements.txt 15 | cd dg 16 | cmake -D CMAKE_C_COMPILER="/usr/bin/clang-6.0" -D CMAKE_CXX_COMPILER="/usr/bin/clang++-6.0" . 17 | make 18 | wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | sudo apt-key add - 19 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.4 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.4.list 20 | sudo apt-get update 21 | sudo apt-get install -y mongodb-org=4.4.2 mongodb-org-server=4.4.2 mongodb-org-shell=4.4.2 mongodb-org-mongos=4.4.2 mongodb-org-tools=4.4.2 22 | sudo systemctl start mongod 23 | -------------------------------------------------------------------------------- /learning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/__init__.py -------------------------------------------------------------------------------- /learning/clustering.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | # from hdbscan import HDBSCAN 4 | from scipy.sparse.csgraph import connected_components 5 | from sklearn.cluster import AffinityPropagation, MeanShift, DBSCAN 6 | from sklearn.decomposition import LatentDirichletAllocation 7 | from sklearn.metrics.pairwise import * 8 | import dask.dataframe as dd 9 | 10 | from sample.projectcode import ProjectCode 11 | from utils import inout 12 | from utils.inout import * 13 | from scipy import sparse 14 | from sklearn.metrics import pairwise_distances 15 | from scipy.spatial.distance import cosine 16 | 17 | 18 | class Clustering: 19 | 20 | def __init__(self, dataset='', clustering_alg='dbscancos_0.9', features_set=None, locations=None, min_samples=2, 21 | from_chuck=None, module_name='root', arguments=None, project_dir='', feature_type='', 22 | node_features=None, node_features_locations=None): 23 | self.arguments = arguments 24 | self.project_dir = project_dir 25 | self.dataset = dataset 26 | self.module_name = module_name 27 | self.locations = locations 28 | if from_chuck is not None: 29 | project_code = ProjectCode(project_dir=self.project_dir, arguments=self.arguments, 30 | feature_type=from_chuck, file_locations=self.locations) 31 | project_code.extract_features() 32 | self.features_set = pd.DataFrame(project_code.afs_features_counters) 33 | self.locations = project_code.afs_file_infos 34 | else: 35 | if self.locations is None: 36 | self.features_set, self.locations = Clustering.split_dataset(self.dataset, 37 | feature_type) 38 | else: 39 | self.features_set = features_set 40 | self.locations = locations 41 | # if 'location' in features_set.columns.values: 42 | # features_set.drop('location', axis=1, inplace=True) 43 | params = clustering_alg.split('_') 44 | self.clustering_alg = params[0] 45 | self.param = float(params[1]) 46 | self.model = None 47 | # LDA Parameters 48 | self.n_features = 1000 49 | self.n_samples = len(self.features_set) 50 | self.n_topics = 1000 51 | self.max_iter = 50 52 | self.learning_offset = 2 53 | # DBScan Parameters 54 | self.eps = self.param # 0.9 55 | self.min_samples = min_samples 56 | # Affinity Parameters 57 | self.preference = self.param # 100 58 | # Mean Shift 59 | self.bandwidth = self.param # 1.2 60 | # hdbscan 61 | self.min_cluster_size = int(self.param) 62 | ######### 63 | self.set_default_settings() 64 | self.clusters_samples = defaultdict(list) 65 | self.node_features = node_features 66 | self.node_features_locations = node_features_locations 67 | self.node_differences = defaultdict(list) 68 | self.clusters_samples_len_sorted_keys = list() 69 | self.cluster_labels = None 70 | 71 | def set_default_settings(self): 72 | if self.clustering_alg == 'dbscan' or self.clustering_alg == 'dbscancos': 73 | # self.features_set = StandardScaler().fit_transform(self.features_set) 74 | self.model = DBSCAN(eps=self.eps, min_samples=self.min_samples, n_jobs=5) 75 | elif self.clustering_alg == 'lda': 76 | self.model = LatentDirichletAllocation(n_components=self.n_topics, max_iter=self.max_iter, 77 | learning_method='online', learning_offset=self.learning_offset, 78 | random_state=0, n_jobs=10) 79 | elif self.clustering_alg == 'aff' or self.clustering_alg == 'affcos': 80 | self.model = AffinityPropagation() 81 | elif self.clustering_alg == 'means': 82 | self.model = MeanShift(n_jobs=5, bandwidth=self.bandwidth) 83 | self.param = self.bandwidth 84 | # elif self.clustering_alg == 'hdbscan': 85 | # self.model = HDBSCAN(min_cluster_size=self.min_cluster_size, min_samples=self.min_samples 86 | # , metric='manhattan') 87 | # # , algorithm='generic', metric='cosine') 88 | elif self.clustering_alg == 'cc': 89 | self.model = 'cc' 90 | 91 | def cluster(self): 92 | 93 | if self.model is None or len(self.features_set) == 0: 94 | return False 95 | if self.clustering_alg == 'affcos': 96 | self.model.affinity = 'euclidean' 97 | self.model.preference = self.preference 98 | # print cosine_distances(self.features_set) 99 | self.model.fit(cosine_similarity(self.features_set)) 100 | # self.param = self.preference 101 | elif self.clustering_alg == 'dbscancos': 102 | # self.model.metric = 'euclidean' 103 | self.model.metric = 'precomputed' 104 | distances = cosine_distances(self.features_set) 105 | # distances[distances < (1 - self.param)] = 0 106 | self.model.fit(distances) 107 | # self.param = self.eps 108 | elif self.clustering_alg == 'hdbscan': 109 | self.model.fit(self.features_set) 110 | self.cluster_labels = self.model.labels_ 111 | elif self.clustering_alg == 'cc': 112 | # similarity = cosine_similarity(self.features_set) 113 | # similarity = self.get_cosine_similarity() 114 | similarity = self.get_similarity_sparse_input() 115 | adjacency_mask = similarity >= self.param 116 | del similarity 117 | del self.features_set 118 | nb_clusters, self.cluster_labels = connected_components(adjacency_mask, connection='strong') 119 | # print nb_clusters 120 | # print self.cluster_labels 121 | else: 122 | self.model.fit(self.features_set) 123 | return True 124 | 125 | def get_similarity_scipy(self): 126 | return 1 - pairwise_distances(self.features_set, metric="cosine") 127 | 128 | def get_similarity_sparse_input(self): 129 | # sparse_features = sparse.csr_matrix(self.features_set) 130 | # return cosine_similarity(sparse_features) 131 | return cosine_similarity(self.features_set) 132 | 133 | def similarity_cosine_by_chunk(self, len, start, end): 134 | if end > len: 135 | end = len 136 | return cosine_similarity(X=self.features_set[start:end], Y=self.features_set, dense_output=False) 137 | 138 | def get_cosine_similarity(self): 139 | chunk_size = int(self.arguments.cose_similarity_chunk_size) 140 | len = self.features_set.shape[0] 141 | # cosine_similarities = None 142 | # if len <= chunk_size: 143 | # cosine_similarities = cosine_similarity(self.features_set) 144 | # else: 145 | filesnames = [] 146 | similarity_files_dir = join(self.arguments.data_dir, self.arguments.datasets_dir, 147 | get_basename(self.project_dir)) 148 | for filename in get_files_in_dir(similarity_files_dir, start='tmp-sim-'): 149 | if os.path.exists(filename): 150 | inout.remove_file(filename) 151 | for chunk_start in xrange(0, len, chunk_size): 152 | print 'chunk start index', chunk_start 153 | filename = join(self.arguments.data_dir, self.arguments.datasets_dir, 154 | get_basename(self.project_dir), 'tmp-sim-{}.txt'.format(chunk_start)) 155 | sim_file = open(filename, "wb") 156 | cosine_similarity_chunk = self.similarity_cosine_by_chunk(len, chunk_start, chunk_start + chunk_size) 157 | np.savetxt(sim_file, cosine_similarity_chunk, fmt="%.2g", delimiter=',', newline='\n') 158 | del cosine_similarity_chunk 159 | sim_file.close() 160 | filesnames.append(filename) 161 | 162 | cosine_similarities = None 163 | for filename in filesnames: 164 | if cosine_similarities is None: 165 | cosine_similarities = np.genfromtxt(filename, delimiter=',') 166 | else: 167 | cosine_similarities = np.concatenate((cosine_similarities, np.genfromtxt(filename, delimiter=',')), 168 | axis=0) 169 | return cosine_similarities 170 | 171 | def get_clusters(self): 172 | 173 | if self.clustering_alg == 'dbscan' or self.clustering_alg == 'dbscancos': 174 | # print self.model.labels_ 175 | for i in range(len(self.model.labels_)): 176 | self.clusters_samples[self.model.labels_[i]].append(self.locations[i]) 177 | elif self.clustering_alg == 'lda': 178 | sample_cluster_distrib = self.model.transform(self.features_set) 179 | counter = 0 180 | for i in range(len(self.locations)): 181 | counter += 1 182 | sample_cluster = np.argmax(sample_cluster_distrib[i]) 183 | self.clusters_samples[sample_cluster].append(self.locations[i]) 184 | elif self.clustering_alg == 'aff' or self.clustering_alg == 'affcos': 185 | for i in range(len(self.model.labels_)): 186 | self.clusters_samples[self.model.labels_[i]].append(self.locations[i]) 187 | elif self.clustering_alg == 'means': 188 | for i in range(len(self.model.labels_)): 189 | self.clusters_samples[self.model.labels_[i]].append(self.locations[i]) 190 | elif self.clustering_alg == 'hdbscan': 191 | for i in range(len(self.cluster_labels)): 192 | self.clusters_samples[self.cluster_labels[i]].append(self.locations[i]) 193 | elif self.clustering_alg == 'cc': 194 | 195 | for i in range(len(self.cluster_labels)): 196 | self.clusters_samples[self.cluster_labels[i]].append(self.locations[i]) 197 | 198 | if self.node_features is not None: 199 | self.set_node_difference(i) 200 | self.sort_clusters() 201 | 202 | def set_node_difference(self, i): 203 | location_index = self.node_features_locations[self.locations[i]] 204 | node_features = self.node_features.iloc[location_index] 205 | node_features = node_features.iloc[node_features.to_numpy().nonzero()[0]].to_dict() 206 | self.node_differences[self.cluster_labels[i]].append(node_features) 207 | 208 | def sort_clusters(self): 209 | self.clusters_samples_len_sorted_keys = sorted(self.clusters_samples, 210 | key=lambda k: len(self.clusters_samples[k]), 211 | reverse=True) 212 | 213 | @staticmethod 214 | def split_dataset(dataset, feature_type): 215 | df = pd.read_csv(dataset, nrows=1) 216 | features = list(df.columns.values) 217 | features.remove('location') 218 | features_type = {'location': 'str'} 219 | if feature_type == 'NN': 220 | for feature in features: 221 | features_type[feature] = 'Int64' 222 | elif feature_type == 'G2v': 223 | for feature in features: 224 | features_type[feature] = 'float64' 225 | elif feature_type == '': 226 | print 'Feature Type is empty ...' 227 | # print 'Reading dataset ...' 228 | dataframe = pd.read_csv(dataset, dtype=features_type) 229 | # dataframe = dd.read_csv(dataset, dtype=features_type, header=0, blocksize=int(5e5), sample=1e9) 230 | # print 'Converting dataframe to pandas ...' 231 | # dataframe.compute() 232 | # print 'Filling Null values with 0 ...' 233 | dataframe = dataframe.fillna(0) 234 | return dataframe.drop('location', axis=1), dataframe['location'].tolist() 235 | 236 | @staticmethod 237 | def make_dataset_binary(dataframe): 238 | columns = dataframe.columns.values.tolist() 239 | columns.remove('location') 240 | for column in columns: 241 | dataframe.ix[dataframe[column] > 0, column] = 1 242 | return dataframe 243 | 244 | def save_clusters(self, step=''): 245 | 246 | content = '\n' 247 | for key in self.clusters_samples_len_sorted_keys: 248 | dic_value = self.clusters_samples[key] 249 | content = '{} Cluster #{} :\n'.format(content, key) 250 | content = '{} # Items: {} \n'.format(content, len(dic_value)) 251 | for item in dic_value: 252 | content = '{} {}\n'.format(content, item) 253 | content = '{} {}\n'.format(content, '=' * 100) 254 | 255 | clusters_file_directory = get_parent_dir(get_filename_without_ext(self.dataset)) 256 | clustering_feature_name = str(get_basename(get_filename_without_ext(self.dataset))) 257 | clusters_file = '{}_{}.{}_{}.{}.clusters.txt'.format(step, 258 | clustering_feature_name.split('_')[1], 259 | clustering_feature_name.split('_')[0], 260 | str(self.param), self.module_name) 261 | clusters_file = join_path(clusters_file_directory, clusters_file) 262 | cluster_file_path = join_path(get_parent_dir(clusters_file), self.clustering_alg) 263 | make_dir_if_not_exist(cluster_file_path) 264 | clusters_file = join_path(cluster_file_path, get_basename(clusters_file)) 265 | write_file(clusters_file, content) 266 | -------------------------------------------------------------------------------- /learning/graph2vec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/graph2vec/__init__.py -------------------------------------------------------------------------------- /learning/graph2vec/corpus_parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import logging 3 | from collections import Counter 4 | from random import shuffle 5 | 6 | import numpy as np 7 | 8 | from utils import get_files 9 | 10 | 11 | class Corpus(object): 12 | def __init__(self, fnames=None, extn='WL2', max_files=0): 13 | assert fnames != None, "please specify the corpus folder" 14 | self.fnames = fnames 15 | self.subgraph_index = 0 16 | self.graph_index = 0 17 | self.epoch_flag = 0 18 | self.max_files = max_files 19 | self.graph_ids_for_batch_traversal = [] 20 | self.extn = extn 21 | 22 | def scan_corpus(self): 23 | 24 | subgraphs = [] 25 | for fname in self.graph_fname_list: 26 | subgraphs.extend( 27 | [l.split()[0] for l in open(fname).xreadlines()]) # just take the first word of every sentence 28 | subgraphs.append('UNK') 29 | 30 | subgraph_to_freq_map = Counter(subgraphs) 31 | del subgraphs 32 | 33 | subgraph_to_id_map = {sg: i for i, sg in 34 | enumerate(subgraph_to_freq_map.iterkeys())} # output layer of the skipgram network 35 | 36 | self._subgraph_to_freq_map = subgraph_to_freq_map # to be used for negative sampling 37 | self._subgraph_to_id_map = subgraph_to_id_map 38 | self._id_to_subgraph_map = {v: k for k, v in subgraph_to_id_map.iteritems()} 39 | self._subgraphcount = sum(subgraph_to_freq_map.values()) # total num subgraphs in all graphs 40 | 41 | self.num_graphs = len(self.graph_fname_list) # doc size 42 | self.num_subgraphs = len(subgraph_to_id_map) # vocab of word size 43 | 44 | self.subgraph_id_freq_map_as_list = [] # id of this list is the word id and value is the freq of word with corresponding word id 45 | for i in xrange(len(self._subgraph_to_freq_map)): 46 | self.subgraph_id_freq_map_as_list.append(self._subgraph_to_freq_map[self._id_to_subgraph_map[i]]) 47 | 48 | return self._subgraph_to_id_map 49 | 50 | def scan_and_load_corpus(self): 51 | 52 | self.graph_fname_list = get_files(self.fnames, extn=self.extn, max_files=self.max_files) 53 | self._graph_name_to_id_map = {g: i for i, g in 54 | enumerate(self.graph_fname_list)} # input layer of the skipgram network 55 | self._id_to_graph_name_map = {i: g for g, i in self._graph_name_to_id_map.iteritems()} 56 | subgraph_to_id_map = self.scan_corpus() 57 | 58 | logging.info('number of graphs: %d' % self.num_graphs) 59 | logging.info('subgraph vocabulary size: %d' % self.num_subgraphs) 60 | logging.info('total number of subgraphs to be trained: %d' % self._subgraphcount) 61 | 62 | self.graph_ids_for_batch_traversal = range(self.num_graphs) 63 | shuffle(self.graph_ids_for_batch_traversal) 64 | 65 | def generate_batch_from_file(self, batch_size): 66 | target_graph_ids = [] 67 | context_subgraph_ids = [] 68 | 69 | graph_name = self.graph_fname_list[self.graph_ids_for_batch_traversal[self.graph_index]] 70 | graph_contents = open(graph_name).readlines() 71 | while self.subgraph_index >= len(graph_contents): 72 | self.subgraph_index = 0 73 | self.graph_index += 1 74 | if self.graph_index == len(self.graph_fname_list): 75 | self.graph_index = 0 76 | np.random.shuffle(self.graph_ids_for_batch_traversal) 77 | self.epoch_flag = True 78 | graph_name = self.graph_fname_list[self.graph_ids_for_batch_traversal[self.graph_index]] 79 | graph_contents = open(graph_name).readlines() 80 | 81 | while len(context_subgraph_ids) < batch_size: 82 | line_id = self.subgraph_index 83 | context_subgraph = graph_contents[line_id].split()[0] 84 | target_graph = graph_name 85 | 86 | context_subgraph_ids.append(self._subgraph_to_id_map[context_subgraph]) 87 | target_graph_ids.append(self._graph_name_to_id_map[target_graph]) 88 | 89 | self.subgraph_index += 1 90 | while self.subgraph_index == len(graph_contents): 91 | self.subgraph_index = 0 92 | self.graph_index += 1 93 | if self.graph_index == len(self.graph_fname_list): 94 | self.graph_index = 0 95 | np.random.shuffle(self.graph_ids_for_batch_traversal) 96 | self.epoch_flag = True 97 | 98 | graph_name = self.graph_fname_list[self.graph_ids_for_batch_traversal[self.graph_index]] 99 | graph_contents = open(graph_name).readlines() 100 | 101 | target_context_pairs = zip(target_graph_ids, context_subgraph_ids) 102 | shuffle(target_context_pairs) 103 | target_graph_ids, context_subgraph_ids = zip(*target_context_pairs) 104 | 105 | target_graph_ids = np.array(target_graph_ids, dtype=np.int32) 106 | context_subgraph_ids = np.array(context_subgraph_ids, dtype=np.int32) 107 | 108 | contextword_outputs = np.reshape(context_subgraph_ids, [len(context_subgraph_ids), 1]) 109 | 110 | return target_graph_ids, contextword_outputs 111 | -------------------------------------------------------------------------------- /learning/graph2vec/graph2vec.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | from copy import deepcopy 4 | from time import time 5 | 6 | import networkx as nx 7 | import tensorflow as tf 8 | import logging 9 | 10 | from learning.graph2vec.train_utils import train_skipgram 11 | 12 | 13 | def get_int_node_label(x): 14 | return int(x.split('+')[-1]) 15 | 16 | 17 | class Graph2Vec: 18 | 19 | def __init__(self, project_dir, files_paths, arguments): 20 | # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 21 | # os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' 22 | # tf.logging.set_verbosity(tf.logging.INFO) 23 | # logging.getLogger('tensorflow').disabled = True 24 | # logging.getLogger('tensorflow').propagate = False 25 | 26 | # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 27 | 28 | self.fnames = files_paths 29 | self.project_dir = project_dir 30 | self.graphs = None 31 | self.node_label_attr_name = 'label' 32 | self.label_to_compressed_label_map = {} 33 | self.wlk_h = 2 34 | self.learning_rate = 0.1 35 | self.embedding_size = 512 # 512 36 | self.num_negative_samples = 6 37 | self.epochs = 500 # 1000 38 | self.batch_size = 128 39 | self.wl_extn = 'g2v' + str(self.wlk_h) 40 | self.final_embeddings = None 41 | self.corpus = None 42 | self.arguments = arguments 43 | 44 | def run(self): 45 | t0 = time() 46 | self.wlk_relabel_and_dump_memory_version(self.fnames, max_h=self.wlk_h) 47 | print 'dumped sg2vec sentences in {} sec.'.format(time() - t0) 48 | t0 = time() 49 | 50 | self.corpus, self.final_embeddings = train_skipgram(self.fnames, self.wl_extn, self.learning_rate, 51 | self.embedding_size, 52 | self.num_negative_samples, self.epochs, self.batch_size, 53 | arguments=self.arguments) 54 | print 'Trained the skipgram model in {} sec.'.format(round(time() - t0, 2)) 55 | 56 | def load_graphs(self): 57 | self.graphs = [nx.drawing.nx_agraph.read_dot(file_path) for file_path in self.fnames] 58 | 59 | def wlk_relabel_and_dump_memory_version(self, fnames, max_h): 60 | 61 | t0 = time() 62 | self.load_graphs() 63 | print 'loaded all graphs in {} sec'.format(round(time() - t0, 2)) 64 | 65 | t0 = time() 66 | self.graphs = [self.initial_relabel(g) for g in self.graphs] 67 | print 'initial relabeling done in {} sec'.format(round(time() - t0, 2)) 68 | 69 | for it in xrange(1, max_h + 1): 70 | t0 = time() 71 | self.label_to_compressed_label_map = {} 72 | self.graphs = [self.wl_relabel(g, it) for g in self.graphs] 73 | print 'WL iteration {} done in {} sec.'.format(it, round(time() - t0, 2)) 74 | print 'num of WL rooted subgraphs in iter {} is {}'.format(it, len(self.label_to_compressed_label_map)) 75 | 76 | t0 = time() 77 | for fname, g in zip(fnames, self.graphs): 78 | self.dump_sg2vec_str(fname, max_h, g) 79 | print 'dumped sg2vec sentences in {} sec.'.format(round(time() - t0, 2)) 80 | 81 | def dump_sg2vec_str(self, fname, max_h, g=None): 82 | if not g: 83 | g = nx.read_gexf(fname + '.tmpg') 84 | new_g = deepcopy(g) 85 | for n in g.nodes(): 86 | del new_g.nodes[n]['relabel'] 87 | new_g.nodes[n]['relabel'] = ast.literal_eval(g.nodes[n]['relabel']) 88 | g = new_g 89 | 90 | opfname = fname + '.' + self.wl_extn 91 | 92 | # if os.path.isfile(opfname): 93 | # return 94 | 95 | with open(opfname, 'w') as fh: 96 | for n, d in g.nodes(data=True): 97 | for i in xrange(0, max_h + 1): 98 | try: 99 | center = d['relabel'][i] 100 | except: 101 | continue 102 | neis_labels_prev_deg = [] 103 | neis_labels_next_deg = [] 104 | 105 | if i != 0: 106 | neis_labels_prev_deg = list( 107 | set([g.node[nei]['relabel'][i - 1] for nei in nx.all_neighbors(g, n)])) 108 | neis_labels_prev_deg.sort() 109 | NeisLabelsSameDeg = list(set([g.node[nei]['relabel'][i] for nei in nx.all_neighbors(g, n)])) 110 | if i != max_h: 111 | neis_labels_next_deg = list( 112 | set([g.node[nei]['relabel'][i + 1] for nei in nx.all_neighbors(g, n)])) 113 | neis_labels_next_deg.sort() 114 | 115 | nei_list = NeisLabelsSameDeg + neis_labels_prev_deg + neis_labels_next_deg 116 | nei_list = ' '.join(nei_list) 117 | 118 | sentence = center + ' ' + nei_list 119 | print>> fh, sentence 120 | 121 | if os.path.isfile(fname + '.tmpg'): 122 | os.system('rm ' + fname + '.tmpg') 123 | 124 | def wl_relabel(self, g, it): 125 | 126 | try: 127 | opfname = g + '.tmpg' 128 | g = nx.drawing.nx_agraph.read_dot(g + '.tmpg') 129 | new_g = deepcopy(g) 130 | for n in g.nodes(): 131 | new_g.nodes[n]['relabel'] = ast.literal_eval(g.nodes[n]['relabel']) 132 | g = new_g 133 | except: 134 | opfname = None 135 | pass 136 | 137 | prev_iter = it - 1 138 | for node in g.nodes(): 139 | prev_iter_node_label = get_int_node_label(g.nodes[node]['relabel'][prev_iter]) 140 | node_label = [prev_iter_node_label] 141 | neighbors = list(nx.all_neighbors(g, node)) 142 | neighborhood_label = sorted([get_int_node_label(g.nodes[nei]['relabel'][prev_iter]) for nei in neighbors]) 143 | node_neighborhood_label = tuple(node_label + neighborhood_label) 144 | if not self.label_to_compressed_label_map.has_key(node_neighborhood_label): 145 | compressed_label = len(self.label_to_compressed_label_map) + 1 146 | self.label_to_compressed_label_map[node_neighborhood_label] = compressed_label 147 | g.node[node]['relabel'][it] = str(it) + '+' + str(compressed_label) 148 | else: 149 | g.node[node]['relabel'][it] = str(it) + '+' + str( 150 | self.label_to_compressed_label_map[node_neighborhood_label]) 151 | 152 | if opfname: 153 | nx.drawing.nx_agraph.write_dot(g, opfname) 154 | else: 155 | return g 156 | 157 | def initial_relabel(self, g): 158 | 159 | try: 160 | opfname = g + '.tmpg' 161 | g = nx.drawing.nx_agraph.read_dot(g) 162 | except: 163 | opfname = None 164 | pass 165 | 166 | nx.convert_node_labels_to_integers(g, 167 | first_label=0) # this needs to be done for the initial interation only 168 | for node in g.nodes(): g.node[node]['relabel'] = {} 169 | 170 | for node in g.nodes(): 171 | try: 172 | label = g.node[node][self.node_label_attr_name] 173 | except: 174 | # no node label referred in 'node_label_attr_name' is present, hence assigning an invalid compressd label 175 | g.node[node]['relabel'][0] = '0+0' 176 | continue 177 | 178 | if not self.label_to_compressed_label_map.has_key(label): 179 | compressed_label = len( 180 | self.label_to_compressed_label_map) + 1 # starts with 1 and incremented every time a new node label is seen 181 | self.label_to_compressed_label_map[label] = compressed_label # inster the new label to the label map 182 | g.node[node]['relabel'][0] = '0+' + str(compressed_label) 183 | else: 184 | g.node[node]['relabel'][0] = '0+' + str(self.label_to_compressed_label_map[label]) 185 | 186 | if opfname: 187 | nx.drawing.nx_agraph.write_dot(g, opfname) 188 | else: 189 | return g 190 | -------------------------------------------------------------------------------- /learning/graph2vec/parallelgraph2vec.py: -------------------------------------------------------------------------------- 1 | import json 2 | import glob 3 | import hashlib 4 | import logging 5 | from collections import namedtuple 6 | 7 | import pandas as pd 8 | import networkx as nx 9 | # from nltk.cluster import cosine_distance 10 | # from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, cosine_distances 11 | from tqdm import tqdm 12 | from joblib import Parallel, delayed 13 | # from parser import parameter_parser 14 | # import numpy.distutils.system_info as sysinfo 15 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 16 | 17 | 18 | class WeisfeilerLehmanMachine: 19 | """ 20 | Weisfeiler Lehman feature extractor class. 21 | """ 22 | 23 | def __init__(self, graph, features, iterations): 24 | """ 25 | Initialization method which also executes feature extraction. 26 | :param graph: The Nx graph object. 27 | :param features: Feature hash table. 28 | :param iterations: Number of WL iterations. 29 | """ 30 | self.iterations = iterations 31 | self.graph = graph 32 | self.features = features 33 | self.nodes = self.graph.nodes() 34 | self.extracted_features = [str(v) for k, v in features.items()] 35 | self.do_recursions() 36 | 37 | def do_a_recursion(self): 38 | """ 39 | The method does a single WL recursion. 40 | :return new_features: The hash table with extracted WL features. 41 | """ 42 | new_features = {} 43 | for node in self.nodes: 44 | nebs = self.graph.neighbors(node) 45 | degs = [self.features[neb] for neb in nebs] 46 | features = "_".join([str(self.features[node])] + sorted([str(deg) for deg in degs])) 47 | hash_object = hashlib.md5(features.encode()) 48 | hashing = hash_object.hexdigest() 49 | new_features[node] = hashing 50 | self.extracted_features = self.extracted_features + list(new_features.values()) 51 | return new_features 52 | 53 | def do_recursions(self): 54 | """ 55 | The method does a series of WL recursions. 56 | """ 57 | for iteration in range(self.iterations): 58 | self.features = self.do_a_recursion() 59 | 60 | 61 | def feature_extractor(path, rounds): 62 | """ 63 | Function to extract WL features from a graph. 64 | :param path: The path to the graph json. 65 | :param rounds: Number of WL iterations. 66 | :return doc: Document collection object. 67 | """ 68 | graph, features, name, graph_len, graph_hash = dataset_reader(path) 69 | machine = WeisfeilerLehmanMachine(graph, features, rounds) 70 | doc = TaggedDocument(words=machine.extracted_features, tags=[name, str(graph_len), graph_hash]) 71 | return doc 72 | 73 | 74 | def dataset_reader(path): 75 | """ 76 | Function to read the graph and features from a json file. 77 | :param path: The path to the graph json. 78 | :return graph: The graph object. 79 | :return features: Features hash table. 80 | :return name: Name of the graph. 81 | """ 82 | # name = path.strip(".json").split("/")[-1] 83 | # data = json.load(open(path)) 84 | # graph = nx.from_edgelist(data["edges"]) 85 | name = path 86 | graph = nx.drawing.nx_agraph.read_dot(path) 87 | graph_len = 0 88 | graph_hash = extract_node_names_features(graph, name) 89 | features = {} 90 | for node in graph.nodes: 91 | features[node] = graph.nodes[node]['label'] 92 | graph_len += 1 93 | 94 | # if "features" in data.keys(): 95 | # features = data["features"] 96 | # else: 97 | # features = nx.degree(graph) 98 | # 99 | # features = {int(k): v for k, v, in features.items()} 100 | return graph, features, name, graph_len, graph_hash 101 | 102 | 103 | def extract_node_names_features(graph, name): 104 | lines_numbers = set() 105 | basic_block_ids = set() 106 | llvm_instructions = set() 107 | for node in graph.nodes: 108 | label = graph.nodes[node]['label'] 109 | if 'line' in graph.nodes[node]: 110 | line = graph.nodes[node]['line'] 111 | lines_numbers.add(line) 112 | if 'basic_block_id' in graph.nodes[node]: 113 | bb_id = graph.nodes[node]['basic_block_id'] 114 | basic_block_ids.add(bb_id) 115 | llvm_instructions.add(label) 116 | 117 | return compute_construct_hash(name, lines_numbers, basic_block_ids, llvm_instructions) 118 | 119 | 120 | def compute_construct_hash(name, lines_numbers, basic_block_ids, llvm_instructions): 121 | construct_string = '' 122 | # print self.file_info 123 | construct_string += name[:name.find('.c/pdg') + 2] 124 | # print construct_string 125 | for id in basic_block_ids: 126 | construct_string += str(id) 127 | for line in lines_numbers: 128 | construct_string += str(line) 129 | for llvm_instruction in llvm_instructions: 130 | construct_string += str(llvm_instruction) 131 | # print construct_string 132 | # self.construct_hash = int(hashlib.sha1(construct_string).hexdigest(), 16) % (10 ** 8) 133 | return hashlib.sha1(construct_string).hexdigest() 134 | 135 | 136 | def save_embedding(output_path, model, files, dimensions): 137 | """ 138 | Function to save the embedding. 139 | :param output_path: Path to the embedding csv. 140 | :param model: The embedding model object. 141 | :param files: The list of files. 142 | :param dimensions: The embedding dimension parameter. 143 | """ 144 | out = [] 145 | for f in files: 146 | identifier = f.split("/")[-1].strip(".json") 147 | out.append([int(identifier)] + list(model.docvecs["g_" + identifier])) 148 | 149 | out = pd.DataFrame(out, columns=["type"] + ["x_" + str(dimension) for dimension in range(dimensions)]) 150 | out = out.sort_values(["type"]) 151 | out.to_csv(output_path, index=None) 152 | 153 | 154 | class Graph2Vec: 155 | 156 | def __init__(self, project_dir, files_paths, arguments=None): 157 | self.graph_files = files_paths 158 | self.project_dir = project_dir 159 | self.arguments = arguments 160 | self.graphs = None 161 | self.node_label_attr_name = 'label' 162 | 163 | self.wlk_h = 2 164 | self.wl_iterations = 5 165 | if self.arguments: 166 | self.workers = self.arguments.count_cpu 167 | else: 168 | self.workers = 4 169 | self.learning_rate = 0.1 170 | self.embedding_size = 1024 # 512 171 | self.num_negative_samples = 6 172 | self.epochs = 100 # 1000 173 | self.batch_size = 10 174 | self.final_embeddings = None 175 | self.corpus = None 176 | self.min_count = 0 177 | self.down_sampling = 0.0001 178 | 179 | def run(self): 180 | # print("\nFeature extraction started ...\n") 181 | document_collections = \ 182 | Parallel(n_jobs=self.workers)( 183 | delayed(feature_extractor)(g, self.wl_iterations) for g in self.graph_files) 184 | # print("\nOptimization started.\n") 185 | unique_hashes = set() 186 | docs = [] 187 | # analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') 188 | for index, text in enumerate(document_collections): 189 | tags = text[1] 190 | graph_file = tags[0] 191 | graph_len = int(tags[1]) 192 | graph_hash = tags[2] 193 | if graph_len > 2 and graph_hash not in unique_hashes: 194 | docs.append(text) 195 | unique_hashes.add(graph_hash) 196 | else: 197 | self.graph_files.remove(graph_file) 198 | 199 | document_collections = docs 200 | model = Doc2Vec(document_collections, 201 | vector_size=self.embedding_size, 202 | window=0, 203 | min_count=self.min_count, 204 | dm=0, 205 | sample=self.down_sampling, 206 | workers=self.workers, 207 | epochs=self.epochs, 208 | alpha=self.learning_rate) 209 | out = [] 210 | for f in self.graph_files: 211 | out.append(list(model.docvecs[f])) 212 | self.final_embeddings = out 213 | -------------------------------------------------------------------------------- /learning/graph2vec/skipgram.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from time import time 4 | 5 | import tensorflow as tf 6 | 7 | 8 | class skipgram(object): 9 | ''' 10 | skipgram model - refer Mikolov et al (2013) 11 | ''' 12 | 13 | def __init__(self, num_graphs, num_subgraphs, learning_rate, embedding_size, 14 | num_negsample, num_steps, corpus, arguments): 15 | self.num_graphs = num_graphs 16 | self.num_subgraphs = num_subgraphs 17 | self.embedding_size = embedding_size 18 | self.num_negsample = num_negsample 19 | self.learning_rate = learning_rate 20 | self.num_steps = num_steps 21 | self.corpus = corpus 22 | self.graph, self.batch_inputs, self.batch_labels, self.normalized_embeddings, \ 23 | self.loss, self.optimizer = self.trainer_initial() 24 | self.arguments = arguments 25 | 26 | def trainer_initial(self): 27 | graph = tf.Graph() 28 | with graph.as_default(): 29 | batch_inputs = tf.placeholder(tf.int32, shape=([None, ])) 30 | batch_labels = tf.placeholder(tf.int64, shape=([None, 1])) 31 | num_negsample = self.num_negsample 32 | if self.num_subgraphs < num_negsample: 33 | num_negsample = self.num_subgraphs 34 | 35 | graph_embeddings = tf.Variable( 36 | tf.random_uniform([self.num_graphs, self.embedding_size], -0.5 / self.embedding_size, 37 | 0.5 / self.embedding_size)) 38 | 39 | batch_graph_embeddings = tf.nn.embedding_lookup(graph_embeddings, batch_inputs) # hidden layer 40 | 41 | weights = tf.Variable(tf.truncated_normal([self.num_subgraphs, self.embedding_size], 42 | stddev=1.0 / math.sqrt(self.embedding_size))) # output layer wt 43 | biases = tf.Variable(tf.zeros(self.num_subgraphs)) # output layer biases 44 | 45 | # negative sampling part 46 | loss = tf.reduce_mean( 47 | tf.nn.nce_loss(weights=weights, 48 | biases=biases, 49 | labels=batch_labels, 50 | inputs=batch_graph_embeddings, 51 | num_sampled=self.num_negsample, 52 | num_classes=self.num_subgraphs, 53 | sampled_values=tf.nn.fixed_unigram_candidate_sampler( 54 | true_classes=batch_labels, 55 | num_true=1, 56 | num_sampled=num_negsample, 57 | unique=True, 58 | range_max=self.num_subgraphs, 59 | distortion=0.75, 60 | unigrams=self.corpus.subgraph_id_freq_map_as_list) # word_id_freq_map_as_list is the 61 | # frequency of each word in vocabulary 62 | )) 63 | 64 | global_step = tf.Variable(0, trainable=False) 65 | learning_rate = tf.train.exponential_decay(self.learning_rate, 66 | global_step, 100000, 0.96, 67 | staircase=True) # linear decay over time 68 | 69 | learning_rate = tf.maximum(learning_rate, 70 | 0.001) # cannot go below 0.001 to ensure at least a minimal learning 71 | 72 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step) 73 | 74 | norm = tf.sqrt(tf.reduce_mean(tf.square(graph_embeddings), 1, keep_dims=True)) 75 | normalized_embeddings = graph_embeddings / norm 76 | 77 | return graph, batch_inputs, batch_labels, normalized_embeddings, loss, optimizer 78 | 79 | def train(self, corpus, batch_size): 80 | with tf.Session(graph=self.graph, 81 | config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, 82 | intra_op_parallelism_threads=self.arguments.count_cpu, 83 | inter_op_parallelism_threads=self.arguments.count_cpu, 84 | device_count={'CPU': self.arguments.count_cpu})) as sess: 85 | 86 | init = tf.global_variables_initializer() 87 | sess.run(init) 88 | 89 | loss = 0 90 | 91 | for i in xrange(self.num_steps): 92 | t0 = time() 93 | step = 0 94 | while corpus.epoch_flag == False: 95 | batch_data, batch_labels = corpus.generate_batch_from_file( 96 | batch_size) # get (target,context) wordid tuples 97 | 98 | feed_dict = {self.batch_inputs: batch_data, self.batch_labels: batch_labels} 99 | _, loss_val = sess.run([self.optimizer, self.loss], feed_dict=feed_dict) 100 | 101 | loss += loss_val 102 | 103 | if step % 100 == 0: 104 | if step > 0: 105 | average_loss = loss / step 106 | logging.info('Epoch: %d : Average loss for step: %d : %f' % (i, step, average_loss)) 107 | step += 1 108 | 109 | corpus.epoch_flag = False 110 | epoch_time = time() - t0 111 | logging.info('######################### Epoch: %d : %f, %.2f sec. #####################' % ( 112 | i, loss / step, epoch_time)) 113 | loss = 0 114 | 115 | # done with training 116 | final_embeddings = self.normalized_embeddings.eval() 117 | return final_embeddings 118 | -------------------------------------------------------------------------------- /learning/graph2vec/train_utils.py: -------------------------------------------------------------------------------- 1 | from corpus_parser import Corpus 2 | from skipgram import skipgram 3 | 4 | 5 | def train_skipgram(fnames, extn, learning_rate, embedding_size, num_negsample, epochs, 6 | batch_size, arguments): # , output_dir): 7 | ''' 8 | 9 | :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled 10 | according to WL relabeling strategy and the format of each line in these folders shall be: .... 11 | :param extn: Extension of the WL relabled file 12 | :param learning_rate: learning rate for the skipgram model (will involve a linear decay) 13 | :param embedding_size: number of dimensions to be used for learning subgraph representations 14 | :param num_negsample: number of negative samples to be used by the skipgram model 15 | :param epochs: number of iterations the dataset is traversed by the skipgram model 16 | :param batch_size: size of each batch for the skipgram model 17 | :param output_dir: the folder where embedding file will be stored 18 | :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013)) 19 | ''' 20 | 21 | # op_fname = '_'.join([os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', 22 | # str(epochs),'lr',str(learning_rate),'embeddings.txt']) 23 | # op_fname = os.path.join(output_dir, op_fname) 24 | # if os.path.isfile(op_fname): 25 | # logging.info('The embedding file: {} is already present, hence NOT training skipgram model ' 26 | # 'for subgraph vectors'.format(op_fname)) 27 | # return op_fname 28 | 29 | print "Initializing SKIPGRAM..." 30 | corpus = Corpus(fnames, extn=extn, max_files=0) # just load 'max_files' files from this folder 31 | corpus.scan_and_load_corpus() 32 | 33 | model_skipgram = skipgram( 34 | num_graphs=corpus.num_graphs, 35 | num_subgraphs=corpus.num_subgraphs, 36 | learning_rate=learning_rate, 37 | embedding_size=embedding_size, 38 | num_negsample=num_negsample, 39 | num_steps=epochs, # no. of time the training set will be iterated through 40 | corpus=corpus, # data set of (target,context) tuples 41 | arguments=arguments 42 | ) 43 | 44 | final_embeddings = model_skipgram.train(corpus=corpus, batch_size=batch_size) 45 | 46 | # logging.info('Write the matrix to a word2vec format file') 47 | # save_graph_embeddings(corpus, final_embeddings, op_fname) 48 | # logging.info('Completed writing the final embeddings, pls check file: {} for the same'.format(op_fname)) 49 | # return op_fname 50 | return corpus, final_embeddings 51 | 52 | 53 | if __name__ == '__main__': 54 | pass 55 | -------------------------------------------------------------------------------- /learning/graph2vec/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def get_files_from_dir(dirname, extn, max_files=0): 6 | all_files = [os.path.join(dirname, f) for f in os.listdir(dirname) if f.endswith(extn)] 7 | for root, dirs, files in os.walk(dirname): 8 | for f in files: 9 | if f.endswith(extn): 10 | all_files.append(os.path.join(root, f)) 11 | 12 | all_files = list(set(all_files)) 13 | all_files.sort() 14 | if max_files: 15 | return all_files[:max_files] 16 | else: 17 | return all_files 18 | 19 | 20 | def get_files(fnames, extn, max_files=0): 21 | all_files = ['{}.{}'.format(f, extn) for f in fnames] 22 | for file_name in all_files: 23 | if not os.path.isfile(file_name): 24 | print 'error, missing graph', file_name 25 | all_files = list(set(all_files)) 26 | all_files.sort() 27 | if max_files: 28 | return all_files[:max_files] 29 | else: 30 | return all_files 31 | 32 | 33 | def save_graph_embeddings(corpus, final_embeddings, opfname): 34 | dict_to_save = {} 35 | for i in range(len(final_embeddings)): 36 | graph_fname = corpus._id_to_graph_name_map[i] 37 | graph_embedding = final_embeddings[i, :].tolist() 38 | dict_to_save[graph_fname] = graph_embedding 39 | 40 | with open(opfname, 'w') as fh: 41 | json.dump(dict_to_save, fh, indent=4) 42 | 43 | 44 | def get_class_labels(graph_files, class_labels_fname): 45 | graph_to_class_label_map = {l.split()[0].split('.')[0]: int(l.split()[1].strip()) for l in open(class_labels_fname)} 46 | labels = [graph_to_class_label_map[os.path.basename(g).split('.')[0]] for g in graph_files] 47 | return labels 48 | 49 | 50 | if __name__ == '__main__': 51 | print 'nothing to do' 52 | -------------------------------------------------------------------------------- /learning/graphkernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/graphkernel/__init__.py -------------------------------------------------------------------------------- /learning/graphkernel/weisfeiler_lehman.py: -------------------------------------------------------------------------------- 1 | """Weisfeiler_Lehman graph kernel. 2 | Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by: 3 | Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt 4 | Mehlhorn, Karsten M. Borgwardt, JMLR, 2012. 5 | http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html 6 | Author : Sandro Vega-Pons, Emanuele Olivetti 7 | """ 8 | 9 | import numpy as np 10 | import networkx as nx 11 | import copy 12 | 13 | import pandas as pd 14 | 15 | 16 | class GK_WL(): 17 | """ 18 | Weisfeiler_Lehman graph kernel. 19 | """ 20 | 21 | def compare_list(self, graph_list, h=1, node_label=True): 22 | """Compute the all-pairs kernel values for a list of graphs. 23 | This function can be used to directly compute the kernel 24 | matrix for a list of graphs. The direct computation of the 25 | kernel matrix is faster than the computation of all individual 26 | pairwise kernel values. 27 | Parameters 28 | ---------- 29 | graph_list: list 30 | A list of graphs (list of networkx graphs) 31 | h : interger 32 | Number of iterations. 33 | node_label : boolean 34 | Whether to use original node labels. True for using node labels 35 | saved in the attribute 'node_label'. False for using the node 36 | degree of each node as node attribute. 37 | Return 38 | ------ 39 | K: numpy.array, shape = (len(graph_list), len(graph_list)) 40 | The similarity matrix of all graphs in graph_list. 41 | """ 42 | graph_list = self.convert_node_names_to_int(graph_list) 43 | self.graphs = graph_list 44 | n = len(graph_list) 45 | lists = [0] * n 46 | k = [0] * (h + 1) 47 | n_nodes = 0 48 | n_max = 0 49 | 50 | # Compute adjacency lists and n_nodes, the total number of 51 | # nodes in the dataset. 52 | for i in range(n): 53 | self.get_adj_list(graph_list, i, lists) 54 | n_nodes = n_nodes + graph_list[i].number_of_nodes() 55 | 56 | # Computing the maximum number of nodes in the graphs. It 57 | # will be used in the computation of vectorial 58 | # representation. 59 | if (n_max < graph_list[i].number_of_nodes()): 60 | n_max = graph_list[i].number_of_nodes() 61 | 62 | phi = np.zeros((n_max, n), dtype=np.uint64) 63 | 64 | # INITIALIZATION: initialize the nodes labels for each graph 65 | # with their labels or with degrees (for unlabeled graphs) 66 | 67 | labels = [0] * n 68 | label_lookup = {} 69 | label_counter = 0 70 | 71 | # label_lookup is an associative array, which will contain the 72 | # mapping from multiset labels (strings) to short labels 73 | # (integers) 74 | 75 | if node_label is True: 76 | for i in range(n): 77 | l_aux = nx.get_node_attributes(graph_list[i], 78 | 'label').values() 79 | # It is assumed that the graph has an attribute 80 | # 'label' 81 | labels[i] = np.zeros(len(l_aux), dtype=np.int32) 82 | 83 | for j in range(len(l_aux)): 84 | if not (l_aux[j] in label_lookup): 85 | label_lookup[l_aux[j]] = label_counter 86 | labels[i][j] = label_counter 87 | label_counter += 1 88 | else: 89 | labels[i][j] = label_lookup[l_aux[j]] 90 | # labels are associated to a natural number 91 | # starting with 0. 92 | phi[labels[i][j], i] += 1 93 | else: 94 | for i in range(n): 95 | labels[i] = np.array(graph_list[i].degree().values()) 96 | for j in range(len(labels[i])): 97 | phi[labels[i][j], i] += 1 98 | 99 | # Simplified vectorial representation of graphs (just taking 100 | # the vectors before the kernel iterations), i.e., it is just 101 | # the original nodes degree. 102 | self.vectors = np.copy(phi.transpose()) 103 | 104 | k = np.dot(phi.transpose(), phi) 105 | 106 | # MAIN LOOP 107 | it = 0 108 | new_labels = copy.deepcopy(labels) 109 | 110 | while it < h: 111 | # create an empty lookup table 112 | label_lookup = {} 113 | label_counter = 0 114 | 115 | phi = np.zeros((n_nodes, n), dtype=np.uint64) 116 | for i in range(n): 117 | for v in range(len(lists[i])): 118 | # form a multiset label of the node v of the i'th graph 119 | # and convert it to a string 120 | long_label = np.concatenate((np.array([labels[i][v]]), 121 | np.sort(labels[i] 122 | [lists[i][v]]))) 123 | long_label_string = str(long_label) 124 | # if the multiset label has not yet occurred, add it to the 125 | # lookup table and assign a number to it 126 | if not (long_label_string in label_lookup): 127 | label_lookup[long_label_string] = label_counter 128 | new_labels[i][v] = label_counter 129 | label_counter += 1 130 | else: 131 | new_labels[i][v] = label_lookup[long_label_string] 132 | # fill the column for i'th graph in phi 133 | aux = np.bincount(new_labels[i]) 134 | # phi[new_labels[i], i] += aux[new_labels[i]] 135 | np.add(phi[new_labels[i], i], aux[new_labels[i]], out=phi[new_labels[i], i], casting="unsafe") 136 | 137 | k += np.dot(phi.transpose(), phi) 138 | labels = copy.deepcopy(new_labels) 139 | it = it + 1 140 | 141 | # Compute the normalized version of the kernel 142 | k_norm = np.zeros(k.shape) 143 | for i in range(k.shape[0]): 144 | for j in range(k.shape[1]): 145 | k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j]) 146 | 147 | return k_norm 148 | 149 | def convert_node_names_to_int(self, graph_list): 150 | graphs = [] 151 | for i in range(len(graph_list)): 152 | # nodes = list(graph.nodes) 153 | # mapping = zip(nodes, pd.Series(nodes).astype('category').cat.codes.values) 154 | graphs.append(nx.convert_node_labels_to_integers(graph_list[i])) 155 | # print graph_list[i].nodes 156 | return graphs 157 | 158 | def get_adj_list(self, graph_list, i, lists): 159 | adj_list = [] 160 | for n, nbrdict in graph_list[i].adjacency(): 161 | adj_list.append(nbrdict.keys()) 162 | print adj_list 163 | lists[i] = adj_list 164 | 165 | def compare(self, g_1, g_2, h=1, node_label=True): 166 | """Compute the kernel value (similarity) between two graphs. 167 | The kernel is normalized to [0,1] by the equation: 168 | k_norm(g1, g2) = k(g1, g2) / sqrt(k(g1,g1) * k(g2,g2)) 169 | Parameters 170 | ---------- 171 | g_1 : networkx.Graph 172 | First graph. 173 | g_2 : networkx.Graph 174 | Second graph. 175 | h : interger 176 | Number of iterations. 177 | node_label : boolean 178 | Whether to use the values under the graph attribute 'node_label' 179 | as node labels. If False, the degree of the nodes are used as 180 | labels. 181 | Returns 182 | ------- 183 | k : The similarity value between g1 and g2. 184 | """ 185 | gl = [g_1, g_2] 186 | return self.compare_list(gl, h, node_label)[0, 1] 187 | -------------------------------------------------------------------------------- /learning/node2vec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/node2vec/__init__.py -------------------------------------------------------------------------------- /learning/node2vec/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference implementation of node2vec. 3 | 4 | Author: Aditya Grover 5 | 6 | For more details, refer to the paper: 7 | node2vec: Scalable Feature Learning for Networks 8 | Aditya Grover and Jure Leskovec 9 | Knowledge Discovery and Data Mining (KDD), 2016 10 | ''' 11 | 12 | import argparse 13 | import numpy as np 14 | import networkx as nx 15 | import node2vec 16 | from gensim.models import Word2Vec 17 | 18 | def parse_args(): 19 | ''' 20 | Parses the node2vec arguments. 21 | ''' 22 | parser = argparse.ArgumentParser(description="Run node2vec.") 23 | 24 | parser.add_argument('--input', nargs='?', default='graph/karate.edgelist', 25 | help='Input graph path') 26 | 27 | parser.add_argument('--output', nargs='?', default='emb/karate.emb', 28 | help='Embeddings path') 29 | 30 | parser.add_argument('--dimensions', type=int, default=128, 31 | help='Number of dimensions. Default is 128.') 32 | 33 | parser.add_argument('--walk-length', type=int, default=80, 34 | help='Length of walk per source. Default is 80.') 35 | 36 | parser.add_argument('--num-walks', type=int, default=10, 37 | help='Number of walks per source. Default is 10.') 38 | 39 | parser.add_argument('--window-size', type=int, default=10, 40 | help='Context size for optimization. Default is 10.') 41 | 42 | parser.add_argument('--iter', default=1, type=int, 43 | help='Number of epochs in SGD') 44 | 45 | parser.add_argument('--workers', type=int, default=8, 46 | help='Number of parallel workers. Default is 8.') 47 | 48 | parser.add_argument('--p', type=float, default=1, 49 | help='Return hyperparameter. Default is 1.') 50 | 51 | parser.add_argument('--q', type=float, default=1, 52 | help='Inout hyperparameter. Default is 1.') 53 | 54 | parser.add_argument('--weighted', dest='weighted', action='store_true', 55 | help='Boolean specifying (un)weighted. Default is unweighted.') 56 | parser.add_argument('--unweighted', dest='unweighted', action='store_false') 57 | parser.set_defaults(weighted=False) 58 | 59 | parser.add_argument('--directed', dest='directed', action='store_true', 60 | help='Graph is (un)directed. Default is undirected.') 61 | parser.add_argument('--undirected', dest='undirected', action='store_false') 62 | parser.set_defaults(directed=False) 63 | 64 | return parser.parse_args() 65 | 66 | def read_graph(): 67 | ''' 68 | Reads the input network in networkx. 69 | ''' 70 | if args.weighted: 71 | G = nx.read_edgelist(args.input, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph()) 72 | else: 73 | G = nx.read_edgelist(args.input, nodetype=int, create_using=nx.DiGraph()) 74 | for edge in G.edges(): 75 | G[edge[0]][edge[1]]['weight'] = 1 76 | 77 | if not args.directed: 78 | G = G.to_undirected() 79 | 80 | return G 81 | 82 | def learn_embeddings(walks): 83 | ''' 84 | Learn embeddings by optimizing the Skipgram objective using SGD. 85 | ''' 86 | walks = [map(str, walk) for walk in walks] 87 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, iter=args.iter) 88 | model.save_word2vec_format(args.output) 89 | 90 | return 91 | 92 | def main(args): 93 | ''' 94 | Pipeline for representational learning for all nodes in a graph. 95 | ''' 96 | nx_G = read_graph() 97 | G = node2vec.Graph(nx_G, args.directed, args.p, args.q) 98 | G.preprocess_transition_probs() 99 | walks = G.simulate_walks(args.num_walks, args.walk_length) 100 | learn_embeddings(walks) 101 | 102 | if __name__ == "__main__": 103 | args = parse_args() 104 | main(args) 105 | -------------------------------------------------------------------------------- /learning/node2vec/node2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import random 4 | 5 | 6 | class Graph(): 7 | def __init__(self, nx_G, is_directed, p, q): 8 | self.G = nx_G 9 | self.is_directed = is_directed 10 | self.p = p 11 | self.q = q 12 | 13 | def node2vec_walk(self, walk_length, start_node): 14 | ''' 15 | Simulate a random walk starting from start node. 16 | ''' 17 | G = self.G 18 | alias_nodes = self.alias_nodes 19 | alias_edges = self.alias_edges 20 | 21 | walk = [start_node] 22 | 23 | while len(walk) < walk_length: 24 | cur = walk[-1] 25 | cur_nbrs = sorted(G.neighbors(cur)) 26 | if len(cur_nbrs) > 0: 27 | if len(walk) == 1: 28 | walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 29 | else: 30 | prev = walk[-2] 31 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 32 | alias_edges[(prev, cur)][1])] 33 | walk.append(next) 34 | else: 35 | break 36 | 37 | return walk 38 | 39 | def simulate_walks(self, num_walks, walk_length): 40 | ''' 41 | Repeatedly simulate random walks from each node. 42 | ''' 43 | G = self.G 44 | walks = [] 45 | nodes = list(G.nodes()) 46 | # print 'Walk iteration:' 47 | for walk_iter in range(num_walks): 48 | # print str(walk_iter + 1), '/', str(num_walks) 49 | random.shuffle(nodes) 50 | for node in nodes: 51 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 52 | 53 | return walks 54 | 55 | def get_alias_edge(self, src, dst): 56 | ''' 57 | Get the alias edge setup lists for a given edge. 58 | ''' 59 | G = self.G 60 | p = self.p 61 | q = self.q 62 | 63 | unnormalized_probs = [] 64 | for dst_nbr in sorted(G.neighbors(dst)): 65 | if dst_nbr == src: 66 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) 67 | elif G.has_edge(dst_nbr, src): 68 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 69 | else: 70 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) 71 | norm_const = sum(unnormalized_probs) 72 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 73 | 74 | return alias_setup(normalized_probs) 75 | 76 | def preprocess_transition_probs(self): 77 | ''' 78 | Preprocessing of transition probabilities for guiding the random walks. 79 | ''' 80 | G = self.G 81 | is_directed = self.is_directed 82 | 83 | alias_nodes = {} 84 | for node in G.nodes(): 85 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 86 | norm_const = sum(unnormalized_probs) 87 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 88 | alias_nodes[node] = alias_setup(normalized_probs) 89 | 90 | alias_edges = {} 91 | triads = {} 92 | 93 | if is_directed: 94 | for edge in G.edges(): 95 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 96 | else: 97 | for edge in G.edges(): 98 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 99 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 100 | 101 | self.alias_nodes = alias_nodes 102 | self.alias_edges = alias_edges 103 | 104 | return 105 | 106 | 107 | def alias_setup(probs): 108 | ''' 109 | Compute utility lists for non-uniform sampling from discrete distributions. 110 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 111 | for details 112 | ''' 113 | K = len(probs) 114 | q = np.zeros(K) 115 | J = np.zeros(K, dtype=np.int) 116 | 117 | smaller = [] 118 | larger = [] 119 | for kk, prob in enumerate(probs): 120 | q[kk] = K * prob 121 | if q[kk] < 1.0: 122 | smaller.append(kk) 123 | else: 124 | larger.append(kk) 125 | 126 | while len(smaller) > 0 and len(larger) > 0: 127 | small = smaller.pop() 128 | large = larger.pop() 129 | 130 | J[small] = large 131 | q[large] = q[large] + q[small] - 1.0 132 | if q[large] < 1.0: 133 | smaller.append(large) 134 | else: 135 | larger.append(large) 136 | 137 | return J, q 138 | 139 | 140 | def alias_draw(J, q): 141 | ''' 142 | Draw sample from a non-uniform discrete distribution using alias sampling. 143 | ''' 144 | K = len(J) 145 | 146 | kk = int(np.floor(np.random.rand() * K)) 147 | if np.random.rand() < q[kk]: 148 | return kk 149 | else: 150 | return J[kk] 151 | -------------------------------------------------------------------------------- /learning/similarity.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | 4 | import networkx as nx 5 | 6 | 7 | def counter_cosine_similarity(c1, c2): 8 | terms = set(c1).union(c2) 9 | dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms) 10 | magA = math.sqrt(sum(c1.get(k, 0) ** 2 for k in terms)) 11 | magB = math.sqrt(sum(c2.get(k, 0) ** 2 for k in terms)) 12 | return dotprod / (magA * magB) 13 | 14 | 15 | def get_graph_similarity(graph1, graph2): 16 | laplacian1 = nx.spectrum.laplacian_spectrum(graph1) 17 | laplacian2 = nx.spectrum.laplacian_spectrum(graph2) 18 | k1 = select_k(laplacian1) 19 | k2 = select_k(laplacian2) 20 | k = min(k1, k2) 21 | 22 | similarity = sum((laplacian1[:k] - laplacian2[:k]) ** 2) 23 | 24 | return similarity 25 | 26 | 27 | def select_k(spectrum, minimum_energy=1): 28 | running_total = 0.0 29 | total = sum(spectrum) 30 | if total == 0.0: 31 | return len(spectrum) 32 | for i in range(len(spectrum)): 33 | running_total += spectrum[i] 34 | if running_total / total >= minimum_energy: 35 | return i + 1 36 | return len(spectrum) 37 | -------------------------------------------------------------------------------- /learning/struc2vec/algorithms.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from time import time 3 | from collections import deque 4 | import numpy as np 5 | import math,random,logging 6 | from concurrent.futures import ProcessPoolExecutor, as_completed 7 | import multiprocessing as mp 8 | from collections import defaultdict 9 | 10 | from utils import * 11 | 12 | 13 | def generate_parameters_random_walk(workers): 14 | 15 | logging.info('Loading distances_nets from disk...') 16 | 17 | sum_weights = {} 18 | amount_edges = {} 19 | 20 | layer = 0 21 | while(isPickle('distances_nets_weights-layer-'+str(layer))): 22 | logging.info('Executing layer {}...'.format(layer)) 23 | weights = restoreVariableFromDisk('distances_nets_weights-layer-'+str(layer)) 24 | 25 | for k,list_weights in weights.iteritems(): 26 | if(layer not in sum_weights): 27 | sum_weights[layer] = 0 28 | if(layer not in amount_edges): 29 | amount_edges[layer] = 0 30 | 31 | for w in list_weights: 32 | sum_weights[layer] += w 33 | amount_edges[layer] += 1 34 | 35 | logging.info('Layer {} executed.'.format(layer)) 36 | layer += 1 37 | 38 | average_weight = {} 39 | for layer in sum_weights.keys(): 40 | average_weight[layer] = sum_weights[layer] / amount_edges[layer] 41 | 42 | logging.info("Saving average_weights on disk...") 43 | saveVariableOnDisk(average_weight,'average_weight') 44 | 45 | amount_neighbours = {} 46 | 47 | layer = 0 48 | while(isPickle('distances_nets_weights-layer-'+str(layer))): 49 | logging.info('Executing layer {}...'.format(layer)) 50 | weights = restoreVariableFromDisk('distances_nets_weights-layer-'+str(layer)) 51 | 52 | amount_neighbours[layer] = {} 53 | 54 | for k,list_weights in weights.iteritems(): 55 | cont_neighbours = 0 56 | for w in list_weights: 57 | if(w > average_weight[layer]): 58 | cont_neighbours += 1 59 | amount_neighbours[layer][k] = cont_neighbours 60 | 61 | logging.info('Layer {} executed.'.format(layer)) 62 | layer += 1 63 | 64 | logging.info("Saving amount_neighbours on disk...") 65 | saveVariableOnDisk(amount_neighbours,'amount_neighbours') 66 | 67 | def chooseNeighbor(v,graphs,alias_method_j,alias_method_q,layer): 68 | v_list = graphs[layer][v] 69 | 70 | idx = alias_draw(alias_method_j[layer][v],alias_method_q[layer][v]) 71 | v = v_list[idx] 72 | 73 | return v 74 | 75 | 76 | def exec_random_walk(graphs,alias_method_j,alias_method_q,v,walk_length,amount_neighbours): 77 | original_v = v 78 | t0 = time() 79 | initialLayer = 0 80 | layer = initialLayer 81 | 82 | 83 | path = deque() 84 | path.append(v) 85 | 86 | while len(path) < walk_length: 87 | r = random.random() 88 | 89 | if(r < 0.3): 90 | v = chooseNeighbor(v,graphs,alias_method_j,alias_method_q,layer) 91 | path.append(v) 92 | 93 | else: 94 | r = random.random() 95 | limiar_moveup = prob_moveup(amount_neighbours[layer][v]) 96 | if(r > limiar_moveup): 97 | if(layer > initialLayer): 98 | layer = layer - 1 99 | else: 100 | if((layer + 1) in graphs and v in graphs[layer + 1]): 101 | layer = layer + 1 102 | 103 | t1 = time() 104 | logging.info('RW - vertex {}. Time : {}s'.format(original_v,(t1-t0))) 105 | 106 | return path 107 | 108 | 109 | def exec_ramdom_walks_for_chunck(vertices,graphs,alias_method_j,alias_method_q,walk_length,amount_neighbours): 110 | walks = deque() 111 | for v in vertices: 112 | walks.append(exec_random_walk(graphs,alias_method_j,alias_method_q,v,walk_length,amount_neighbours)) 113 | return walks 114 | 115 | def generate_random_walks_large_graphs(num_walks,walk_length,workers,vertices): 116 | 117 | logging.info('Loading distances_nets from disk...') 118 | 119 | graphs = restoreVariableFromDisk('distances_nets_graphs') 120 | alias_method_j = restoreVariableFromDisk('nets_weights_alias_method_j') 121 | alias_method_q = restoreVariableFromDisk('nets_weights_alias_method_q') 122 | amount_neighbours = restoreVariableFromDisk('amount_neighbours') 123 | 124 | logging.info('Creating RWs...') 125 | t0 = time() 126 | 127 | walks = deque() 128 | initialLayer = 0 129 | 130 | parts = workers 131 | 132 | with ProcessPoolExecutor(max_workers=workers) as executor: 133 | 134 | for walk_iter in range(num_walks): 135 | random.shuffle(vertices) 136 | logging.info("Execution iteration {} ...".format(walk_iter)) 137 | walk = exec_ramdom_walks_for_chunck(vertices,graphs,alias_method_j,alias_method_q,walk_length,amount_neighbours) 138 | walks.extend(walk) 139 | logging.info("Iteration {} executed.".format(walk_iter)) 140 | 141 | 142 | 143 | t1 = time() 144 | logging.info('RWs created. Time : {}m'.format((t1-t0)/60)) 145 | logging.info("Saving Random Walks on disk...") 146 | save_random_walks(walks) 147 | 148 | def generate_random_walks(num_walks,walk_length,workers,vertices): 149 | 150 | logging.info('Loading distances_nets on disk...') 151 | 152 | graphs = restoreVariableFromDisk('distances_nets_graphs') 153 | alias_method_j = restoreVariableFromDisk('nets_weights_alias_method_j') 154 | alias_method_q = restoreVariableFromDisk('nets_weights_alias_method_q') 155 | amount_neighbours = restoreVariableFromDisk('amount_neighbours') 156 | 157 | logging.info('Creating RWs...') 158 | t0 = time() 159 | 160 | walks = deque() 161 | initialLayer = 0 162 | 163 | if(workers > num_walks): 164 | workers = num_walks 165 | 166 | with ProcessPoolExecutor(max_workers=workers) as executor: 167 | futures = {} 168 | for walk_iter in range(num_walks): 169 | random.shuffle(vertices) 170 | job = executor.submit(exec_ramdom_walks_for_chunck,vertices,graphs,alias_method_j,alias_method_q,walk_length,amount_neighbours) 171 | futures[job] = walk_iter 172 | #part += 1 173 | logging.info("Receiving results...") 174 | for job in as_completed(futures): 175 | walk = job.result() 176 | r = futures[job] 177 | logging.info("Iteration {} executed.".format(r)) 178 | walks.extend(walk) 179 | del futures[job] 180 | 181 | 182 | t1 = time() 183 | logging.info('RWs created. Time: {}m'.format((t1-t0)/60)) 184 | logging.info("Saving Random Walks on disk...") 185 | save_random_walks(walks) 186 | 187 | def save_random_walks(walks): 188 | with open('random_walks.txt', 'w') as file: 189 | for walk in walks: 190 | line = '' 191 | for v in walk: 192 | line += str(v)+' ' 193 | line += '\n' 194 | file.write(line) 195 | return 196 | 197 | def prob_moveup(amount_neighbours): 198 | x = math.log(amount_neighbours + math.e) 199 | p = (x / ( x + 1)) 200 | return p 201 | 202 | 203 | 204 | def alias_draw(J, q): 205 | ''' 206 | Draw sample from a non-uniform discrete distribution using alias sampling. 207 | ''' 208 | K = len(J) 209 | 210 | kk = int(np.floor(np.random.rand()*K)) 211 | if np.random.rand() < q[kk]: 212 | return kk 213 | else: 214 | return J[kk] 215 | -------------------------------------------------------------------------------- /learning/struc2vec/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Graph utilities.""" 5 | 6 | import logging 7 | import sys 8 | import math 9 | from io import open 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | from six.moves import range, zip, zip_longest 14 | from six import iterkeys 15 | from collections import defaultdict, Iterable 16 | from multiprocessing import cpu_count 17 | import random 18 | from random import shuffle 19 | from itertools import product,permutations 20 | import collections 21 | 22 | from concurrent.futures import ProcessPoolExecutor 23 | 24 | from multiprocessing import Pool 25 | from multiprocessing import cpu_count 26 | 27 | #novas importações 28 | import numpy as np 29 | import operator 30 | 31 | 32 | class Graph(defaultdict): 33 | """Efficient basic implementation of nx `Graph' – Undirected graphs with self loops""" 34 | def __init__(self): 35 | super(Graph, self).__init__(list) 36 | 37 | def nodes(self): 38 | return self.keys() 39 | 40 | def adjacency_iter(self): 41 | return self.iteritems() 42 | 43 | def subgraph(self, nodes={}): 44 | subgraph = Graph() 45 | 46 | for n in nodes: 47 | if n in self: 48 | subgraph[n] = [x for x in self[n] if x in nodes] 49 | 50 | return subgraph 51 | 52 | def make_undirected(self): 53 | 54 | t0 = time() 55 | 56 | for v in self.keys(): 57 | for other in self[v]: 58 | if v != other: 59 | self[other].append(v) 60 | 61 | t1 = time() 62 | #logger.info('make_directed: added missing edges {}s'.format(t1-t0)) 63 | 64 | self.make_consistent() 65 | return self 66 | 67 | def make_consistent(self): 68 | t0 = time() 69 | for k in iterkeys(self): 70 | self[k] = list(sorted(set(self[k]))) 71 | 72 | t1 = time() 73 | #logger.info('make_consistent: made consistent in {}s'.format(t1-t0)) 74 | 75 | #self.remove_self_loops() 76 | 77 | return self 78 | 79 | def remove_self_loops(self): 80 | 81 | removed = 0 82 | t0 = time() 83 | 84 | for x in self: 85 | if x in self[x]: 86 | self[x].remove(x) 87 | removed += 1 88 | 89 | t1 = time() 90 | 91 | #logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0))) 92 | return self 93 | 94 | def check_self_loops(self): 95 | for x in self: 96 | for y in self[x]: 97 | if x == y: 98 | return True 99 | 100 | return False 101 | 102 | def has_edge(self, v1, v2): 103 | if v2 in self[v1] or v1 in self[v2]: 104 | return True 105 | return False 106 | 107 | def degree(self, nodes=None): 108 | if isinstance(nodes, Iterable): 109 | return {v:len(self[v]) for v in nodes} 110 | else: 111 | return len(self[nodes]) 112 | 113 | def order(self): 114 | "Returns the number of nodes in the graph" 115 | return len(self) 116 | 117 | def number_of_edges(self): 118 | "Returns the number of nodes in the graph" 119 | return sum([self.degree(x) for x in self.keys()])/2 120 | 121 | def number_of_nodes(self): 122 | "Returns the number of nodes in the graph" 123 | return self.order() 124 | 125 | def gToDict(self): 126 | d = {} 127 | for k,v in self.iteritems(): 128 | d[k] = v 129 | return d 130 | 131 | def printAdjList(self): 132 | for key,value in self.iteritems(): 133 | print (key,":",value) 134 | 135 | 136 | 137 | def clique(size): 138 | return from_adjlist(permutations(range(1,size+1))) 139 | 140 | # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python 141 | def grouper(n, iterable, padvalue=None): 142 | "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')" 143 | return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue) 144 | 145 | def parse_adjacencylist(f): 146 | adjlist = [] 147 | for l in f: 148 | if l and l[0] != "#": 149 | introw = [int(x) for x in l.strip().split()] 150 | row = [introw[0]] 151 | row.extend(set(sorted(introw[1:]))) 152 | adjlist.extend([row]) 153 | 154 | return adjlist 155 | 156 | def parse_adjacencylist_unchecked(f): 157 | adjlist = [] 158 | for l in f: 159 | if l and l[0] != "#": 160 | adjlist.extend([[int(x) for x in l.strip().split()]]) 161 | return adjlist 162 | 163 | def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True): 164 | 165 | if unchecked: 166 | parse_func = parse_adjacencylist_unchecked 167 | convert_func = from_adjlist_unchecked 168 | else: 169 | parse_func = parse_adjacencylist 170 | convert_func = from_adjlist 171 | 172 | adjlist = [] 173 | 174 | t0 = time() 175 | 176 | with open(file_) as f: 177 | with ProcessPoolExecutor(max_workers=cpu_count()) as executor: 178 | total = 0 179 | for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))): 180 | adjlist.extend(adj_chunk) 181 | total += len(adj_chunk) 182 | 183 | t1 = time() 184 | 185 | logging.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0)) 186 | 187 | t0 = time() 188 | G = convert_func(adjlist) 189 | t1 = time() 190 | 191 | logging.info('Converted edges to graph in {}s'.format(t1-t0)) 192 | 193 | if undirected: 194 | t0 = time() 195 | G = G.make_undirected() 196 | t1 = time() 197 | logging.info('Made graph undirected in {}s'.format(t1-t0)) 198 | 199 | return G 200 | 201 | 202 | def load_edgelist(file_, undirected=True): 203 | G = Graph() 204 | with open(file_) as f: 205 | for l in f: 206 | if(len(l.strip().split()[:2]) > 1): 207 | x, y = l.strip().split()[:2] 208 | x = int(x) 209 | y = int(y) 210 | G[x].append(y) 211 | if undirected: 212 | G[y].append(x) 213 | else: 214 | x = l.strip().split()[:2] 215 | x = int(x[0]) 216 | G[x] = [] 217 | 218 | G.make_consistent() 219 | return G 220 | 221 | 222 | def load_matfile(file_, variable_name="network", undirected=True): 223 | mat_varables = loadmat(file_) 224 | mat_matrix = mat_varables[variable_name] 225 | 226 | return from_numpy(mat_matrix, undirected) 227 | 228 | 229 | def from_networkx(G_input, undirected=True): 230 | G = Graph() 231 | 232 | for idx, x in enumerate(G_input.nodes_iter()): 233 | for y in iterkeys(G_input[x]): 234 | G[x].append(y) 235 | 236 | if undirected: 237 | G.make_undirected() 238 | 239 | return G 240 | 241 | 242 | def from_numpy(x, undirected=True): 243 | G = Graph() 244 | 245 | if issparse(x): 246 | cx = x.tocoo() 247 | for i,j,v in zip(cx.row, cx.col, cx.data): 248 | G[i].append(j) 249 | else: 250 | raise Exception("Dense matrices not yet supported.") 251 | 252 | if undirected: 253 | G.make_undirected() 254 | 255 | G.make_consistent() 256 | return G 257 | 258 | 259 | def from_adjlist(adjlist): 260 | G = Graph() 261 | 262 | for row in adjlist: 263 | node = row[0] 264 | neighbors = row[1:] 265 | G[node] = list(sorted(set(neighbors))) 266 | 267 | return G 268 | 269 | 270 | def from_adjlist_unchecked(adjlist): 271 | G = Graph() 272 | 273 | for row in adjlist: 274 | node = row[0] 275 | neighbors = row[1:] 276 | G[node] = neighbors 277 | 278 | return G 279 | 280 | 281 | def from_dict(d): 282 | G = Graph() 283 | for k,v in d.iteritems(): 284 | G[k] = v 285 | 286 | return G 287 | 288 | -------------------------------------------------------------------------------- /learning/struc2vec/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse, logging 5 | import numpy as np 6 | import struc2vec 7 | from gensim.models import Word2Vec 8 | from gensim.models.word2vec import LineSentence 9 | from time import time 10 | 11 | import graph 12 | 13 | logging.basicConfig(filename='struc2vec.log',filemode='w',level=logging.DEBUG,format='%(asctime)s %(message)s') 14 | 15 | def parse_args(): 16 | ''' 17 | Parses the struc2vec arguments. 18 | ''' 19 | parser = argparse.ArgumentParser(description="Run struc2vec.") 20 | 21 | parser.add_argument('--input', nargs='?', default='graph/karate.edgelist', 22 | help='Input graph path') 23 | 24 | parser.add_argument('--output', nargs='?', default='emb/karate.emb', 25 | help='Embeddings path') 26 | 27 | parser.add_argument('--dimensions', type=int, default=128, 28 | help='Number of dimensions. Default is 128.') 29 | 30 | parser.add_argument('--walk-length', type=int, default=80, 31 | help='Length of walk per source. Default is 80.') 32 | 33 | parser.add_argument('--num-walks', type=int, default=10, 34 | help='Number of walks per source. Default is 10.') 35 | 36 | parser.add_argument('--window-size', type=int, default=10, 37 | help='Context size for optimization. Default is 10.') 38 | 39 | parser.add_argument('--until-layer', type=int, default=None, 40 | help='Calculation until the layer.') 41 | 42 | parser.add_argument('--iter', default=5, type=int, 43 | help='Number of epochs in SGD') 44 | 45 | parser.add_argument('--workers', type=int, default=4, 46 | help='Number of parallel workers. Default is 8.') 47 | 48 | parser.add_argument('--weighted', dest='weighted', action='store_true', 49 | help='Boolean specifying (un)weighted. Default is unweighted.') 50 | parser.add_argument('--unweighted', dest='unweighted', action='store_false') 51 | parser.set_defaults(weighted=False) 52 | 53 | parser.add_argument('--directed', dest='directed', action='store_true', 54 | help='Graph is (un)directed. Default is undirected.') 55 | parser.add_argument('--undirected', dest='undirected', action='store_false') 56 | parser.set_defaults(directed=False) 57 | 58 | parser.add_argument('--OPT1', default=False, type=bool, 59 | help='optimization 1') 60 | parser.add_argument('--OPT2', default=False, type=bool, 61 | help='optimization 2') 62 | parser.add_argument('--OPT3', default=False, type=bool, 63 | help='optimization 3') 64 | return parser.parse_args() 65 | 66 | def read_graph(): 67 | ''' 68 | Reads the input network. 69 | ''' 70 | logging.info(" - Loading graph...") 71 | G = graph.load_edgelist(args.input,undirected=True) 72 | logging.info(" - Graph loaded.") 73 | return G 74 | 75 | def learn_embeddings(): 76 | ''' 77 | Learn embeddings by optimizing the Skipgram objective using SGD. 78 | ''' 79 | logging.info("Initializing creation of the representations...") 80 | walks = LineSentence('random_walks.txt') 81 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter) 82 | model.wv.save_word2vec_format(args.output) 83 | logging.info("Representations created.") 84 | 85 | return 86 | 87 | def exec_struc2vec(args): 88 | ''' 89 | Pipeline for representational learning for all nodes in a graph. 90 | ''' 91 | if(args.OPT3): 92 | until_layer = args.until_layer 93 | else: 94 | until_layer = None 95 | 96 | G = read_graph() 97 | G = struc2vec.Graph(G, args.directed, args.workers, untilLayer = until_layer) 98 | 99 | if(args.OPT1): 100 | G.preprocess_neighbors_with_bfs_compact() 101 | else: 102 | G.preprocess_neighbors_with_bfs() 103 | 104 | if(args.OPT2): 105 | G.create_vectors() 106 | G.calc_distances(compactDegree = args.OPT1) 107 | else: 108 | G.calc_distances_all_vertices(compactDegree = args.OPT1) 109 | 110 | 111 | G.create_distances_network() 112 | G.preprocess_parameters_random_walk() 113 | 114 | G.simulate_walks(args.num_walks, args.walk_length) 115 | 116 | 117 | return G 118 | 119 | def main(args): 120 | 121 | G = exec_struc2vec(args) 122 | 123 | learn_embeddings() 124 | 125 | 126 | if __name__ == "__main__": 127 | args = parse_args() 128 | main(args) 129 | 130 | -------------------------------------------------------------------------------- /learning/struc2vec/struc2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import random,sys,logging 5 | from concurrent.futures import ProcessPoolExecutor, as_completed 6 | from multiprocessing import Manager 7 | from time import time 8 | from collections import deque 9 | 10 | from utils import * 11 | from algorithms import * 12 | from algorithms_distances import * 13 | import graph 14 | 15 | 16 | class Graph(): 17 | def __init__(self, g, is_directed, workers, untilLayer = None): 18 | 19 | logging.info(" - Converting graph to dict...") 20 | self.G = g.gToDict() 21 | logging.info("Graph converted.") 22 | 23 | self.num_vertices = g.number_of_nodes() 24 | self.num_edges = g.number_of_edges() 25 | self.is_directed = is_directed 26 | self.workers = workers 27 | self.calcUntilLayer = untilLayer 28 | logging.info('Graph - Number of vertices: {}'.format(self.num_vertices)) 29 | logging.info('Graph - Number of edges: {}'.format(self.num_edges)) 30 | 31 | 32 | def preprocess_neighbors_with_bfs(self): 33 | 34 | with ProcessPoolExecutor(max_workers=self.workers) as executor: 35 | job = executor.submit(exec_bfs,self.G,self.workers,self.calcUntilLayer) 36 | 37 | job.result() 38 | 39 | return 40 | 41 | def preprocess_neighbors_with_bfs_compact(self): 42 | 43 | with ProcessPoolExecutor(max_workers=self.workers) as executor: 44 | job = executor.submit(exec_bfs_compact,self.G,self.workers,self.calcUntilLayer) 45 | 46 | job.result() 47 | 48 | return 49 | 50 | def preprocess_degree_lists(self): 51 | 52 | with ProcessPoolExecutor(max_workers=self.workers) as executor: 53 | job = executor.submit(preprocess_degreeLists) 54 | 55 | job.result() 56 | 57 | return 58 | 59 | 60 | def create_vectors(self): 61 | logging.info("Creating degree vectors...") 62 | degrees = {} 63 | degrees_sorted = set() 64 | G = self.G 65 | for v in G.keys(): 66 | degree = len(G[v]) 67 | degrees_sorted.add(degree) 68 | if(degree not in degrees): 69 | degrees[degree] = {} 70 | degrees[degree]['vertices'] = deque() 71 | degrees[degree]['vertices'].append(v) 72 | degrees_sorted = np.array(list(degrees_sorted),dtype='int') 73 | degrees_sorted = np.sort(degrees_sorted) 74 | 75 | l = len(degrees_sorted) 76 | for index, degree in enumerate(degrees_sorted): 77 | if(index > 0): 78 | degrees[degree]['before'] = degrees_sorted[index - 1] 79 | if(index < (l - 1)): 80 | degrees[degree]['after'] = degrees_sorted[index + 1] 81 | logging.info("Degree vectors created.") 82 | logging.info("Saving degree vectors...") 83 | saveVariableOnDisk(degrees,'degrees_vector') 84 | 85 | 86 | def calc_distances_all_vertices(self,compactDegree = False): 87 | 88 | logging.info("Using compactDegree: {}".format(compactDegree)) 89 | if(self.calcUntilLayer): 90 | logging.info("Calculations until layer: {}".format(self.calcUntilLayer)) 91 | 92 | futures = {} 93 | 94 | count_calc = 0 95 | 96 | vertices = list(reversed(sorted(self.G.keys()))) 97 | 98 | if(compactDegree): 99 | logging.info("Recovering degreeList from disk...") 100 | degreeList = restoreVariableFromDisk('compactDegreeList') 101 | else: 102 | logging.info("Recovering compactDegreeList from disk...") 103 | degreeList = restoreVariableFromDisk('degreeList') 104 | 105 | parts = self.workers 106 | chunks = partition(vertices,parts) 107 | 108 | t0 = time() 109 | 110 | with ProcessPoolExecutor(max_workers = self.workers) as executor: 111 | 112 | part = 1 113 | for c in chunks: 114 | logging.info("Executing part {}...".format(part)) 115 | list_v = [] 116 | for v in c: 117 | list_v.append([vd for vd in degreeList.keys() if vd > v]) 118 | job = executor.submit(calc_distances_all, c, list_v, degreeList,part, compactDegree = compactDegree) 119 | futures[job] = part 120 | part += 1 121 | 122 | 123 | logging.info("Receiving results...") 124 | 125 | for job in as_completed(futures): 126 | job.result() 127 | r = futures[job] 128 | logging.info("Part {} Completed.".format(r)) 129 | 130 | logging.info('Distances calculated.') 131 | t1 = time() 132 | logging.info('Time : {}m'.format((t1-t0)/60)) 133 | 134 | return 135 | 136 | 137 | def calc_distances(self, compactDegree = False): 138 | 139 | logging.info("Using compactDegree: {}".format(compactDegree)) 140 | if(self.calcUntilLayer): 141 | logging.info("Calculations until layer: {}".format(self.calcUntilLayer)) 142 | 143 | futures = {} 144 | #distances = {} 145 | 146 | count_calc = 0 147 | 148 | G = self.G 149 | vertices = G.keys() 150 | 151 | parts = self.workers 152 | chunks = partition(vertices,parts) 153 | 154 | with ProcessPoolExecutor(max_workers = 1) as executor: 155 | 156 | logging.info("Split degree List...") 157 | part = 1 158 | for c in chunks: 159 | job = executor.submit(splitDegreeList,part,c,G,compactDegree) 160 | job.result() 161 | logging.info("degreeList {} completed.".format(part)) 162 | part += 1 163 | 164 | 165 | with ProcessPoolExecutor(max_workers = self.workers) as executor: 166 | 167 | part = 1 168 | for c in chunks: 169 | logging.info("Executing part {}...".format(part)) 170 | job = executor.submit(calc_distances, part, compactDegree = compactDegree) 171 | futures[job] = part 172 | part += 1 173 | 174 | logging.info("Receiving results...") 175 | for job in as_completed(futures): 176 | job.result() 177 | r = futures[job] 178 | logging.info("Part {} completed.".format(r)) 179 | 180 | 181 | return 182 | 183 | def consolide_distances(self): 184 | 185 | distances = {} 186 | 187 | parts = self.workers 188 | for part in range(1,parts + 1): 189 | d = restoreVariableFromDisk('distances-'+str(part)) 190 | preprocess_consolides_distances(distances) 191 | distances.update(d) 192 | 193 | 194 | preprocess_consolides_distances(distances) 195 | saveVariableOnDisk(distances,'distances') 196 | 197 | 198 | def create_distances_network(self): 199 | 200 | with ProcessPoolExecutor(max_workers=1) as executor: 201 | job = executor.submit(generate_distances_network,self.workers) 202 | 203 | job.result() 204 | 205 | return 206 | 207 | def preprocess_parameters_random_walk(self): 208 | 209 | with ProcessPoolExecutor(max_workers=1) as executor: 210 | job = executor.submit(generate_parameters_random_walk,self.workers) 211 | 212 | job.result() 213 | 214 | return 215 | 216 | 217 | def simulate_walks(self,num_walks,walk_length): 218 | 219 | # for large graphs, it is serially executed, because of memory use. 220 | if(len(self.G) > 500000): 221 | 222 | with ProcessPoolExecutor(max_workers=1) as executor: 223 | job = executor.submit(generate_random_walks_large_graphs,num_walks,walk_length,self.workers,self.G.keys()) 224 | 225 | job.result() 226 | 227 | else: 228 | 229 | with ProcessPoolExecutor(max_workers=1) as executor: 230 | job = executor.submit(generate_random_walks,num_walks,walk_length,self.workers,self.G.keys()) 231 | 232 | job.result() 233 | 234 | 235 | return 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | -------------------------------------------------------------------------------- /learning/struc2vec/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from time import time 3 | import logging,inspect 4 | import cPickle as pickle 5 | from itertools import islice 6 | import os.path 7 | 8 | dir_f = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | folder_pickles = dir_f+"/../pickles/" 10 | 11 | def returnPathStruc2vec(): 12 | return dir_f 13 | 14 | def isPickle(fname): 15 | return os.path.isfile(dir_f+'/../pickles/'+fname+'.pickle') 16 | 17 | def chunks(data, SIZE=10000): 18 | it = iter(data) 19 | for i in xrange(0, len(data), SIZE): 20 | yield {k:data[k] for k in islice(it, SIZE)} 21 | 22 | def partition(lst, n): 23 | division = len(lst) / float(n) 24 | return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n) ] 25 | 26 | def restoreVariableFromDisk(name): 27 | logging.info('Recovering variable...') 28 | t0 = time() 29 | val = None 30 | with open(folder_pickles + name + '.pickle', 'rb') as handle: 31 | val = pickle.load(handle) 32 | t1 = time() 33 | logging.info('Variable recovered. Time: {}m'.format((t1-t0)/60)) 34 | 35 | return val 36 | 37 | def saveVariableOnDisk(f,name): 38 | logging.info('Saving variable on disk...') 39 | t0 = time() 40 | with open(folder_pickles + name + '.pickle', 'wb') as handle: 41 | pickle.dump(f, handle, protocol=pickle.HIGHEST_PROTOCOL) 42 | t1 = time() 43 | logging.info('Variable saved. Time: {}m'.format((t1-t0)/60)) 44 | 45 | return 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bcrypt==3.1.7 2 | boto3==1.15.1 3 | botocore==1.18.1 4 | bz2file==0.98 5 | certifi==2020.6.20 6 | cffi==1.14.2 7 | chardet==3.0.4 8 | cloudpickle==1.3.0 9 | cryptography==3.1 10 | dask==1.2.2 11 | decorator==4.4.2 12 | enum34==1.1.10 13 | funcsigs==1.0.2 14 | futures==3.3.0 15 | gensim==3.8.3 16 | graphviz==0.14.1 17 | idna==2.10 18 | ipaddress==1.0.23 19 | jmespath==0.10.0 20 | joblib==0.14.1 21 | locket==0.2.0 22 | matplotlib==1.4.3 23 | mock==3.0.5 24 | networkx==2.2 25 | nose==1.3.7 26 | numpy==1.16.1 27 | pandas==0.24.2 28 | paramiko==2.7.2 29 | partd==1.0.0 30 | pycparser==2.20 31 | pydot==1.0.28 32 | pygraphviz==1.5 33 | pymongo==3.11.0 34 | PyNaCl==1.4.0 35 | pyparsing==1.5.7 36 | python-dateutil==2.8.1 37 | pytz==2020.1 38 | requests==2.24.0 39 | s3transfer==0.3.3 40 | scikit-learn==0.20.4 41 | scipy==1.2.3 42 | scour==0.32 43 | six==1.10.0 44 | smart-open==1.10.1 45 | sshtunnel==0.1.5 46 | toolz==0.10.0 47 | tqdm==4.49.0 48 | urllib3==1.25.10 49 | -------------------------------------------------------------------------------- /sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/sample/__init__.py -------------------------------------------------------------------------------- /sample/astfile.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import networkx as nx 3 | 4 | from learning.similarity import counter_cosine_similarity 5 | 6 | 7 | class ASTFile: 8 | 9 | def __init__(self, ast_file, arguments, ast=None, feature_type=''): 10 | self.ast_file = ast_file 11 | self.arguments = arguments 12 | self.ast = ast 13 | if self.ast is None: 14 | try: 15 | self.ast = nx.read_graphml(self.ast_file) 16 | # self.ast = self.index.read(self.ast_file) 17 | except Exception, e: 18 | print e 19 | print self.ast_file 20 | 21 | self.functions_root_nodes = [] 22 | self.features = [] 23 | self.functions_features_counters = [] 24 | self.function_names = [] 25 | self.feature_type = feature_type 26 | 27 | def extract_features(self): 28 | self.functions_root_nodes = [x for x, y in self.ast.nodes(data=True) 29 | if 'type' in y and y['type'] == '"FUNCTION_DECL"'] 30 | 31 | for root_node in self.functions_root_nodes: 32 | # print root_node 33 | self.extract_potential_features(root_node) 34 | self.functions_features_counters.append(Counter(self.features)) 35 | self.function_names.append(self.ast.node[root_node]['spelling'].replace('"', '')) 36 | 37 | def extract_potential_features(self, root_node): 38 | # self.print_graph(root_node) 39 | s = list(nx.dfs_preorder_nodes(self.ast, root_node)) 40 | self.features = [] 41 | 42 | feature_types = self.feature_type.split('+') 43 | for feature_type in feature_types: 44 | if feature_type == 'MR': 45 | self.extract_members(s) 46 | elif feature_type == 'C': 47 | self.extract_calls(s) 48 | elif feature_type == 'NT': 49 | self.extract_node_types(s) 50 | 51 | def extract_members(self, s): 52 | 53 | for item in s: 54 | node_type = self.ast.node[item]['type'].replace('"', '') 55 | if node_type == 'MEMBER_REF_EXPR' or node_type == 'MEMBER_REF': 56 | node_spelling = self.ast.node[item]['spelling'].replace('"', '') 57 | if node_spelling != '': 58 | self.features.append('{}_{}'.format(node_type, node_spelling)) 59 | 60 | def extract_calls(self, s): 61 | 62 | for item in s: 63 | node_type = self.ast.node[item]['type'].replace('"', '') 64 | if node_type == 'CALL_EXPR': 65 | node_spelling = self.ast.node[item]['spelling'].replace('"', '') 66 | if node_spelling != '': 67 | self.features.append('{}_{}'.format(node_type, node_spelling)) 68 | 69 | def extract_node_types(self, s): 70 | 71 | for item in s: 72 | node_type = self.ast.node[item]['type'].replace('"', '') 73 | self.features.append('NODE_TYPE_{}'.format(node_type)) 74 | 75 | def print_graph(self, root_node): 76 | if self.ast.node[root_node]['spelling'].replace('"', '') in \ 77 | ['X509v3_addr_get_afi', 'ssl3_get_record', 'aes_gcm_ctrl']: 78 | print root_node, self.ast.node[root_node]['spelling'] 79 | s = list(nx.dfs_preorder_nodes(self.ast, root_node)) 80 | for item in s: 81 | print self.ast.node[item] 82 | 83 | def compute_functions_similarities(self): 84 | functions_similarities = [] 85 | 86 | for i in range(len(self.functions_features_counters) - 1): 87 | for j in range(len(self.functions_features_counters)): 88 | if i == i + j: 89 | continue 90 | if i + j >= len(self.functions_features_counters): 91 | continue 92 | functions_similarities.append({'func1': self.ast.node[self.functions_root_nodes[i]]['spelling'], 93 | 'func2': self.ast.node[self.functions_root_nodes[i + j]]['spelling'], 94 | 'score': counter_cosine_similarity(self.functions_features_counters[i], 95 | self.functions_features_counters[i + 96 | j])}) 97 | 98 | return sorted(functions_similarities, key=lambda k: k['score'], reverse=True) 99 | 100 | def extract_backup_features(self, root_node): 101 | # self.print_graph(root_node) 102 | s = list(nx.dfs_preorder_nodes(self.ast, root_node)) 103 | features = [] 104 | for item in s: 105 | node_type = self.ast.node[item]['type'].replace('"', '') 106 | features.append(node_type) 107 | if node_type == 'MEMBER_REF_EXPR' or node_type == 'MEMBER_REF' or node_type =='TYPEDEF_DECL': 108 | node_spelling = self.ast.node[item]['spelling'].replace('"', '') 109 | if node_spelling != '': 110 | features.append(node_spelling) 111 | # print features 112 | return features 113 | -------------------------------------------------------------------------------- /sample/bitcodefile.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import subprocess 3 | from collections import Counter, defaultdict 4 | # from node2vec import Node2Vec 5 | from operator import itemgetter 6 | from subprocess import call 7 | from timeit import default_timer 8 | 9 | import networkx as nx 10 | import numpy as np 11 | from gensim.models import Word2Vec 12 | 13 | from learning.node2vec import node2vec 14 | from slicer import Slicer 15 | from utils.inout import * 16 | 17 | 18 | class BitCodeFile: 19 | 20 | def __init__(self, file_info, arguments, analysis_type='', feature_type=''): 21 | self.file_info = file_info 22 | self.arguments = arguments 23 | self.analysis_type = analysis_type 24 | self.feature_type = feature_type 25 | 26 | self.features = [] 27 | self.afs_features_counters_list = defaultdict(list) 28 | self.afs_features_counters = {} 29 | self.afs_graph = None 30 | self.graph_len = 0 31 | self.basic_block_ids = set() 32 | self.lines_numbers = set() 33 | self.llvm_instructions = set() 34 | self.construct_hash = None 35 | 36 | def analyze(self): 37 | if self.analysis_type == 'pdg': 38 | return self.extract_pdg() 39 | elif self.analysis_type == 'as': 40 | return self.extract_as() 41 | else: 42 | return False 43 | 44 | def extract_pdg(self): 45 | pdg_dir = join_path(get_parent_dir(self.file_info), 'pdg') 46 | make_dir_if_not_exist(pdg_dir) 47 | for function_name in self.get_functions(): 48 | function_pdg_file = join_path(pdg_dir, '{}.pdg.dot'.format(function_name)) 49 | # llvm_pdg_log_file = join_path(pdg_dir, '{}.log.txt'.format(function)) 50 | time_file = join_path(pdg_dir, '{}.pdg.time.txt'.format(function_name)) 51 | # args = ['-entrypoint', function, '-nocfg', '>', function_pdg_file] 52 | parse_start_time = default_timer() 53 | try: 54 | OUTPUT = open(function_pdg_file, 'w') 55 | 56 | # set 60 seconds timeout for pdg extraction 57 | call(['timeout', '60', self.arguments.pdg_dumper, self.file_info, 58 | '-entrypoint', function_name, '-nocfg'], 59 | stdout=OUTPUT, stderr=subprocess.STDOUT, close_fds=True) 60 | OUTPUT.close() 61 | 62 | if not exist_file(function_pdg_file): 63 | print 'error in:', function_name, self.file_info 64 | # output = open(function_pdg_file, 'r') 65 | # output_lines = output.readlines() 66 | # for i in range(len(output_lines)): 67 | # if 'WARNING' in output_lines[i]: 68 | # # digraph "DependenceGraph" 69 | # print function_pdg_file 70 | 71 | parse_elapsed = default_timer() - parse_start_time 72 | write_file(time_file, '{}'.format(parse_elapsed)) 73 | except: 74 | print 'crash in pdg dumper', self.file_info, function_name 75 | return False 76 | 77 | return True 78 | 79 | def get_functions(self): 80 | functions = [] 81 | functions_file = join_path(get_parent_dir(self.file_info), 'functions.txt') 82 | try: 83 | for line in read_lines(functions_file): 84 | # Do not consider inlinehint functions 85 | if ' inlinehint ' not in line.split(':')[1]: 86 | functions.append(line.split('@')[1].split('(')[0]) 87 | except Exception, e: 88 | print functions_file, e 89 | return functions 90 | 91 | def extract_as(self): 92 | slicer = Slicer(pdg_graph_file=self.file_info, arguments=self.arguments) 93 | if not slicer.error: 94 | slicer.run() 95 | del slicer 96 | return True 97 | else: 98 | return False 99 | 100 | def extract_features(self): 101 | self.extract_potential_features() 102 | 103 | def extract_potential_features(self): 104 | self.afs_graph = nx.drawing.nx_agraph.read_dot(self.file_info) 105 | self.graph_len = len(self.afs_graph) 106 | if self.feature_type == 'NN': 107 | self.extract_node_names_features() 108 | self.afs_features_counters = Counter(self.features) 109 | elif self.feature_type == 'NNMD': 110 | self.extract_node_names_features() 111 | self.afs_features_counters = Counter(self.features) 112 | self.extract_metadata_features() 113 | elif self.feature_type == 'SM': 114 | self.extract_semantic_features() 115 | 116 | def extract_node_names_features(self): 117 | for node in self.afs_graph.nodes: 118 | label = self.afs_graph.nodes[node]['label'] 119 | if 'line' in self.afs_graph.nodes[node]: 120 | line = self.afs_graph.nodes[node]['line'] 121 | self.lines_numbers.add(line) 122 | if 'basic_block_id' in self.afs_graph.nodes[node]: 123 | bb_id = self.afs_graph.nodes[node]['basic_block_id'] 124 | self.basic_block_ids.add(bb_id) 125 | self.features.append(label) 126 | self.llvm_instructions.add(label) 127 | 128 | self.compute_construct_hash() 129 | 130 | def extract_metadata_features(self): 131 | self.afs_features_counters['metadata_num_edges'] = len(self.afs_graph.edges) 132 | self.afs_features_counters['metadata_num_nodes'] = len(self.afs_graph.nodes) 133 | # d = self.centrality_distribution(self.afs_graph) 134 | # self.afs_features_counters['metadata_entropy_centrality_distribution'] = self.entropy(d) 135 | 136 | def entropy(self, dist): 137 | """ 138 | Returns the entropy of `dist` in bits (base-2). 139 | 140 | """ 141 | dist = np.asarray(dist) 142 | ent = np.nansum(dist * np.log2(1 / dist)) 143 | return ent 144 | 145 | def centrality_distribution(self, G): 146 | """ 147 | Returns a centrality distribution. 148 | 149 | Each normalized centrality is divided by the sum of the normalized 150 | centralities. Note, this assumes the graph is simple. 151 | 152 | """ 153 | if len(G) == 1: 154 | print self.file_info 155 | centrality = nx.degree_centrality(G).values() 156 | centrality = np.asarray(centrality) 157 | centrality /= centrality.sum() 158 | return centrality 159 | 160 | def extract_semantic_features(self): 161 | # self.build_laplacian_features() 162 | self.build_node2vec_features_node_representation() 163 | # self.build_graph2vec_features_node_representation() 164 | 165 | def build_node2vec_features_node_representation(self): 166 | afs_graph = nx.DiGraph() 167 | for e in self.afs_graph.edges(): 168 | afs_graph.add_weighted_edges_from([(e[0], e[1], 1)]) 169 | walks = self.get_node2vec_walks(afs_graph) 170 | if len(walks): 171 | node2vec_ref = self.learn_embeddings(walks) 172 | else: 173 | node2vec_ref = {} 174 | 175 | data = [] 176 | for node in afs_graph.nodes: 177 | data.append(node2vec_ref.get_vector(node)) 178 | 179 | data = np.array(data) 180 | data = np.average(data, axis=0) 181 | 182 | for index, value in enumerate(data): 183 | feature_name = 'representation_{}'.format(index) 184 | self.afs_features_counters[feature_name] = value 185 | 186 | def build_node2vec_features_similar_nodes(self): 187 | afs_graph = nx.DiGraph() 188 | for e in self.afs_graph.edges(): 189 | afs_graph.add_weighted_edges_from([(e[0], e[1], 1)]) 190 | walks = self.get_node2vec_walks(afs_graph) 191 | if len(walks): 192 | node2vec_ref = self.learn_embeddings(walks) 193 | else: 194 | node2vec_ref = {} 195 | 196 | # for index, value in enumerate(node2vec_ref.get_vector(node)): 197 | for node in afs_graph.nodes: 198 | similar_nodes = sorted(node2vec_ref.wv.most_similar(node), key=itemgetter(1), reverse=True) 199 | sn_len = len(similar_nodes) 200 | for item in similar_nodes[0:min(sn_len, 5)]: 201 | 202 | similar_label = self.afs_graph.node[item[0]]['label'] 203 | node_label = self.afs_graph.node[node]['label'] 204 | # feature_name = '{}_{}'.format(similar_label, node_label) 205 | feature_name = '{}'.format(similar_label) 206 | if feature_name not in self.afs_features_counters_list.keys(): 207 | self.afs_features_counters[feature_name] = 1 208 | else: 209 | self.afs_features_counters[feature_name] += 1 210 | 211 | def build_node2vec_features_single_node(self): 212 | afs_graph = nx.DiGraph() 213 | for e in self.afs_graph.edges(): 214 | afs_graph.add_weighted_edges_from([(e[0], e[1], 1)]) 215 | walks = self.get_node2vec_walks(afs_graph) 216 | if len(walks): 217 | node2vec_ref = self.learn_embeddings(walks) 218 | else: 219 | node2vec_ref = {} 220 | 221 | for node in afs_graph.nodes: 222 | # print node, self.afs_graph.node[node]['label'] 223 | # print node2vec_ref.get_vector(node) 224 | 225 | for index, value in enumerate(node2vec_ref.get_vector(node)): 226 | feature_name = '{} ({})'.format(self.afs_graph.node[node]['label'], index) 227 | if feature_name not in self.afs_features_counters_list.keys(): 228 | self.afs_features_counters_list[feature_name].append(round(value, 2)) 229 | else: 230 | self.afs_features_counters_list[feature_name].append(round(value, 2)) 231 | for key, value in self.afs_features_counters_list.iteritems(): 232 | self.afs_features_counters[key] = round(np.average(value), 2) 233 | 234 | def get_node2vec_walks(self, afs_graph): 235 | num_walks = 10 236 | walk_length = 10 237 | p = 0.25 238 | q = 0.25 239 | node2vec_graph = node2vec.Graph(afs_graph, True, p, q) 240 | node2vec_graph.preprocess_transition_probs() 241 | walks = node2vec_graph.simulate_walks(num_walks, walk_length) 242 | return walks 243 | 244 | def learn_embeddings(self, walks): 245 | ''' 246 | Learn embeddings by optimizing the Skipgram objective using SGD. 247 | ''' 248 | dimensions = 128 249 | window_size = 10 250 | workers = 5 251 | iteration = 1 252 | output = '/Users/mansourahmadi/Desktop/aaa.out' 253 | walks = [map(str, walk) for walk in walks] 254 | model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=iteration) 255 | # print model 256 | return model.wv 257 | 258 | def compute_construct_hash(self): 259 | construct_string = '' 260 | # print self.file_info 261 | construct_string += self.file_info[:self.file_info.find('.c/pdg') + 2] 262 | # print construct_string 263 | for id in self.basic_block_ids: 264 | construct_string += str(id) 265 | for line in self.lines_numbers: 266 | construct_string += str(line) 267 | for llvm_instruction in self.llvm_instructions: 268 | construct_string += str(llvm_instruction) 269 | # print construct_string 270 | # self.construct_hash = int(hashlib.sha1(construct_string).hexdigest(), 16) % (10 ** 8) 271 | self.construct_hash = hashlib.sha1(construct_string).hexdigest() 272 | -------------------------------------------------------------------------------- /sample/languagetype.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class LanguageType(Enum): 5 | C = 'C' 6 | 7 | @staticmethod 8 | def get_names(): 9 | return [e.name for e in LanguageType] 10 | 11 | @staticmethod 12 | def get_detail(): 13 | return ['{}: {}'.format(e.name, e.value) for e in LanguageType] 14 | -------------------------------------------------------------------------------- /sample/sourcefile.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pprint import pprint 3 | from subprocess import call 4 | from timeit import default_timer 5 | 6 | # import clang.cindex as cl 7 | import networkx as nx 8 | import pandas 9 | 10 | from utils.inout import * 11 | 12 | 13 | def get_diagnostics_info(diagnostics): 14 | return {'severity': diagnostics.severity, 15 | 'location': diagnostics.location, 16 | 'spelling': diagnostics.spelling 17 | # 'ranges': diagnostics.ranges, 18 | # 'fixits': diagnostics.fixits 19 | } 20 | 21 | 22 | class SourceFile: 23 | 24 | def __init__(self, source_file, arguments, project_dir='', compile_arguments=[], analysis_type=''): 25 | self.source_file = source_file 26 | self.source_file_ast_dir = '' 27 | self.source_file_bc_dir = '' 28 | self.project_dir = project_dir 29 | # self.project_includes = project_includes 30 | self.compile_arguments = compile_arguments 31 | self.arguments = arguments 32 | # if not cl.Config.loaded: 33 | # cl.Config.set_library_path(self.arguments.clang_lib_dir) 34 | # self.index = cl.Index.create() 35 | self.index = None 36 | self.ast = None 37 | self.root_nodes = [] 38 | self.function_count = 0 39 | self.function_name = '' 40 | self.file_ast_json = {} 41 | self.cursor_list = {} 42 | self.analysis_type = analysis_type 43 | # self.cursor_list = [] 44 | 45 | def emit_llvm_bc(self): 46 | parent_dir = get_parent_dir(self.source_file).replace(self.arguments.projects_dir, self.arguments.bcs_dir) 47 | make_dir_if_not_exist(parent_dir) 48 | self.source_file_bc_dir = join_path(parent_dir, get_basename(self.source_file)) 49 | make_dir_if_not_exist(self.source_file_bc_dir) 50 | bc_file = join_path(self.source_file_bc_dir, get_filename_without_ext(get_basename(self.source_file)) + '.bc') 51 | ll_file = join_path(self.source_file_bc_dir, get_filename_without_ext(get_basename(self.source_file)) + '.ll') 52 | llvm_log_file = join_path(self.source_file_bc_dir, 'llvm.log.txt') 53 | llvm_dis_log_file = join_path(self.source_file_bc_dir, 'llvm-dis.log.txt') 54 | function_names_file = join_path(self.source_file_bc_dir, 'functions.txt') 55 | time_file = join_path(self.source_file_bc_dir, 'bc.time.txt') 56 | 57 | args = [] 58 | # print self.compile_arguments 59 | if self.compile_arguments is None and not self.arguments.ignore_compile_commands: 60 | return True 61 | self.compile_arguments = [ca for ca in self.compile_arguments if not ca.startswith('-O')] + ['-O0'] 62 | if "-Wexpansion-to-defined" in self.compile_arguments: 63 | self.compile_arguments.remove("-Wexpansion-to-defined") 64 | # self.compile_arguments = [ca for ca in self.compile_arguments if ca.startswith('-I')] + ['-O0'] 65 | args += self.compile_arguments 66 | args += ['-I/usr/lib/clang/3.8.0/include/'] 67 | args += self.arguments.includes 68 | # print 'args', args 69 | 70 | cwd = get_current_directory() 71 | change_directory(self.project_dir) 72 | # print [self.arguments.clang, '-emit-llvm', '-c', '-g', self.source_file, '-o', bc_file] + args 73 | 74 | LOG = open(llvm_log_file, 'w') 75 | parse_start_time = default_timer() 76 | try: 77 | call([self.arguments.clang, '-emit-llvm', '-c', '-g', self.source_file, '-o', bc_file] + args, 78 | stdout=LOG, stderr=subprocess.STDOUT, close_fds=True) 79 | except: 80 | print 'crash in clang', self.source_file 81 | return True 82 | parse_elapsed = default_timer() - parse_start_time 83 | write_file(time_file, '{}'.format(parse_elapsed)) 84 | change_directory(cwd) 85 | # llvm_dis = join_path(self.arguments.llvm_config, 'llvm-dis') 86 | llvm_dis = 'llvm-dis-3.8' 87 | LOG = open(llvm_dis_log_file, 'w') 88 | try: 89 | if is_file(bc_file): 90 | call([llvm_dis, bc_file, '-o', ll_file], 91 | stdout=LOG, stderr=subprocess.STDOUT, close_fds=True) 92 | else: 93 | print 'No bc file:', bc_file 94 | print [arg for arg in args if arg.startswith('-I')] 95 | except: 96 | print llvm_dis, ' cannot be found' 97 | return True 98 | functions = open(function_names_file, 'w') 99 | lines = read_lines(ll_file) 100 | functions_name = [] 101 | prev_line = '' 102 | for line in lines: 103 | if line[0:6] == 'define': 104 | functions_name.append(line + prev_line + '\n') 105 | prev_line = line 106 | functions.writelines(functions_name) 107 | return True 108 | 109 | def emit_llvm_ll_and_functions(self, bitcode_file): 110 | bitcode_filename = get_basename(bitcode_file) 111 | sourcecode_filename = bitcode_filename[:-2] + 'c' 112 | ll_filename = bitcode_filename[:-2] + 'll' 113 | c_dir = join_path(get_parent_dir(bitcode_file), sourcecode_filename) 114 | make_dir_if_not_exist(c_dir) 115 | new_bitcode_file = join_path(c_dir, bitcode_filename) 116 | move_file(bitcode_file, new_bitcode_file) 117 | ll_file = join_path(c_dir, ll_filename) 118 | llvm_dis_log_file = join_path(c_dir, 'llvm-dis.log.txt') 119 | function_names_file = join_path(c_dir, 'functions.txt') 120 | 121 | llvm_dis = join_path(self.arguments.llvm_config, 'llvm-dis-3.8') 122 | LOG = open(llvm_dis_log_file, 'w') 123 | try: 124 | if is_file(new_bitcode_file): 125 | call([llvm_dis, new_bitcode_file, '-o', ll_file], 126 | stdout=LOG, stderr=subprocess.STDOUT, close_fds=True) 127 | else: 128 | print 'No bc file:', new_bitcode_file 129 | except: 130 | print llvm_dis, ' cannot be found' 131 | return True 132 | functions = open(function_names_file, 'w') 133 | lines = read_lines(ll_file) 134 | functions_name = [] 135 | prev_line = '' 136 | for line in lines: 137 | if line[0:6] == 'define': 138 | functions_name.append(line + prev_line + '\n') 139 | prev_line = line 140 | functions.writelines(functions_name) 141 | 142 | def emit_llvm_ast(self): 143 | # I commented the following lines to be able to run on conda 144 | # self.ast = ig.Graph(directed=True) 145 | # if not cl.Config.loaded: 146 | # cl.Config.set_library_path(self.arguments.clang_lib_dir) 147 | # self.index = cl.Index.create() 148 | self.ast = nx.DiGraph() 149 | include_args = [] 150 | parent_dir = get_parent_dir(self.source_file).replace(self.arguments.projects_dir, self.arguments.asts_dir) 151 | make_dir_if_not_exist(parent_dir) 152 | self.source_file_ast_dir = join_path(parent_dir, get_basename(self.source_file)) 153 | make_dir_if_not_exist(self.source_file_ast_dir) 154 | # for include in self.project_includes: 155 | # include_args.append('-I{}'.format(include)) 156 | 157 | args = [] 158 | # args += ['--no-standard-includes', '-nostdinc++', '-nobuiltininc'] 159 | # args += ['-nostdinc','-nostdinc++'] 160 | # args += include_args 161 | if len(self.compile_arguments) == 0 and not self.arguments.ignore_compile_commands: 162 | return True 163 | args += self.compile_arguments 164 | args += ['-I/usr/lib/clang/3.8.0/include/'] 165 | # args += ['-S', '-emit-llvm', '-c', '-o', 'xx.bc'] 166 | # args += ['-I/home/mansour/nfs/vulfinder/tools/clang+llvm-5.0.1-x86_64-linux-gnu-ubuntu-16.04/include/clang'] 167 | 168 | parse_start_time = default_timer() 169 | cwd = get_current_directory() 170 | change_directory(self.project_dir) 171 | translation_unit = self.index.parse(self.source_file, args=args) 172 | change_directory(cwd) 173 | parse_elapsed = default_timer() - parse_start_time 174 | diagnostics = map(get_diagnostics_info, translation_unit.diagnostics) 175 | diag_file = join_path(self.source_file_ast_dir, '{}.diag.txt'.format(get_basename(self.source_file))) 176 | pandas.DataFrame(diagnostics).to_csv(diag_file, index=False) 177 | iteration_start_time = default_timer() 178 | # self.add_node(translation_unit.cursor) 179 | if self.arguments.save_format == 'graph': 180 | self.get_info_graph(translation_unit.cursor) 181 | self.save_ast(self.function_name) 182 | elif self.arguments.save_format == 'json': 183 | ast_json = self.get_info_json(translation_unit.cursor) 184 | self.save_ast_json(ast_json) 185 | pprint(ast_json) 186 | elif self.arguments.save_format == 'ast': 187 | self.save_tu(translation_unit) 188 | iteration_elapsed = default_timer() - iteration_start_time 189 | time_file = join_path(self.source_file_ast_dir, '{}.time.txt'.format(get_basename(self.source_file))) 190 | write_file(time_file, 'Parsed Time: {} , Iteration Time: {}'.format(parse_elapsed, iteration_elapsed)) 191 | return True 192 | 193 | def analyze(self): 194 | if self.analysis_type == 'ast': 195 | return self.emit_llvm_ast() 196 | elif self.analysis_type == 'bc': 197 | return self.emit_llvm_bc() 198 | else: 199 | return False 200 | 201 | def get_info_graph(self, node, depth=0): 202 | 203 | flag = True if self.source_file in str(node.location) else False 204 | 205 | if flag: 206 | parent_vertex_id = self.add_node(node) 207 | 208 | # children_info = [] 209 | for c in node.get_children(): 210 | self.get_info_graph(c, depth + 1) 211 | if flag: 212 | child_vertex_id = self.add_node(c) 213 | if parent_vertex_id != child_vertex_id: 214 | self.ast.add_edge(parent_vertex_id, child_vertex_id) 215 | 216 | # Should be tested 217 | def get_info_json(self, node, depth=0): 218 | node_kind = str(node.kind).split('.')[1] 219 | flag = True if self.source_file in str(node.location) or node_kind == 'TRANSLATION_UNIT' else False 220 | 221 | if not flag: 222 | return None 223 | 224 | children = [self.get_info_json(c, depth + 1) for c in node.get_children()] 225 | children = [c for c in children if c is not None] 226 | 227 | return {'id': self.get_cursor_id(node), 228 | 'kind': node_kind, 229 | #'usr': node.get_usr(), 230 | 'spelling': node.spelling, 231 | 'location': str(node.location).split(',')[1:], 232 | #'extent.start': str(node.extent.start), 233 | #'extent.end': str(node.extent.end), 234 | 'is_definition': node.is_definition(), 235 | # 'definition id': get_cursor_id(node.get_definition()), 236 | 'children': children} 237 | 238 | def add_node(self, node): 239 | node_kind = str(node.kind).split('.')[1] 240 | if node_kind == 'FUNCTION_DECL': 241 | if self.function_count >= 1: 242 | self.save_ast(self.function_name) 243 | # self.ast = None 244 | self.ast = nx.DiGraph() 245 | self.cursor_list = {} 246 | # self.cursor_list = [] 247 | self.function_count += 1 248 | self.function_name = node.spelling 249 | 250 | # node_id = self.get_cursor_id(node) 251 | node_id = self.get_cursor_id(self.get_unique_hash(node)) 252 | self.ast.add_node(node_id, 253 | type='"{}"'.format(node_kind), 254 | usr='"{}"'.format(node.get_usr()), 255 | spelling=u'"{}"'.format(str(node.spelling).replace('"', '')), 256 | location='"{}"'.format(node.location), 257 | extent_start='"{}"'.format(node.extent.start), 258 | extent_end='"{}"'.format(node.extent.end), 259 | is_definition=node.is_definition() 260 | # definition_id = self.get_cursor_id(node.get_definition()) 261 | ) 262 | 263 | return node_id 264 | 265 | def get_cursor_id_bk(self, cursor_hash): 266 | 267 | if cursor_hash is None: 268 | return None 269 | 270 | self.cursor_list.append(cursor_hash) 271 | index = self.cursor_list.index(cursor_hash) 272 | return index - 1 273 | 274 | def get_cursor_id(self, cursor_hash): 275 | 276 | if cursor_hash is None: 277 | return None 278 | 279 | for key, value in self.cursor_list.iteritems(): 280 | if cursor_hash == value: 281 | return key 282 | len_cursor_list = len(self.cursor_list) 283 | self.cursor_list[len_cursor_list] = cursor_hash 284 | return len_cursor_list 285 | 286 | def get_unique_hash(self, cursor): 287 | return hash(('"{}"'.format(cursor.kind), 288 | '"{}"'.format(cursor.get_usr()), 289 | u'"{}"'.format(str(cursor.spelling).replace('"', '')), 290 | '"{}"'.format(cursor.location), 291 | '"{}"'.format(cursor.extent.start), 292 | '"{}"'.format(cursor.extent.end), 293 | cursor.is_definition() 294 | )) 295 | 296 | def save_ast(self, function_name): 297 | existing_files = get_files_in_dir(self.source_file_ast_dir, ext='{}.graphml'.format(function_name)) 298 | if len(existing_files) == 0: 299 | ast_file = join_path(self.source_file_ast_dir, '{}.graphml'.format(function_name)) 300 | else: 301 | ast_file = join_path(self.source_file_ast_dir, '+{}'.format(get_basename(existing_files[0]))) 302 | 303 | nx.write_graphml(self.ast, ast_file) 304 | 305 | def save_ast_json(self, ast_json): 306 | # pprint(ast_json) 307 | try: 308 | write_file_json(join_path(self.source_file_ast_dir, 'ast.json'), ast_json) 309 | except Exception as e: 310 | print e 311 | 312 | def save_tu(self, translation_unit): 313 | translation_unit.save(join_path(self.source_file_ast_dir, '{}.ast'.format(get_basename(self.source_file_ast_dir)))) 314 | 315 | -------------------------------------------------------------------------------- /scripts/get_inconsistencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | benchmark=$1 4 | threshold=$2 5 | granularity=$3 6 | preprocess=$4 7 | 8 | if [ -z "${benchmark}" ]; then 9 | echo "benchmark is unset or set to the empty string" 10 | exit 1; 11 | fi 12 | 13 | if [ -z "${preprocess}" ]; then 14 | echo "No preprocessing" 15 | preprocess="np" 16 | fi 17 | 18 | rm output 19 | 20 | if [ "${preprocess}" = "p" ]; then 21 | datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 22 | bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 23 | data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 24 | 25 | rm -rf "$data/$datasets/$benchmark" 26 | rm -rf "$data/$bcs/$benchmark" 27 | python __init__.py -p=$benchmark -a=BC 28 | python __init__.py -p=$benchmark -a=PDG 29 | python __init__.py -p=$benchmark -a=AS 30 | python __init__.py -p=$benchmark -a=AS -cws=2 31 | python __init__.py -p=$benchmark -a=AS -hcf 32 | python __init__.py -p=$benchmark -a=AS -hcf -cws=2 33 | python __init__.py -p=$benchmark -a=FE -ft=afs_NN 34 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN 35 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_NN 36 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs_NN 37 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb1_NN 38 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb2_NN 39 | fi 40 | 41 | if [ "$threshold" = "all" ]; then 42 | if [ "$granularity" = "all" ]; then 43 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.99 44 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.99 45 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.99 46 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.99 47 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.99 48 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99 49 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.99 50 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.99 51 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.99 52 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.99 53 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.99 54 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99 55 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.99 56 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.99 57 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.99 58 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.99 59 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.99 60 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99 61 | 62 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.70,cc_0.99 63 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.75,cc_0.99 64 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.80,cc_0.99 65 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.85,cc_0.99 66 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.90,cc_0.99 67 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99 68 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99 69 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99 70 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99 71 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99 72 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99 73 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 74 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99 75 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99 76 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99 77 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99 78 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99 79 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 80 | elif [ "$granularity" = "afs" ]; then 81 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.99 82 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.99 83 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.99 84 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.99 85 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.99 86 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99 87 | 88 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.70,cc_0.99 89 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.75,cc_0.99 90 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.80,cc_0.99 91 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.85,cc_0.99 92 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.90,cc_0.99 93 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99 94 | elif [ "$granularity" = "afs.bb1" ]; then 95 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.99 96 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.99 97 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.99 98 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.99 99 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.99 100 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99 101 | 102 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99 103 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99 104 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99 105 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99 106 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99 107 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 108 | elif [ "$granularity" = "afs.bb2" ]; then 109 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.99 110 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.99 111 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.99 112 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.99 113 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.99 114 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99 115 | 116 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99 117 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99 118 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99 119 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99 120 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99 121 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 122 | fi 123 | 124 | elif [ "$threshold" = "most" ]; then 125 | if [ "$granularity" = "all" ]; then 126 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99 127 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99 128 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99 129 | 130 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99 131 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 132 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 133 | elif [ "$granularity" = "afs" ]; then 134 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99 135 | 136 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99 137 | elif [ "$granularity" = "afs.bb1" ]; then 138 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99 139 | 140 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 141 | elif [ "$granularity" = "afs.bb2" ]; then 142 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99 143 | 144 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 145 | fi 146 | fi 147 | 148 | -------------------------------------------------------------------------------- /scripts/get_inconsistencies_NN_G2v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | benchmark=$1 4 | threshold=$2 5 | granularity=$3 6 | preprocess=$4 7 | 8 | if [ -z "${benchmark}" ]; then 9 | echo "benchmark is unset or set to the empty string" 10 | exit 1; 11 | fi 12 | 13 | if [ -z "${preprocess}" ]; then 14 | echo "No preprocessing" 15 | preprocess="np" 16 | fi 17 | 18 | rm output 19 | 20 | if [ "${preprocess}" = "p" ]; then 21 | datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 22 | bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 23 | data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 24 | 25 | rm -rf "$data/$datasets/$benchmark" 26 | rm -rf "$data/$bcs/$benchmark" 27 | python __init__.py -p=$benchmark -a=BC 28 | python __init__.py -p=$benchmark -a=PDG 29 | python __init__.py -p=$benchmark -a=AS 30 | python __init__.py -p=$benchmark -a=AS -cws=2 31 | 32 | python __init__.py -p=$benchmark -a=FE -ft=afs_NN 33 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN 34 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_NN 35 | 36 | python __init__.py -p=$benchmark -a=FE -ft=afs_G2v 37 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_G2v 38 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_G2v 39 | fi 40 | 41 | if [ "$threshold" = "all" ]; then 42 | if [ "$granularity" = "all" ]; then 43 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.98 44 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.98 45 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.98 46 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.98 47 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.98 48 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98 49 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.98 50 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.98 51 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.98 52 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.98 53 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.98 54 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98 55 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.98 56 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.98 57 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.98 58 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.98 59 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.98 60 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98 61 | 62 | elif [ "$granularity" = "afs" ]; then 63 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.98 64 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.98 65 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.98 66 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.98 67 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.98 68 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98 69 | 70 | elif [ "$granularity" = "afs.bb1" ]; then 71 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.98 72 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.98 73 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.98 74 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.98 75 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.98 76 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98 77 | 78 | elif [ "$granularity" = "afs.bb2" ]; then 79 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.98 80 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.98 81 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.98 82 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.98 83 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.98 84 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98 85 | 86 | fi 87 | 88 | elif [ "$threshold" = "most" ]; then 89 | if [ "$granularity" = "all" ]; then 90 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98 91 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98 92 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98 93 | 94 | elif [ "$granularity" = "afs" ]; then 95 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98 96 | 97 | elif [ "$granularity" = "afs.bb1" ]; then 98 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98 99 | 100 | elif [ "$granularity" = "afs.bb2" ]; then 101 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98 102 | 103 | fi 104 | fi 105 | 106 | -------------------------------------------------------------------------------- /scripts/get_inconsistencies_g2v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | benchmark=$1 4 | threshold=$2 5 | granularity=$3 6 | preprocess=$4 7 | 8 | if [ -z "${benchmark}" ]; then 9 | echo "benchmark is unset or set to the empty string" 10 | exit 1; 11 | fi 12 | 13 | if [ -z "${preprocess}" ]; then 14 | echo "No preprocessing" 15 | preprocess="np" 16 | fi 17 | 18 | rm output 19 | 20 | if [ "${preprocess}" = "p" ]; then 21 | datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 22 | bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 23 | data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 24 | 25 | rm -rf "$data/$datasets/$benchmark" 26 | rm -rf "$data/$bcs/$benchmark" 27 | python __init__.py -p=$benchmark -a=BC 28 | python __init__.py -p=$benchmark -a=PDG 29 | python __init__.py -p=$benchmark -a=AS 30 | python __init__.py -p=$benchmark -a=AS -cws=2 31 | 32 | python __init__.py -p=$benchmark -a=FE -ft=afs_NN 33 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN 34 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_NN 35 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs_NN 36 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb1_NN 37 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb2_NN 38 | 39 | python __init__.py -p=$benchmark -a=FE -ft=afs_G2v 40 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_G2v 41 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_G2v 42 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs_G2v 43 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb1_G2v 44 | python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb2_G2v 45 | fi 46 | 47 | if [ "$threshold" = "all" ]; then 48 | if [ "$granularity" = "all" ]; then 49 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.70,cc_0.99 50 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.75,cc_0.99 51 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.80,cc_0.99 52 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.85,cc_0.99 53 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.90,cc_0.99 54 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99 55 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.70,cc_0.99 56 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.75,cc_0.99 57 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.80,cc_0.99 58 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.85,cc_0.99 59 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.90,cc_0.99 60 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99 61 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.70,cc_0.99 62 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.75,cc_0.99 63 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.80,cc_0.99 64 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.85,cc_0.99 65 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.90,cc_0.99 66 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99 67 | 68 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.70,cc_0.99 69 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.75,cc_0.99 70 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.80,cc_0.99 71 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.85,cc_0.99 72 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.90,cc_0.99 73 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99 74 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99 75 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99 76 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99 77 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99 78 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99 79 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 80 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99 81 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99 82 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99 83 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99 84 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99 85 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 86 | elif [ "$granularity" = "afs" ]; then 87 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.70,cc_0.99 88 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.75,cc_0.99 89 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.80,cc_0.99 90 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.85,cc_0.99 91 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.90,cc_0.99 92 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99 93 | 94 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.70,cc_0.99 95 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.75,cc_0.99 96 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.80,cc_0.99 97 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.85,cc_0.99 98 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.90,cc_0.99 99 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99 100 | elif [ "$granularity" = "afs.bb1" ]; then 101 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.70,cc_0.99 102 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.75,cc_0.99 103 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.80,cc_0.99 104 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.85,cc_0.99 105 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.90,cc_0.99 106 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99 107 | 108 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99 109 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99 110 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99 111 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99 112 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99 113 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 114 | elif [ "$granularity" = "afs.bb2" ]; then 115 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.70,cc_0.99 116 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.75,cc_0.99 117 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.80,cc_0.99 118 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.85,cc_0.99 119 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.90,cc_0.99 120 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99 121 | 122 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99 123 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99 124 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99 125 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99 126 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99 127 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 128 | fi 129 | 130 | elif [ "$threshold" = "most" ]; then 131 | if [ "$granularity" = "all" ]; then 132 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99 133 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99 134 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99 135 | 136 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99 137 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 138 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 139 | elif [ "$granularity" = "afs" ]; then 140 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99 141 | 142 | python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99 143 | elif [ "$granularity" = "afs.bb1" ]; then 144 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99 145 | 146 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99 147 | elif [ "$granularity" = "afs.bb2" ]; then 148 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99 149 | 150 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99 151 | fi 152 | fi 153 | 154 | -------------------------------------------------------------------------------- /scripts/get_inconsistencies_real_programs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | benchmark=$1 4 | preprocess=$2 5 | 6 | if [ -z "${benchmark}" ]; then 7 | echo "benchmark is unset or set to the empty string" 8 | exit 1; 9 | fi 10 | 11 | if [ -z "${preprocess}" ]; then 12 | echo "No preprocessing" 13 | preprocess="np" 14 | fi 15 | 16 | rm output 17 | 18 | if [ "${preprocess}" = "p" ]; then 19 | datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 20 | bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 21 | data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 22 | 23 | rm -rf "$data/$datasets/$benchmark" 24 | rm -rf "$data/$bcs/$benchmark" 25 | python __init__.py -p=$benchmark -a=BC 26 | python __init__.py -p=$benchmark -a=PDG 27 | python __init__.py -p=$benchmark -a=AS 28 | python __init__.py -p=$benchmark -a=FE -ft=afs_NN 29 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN 30 | fi 31 | 32 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99 33 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99 34 | 35 | -------------------------------------------------------------------------------- /scripts/get_inconsistencies_real_programs_NN_G2v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | benchmark=$1 4 | preprocess=$2 5 | split=$3 6 | 7 | if [ -z "${benchmark}" ]; then 8 | echo "benchmark is unset or set to the empty string" 9 | exit 1; 10 | fi 11 | 12 | if [ -z "${preprocess}" ]; then 13 | echo "No preprocessing" 14 | preprocess="np" 15 | fi 16 | 17 | if [ "${preprocess}" = "p" ]; then 18 | datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 19 | bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 20 | data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]') 21 | echo "Removing dataset folder of $benchmark" 22 | rm -rf "$data/$datasets/$benchmark" 23 | echo "Removing IR folder of $benchmark" 24 | rm -rf "$data/$bcs/$benchmark" 25 | python __init__.py -p=$benchmark -a=BC 26 | python __init__.py -p=$benchmark -a=PDG 27 | python __init__.py -p=$benchmark -a=AS 28 | if [ "${split}" = "ns" ]; then 29 | python __init__.py -p=$benchmark -a=FE -ft=afs_NN 30 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN 31 | else 32 | python __init__.py -p=$benchmark -a=FE -ft=afs_NN -s=True 33 | python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN -s=True 34 | fi 35 | fi 36 | 37 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98 -sc=online 38 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98 -sc=online 39 | -------------------------------------------------------------------------------- /settings-bak.py: -------------------------------------------------------------------------------- 1 | ACTIONS = 'AS' 2 | LANGUAGES = 'C' 3 | DATA_DIR = '/Users/mansourahmadi/Bank/Work/NEU/vulfinder/data/' # Full path must be provided 4 | PROJECTS_DIR = 'projects' 5 | ASTS_DIR = 'asts' 6 | SAVE_FORMAT = 'graph' # json , graph , ast 7 | BCS_DIR = 'bcs' 8 | DATASETS_DIR = 'datasets' 9 | PLOTS_DIR = 'plots' 10 | CLUSTERING_ALGS = 'dbscancos_0.3,dbscancos_0.01' # aff_cos dbscan_cos means 11 | COSE_SIMILARITY_CHUNK_SIZE = 25000 # If there is 200G Ram 12 | CLUSTERING_FEAT = 'afs_NN,afs_G2v' 13 | SECOND_CLUSTERING = 'offline' # offline, online 14 | BIG_CLUSTERS_IGNORE = 200 # Size of big clusters that should be ignored from the first step clustering 15 | # as they might not contain useful data 16 | CHUNK_WINDOW_SIZE = 2 17 | SPLIT = True 18 | SEARCH_SPACES = [] # Empty means everywhere in the projects 19 | PROJECTS = 'juliet-test-suite' # 'openssl-41bff72' # 'ffmpeg-b2f0f37' # Empty means all projects 20 | IGNORE_COMPILE_COMMANDS = True 21 | # UNIFIED_PROJECTS = [['test']] 22 | FEATURE_TYPES = 'afs_NN' # afs_G2v, afs.bb2_NN 23 | # CLANG_LIB_DIR = '../../tools/clang+llvm-5.0.1-x86_64-linux-gnu-ubuntu-16.04/lib' 24 | CLANG_LIB_DIR = './../tools/clang+llvm-5.0.1/lib' 25 | LLVM_CONFIG = '/usr/local/Cellar/llvm/6.0.0/bin/' 26 | INCLUDES = '' 27 | CLANG = 'clang' 28 | PDG_DUMPER = '' 29 | STAT_TYPE = 'ST' # SS, SI, ST 30 | STAT_SIM_TYPES = 'NN,G2v' 31 | INCONSISTENCY_TYPE = 'check' # check, call, type, order 32 | SIMILARITY_THRESHOLD = 0.7 33 | GRANULARITY = 'afs,afs.bb1,afs.bb2' 34 | DEPENDENCY = '' # all , odd , cdd 35 | CALL_INCONSISTENCY = 'free,close,memset,clear,zero,remove,unlock,end,clean,cleanse,assert' 36 | TYPE_INCONSISTENCY = 'sext,trunc' # 'fptrunc,sext,zext,call zeroext,call signext,sitofp,uitofp,bitcast' 37 | STORE_INCONSISTENCY = 'null' 38 | INCONSISTENCY_QUERY_OPTIONS = 'top_10' 39 | COUNT_CPU = 10 40 | BENCHMARK_GROUNDTRUTH_PATH = '../iBench/' -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | ACTIONS = '' 2 | LANGUAGES = 'C' 3 | DATA_DIR = '/home/mansour/bank/FICS/data/' # Full path must be provided 4 | PROJECTS_DIR = 'projects' 5 | ASTS_DIR = 'asts' 6 | SAVE_FORMAT = 'graph' # json , graph , ast 7 | BCS_DIR = 'bcs' 8 | DATASETS_DIR = 'datasets' 9 | PLOTS_DIR = 'plots' 10 | CLUSTERING_ALGS = 'dbscancos_0.3,dbscancos_0.02' 11 | CLUSTERING_FEAT = 'afs_NN,afs_G2v' 12 | SECOND_CLUSTERING = 'offline' 13 | COSE_SIMILARITY_CHUNK_SIZE = 200000 # BB 150000 # If there is 200G Ram 14 | BIG_CLUSTERS_IGNORE = 50 15 | CHUNK_WINDOW_SIZE = 1 16 | SPLIT = False 17 | SEARCH_SPACES = [] # Empty means everywhere in the projects 18 | PROJECTS = '' # 'ffmpeg-b2f0f37' # Empty means all projects 19 | IGNORE_COMPILE_COMMANDS = False 20 | FEATURE_TYPES = 'afs_NN' # afs_G2v, afs.bb2_NN 21 | # CLANG_LIB_DIR = '../../tools/clang+llvm-5.0.1-x86_64-linux-gnu-ubuntu-16.04/lib' 22 | CLANG_LIB_DIR = '/usr/lib/x86_64-linux-gnu' 23 | LLVM_CONFIG = '' 24 | INCLUDES = '' 25 | CLANG = 'clang-3.8' # 'clang-6.0' #'clang-3.8' 26 | PDG_DUMPER = './dg/tools/llvm-dg-dump' 27 | STAT_TYPE = 'ST' # SI, SS , ST 28 | STAT_SIM_TYPES = 'NN,G2v' 29 | INCONSISTENCY_TYPE = 'check' # check, call, type, order 30 | SIMILARITY_THRESHOLD = 0.7 31 | GRANULARITY = 'afs,afs.bb1,afs.bb2' 32 | DEPENDENCY = '' # all , odd , cdd 33 | CALL_INCONSISTENCY = 'free,close,memset,clear,bzero,remove,unlock,end,clean,cleanse,assert' 34 | TYPE_INCONSISTENCY = 'trunc' # 'fptrunc,sext,zext,call zeroext,call signext,sitofp,uitofp,bitcast' 35 | STORE_INCONSISTENCY = 'null' 36 | INCONSISTENCY_QUERY_OPTIONS = 'top_10' 37 | COUNT_CPU = 8 38 | BENCHMARK_GROUNDTRUTH_PATH = 'iBench/' 39 | -------------------------------------------------------------------------------- /ssh_private_key_password.py: -------------------------------------------------------------------------------- 1 | ip='' 2 | username='' 3 | password='' 4 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/utils/__init__.py -------------------------------------------------------------------------------- /utils/computation.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | 3 | 4 | def is_number(num): 5 | try: 6 | if isinstance(float(num), numbers.Number): 7 | return True 8 | else: 9 | return False 10 | except: 11 | return False 12 | 13 | -------------------------------------------------------------------------------- /utils/inout.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import sys 5 | from cgitb import grey 6 | from os.path import join, normpath 7 | from shutil import move, copy 8 | 9 | import pandas as pd 10 | 11 | 12 | def exist_file(file_path): 13 | return os.path.isfile(file_path) 14 | 15 | 16 | def exist_dir(path): 17 | return os.path.isdir(path) 18 | 19 | 20 | def make_dir_if_not_exist(path): 21 | if not exist_dir(path): 22 | os.makedirs(path) 23 | 24 | 25 | def get_basename(file_path): 26 | return os.path.basename(file_path) 27 | 28 | 29 | def join_path(*args): 30 | return normpath(join(*args)) 31 | 32 | 33 | def get_dataframe(file_path, columns=''): 34 | try: 35 | dataframe = pd.read_csv(file_path) 36 | except: 37 | with open(file_path, 'w') as f: 38 | f.write(columns) 39 | f.close() 40 | dataframe = pd.read_csv(file_path) 41 | return dataframe 42 | 43 | 44 | def show_error(message): 45 | print 46 | print '*' * 50 47 | print 'Error: {}'.format(message) 48 | sys.exit(2) 49 | 50 | 51 | def get_current_directory(): 52 | current_path = os.getcwd() 53 | return current_path 54 | 55 | 56 | def change_directory(directory): 57 | os.chdir(directory) 58 | 59 | 60 | def load_from_csv(modes_files_path, file_path, separated=False): 61 | mode_file_path = modes_files_path + '/' + file_path 62 | mode_file = pd.read_csv(mode_file_path, delimiter=',') 63 | if separated is True: 64 | mode_feature_vectors = mode_file.ix[:, :-1] 65 | mode_class_labels = mode_file.ix[:, -1] 66 | 67 | return mode_feature_vectors, mode_class_labels 68 | else: 69 | return mode_file 70 | 71 | 72 | def get_directories(directory_path): 73 | return [join_path(directory_path, i) for i in os.listdir(directory_path) if not i.startswith('.')] 74 | 75 | 76 | def remove_file(file_name): 77 | os.remove(file_name) 78 | 79 | 80 | def remove_directory(dir_name): 81 | if exist_dir(dir_name): 82 | shutil.rmtree(dir_name) 83 | 84 | 85 | def move_file(src_file, dst_file): 86 | move(src_file, dst_file) 87 | 88 | 89 | def copy_file(src_file, dst_file): 90 | copy(src_file, dst_file) 91 | 92 | 93 | def get_parent_dir(file_name): 94 | return os.path.dirname(file_name) 95 | 96 | 97 | def is_file(file_name): 98 | return os.path.isfile(file_name) 99 | 100 | 101 | def get_files_in_dir(dir, ext='', search_spaces=[], start=''): 102 | files = [] 103 | for path, sub_dirs, file_names in os.walk(dir): 104 | 105 | for file_name in file_names: 106 | if file_name.endswith(ext) and file_name.startswith(start): 107 | in_search_space = False 108 | if len(search_spaces) == 0: 109 | in_search_space = True 110 | for search_space in search_spaces: 111 | if search_space in path: 112 | in_search_space = True 113 | if not in_search_space: 114 | continue 115 | files.append(join_path(path, file_name)) 116 | 117 | return files 118 | 119 | 120 | def get_cfiles_compile_db(compile_db): 121 | cfiles = {} 122 | for json_values in compile_db: 123 | if 'file' in json_values: 124 | file_path = join_path(json_values['directory'], json_values['file']) 125 | # print file_path 126 | if file_path.endswith(".c"): 127 | final_args = [] 128 | if 'arguments' in json_values: 129 | args = list(json_values['arguments']) 130 | else: 131 | args = [item for item in json_values['command'].split() if not item.endswith('.o') and not item.endswith('.c')] 132 | remove_items = ('-c', 'cc', '-o', '-g') 133 | for remove_item in remove_items: 134 | if remove_item in args: 135 | args.remove(remove_item) 136 | 137 | for i in range(len(args)): 138 | arg = args[i] 139 | if arg.startswith('-I.') or arg.startswith('-I..'): 140 | arg = '-I' + join_path(json_values['directory'], arg[2:]) 141 | if arg == '.' or arg == '..': 142 | arg = join_path(json_values['directory'], arg) 143 | final_args.append(arg) 144 | 145 | cfiles[file_path] = final_args 146 | # print file_path, final_args 147 | else: 148 | print "Not a C file:", file_path 149 | 150 | return cfiles 151 | 152 | 153 | def read_file(file_name): 154 | with open(file_name, 'r') as f: 155 | return f.read() 156 | 157 | 158 | def write_file(file_name, content): 159 | with open(file_name, 'w') as f: 160 | f.write(content) 161 | f.close() 162 | 163 | 164 | def write_file_json(file_name, content): 165 | with open(file_name, 'w') as f: 166 | json.dump(content, f) 167 | f.close() 168 | 169 | 170 | def get_filename_without_ext(file_name): 171 | return os.path.splitext(file_name)[0] 172 | 173 | 174 | def load_json(file_path): 175 | return json.load(open(file_path)) 176 | 177 | 178 | def load_json_file(file_path): 179 | with open(file_path) as f: 180 | json_content = eval(f.read()) 181 | return json_content 182 | 183 | 184 | def get_arguments(file_path, json_data): 185 | for json_values in json_data: 186 | if 'file' in json_values: 187 | if join_path(json_values['directory'], json_values['file']) == file_path: 188 | args = list(json_values['arguments']) 189 | for i in range(len(args)): 190 | arg = args[i] 191 | if arg.startswith('-I./'): 192 | arg = '-I{}'.format(arg[4:]) 193 | if arg.startswith('-I..'): 194 | parent_include = json_values['directory'] 195 | include_path = arg[2:] 196 | for item in range(arg.count('..')): 197 | parent_include = get_parent_dir(parent_include) 198 | include_path = include_path[3:] 199 | parent_include = join_path(parent_include, include_path) 200 | args[i] = '-I{}'.format(parent_include) 201 | remove_items = ('-c', 'cc', '-o') 202 | for item in remove_items: 203 | if item in args: 204 | args.remove(item) 205 | return args 206 | 207 | # print file_path 208 | return None 209 | 210 | 211 | def check_missing_files(c_files, json_data): 212 | for json_values in json_data: 213 | flag = 0 214 | for c_file in c_files: 215 | if 'file' in json_values: 216 | if join_path(json_values['directory'], json_values['file']) == c_file: 217 | flag = 1 218 | if flag == 0: 219 | if 'file' in json_values: 220 | print join_path(json_values['directory'], json_values['file']) 221 | elif 'files' in json_values: 222 | print json_values['directory'], json_values['files'] 223 | 224 | 225 | def read_lines(file_path): 226 | with open(file_path) as f: 227 | content = f.readlines() 228 | return [x.strip() for x in content] 229 | 230 | 231 | def read_csv_header(file_path): 232 | first_row = pd.read_csv(file_path, index_col=0, nrows=1) 233 | return first_row.columns.values 234 | 235 | 236 | class bcolors: 237 | HEADER = '\033[95m' 238 | OKBLUE = '\033[94m' 239 | OKGREEN = '\033[92m' 240 | WARNING = '\033[93m' 241 | FAIL = '\033[91m' 242 | ENDC = '\033[0m' 243 | BOLD = '\033[1m' 244 | UNDERLINE = '\033[4m' 245 | GREY = '\033[90m' 246 | -------------------------------------------------------------------------------- /utils/progress.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | class Progress: 5 | def __init__(self, total, message='progress'): 6 | self.message = message 7 | self.total = total 8 | self.current = 0 9 | 10 | def next(self, step=1, extra_message=''): 11 | bar_length, status = 20, "" 12 | self.current += step 13 | progress = float(self.current) / float(self.total) 14 | if progress >= 1.: 15 | progress = 1 16 | block = int(round(bar_length * progress)) 17 | text = "\r{} [{}] {:.0f}% {} {}".format(self.message, 18 | "#" * block + "-" * (bar_length - block), round(progress * 100, 0), 19 | status, extra_message) 20 | sys.stdout.write(text) 21 | sys.stdout.flush() 22 | 23 | @staticmethod 24 | def finish(): 25 | sys.stdout.write("\n") 26 | sys.stdout.flush() 27 | 28 | @staticmethod 29 | def print_counter(counter, message): 30 | text = "\r{}: {}".format(message, counter) 31 | sys.stdout.write(text) 32 | sys.stdout.flush() 33 | --------------------------------------------------------------------------------