├── .gitmodules
├── LICENSE
├── README.md
├── __init__.py
├── act
    ├── __init__.py
    ├── act.py
    ├── actionfactory.py
    ├── actiontype.py
    ├── asbuilder.py
    ├── astbuilder.py
    ├── bcbuilder.py
    ├── cluster.py
    ├── featureextractor.py
    ├── getstatistics.py
    ├── pdgbuilder.py
    ├── queryinconsistency.py
    └── sampledownloader.py
├── argsparser.py
├── arguments.py
├── iBench
    └── groundtruth.py
├── install.sh
├── learning
    ├── __init__.py
    ├── clustering.py
    ├── graph2vec
    │   ├── __init__.py
    │   ├── corpus_parser.py
    │   ├── graph2vec.py
    │   ├── parallelgraph2vec.py
    │   ├── skipgram.py
    │   ├── train_utils.py
    │   └── utils.py
    ├── graphkernel
    │   ├── __init__.py
    │   └── weisfeiler_lehman.py
    ├── node2vec
    │   ├── __init__.py
    │   ├── main.py
    │   └── node2vec.py
    ├── similarity.py
    ├── statistics.py
    └── struc2vec
    │   ├── algorithms.py
    │   ├── algorithms_distances.py
    │   ├── graph.py
    │   ├── main.py
    │   ├── struc2vec.py
    │   └── utils.py
├── requirements.txt
├── sample
    ├── __init__.py
    ├── astfile.py
    ├── bitcodefile.py
    ├── languagetype.py
    ├── projectcode.py
    ├── slicer.py
    └── sourcefile.py
├── scripts
    ├── get_inconsistencies.sh
    ├── get_inconsistencies_NN_G2v.sh
    ├── get_inconsistencies_g2v.sh
    ├── get_inconsistencies_real_programs.sh
    └── get_inconsistencies_real_programs_NN_G2v.sh
├── settings-bak.py
├── settings.py
├── ssh_private_key_password.py
└── utils
    ├── __init__.py
    ├── computation.py
    ├── inout.py
    └── progress.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dg"]
2 | 	path = dg
3 | 	url = https://github.com/ManSoSec/dg
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FICS
 2 | 
 3 | 
 4 | # Download & configure FICS
 5 | 
 6 | 1. Clone the repository
 7 |   - For example: mkdir /home/mansour/code
 8 |   - cd /home/mansour/code
 9 |   - ```git clone --recurse-submodules https://github.com/RiS3-Lab/FICS.git```
10 |   - cd FICS
11 | 2. ```sh install.sh```
12 | 3. create a directory as the root of your data (e.g., source code, bitcodes, graphs, etc.)
13 |   - For example: mkdir /home/mansour/data
14 |   - cd /home/mansour/data
15 |   - create a directory inside and name it 'projects': mkdir projects
16 |   - cd /home/mansour/data/projects
17 | 4. Modify settings.py and update DATA_DIR to the root of your data
18 |   - For example: DATA_DIR = '/home/mansour/data'
19 |   
20 | # Prepare target codebase
21 | 
22 | 5. In the "projects" directory, clone the source code a codebase you target:
23 |   - For example: git clone https://gitlab.com/libtiff/libtiff.git libtiff-19f6b70
24 |   - cd libtiff-19f6b70
25 |   - git checkout 19f6b70 .
26 | 6. Compile the project with clang-3.8 and get compilation database (FICS just supports clang 3.8 and llvm 3.8)
27 |   - For example: cmake -D CMAKE_C_COMPILER="/usr/bin/clang-3.8" -D CMAKE_CXX_COMPILER="/usr/bin/clang++-3.8" .
28 |   - get compilation database: bear make
29 | 
30 | # Discover the inconsistencies
31 | 
32 | 7. Run FICS on the target codebase:
33 |   - For example: ```sh scripts/get_inconsistencies_real_programs_NN_G2v.sh libtiff-19f6b70 p ns```
34 |   - If you need to run FICS on larger projects like QEMU, change 'ns' to 's'. FICS splits the codebase to submodules
35 |   - *The inconsistencies are saved in mongodb*
36 | 
37 | # Query the found inconsistencies!!!
38 | 8. To query the saved inconsistencies, you need to run the following command:
39 |   - ```python __init__.py -a=QI -p=libtiff-19f6b70 -it=check -f```
40 |   - "-it" argument is inconsistency type and can be: check | call | type | store | order | all
41 |   - if you need to disable filtering, just remove -f
42 | 
43 | # Here is the list of bugs found by FICS
44 | 
45 | | Bug | Link | 
46 | | ------------- | ------------- |
47 | |  Codebase | OpenSSL  |
48 | | Missing check | [Report/Patch](https://github.com/openssl/openssl/issues/7650) |
49 | | Missing check | [Patch](https://github.com/openssl/openssl/pull/7427)|
50 | | Wrong use of clear_free | [Report/Patch](https://github.com/openssl/openssl/issues/10406)|
51 | | Null dereference | [Report/Patch](https://github.com/openssl/openssl/issues/10404)|
52 | | Null dereference | [Report/Patch](https://github.com/openssl/openssl/issues/10405)|
53 | | Inconsistent Check | [Report/Patch](https://github.com/openssl/openssl/pull/7880)|
54 | | Memory Leak | [Report/Patch](https://github.com/openssl/openssl/issues/10294)|
55 | | Missing clear_free | [Report/Patch](https://github.com/openssl/openssl/issues/7657)|
56 | |  Codebase | QEMU  |
57 | | 2 Missing checks | [Report/Patch](https://patchew.org/QEMU/20200414133052.13712-1-philmd@redhat.com/20200414133052.13712-11-philmd@redhat.com/) |
58 | | Undefined Behaviour  | [Report](https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg05749.html)/[Patch](https://patchwork.kernel.org/patch/11446203/) |
59 | | Uninitialized variable | [Report/Patch](https://lists.gnu.org/archive/html/qemu-trivial/2020-03/msg00239.html) |
60 | |  Codebase | LibTIFF  |
61 | | Missing checks | [Patch](https://gitlab.com/libtiff/libtiff/-/merge_requests/96)
62 | | Mislocated check - Bad casting | [Report/Patch](https://gitlab.com/libtiff/libtiff/-/issues/162)|
63 | | Missing TIFFClose | [Report/Patch](https://gitlab.com/libtiff/libtiff/-/issues/171)
64 | |  Codebase | wolfSSL  |
65 | | Missing check | [Report/Patch](https://github.com/wolfSSL/wolfssl/issues/2038) |
66 | | Missing check | [Report/Patch](https://github.com/wolfSSL/wolfssl/issues/2037)|
67 | | Memory exhaustion | [Report/Patch](https://github.com/wolfSSL/wolfssl/issues/2527)|
68 | |  Codebase | OpenSSH  |
69 | | Missing bzero | [Patch](https://github.com/openssh/openssh-portable/commit/2d1428b11c8b6f616f070f2ecedce12328526944)|
70 | |  Codebase | libredwg  |
71 | | Bad casting (Overflow)  | [Report](https://github.com/LibreDWG/libredwg/issues/174)/[Patch](https://github.com/LibreDWG/libredwg/commit/631bbacb3e18403db1015ef4063c3d19e9c8e11a) | 
72 | | Null dereference  | [Report](https://github.com/LibreDWG/libredwg/issues/172)/[Patch](https://github.com/LibreDWG/libredwg/commit/373c8e4849f2013d7123913bca8edb35ff6bc3d6) | 
73 | | Null dereference  | [Report](https://github.com/LibreDWG/libredwg/issues/173)/[Patch](https://github.com/LibreDWG/libredwg/commit/373c8e4849f2013d7123913bca8edb35ff6bc3d6) | 
74 | |  Codebase | TCPdump |
75 | | Missing initialization | [Report](https://github.com/the-tcpdump-group/tcpdump/issues/801) |
76 | 
77 | # Citation
78 | 
79 | If your found FICS useful for your research, please cite the following paper:
80 | 
81 | ```Latex
82 | @inproceedings{fics,
83 |  abstract = {
84 | Probabilistic classification has shown success in detecting known types of software bugs. However, the works following this approach tend to require a large amount of specimens to train their models. We present a new machine learning-based bug detection technique that does not require any external code or samples for training. Instead, our technique learns from the very codebase on which the bug detection is performed, and therefore, obviates the need for the cumbersome task of gathering and cleansing training samples (e.g., buggy code of certain kinds). The key idea behind our technique is a novel two-step clustering process applied on a given codebase. This clustering process identifies code snippets in a project that are functionally-similar yet appear in inconsistent forms. Such inconsistencies are found to cause a wide range of bugs, anything from missing checks to unsafe type conversions. Unlike previous works, our technique is generic and not specific to one type of inconsistency or bug. We prototyped our technique and evaluated it using 5 popular open source software, including QEMU and OpenSSL. With a minimal amount of manual analysis on the inconsistencies detected by our tool, we discovered 22 new unique bugs, despite the fact that many of these programs are constantly undergoing bug scans and new bugs in them are believed to be rare.
85 | },
86 |  author = {Ahmadi, Mansour and Mirzazade farkhani, Reza and  Williams, Ryan and Lu, Long},
87 |  booktitle = {Proceedings of the 30th USENIX Security Symposium},
88 |  month = {August},
89 |  series = {USENIX Security'21},
90 |  title = {Finding Bugs Using Your Own Code: Detecting Functionally-similar yet Inconsistent Code},
91 |  year = {2021}
92 |  ```
93 | }
94 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from act.actionfactory import ActionFactory
 4 | from argsparser import ArgsParser
 5 | 
 6 | if __name__ == '__main__':
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf8')
 9 |     args_parser = ArgsParser()
10 |     args_parser.parse()
11 |     args_parser.do_basic_checks()
12 |     ActionFactory(args_parser.arguments).perform_actions()
13 | 


--------------------------------------------------------------------------------
/act/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/act/__init__.py


--------------------------------------------------------------------------------
/act/act.py:
--------------------------------------------------------------------------------
1 | class Act:
2 | 
3 |     def __init__(self, arguments):
4 |         self.arguments = arguments
5 | 
6 |     def start(self):
7 |         pass
8 | 


--------------------------------------------------------------------------------
/act/actionfactory.py:
--------------------------------------------------------------------------------
 1 | from asbuilder import ASBuilder
 2 | from pdgbuilder import PDGBuilder
 3 | from bcbuilder import BCBuilder
 4 | from cluster import Cluster
 5 | from actiontype import ActionType
 6 | from featureextractor import FeatureExtractor
 7 | from astbuilder import ASTBuilder
 8 | from getstatistics import GetStatistics
 9 | from queryinconsistency import QueryInconsistency
10 | 
11 | 
12 | class ActionFactory:
13 | 
14 |     def __init__(self, arguments):
15 |         self.arguments = arguments
16 |         self.action = None
17 | 
18 |     def perform_actions(self):
19 | 
20 |         for action in self.arguments.actions:
21 |             if action == ActionType.AST.name:
22 |                 print '======================================='
23 |                 print '| Retrieve ASTs from the source codes |'
24 |                 print '======================================='
25 |                 self.start(ASTBuilder(arguments=self.arguments))
26 |             elif action == ActionType.BC.name:
27 |                 print '======================================='
28 |                 print '|  Retrieve bc from the source codes  |'
29 |                 print '======================================='
30 |                 self.start(BCBuilder(arguments=self.arguments))
31 |             elif action == ActionType.PDG.name:
32 |                 print '======================================='
33 |                 print '| Retrieve PDG from the LLVM bitcodes |'
34 |                 print '======================================='
35 |                 self.start(PDGBuilder(arguments=self.arguments))
36 |             elif action == ActionType.AS.name:
37 |                 print '======================================='
38 |                 print '|   Extract Abstract Slices from PDG  |'
39 |                 print '======================================='
40 |                 self.start(ASBuilder(arguments=self.arguments))
41 |             elif action == ActionType.FE.name:
42 |                 print '======================================='
43 |                 print '|          Extract features           |'
44 |                 print '======================================='
45 |                 self.start(FeatureExtractor(arguments=self.arguments))
46 |             elif action == ActionType.MC.name:
47 |                 print '======================================='
48 |                 print '|            Cluster samples          |'
49 |                 print '======================================='
50 |                 self.start(Cluster(arguments=self.arguments))
51 |             elif action == ActionType.ST.name:
52 |                 print '======================================='
53 |                 print '|        Print Clusters stats         |'
54 |                 print '======================================='
55 |                 self.start(GetStatistics(arguments=self.arguments))
56 |             elif action == ActionType.QI.name:
57 |                 print '======================================='
58 |                 print '|        Query Inconsistencies         |'
59 |                 print '======================================='
60 |                 self.start(QueryInconsistency(arguments=self.arguments))
61 | 
62 |     def start(self, action):
63 |         action.start()
64 | 
65 | 


--------------------------------------------------------------------------------
/act/actiontype.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class ActionType(Enum):
 5 |     AST = 'Retrieve Abstract Syntax Tree'
 6 |     BC = 'Retrieve bitcode'
 7 |     PDG = 'Retrieve Program Dependence Graph'
 8 |     AS = 'Extract Abstract Forward Slices'
 9 |     FE = 'Feature Extraction'
10 |     MC = 'Model Construction'
11 |     ST = 'Get cluster statistics'
12 |     QI = 'Query inconsistencies'
13 | 
14 |     @staticmethod
15 |     def get_names():
16 |         return [e.name for e in ActionType]
17 | 
18 |     @staticmethod
19 |     def get_detail():
20 |         return ['{}: {}'.format(e.name, e.value) for e in ActionType]
21 | 


--------------------------------------------------------------------------------
/act/asbuilder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from act import Act
 3 | from sample.projectcode import ProjectCode
 4 | from utils.inout import *
 5 | 
 6 | 
 7 | class ASBuilder(Act):
 8 | 
 9 |     def start(self):
10 |         projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir)
11 |         dir_names = get_directories(projects_dir)
12 |         for dir_name in dir_names:
13 |             for project_name in self.arguments.projects:
14 |                 if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
15 |                     print 'Analyzing {}'.format(get_basename(dir_name))
16 |                     project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments)
17 |                     project_code.retrieve_as()
18 | 


--------------------------------------------------------------------------------
/act/astbuilder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from act import Act
 3 | from sample.projectcode import ProjectCode
 4 | from utils.inout import *
 5 | 
 6 | 
 7 | class ASTBuilder(Act):
 8 | 
 9 |     def start(self):
10 |         projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir)
11 |         dir_names = get_directories(projects_dir)
12 |         for dir_name in dir_names:
13 |             for project_name in self.arguments.projects:
14 |                 if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
15 |                     print 'Analyzing {}'.format(get_basename(dir_name))
16 |                     project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments)
17 |                     project_code.retrieve_ast()
18 | 


--------------------------------------------------------------------------------
/act/bcbuilder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from act import Act
 3 | from sample.projectcode import ProjectCode
 4 | from utils.inout import *
 5 | 
 6 | 
 7 | class BCBuilder(Act):
 8 | 
 9 |     def start(self):
10 |         projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir)
11 |         dir_names = get_directories(projects_dir)
12 |         for dir_name in dir_names:
13 |             for project_name in self.arguments.projects:
14 |                 if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
15 |                     print 'Analyzing {}'.format(get_basename(dir_name))
16 |                     project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments)
17 |                     if self.arguments.prepare:
18 |                         project_code.prepare_bc()
19 |                     else:
20 |                         project_code.retrieve_bc()
21 |                     # project_code.link_bc_files()
22 | 


--------------------------------------------------------------------------------
/act/featureextractor.py:
--------------------------------------------------------------------------------
 1 | from act import Act
 2 | from sample.projectcode import ProjectCode
 3 | from utils.inout import *
 4 | from timeit import default_timer
 5 | 
 6 | 
 7 | class FeatureExtractor(Act):
 8 | 
 9 |     def start(self):
10 |         projects_dir = join_path(self.arguments.data_dir, self.arguments.bcs_dir)
11 |         datasets_dir = join_path(self.arguments.data_dir, self.arguments.datasets_dir)
12 |         dir_names = get_directories(projects_dir)
13 |         # feature_types = self.arguments.feature_types.split(',')
14 |         for feature_type in self.arguments.feature_types:
15 |             for dir_name in dir_names:
16 |                 for project_name in self.arguments.projects:
17 |                     if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
18 |                         if self.arguments.split != 'True':
19 |                             print 'Extracting {} features for {}'.format(feature_type, get_basename(dir_name))
20 |                             project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments,
21 |                                                        feature_type=feature_type)
22 |                             start_time = default_timer()
23 |                             project_code.extract_features(save=True)
24 |                             elapsed_time = default_timer() - start_time
25 |                             time_file = join_path(datasets_dir, get_basename(dir_name),
26 |                                                   '{}.feature_extraction.time.txt'.format(
27 |                                                       feature_type))
28 |                             print 'Feature Extraction Time:', elapsed_time
29 |                             write_file(time_file, '{}'.format(elapsed_time))
30 |                             # project_code.save_features()
31 |                         else:
32 |                             for module in get_directories(dir_name):
33 |                                 print 'Module:', get_basename(module)
34 |                                 project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments,
35 |                                                            feature_type=feature_type, module_name=get_basename(module))
36 | 
37 |                                 start_time = default_timer()
38 |                                 project_code.extract_features(save=True)
39 |                                 elapsed_time = default_timer() - start_time
40 |                                 if project_code.num_abstract_slices != 0:
41 |                                     time_file = join_path(datasets_dir, get_basename(dir_name),
42 |                                                           '{}.{}.feature_extraction.time.txt'.format(
43 |                                                               get_basename(module), feature_type))
44 |                                     print 'Feature Extraction Time:', elapsed_time
45 |                                     write_file(time_file, '{}'.format(elapsed_time))
46 |                                     # project_code.save_features()
47 | 


--------------------------------------------------------------------------------
/act/getstatistics.py:
--------------------------------------------------------------------------------
 1 | from act import Act
 2 | from learning.statistics import Statistics
 3 | from utils.inout import *
 4 | 
 5 | 
 6 | class GetStatistics(Act):
 7 | 
 8 |     def start(self):
 9 |         if self.arguments.stat_type == 'VI':
10 |             datasets_dir = join_path(self.arguments.data_dir, self.arguments.datasets_dir)
11 |             cluster_files = get_files_in_dir(datasets_dir, ext='.clusters.txt')
12 |             for cluster_file in cluster_files:
13 |                 for project_name in self.arguments.projects:
14 |                     if get_basename(get_parent_dir(get_parent_dir(cluster_file))) == project_name or \
15 |                             len(self.arguments.projects) == 0:
16 |                         print cluster_file
17 |                         statistics = Statistics(arguments=self.arguments,
18 |                                                 project_clusters_info_file=cluster_file)
19 |                         statistics.print_vul_info()
20 | 
21 |         elif self.arguments.stat_type == 'SI':
22 |             projects_dir = join_path(self.arguments.data_dir, self.arguments.bcs_dir)
23 |             dir_names = get_directories(projects_dir)
24 |             for dir_name in dir_names:
25 |                 for project_name in self.arguments.projects:
26 |                     if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
27 |                         statistics = Statistics(arguments=self.arguments, project_dir=dir_name)
28 |                         statistics.print_slices_info()
29 | 
30 |         elif self.arguments.stat_type == 'SS':
31 |             projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir)
32 |             dir_names = get_directories(projects_dir)
33 |             for dir_name in dir_names:
34 |                 for project_name in self.arguments.projects:
35 |                     if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
36 |                         statistics = Statistics(arguments=self.arguments, project_dir=dir_name)
37 |                         statistics.print_slices_similarities()
38 | 
39 |         elif self.arguments.stat_type == 'ST':
40 |             projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir)
41 |             time_data_min = {}
42 |             time_data_hour = {}
43 |             project_name_mapping = {'libpcap-545e77d8': 'libpcap', 'libtiff-19f6b70d': 'libtiff',
44 |                                     'mbedtls-0592ea7': 'mbedtls', 'openssh-c2fa53c': 'openssh',
45 |                                     'openssl-a75be9f': 'openssl', 'nginx-0098761': 'nginx',
46 |                                     'wolfssl-c26cb53': 'wolfssl'}
47 |             for project_name in self.arguments.projects:
48 |                 project_dir = join_path(projects_dir, project_name)
49 |                 statistics = Statistics(arguments=self.arguments, project_dir=project_dir)
50 | 
51 |                 if project_name in project_name_mapping.keys():
52 |                     project_name = project_name_mapping[project_name]
53 |                 time_data_min[project_name], time_data_hour[project_name] = statistics.print_performance_time()
54 | 
55 |             Statistics.draw_bar_chart(self.arguments, time_data_min, time_data_hour)
56 | 


--------------------------------------------------------------------------------
/act/pdgbuilder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from act import Act
 3 | from sample.projectcode import ProjectCode
 4 | from utils.inout import *
 5 | 
 6 | 
 7 | class PDGBuilder(Act):
 8 | 
 9 |     def start(self):
10 |         projects_dir = join_path(self.arguments.data_dir, self.arguments.projects_dir)
11 |         dir_names = get_directories(projects_dir)
12 |         for dir_name in dir_names:
13 |             for project_name in self.arguments.projects:
14 |                 if get_basename(dir_name) == project_name or len(self.arguments.projects) == 0:
15 |                     print 'Analyzing {}'.format(get_basename(dir_name))
16 |                     project_code = ProjectCode(project_dir=dir_name, arguments=self.arguments)
17 |                     project_code.retrieve_pdg()
18 | 


--------------------------------------------------------------------------------
/act/sampledownloader.py:
--------------------------------------------------------------------------------
1 | 
2 | from act import Act
3 | 
4 | 
5 | class SampleDownloader(Act):
6 | 
7 |     def start(self):
8 |         pass
9 | 


--------------------------------------------------------------------------------
/argsparser.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | from act.actiontype import ActionType
  4 | from arguments import Arguments
  5 | from sample.languagetype import LanguageType
  6 | from settings import *
  7 | from utils import inout
  8 | 
  9 | 
 10 | class ArgsParser:
 11 |     def __init__(self):
 12 |         self.parser = argparse.ArgumentParser()
 13 |         self.init_arguments()
 14 |         self.arguments = None
 15 | 
 16 |     def parse(self):
 17 |         args, unparsed = self.parser.parse_known_args()
 18 |         self.arguments = Arguments(
 19 |             actions=args.actions.split(','),
 20 |             languages=args.languages.split(','),
 21 |             data_dir=args.data_dir,
 22 |             projects_dir=args.projects_dir,
 23 |             asts_dir=args.asts_dir,
 24 |             bcs_dir=BCS_DIR,
 25 |             datasets_dir=DATASETS_DIR,
 26 |             plots_dir=PLOTS_DIR,
 27 |             search_spaces=SEARCH_SPACES,
 28 |             clang_lib_dir=args.clang_lib_dir,
 29 |             clustering_algs=args.clustering_algs.split(','),
 30 |             clustering_feat=args.clustering_feat.split(','),
 31 |             second_clustering=args.second_clustering,
 32 |             cose_similarity_chunk_size=args.cose_similarity_chunk_size,
 33 |             big_clusters_ignore=args.big_clusters_ignore,
 34 |             chunk_window_size=args.chunk_window_size,
 35 |             split=args.split,
 36 |             projects=args.projects.split(','),
 37 |             save_format=SAVE_FORMAT,
 38 |             ignore_compile_commands=IGNORE_COMPILE_COMMANDS,
 39 |             feature_types=args.feature_types.split(','),
 40 |             llvm_config=LLVM_CONFIG,
 41 |             includes=args.includes.split(','),
 42 |             pdg_dumper=PDG_DUMPER,
 43 |             clang=CLANG,
 44 |             stat_type=STAT_TYPE,
 45 |             stat_sim_types=STAT_SIM_TYPES.split(','),
 46 |             has_control_flow=args.has_control_flow,
 47 |             inconsistency_type=args.inconsistency_type,
 48 |             similarity_threshold=args.similarity_threshold,
 49 |             granularity=args.granularity.split(','),
 50 |             dependency=args.dependency,
 51 |             call_inconsistency=args.call_inconsistency.split(','),
 52 |             type_inconsistency=args.type_inconsistency.split(','),
 53 |             store_inconsistency=args.store_inconsistency.split(','),
 54 |             inconsistency_query_options=args.inconsistency_query_options,
 55 |             ssh=args.ssh,
 56 |             filtering=args.filtering,
 57 |             count_cpu=args.count_cpu,
 58 |             ids=args.ids.split(','),
 59 |             starting_report_item=args.starting_report_item,
 60 |             prepare=args.prepare
 61 |         )
 62 | 
 63 |     def init_arguments(self):
 64 |         self.parser.add_argument(
 65 |             '--actions',
 66 |             '-a',
 67 |             type=str,
 68 |             default=ACTIONS,
 69 |             help='Action must be among these: {}'.format(ActionType.get_detail())
 70 |         )
 71 | 
 72 |         self.parser.add_argument(
 73 |             '--languages',
 74 |             '-l',
 75 |             type=str,
 76 |             default=LANGUAGES,
 77 |             help='Target language must be among these: {}'.format(LanguageType.get_detail())
 78 |         )
 79 | 
 80 |         self.parser.add_argument(
 81 |             '--data_dir',
 82 |             '-dd',
 83 |             type=str,
 84 |             default=DATA_DIR,
 85 |             help='Base directory of the data'
 86 |         )
 87 | 
 88 |         self.parser.add_argument(
 89 |             '--projects_dir',
 90 |             '-pd',
 91 |             type=str,
 92 |             default=PROJECTS_DIR,
 93 |             help='Base directory of the projects'
 94 |         )
 95 | 
 96 |         self.parser.add_argument(
 97 |             '--asts_dir',
 98 |             '-ad',
 99 |             type=str,
100 |             default=ASTS_DIR,
101 |             help='Base directory of the ast of files'
102 |         )
103 | 
104 |         self.parser.add_argument(
105 |             '--clang_lib_dir',
106 |             '-cld',
107 |             type=str,
108 |             default=CLANG_LIB_DIR,
109 |             help='Base directory of the clang library files'
110 |         )
111 | 
112 |         self.parser.add_argument(
113 |             '--projects',
114 |             '-p',
115 |             type=str,
116 |             default=PROJECTS,
117 |             help='An array containing list of checking projects'
118 |         )
119 | 
120 |         self.parser.add_argument(
121 |             '--clustering_algs',
122 |             '-ca',
123 |             type=str,
124 |             default=CLUSTERING_ALGS,
125 |             help='An array containing list of clustering algorithms and their thresholds'
126 |         )
127 | 
128 |         self.parser.add_argument(
129 |             '--clustering_feat',
130 |             '-cf',
131 |             type=str,
132 |             default=CLUSTERING_FEAT,
133 |             help='An array containing list of clustering features'
134 |         )
135 | 
136 |         self.parser.add_argument(
137 |             '--second_clustering',
138 |             '-sc',
139 |             type=str,
140 |             default=SECOND_CLUSTERING,
141 |             help='Type of second clustering, online vs offline'
142 |         )
143 | 
144 |         self.parser.add_argument(
145 |             '--cose_similarity_chunk_size',
146 |             '-cscs',
147 |             type=int,
148 |             default=COSE_SIMILARITY_CHUNK_SIZE,
149 |             help='Batch size when compute cosine similarity, depends to available RAM'
150 |         )
151 | 
152 |         self.parser.add_argument(
153 |             '--big_clusters_ignore',
154 |             '-bci',
155 |             type=int,
156 |             default=BIG_CLUSTERS_IGNORE,
157 |             help='Size of big clusters that should be ignored from the first step clustering'
158 |         )
159 | 
160 |         self.parser.add_argument(
161 |             '--chunk_window_size',
162 |             '-cws',
163 |             type=int,
164 |             default=CHUNK_WINDOW_SIZE,
165 |             help='Size of basic block window from data flows'
166 |         )
167 | 
168 |         self.parser.add_argument(
169 |             '--split',
170 |             '-s',
171 |             type=str,
172 |             default=SPLIT,
173 |             help='A boolean value use for splitting a project'
174 |         )
175 | 
176 |         self.parser.add_argument(
177 |             '--feature_types',
178 |             '-ft',
179 |             type=str,
180 |             default=FEATURE_TYPES,
181 |             help='An array containing list of feature types'
182 |         )
183 | 
184 |         self.parser.add_argument(
185 |             '--includes',
186 |             '-i',
187 |             type=str,
188 |             default=INCLUDES,
189 |             help='An array containing list of checking projects'
190 |         )
191 | 
192 |         self.parser.add_argument(
193 |             '--has_control_flow',
194 |             '-hcf',
195 |             action='store_true',
196 |             help='If true, it considers control flow as well during construct extraction'
197 |         )
198 | 
199 |         self.parser.add_argument(
200 |             '--inconsistency_type',
201 |             '-it',
202 |             type=str,
203 |             default=INCONSISTENCY_TYPE,
204 |             help='show the result of a type of inconsistency'
205 |         )
206 | 
207 |         self.parser.add_argument(
208 |             '--similarity_threshold',
209 |             '-st',
210 |             type=float,
211 |             default=SIMILARITY_THRESHOLD,
212 |             help='Only show the inconsistencies having a similarity greater than a threshold'
213 |         )
214 | 
215 |         self.parser.add_argument(
216 |             '--granularity',
217 |             '-g',
218 |             type=str,
219 |             default=GRANULARITY,
220 |             help='Granularity of the construct'
221 |         )
222 | 
223 |         self.parser.add_argument(
224 |             '--dependency',
225 |             '-d',
226 |             type=str,
227 |             default=DEPENDENCY,
228 |             help='Dependency of the construct'
229 |         )
230 | 
231 |         self.parser.add_argument(
232 |             '--call_inconsistency',
233 |             '-ci',
234 |             type=str,
235 |             default=CALL_INCONSISTENCY,
236 |             help='Select the inconsistencies containing specific calls'
237 |         )
238 | 
239 |         self.parser.add_argument(
240 |             '--type_inconsistency',
241 |             '-ti',
242 |             type=str,
243 |             default=TYPE_INCONSISTENCY,
244 |             help='Select the inconsistencies containing specific types'
245 |         )
246 | 
247 |         self.parser.add_argument(
248 |             '--store_inconsistency',
249 |             '-sti',
250 |             type=str,
251 |             default=STORE_INCONSISTENCY,
252 |             help='Select the inconsistencies containing specific stores'
253 |         )
254 | 
255 |         self.parser.add_argument(
256 |             '--inconsistency_query_options',
257 |             '-iqo',
258 |             type=str,
259 |             default=INCONSISTENCY_QUERY_OPTIONS,
260 |             help='Set specific options during querying the inconsistencies'
261 |         )
262 | 
263 |         self.parser.add_argument(
264 |             '--ssh',
265 |             '-ssh',
266 |             action='store_true',
267 |             help='If it needs to connect to a remote mongodb server'
268 |         )
269 | 
270 |         self.parser.add_argument(
271 |             '--filtering',
272 |             '-f',
273 |             action='store_true',
274 |             help='Filter the less potential inconsistencies'
275 |         )
276 | 
277 |         self.parser.add_argument(
278 |             '--count_cpu',
279 |             '-cc',
280 |             type=str,
281 |             default=COUNT_CPU,
282 |             help='Number of cores'
283 |         )
284 | 
285 |         self.parser.add_argument(
286 |             '--ids',
287 |             '-ids',
288 |             type=str,
289 |             default='',
290 |             help='IDs of inconsistencies'
291 |         )
292 | 
293 |         self.parser.add_argument(
294 |             '--starting_report_item',
295 |             '-si',
296 |             type=int,
297 |             default=1,
298 |             help='Show inconsistencies starting from specific item'
299 |         )
300 | 
301 |         self.parser.add_argument(
302 |             '--prepare',
303 |             '-pp',
304 |             action='store_true',
305 |             help='If bitcodes are given, prepare them for pdg extraction'
306 |         )
307 | 
308 |     def do_basic_checks(self):
309 | 
310 |         possible_actions = ActionType.get_names()
311 |         for action in self.arguments.actions:
312 |             if action not in possible_actions:
313 |                 self.parser.print_help()
314 |                 inout.show_error('action argument is wrong!\n')
315 | 
316 |         possible_languages = LanguageType.get_names()
317 |         for language in self.arguments.languages:
318 |             if language not in possible_languages:
319 |                 self.parser.print_help()
320 |                 inout.show_error('language argument is wrong!\n')
321 | 
322 |         if self.arguments.data_dir is None or self.arguments.data_dir == '' or \
323 |                 not inout.exist_dir(self.arguments.data_dir):
324 |             self.parser.print_help()
325 |             inout.show_error('data_dir argument is not valid!\n')
326 | 
327 |         if self.arguments.projects_dir is None or self.arguments.projects_dir == '' or \
328 |                 not inout.exist_dir(inout.join_path(self.arguments.data_dir, self.arguments.projects_dir)):
329 |             self.parser.print_help()
330 |             inout.show_error('projects_dir argument is not valid!\n')
331 | 
332 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing as mp
 2 | 
 3 | class Arguments:
 4 |     def __init__(self, actions, languages, data_dir, projects_dir, asts_dir, bcs_dir,
 5 |                  datasets_dir, plots_dir, search_spaces, clang_lib_dir, clustering_algs, clustering_feat,
 6 |                  second_clustering, cose_similarity_chunk_size, big_clusters_ignore, chunk_window_size,
 7 |                  split, projects, save_format,
 8 |                  ignore_compile_commands,
 9 |                  feature_types, llvm_config, includes, pdg_dumper, clang, stat_type, stat_sim_types, has_control_flow,
10 |                  inconsistency_type, similarity_threshold, granularity, dependency, call_inconsistency,
11 |                  type_inconsistency, store_inconsistency, inconsistency_query_options, ssh, filtering, count_cpu, ids,
12 |                  starting_report_item, prepare):
13 |         self.actions = actions
14 |         self.languages = languages
15 |         self.data_dir = data_dir
16 |         self.projects_dir = projects_dir
17 |         self.asts_dir = asts_dir
18 |         self.bcs_dir = bcs_dir
19 |         self.datasets_dir = datasets_dir
20 |         self.plots_dir = plots_dir
21 |         self.search_spaces = search_spaces
22 |         self.clang_lib_dir = clang_lib_dir
23 |         self.clustering_algs = clustering_algs
24 |         self.clustering_feat = clustering_feat
25 |         self.second_clustering = second_clustering
26 |         self.cose_similarity_chunk_size = cose_similarity_chunk_size
27 |         self.big_clusters_ignore = big_clusters_ignore
28 |         self.chunk_window_size = chunk_window_size
29 |         self.split = split
30 |         self.projects = projects
31 |         self.save_format = save_format
32 |         self.ignore_compile_commands = ignore_compile_commands
33 |         self.feature_types = feature_types
34 |         self.llvm_config = llvm_config
35 |         self.includes = includes
36 |         self.pdg_dumper = pdg_dumper
37 |         self.clang = clang
38 |         self.stat_type = stat_type
39 |         self.stat_sim_types = stat_sim_types
40 |         self.has_control_flow = has_control_flow
41 |         self.inconsistency_type = inconsistency_type
42 |         self.similarity_threshold = similarity_threshold
43 |         self.granularity = granularity
44 |         self.dependency = dependency
45 |         self.call_inconsistency = call_inconsistency
46 |         self.type_inconsistency = type_inconsistency
47 |         self.store_inconsistency = store_inconsistency
48 |         self.inconsistency_query_options = inconsistency_query_options
49 |         self.ssh = ssh
50 |         self.filtering = filtering
51 |         self.count_cpu = count_cpu
52 |         self.ids = ids
53 |         self.starting_report_item = starting_report_item
54 |         self.prepare = prepare
55 | 
56 |         try:
57 |             mp.cpu_count()
58 |         except:
59 |             self.count_cpu = 1
60 |         print 'Running the code by {} CPUs'.format(self.count_cpu)
61 | 


--------------------------------------------------------------------------------
/iBench/groundtruth.py:
--------------------------------------------------------------------------------
1 | ground_truth = []
2 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | sudo apt-get update
 2 | sudo apt-get -y install clang-3.8
 3 | sudo apt-get -y install clang-6.0
 4 | sudo apt-get -y install llvm-3.8
 5 | sudo apt-get -y install llvm-6.0
 6 | sudo apt-get -y install bear
 7 | sudo apt-get -y install git
 8 | sudo apt-get -y install cmake
 9 | sudo apt-get -y install libpng-dev libfreetype6-dev
10 | sudo apt-get -y install python-dev graphviz libgraphviz-dev pkg-config
11 | sudo apt-get -y install python-pip
12 | pip install --upgrade pip
13 | pip install --upgrade setuptools
14 | pip install -r requirements.txt
15 | cd dg
16 | cmake -D CMAKE_C_COMPILER="/usr/bin/clang-6.0" -D CMAKE_CXX_COMPILER="/usr/bin/clang++-6.0" .
17 | make
18 | wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | sudo apt-key add -
19 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.4 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.4.list
20 | sudo apt-get update
21 | sudo apt-get install -y mongodb-org=4.4.2 mongodb-org-server=4.4.2 mongodb-org-shell=4.4.2 mongodb-org-mongos=4.4.2 mongodb-org-tools=4.4.2
22 | sudo systemctl start mongod
23 | 


--------------------------------------------------------------------------------
/learning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/__init__.py


--------------------------------------------------------------------------------
/learning/clustering.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | # from hdbscan import HDBSCAN
  4 | from scipy.sparse.csgraph import connected_components
  5 | from sklearn.cluster import AffinityPropagation, MeanShift, DBSCAN
  6 | from sklearn.decomposition import LatentDirichletAllocation
  7 | from sklearn.metrics.pairwise import *
  8 | import dask.dataframe as dd
  9 | 
 10 | from sample.projectcode import ProjectCode
 11 | from utils import inout
 12 | from utils.inout import *
 13 | from scipy import sparse
 14 | from sklearn.metrics import pairwise_distances
 15 | from scipy.spatial.distance import cosine
 16 | 
 17 | 
 18 | class Clustering:
 19 | 
 20 |     def __init__(self, dataset='', clustering_alg='dbscancos_0.9', features_set=None, locations=None, min_samples=2,
 21 |                  from_chuck=None, module_name='root', arguments=None, project_dir='', feature_type='',
 22 |                  node_features=None, node_features_locations=None):
 23 |         self.arguments = arguments
 24 |         self.project_dir = project_dir
 25 |         self.dataset = dataset
 26 |         self.module_name = module_name
 27 |         self.locations = locations
 28 |         if from_chuck is not None:
 29 |             project_code = ProjectCode(project_dir=self.project_dir, arguments=self.arguments,
 30 |                                        feature_type=from_chuck, file_locations=self.locations)
 31 |             project_code.extract_features()
 32 |             self.features_set = pd.DataFrame(project_code.afs_features_counters)
 33 |             self.locations = project_code.afs_file_infos
 34 |         else:
 35 |             if self.locations is None:
 36 |                 self.features_set, self.locations = Clustering.split_dataset(self.dataset,
 37 |                                                                              feature_type)
 38 |             else:
 39 |                 self.features_set = features_set
 40 |                 self.locations = locations
 41 |         # if 'location' in features_set.columns.values:
 42 |         #     features_set.drop('location', axis=1, inplace=True)
 43 |         params = clustering_alg.split('_')
 44 |         self.clustering_alg = params[0]
 45 |         self.param = float(params[1])
 46 |         self.model = None
 47 |         # LDA Parameters
 48 |         self.n_features = 1000
 49 |         self.n_samples = len(self.features_set)
 50 |         self.n_topics = 1000
 51 |         self.max_iter = 50
 52 |         self.learning_offset = 2
 53 |         # DBScan Parameters
 54 |         self.eps = self.param  # 0.9
 55 |         self.min_samples = min_samples
 56 |         # Affinity Parameters
 57 |         self.preference = self.param  # 100
 58 |         # Mean Shift
 59 |         self.bandwidth = self.param  # 1.2
 60 |         # hdbscan
 61 |         self.min_cluster_size = int(self.param)
 62 |         #########
 63 |         self.set_default_settings()
 64 |         self.clusters_samples = defaultdict(list)
 65 |         self.node_features = node_features
 66 |         self.node_features_locations = node_features_locations
 67 |         self.node_differences = defaultdict(list)
 68 |         self.clusters_samples_len_sorted_keys = list()
 69 |         self.cluster_labels = None
 70 | 
 71 |     def set_default_settings(self):
 72 |         if self.clustering_alg == 'dbscan' or self.clustering_alg == 'dbscancos':
 73 |             # self.features_set = StandardScaler().fit_transform(self.features_set)
 74 |             self.model = DBSCAN(eps=self.eps, min_samples=self.min_samples, n_jobs=5)
 75 |         elif self.clustering_alg == 'lda':
 76 |             self.model = LatentDirichletAllocation(n_components=self.n_topics, max_iter=self.max_iter,
 77 |                                                    learning_method='online', learning_offset=self.learning_offset,
 78 |                                                    random_state=0, n_jobs=10)
 79 |         elif self.clustering_alg == 'aff' or self.clustering_alg == 'affcos':
 80 |             self.model = AffinityPropagation()
 81 |         elif self.clustering_alg == 'means':
 82 |             self.model = MeanShift(n_jobs=5, bandwidth=self.bandwidth)
 83 |             self.param = self.bandwidth
 84 |         # elif self.clustering_alg == 'hdbscan':
 85 |         #     self.model = HDBSCAN(min_cluster_size=self.min_cluster_size, min_samples=self.min_samples
 86 |         #                          , metric='manhattan')
 87 |         #     # , algorithm='generic', metric='cosine')
 88 |         elif self.clustering_alg == 'cc':
 89 |             self.model = 'cc'
 90 | 
 91 |     def cluster(self):
 92 | 
 93 |         if self.model is None or len(self.features_set) == 0:
 94 |             return False
 95 |         if self.clustering_alg == 'affcos':
 96 |             self.model.affinity = 'euclidean'
 97 |             self.model.preference = self.preference
 98 |             # print cosine_distances(self.features_set)
 99 |             self.model.fit(cosine_similarity(self.features_set))
100 |             # self.param = self.preference
101 |         elif self.clustering_alg == 'dbscancos':
102 |             # self.model.metric = 'euclidean'
103 |             self.model.metric = 'precomputed'
104 |             distances = cosine_distances(self.features_set)
105 |             # distances[distances < (1 - self.param)] = 0
106 |             self.model.fit(distances)
107 |             # self.param = self.eps
108 |         elif self.clustering_alg == 'hdbscan':
109 |             self.model.fit(self.features_set)
110 |             self.cluster_labels = self.model.labels_
111 |         elif self.clustering_alg == 'cc':
112 |             # similarity = cosine_similarity(self.features_set)
113 |             # similarity = self.get_cosine_similarity()
114 |             similarity = self.get_similarity_sparse_input()
115 |             adjacency_mask = similarity >= self.param
116 |             del similarity
117 |             del self.features_set
118 |             nb_clusters, self.cluster_labels = connected_components(adjacency_mask, connection='strong')
119 |             # print nb_clusters
120 |             # print self.cluster_labels
121 |         else:
122 |             self.model.fit(self.features_set)
123 |         return True
124 | 
125 |     def get_similarity_scipy(self):
126 |         return 1 - pairwise_distances(self.features_set, metric="cosine")
127 | 
128 |     def get_similarity_sparse_input(self):
129 |         # sparse_features = sparse.csr_matrix(self.features_set)
130 |         # return cosine_similarity(sparse_features)
131 |         return cosine_similarity(self.features_set)
132 | 
133 |     def similarity_cosine_by_chunk(self, len, start, end):
134 |         if end > len:
135 |             end = len
136 |         return cosine_similarity(X=self.features_set[start:end], Y=self.features_set, dense_output=False)
137 | 
138 |     def get_cosine_similarity(self):
139 |         chunk_size = int(self.arguments.cose_similarity_chunk_size)
140 |         len = self.features_set.shape[0]
141 |         # cosine_similarities = None
142 |         # if len <= chunk_size:
143 |         #     cosine_similarities = cosine_similarity(self.features_set)
144 |         # else:
145 |         filesnames = []
146 |         similarity_files_dir = join(self.arguments.data_dir, self.arguments.datasets_dir,
147 |                                     get_basename(self.project_dir))
148 |         for filename in get_files_in_dir(similarity_files_dir, start='tmp-sim-'):
149 |             if os.path.exists(filename):
150 |                 inout.remove_file(filename)
151 |         for chunk_start in xrange(0, len, chunk_size):
152 |             print 'chunk start index', chunk_start
153 |             filename = join(self.arguments.data_dir, self.arguments.datasets_dir,
154 |                             get_basename(self.project_dir), 'tmp-sim-{}.txt'.format(chunk_start))
155 |             sim_file = open(filename, "wb")
156 |             cosine_similarity_chunk = self.similarity_cosine_by_chunk(len, chunk_start, chunk_start + chunk_size)
157 |             np.savetxt(sim_file, cosine_similarity_chunk, fmt="%.2g", delimiter=',', newline='\n')
158 |             del cosine_similarity_chunk
159 |             sim_file.close()
160 |             filesnames.append(filename)
161 | 
162 |         cosine_similarities = None
163 |         for filename in filesnames:
164 |             if cosine_similarities is None:
165 |                 cosine_similarities = np.genfromtxt(filename, delimiter=',')
166 |             else:
167 |                 cosine_similarities = np.concatenate((cosine_similarities, np.genfromtxt(filename, delimiter=',')),
168 |                                                      axis=0)
169 |         return cosine_similarities
170 | 
171 |     def get_clusters(self):
172 | 
173 |         if self.clustering_alg == 'dbscan' or self.clustering_alg == 'dbscancos':
174 |             # print self.model.labels_
175 |             for i in range(len(self.model.labels_)):
176 |                 self.clusters_samples[self.model.labels_[i]].append(self.locations[i])
177 |         elif self.clustering_alg == 'lda':
178 |             sample_cluster_distrib = self.model.transform(self.features_set)
179 |             counter = 0
180 |             for i in range(len(self.locations)):
181 |                 counter += 1
182 |                 sample_cluster = np.argmax(sample_cluster_distrib[i])
183 |                 self.clusters_samples[sample_cluster].append(self.locations[i])
184 |         elif self.clustering_alg == 'aff' or self.clustering_alg == 'affcos':
185 |             for i in range(len(self.model.labels_)):
186 |                 self.clusters_samples[self.model.labels_[i]].append(self.locations[i])
187 |         elif self.clustering_alg == 'means':
188 |             for i in range(len(self.model.labels_)):
189 |                 self.clusters_samples[self.model.labels_[i]].append(self.locations[i])
190 |         elif self.clustering_alg == 'hdbscan':
191 |             for i in range(len(self.cluster_labels)):
192 |                 self.clusters_samples[self.cluster_labels[i]].append(self.locations[i])
193 |         elif self.clustering_alg == 'cc':
194 | 
195 |             for i in range(len(self.cluster_labels)):
196 |                 self.clusters_samples[self.cluster_labels[i]].append(self.locations[i])
197 | 
198 |                 if self.node_features is not None:
199 |                     self.set_node_difference(i)
200 |         self.sort_clusters()
201 | 
202 |     def set_node_difference(self, i):
203 |         location_index = self.node_features_locations[self.locations[i]]
204 |         node_features = self.node_features.iloc[location_index]
205 |         node_features = node_features.iloc[node_features.to_numpy().nonzero()[0]].to_dict()
206 |         self.node_differences[self.cluster_labels[i]].append(node_features)
207 | 
208 |     def sort_clusters(self):
209 |         self.clusters_samples_len_sorted_keys = sorted(self.clusters_samples,
210 |                                                        key=lambda k: len(self.clusters_samples[k]),
211 |                                                        reverse=True)
212 | 
213 |     @staticmethod
214 |     def split_dataset(dataset, feature_type):
215 |         df = pd.read_csv(dataset, nrows=1)
216 |         features = list(df.columns.values)
217 |         features.remove('location')
218 |         features_type = {'location': 'str'}
219 |         if feature_type == 'NN':
220 |             for feature in features:
221 |                 features_type[feature] = 'Int64'
222 |         elif feature_type == 'G2v':
223 |             for feature in features:
224 |                 features_type[feature] = 'float64'
225 |         elif feature_type == '':
226 |             print 'Feature Type is empty ...'
227 |         # print 'Reading dataset ...'
228 |         dataframe = pd.read_csv(dataset, dtype=features_type)
229 |         # dataframe = dd.read_csv(dataset, dtype=features_type, header=0, blocksize=int(5e5), sample=1e9)
230 |         # print 'Converting dataframe to pandas ...'
231 |         # dataframe.compute()
232 |         # print 'Filling Null values with 0 ...'
233 |         dataframe = dataframe.fillna(0)
234 |         return dataframe.drop('location', axis=1), dataframe['location'].tolist()
235 | 
236 |     @staticmethod
237 |     def make_dataset_binary(dataframe):
238 |         columns = dataframe.columns.values.tolist()
239 |         columns.remove('location')
240 |         for column in columns:
241 |             dataframe.ix[dataframe[column] > 0, column] = 1
242 |         return dataframe
243 | 
244 |     def save_clusters(self, step=''):
245 | 
246 |         content = '\n'
247 |         for key in self.clusters_samples_len_sorted_keys:
248 |             dic_value = self.clusters_samples[key]
249 |             content = '{} Cluster #{} :\n'.format(content, key)
250 |             content = '{} # Items: {}  \n'.format(content, len(dic_value))
251 |             for item in dic_value:
252 |                 content = '{}  {}\n'.format(content, item)
253 |             content = '{} {}\n'.format(content, '=' * 100)
254 | 
255 |         clusters_file_directory = get_parent_dir(get_filename_without_ext(self.dataset))
256 |         clustering_feature_name = str(get_basename(get_filename_without_ext(self.dataset)))
257 |         clusters_file = '{}_{}.{}_{}.{}.clusters.txt'.format(step,
258 |                                                              clustering_feature_name.split('_')[1],
259 |                                                              clustering_feature_name.split('_')[0],
260 |                                                              str(self.param), self.module_name)
261 |         clusters_file = join_path(clusters_file_directory, clusters_file)
262 |         cluster_file_path = join_path(get_parent_dir(clusters_file), self.clustering_alg)
263 |         make_dir_if_not_exist(cluster_file_path)
264 |         clusters_file = join_path(cluster_file_path, get_basename(clusters_file))
265 |         write_file(clusters_file, content)
266 | 


--------------------------------------------------------------------------------
/learning/graph2vec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/graph2vec/__init__.py


--------------------------------------------------------------------------------
/learning/graph2vec/corpus_parser.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import logging
  3 | from collections import Counter
  4 | from random import shuffle
  5 | 
  6 | import numpy as np
  7 | 
  8 | from utils import get_files
  9 | 
 10 | 
 11 | class Corpus(object):
 12 |     def __init__(self, fnames=None, extn='WL2', max_files=0):
 13 |         assert fnames != None, "please specify the corpus folder"
 14 |         self.fnames = fnames
 15 |         self.subgraph_index = 0
 16 |         self.graph_index = 0
 17 |         self.epoch_flag = 0
 18 |         self.max_files = max_files
 19 |         self.graph_ids_for_batch_traversal = []
 20 |         self.extn = extn
 21 | 
 22 |     def scan_corpus(self):
 23 | 
 24 |         subgraphs = []
 25 |         for fname in self.graph_fname_list:
 26 |             subgraphs.extend(
 27 |                 [l.split()[0] for l in open(fname).xreadlines()])  # just take the first word of every sentence
 28 |         subgraphs.append('UNK')
 29 | 
 30 |         subgraph_to_freq_map = Counter(subgraphs)
 31 |         del subgraphs
 32 | 
 33 |         subgraph_to_id_map = {sg: i for i, sg in
 34 |                               enumerate(subgraph_to_freq_map.iterkeys())}  # output layer of the skipgram network
 35 | 
 36 |         self._subgraph_to_freq_map = subgraph_to_freq_map  # to be used for negative sampling
 37 |         self._subgraph_to_id_map = subgraph_to_id_map
 38 |         self._id_to_subgraph_map = {v: k for k, v in subgraph_to_id_map.iteritems()}
 39 |         self._subgraphcount = sum(subgraph_to_freq_map.values())  # total num subgraphs in all graphs
 40 | 
 41 |         self.num_graphs = len(self.graph_fname_list)  # doc size
 42 |         self.num_subgraphs = len(subgraph_to_id_map)  # vocab of word size
 43 | 
 44 |         self.subgraph_id_freq_map_as_list = []  # id of this list is the word id and value is the freq of word with corresponding word id
 45 |         for i in xrange(len(self._subgraph_to_freq_map)):
 46 |             self.subgraph_id_freq_map_as_list.append(self._subgraph_to_freq_map[self._id_to_subgraph_map[i]])
 47 | 
 48 |         return self._subgraph_to_id_map
 49 | 
 50 |     def scan_and_load_corpus(self):
 51 | 
 52 |         self.graph_fname_list = get_files(self.fnames, extn=self.extn, max_files=self.max_files)
 53 |         self._graph_name_to_id_map = {g: i for i, g in
 54 |                                       enumerate(self.graph_fname_list)}  # input layer of the skipgram network
 55 |         self._id_to_graph_name_map = {i: g for g, i in self._graph_name_to_id_map.iteritems()}
 56 |         subgraph_to_id_map = self.scan_corpus()
 57 | 
 58 |         logging.info('number of graphs: %d' % self.num_graphs)
 59 |         logging.info('subgraph vocabulary size: %d' % self.num_subgraphs)
 60 |         logging.info('total number of subgraphs to be trained: %d' % self._subgraphcount)
 61 | 
 62 |         self.graph_ids_for_batch_traversal = range(self.num_graphs)
 63 |         shuffle(self.graph_ids_for_batch_traversal)
 64 | 
 65 |     def generate_batch_from_file(self, batch_size):
 66 |         target_graph_ids = []
 67 |         context_subgraph_ids = []
 68 | 
 69 |         graph_name = self.graph_fname_list[self.graph_ids_for_batch_traversal[self.graph_index]]
 70 |         graph_contents = open(graph_name).readlines()
 71 |         while self.subgraph_index >= len(graph_contents):
 72 |             self.subgraph_index = 0
 73 |             self.graph_index += 1
 74 |             if self.graph_index == len(self.graph_fname_list):
 75 |                 self.graph_index = 0
 76 |                 np.random.shuffle(self.graph_ids_for_batch_traversal)
 77 |                 self.epoch_flag = True
 78 |             graph_name = self.graph_fname_list[self.graph_ids_for_batch_traversal[self.graph_index]]
 79 |             graph_contents = open(graph_name).readlines()
 80 | 
 81 |         while len(context_subgraph_ids) < batch_size:
 82 |             line_id = self.subgraph_index
 83 |             context_subgraph = graph_contents[line_id].split()[0]
 84 |             target_graph = graph_name
 85 | 
 86 |             context_subgraph_ids.append(self._subgraph_to_id_map[context_subgraph])
 87 |             target_graph_ids.append(self._graph_name_to_id_map[target_graph])
 88 | 
 89 |             self.subgraph_index += 1
 90 |             while self.subgraph_index == len(graph_contents):
 91 |                 self.subgraph_index = 0
 92 |                 self.graph_index += 1
 93 |                 if self.graph_index == len(self.graph_fname_list):
 94 |                     self.graph_index = 0
 95 |                     np.random.shuffle(self.graph_ids_for_batch_traversal)
 96 |                     self.epoch_flag = True
 97 | 
 98 |                 graph_name = self.graph_fname_list[self.graph_ids_for_batch_traversal[self.graph_index]]
 99 |                 graph_contents = open(graph_name).readlines()
100 | 
101 |         target_context_pairs = zip(target_graph_ids, context_subgraph_ids)
102 |         shuffle(target_context_pairs)
103 |         target_graph_ids, context_subgraph_ids = zip(*target_context_pairs)
104 | 
105 |         target_graph_ids = np.array(target_graph_ids, dtype=np.int32)
106 |         context_subgraph_ids = np.array(context_subgraph_ids, dtype=np.int32)
107 | 
108 |         contextword_outputs = np.reshape(context_subgraph_ids, [len(context_subgraph_ids), 1])
109 | 
110 |         return target_graph_ids, contextword_outputs
111 | 


--------------------------------------------------------------------------------
/learning/graph2vec/graph2vec.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import os
  3 | from copy import deepcopy
  4 | from time import time
  5 | 
  6 | import networkx as nx
  7 | import tensorflow as tf
  8 | import logging
  9 | 
 10 | from learning.graph2vec.train_utils import train_skipgram
 11 | 
 12 | 
 13 | def get_int_node_label(x):
 14 |     return int(x.split('+')[-1])
 15 | 
 16 | 
 17 | class Graph2Vec:
 18 | 
 19 |     def __init__(self, project_dir, files_paths, arguments):
 20 |         # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 21 |         # os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
 22 |         # tf.logging.set_verbosity(tf.logging.INFO)
 23 |         # logging.getLogger('tensorflow').disabled = True
 24 |         # logging.getLogger('tensorflow').propagate = False
 25 | 
 26 |         # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 27 | 
 28 |         self.fnames = files_paths
 29 |         self.project_dir = project_dir
 30 |         self.graphs = None
 31 |         self.node_label_attr_name = 'label'
 32 |         self.label_to_compressed_label_map = {}
 33 |         self.wlk_h = 2
 34 |         self.learning_rate = 0.1
 35 |         self.embedding_size = 512  # 512
 36 |         self.num_negative_samples = 6
 37 |         self.epochs = 500  # 1000
 38 |         self.batch_size = 128
 39 |         self.wl_extn = 'g2v' + str(self.wlk_h)
 40 |         self.final_embeddings = None
 41 |         self.corpus = None
 42 |         self.arguments = arguments
 43 | 
 44 |     def run(self):
 45 |         t0 = time()
 46 |         self.wlk_relabel_and_dump_memory_version(self.fnames, max_h=self.wlk_h)
 47 |         print 'dumped sg2vec sentences in {} sec.'.format(time() - t0)
 48 |         t0 = time()
 49 | 
 50 |         self.corpus, self.final_embeddings = train_skipgram(self.fnames, self.wl_extn, self.learning_rate,
 51 |                                                             self.embedding_size,
 52 |                                                             self.num_negative_samples, self.epochs, self.batch_size,
 53 |                                                             arguments=self.arguments)
 54 |         print 'Trained the skipgram model in {} sec.'.format(round(time() - t0, 2))
 55 | 
 56 |     def load_graphs(self):
 57 |         self.graphs = [nx.drawing.nx_agraph.read_dot(file_path) for file_path in self.fnames]
 58 | 
 59 |     def wlk_relabel_and_dump_memory_version(self, fnames, max_h):
 60 | 
 61 |         t0 = time()
 62 |         self.load_graphs()
 63 |         print 'loaded all graphs in {} sec'.format(round(time() - t0, 2))
 64 | 
 65 |         t0 = time()
 66 |         self.graphs = [self.initial_relabel(g) for g in self.graphs]
 67 |         print 'initial relabeling done in {} sec'.format(round(time() - t0, 2))
 68 | 
 69 |         for it in xrange(1, max_h + 1):
 70 |             t0 = time()
 71 |             self.label_to_compressed_label_map = {}
 72 |             self.graphs = [self.wl_relabel(g, it) for g in self.graphs]
 73 |             print 'WL iteration {} done in {} sec.'.format(it, round(time() - t0, 2))
 74 |             print 'num of WL rooted subgraphs in iter {} is {}'.format(it, len(self.label_to_compressed_label_map))
 75 | 
 76 |         t0 = time()
 77 |         for fname, g in zip(fnames, self.graphs):
 78 |             self.dump_sg2vec_str(fname, max_h, g)
 79 |         print 'dumped sg2vec sentences in {} sec.'.format(round(time() - t0, 2))
 80 | 
 81 |     def dump_sg2vec_str(self, fname, max_h, g=None):
 82 |         if not g:
 83 |             g = nx.read_gexf(fname + '.tmpg')
 84 |             new_g = deepcopy(g)
 85 |             for n in g.nodes():
 86 |                 del new_g.nodes[n]['relabel']
 87 |                 new_g.nodes[n]['relabel'] = ast.literal_eval(g.nodes[n]['relabel'])
 88 |             g = new_g
 89 | 
 90 |         opfname = fname + '.' + self.wl_extn
 91 | 
 92 |         # if os.path.isfile(opfname):
 93 |         #     return
 94 | 
 95 |         with open(opfname, 'w') as fh:
 96 |             for n, d in g.nodes(data=True):
 97 |                 for i in xrange(0, max_h + 1):
 98 |                     try:
 99 |                         center = d['relabel'][i]
100 |                     except:
101 |                         continue
102 |                     neis_labels_prev_deg = []
103 |                     neis_labels_next_deg = []
104 | 
105 |                     if i != 0:
106 |                         neis_labels_prev_deg = list(
107 |                             set([g.node[nei]['relabel'][i - 1] for nei in nx.all_neighbors(g, n)]))
108 |                         neis_labels_prev_deg.sort()
109 |                     NeisLabelsSameDeg = list(set([g.node[nei]['relabel'][i] for nei in nx.all_neighbors(g, n)]))
110 |                     if i != max_h:
111 |                         neis_labels_next_deg = list(
112 |                             set([g.node[nei]['relabel'][i + 1] for nei in nx.all_neighbors(g, n)]))
113 |                         neis_labels_next_deg.sort()
114 | 
115 |                     nei_list = NeisLabelsSameDeg + neis_labels_prev_deg + neis_labels_next_deg
116 |                     nei_list = ' '.join(nei_list)
117 | 
118 |                     sentence = center + ' ' + nei_list
119 |                     print>> fh, sentence
120 | 
121 |         if os.path.isfile(fname + '.tmpg'):
122 |             os.system('rm ' + fname + '.tmpg')
123 | 
124 |     def wl_relabel(self, g, it):
125 | 
126 |         try:
127 |             opfname = g + '.tmpg'
128 |             g = nx.drawing.nx_agraph.read_dot(g + '.tmpg')
129 |             new_g = deepcopy(g)
130 |             for n in g.nodes():
131 |                 new_g.nodes[n]['relabel'] = ast.literal_eval(g.nodes[n]['relabel'])
132 |             g = new_g
133 |         except:
134 |             opfname = None
135 |             pass
136 | 
137 |         prev_iter = it - 1
138 |         for node in g.nodes():
139 |             prev_iter_node_label = get_int_node_label(g.nodes[node]['relabel'][prev_iter])
140 |             node_label = [prev_iter_node_label]
141 |             neighbors = list(nx.all_neighbors(g, node))
142 |             neighborhood_label = sorted([get_int_node_label(g.nodes[nei]['relabel'][prev_iter]) for nei in neighbors])
143 |             node_neighborhood_label = tuple(node_label + neighborhood_label)
144 |             if not self.label_to_compressed_label_map.has_key(node_neighborhood_label):
145 |                 compressed_label = len(self.label_to_compressed_label_map) + 1
146 |                 self.label_to_compressed_label_map[node_neighborhood_label] = compressed_label
147 |                 g.node[node]['relabel'][it] = str(it) + '+' + str(compressed_label)
148 |             else:
149 |                 g.node[node]['relabel'][it] = str(it) + '+' + str(
150 |                     self.label_to_compressed_label_map[node_neighborhood_label])
151 | 
152 |         if opfname:
153 |             nx.drawing.nx_agraph.write_dot(g, opfname)
154 |         else:
155 |             return g
156 | 
157 |     def initial_relabel(self, g):
158 | 
159 |         try:
160 |             opfname = g + '.tmpg'
161 |             g = nx.drawing.nx_agraph.read_dot(g)
162 |         except:
163 |             opfname = None
164 |             pass
165 | 
166 |         nx.convert_node_labels_to_integers(g,
167 |                                            first_label=0)  # this needs to be done for the initial interation only
168 |         for node in g.nodes(): g.node[node]['relabel'] = {}
169 | 
170 |         for node in g.nodes():
171 |             try:
172 |                 label = g.node[node][self.node_label_attr_name]
173 |             except:
174 |                 # no node label referred in 'node_label_attr_name' is present, hence assigning an invalid compressd label
175 |                 g.node[node]['relabel'][0] = '0+0'
176 |                 continue
177 | 
178 |             if not self.label_to_compressed_label_map.has_key(label):
179 |                 compressed_label = len(
180 |                     self.label_to_compressed_label_map) + 1  # starts with 1 and incremented every time a new node label is seen
181 |                 self.label_to_compressed_label_map[label] = compressed_label  # inster the new label to the label map
182 |                 g.node[node]['relabel'][0] = '0+' + str(compressed_label)
183 |             else:
184 |                 g.node[node]['relabel'][0] = '0+' + str(self.label_to_compressed_label_map[label])
185 | 
186 |         if opfname:
187 |             nx.drawing.nx_agraph.write_dot(g, opfname)
188 |         else:
189 |             return g
190 | 


--------------------------------------------------------------------------------
/learning/graph2vec/parallelgraph2vec.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import glob
  3 | import hashlib
  4 | import logging
  5 | from collections import namedtuple
  6 | 
  7 | import pandas as pd
  8 | import networkx as nx
  9 | # from nltk.cluster import cosine_distance
 10 | # from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, cosine_distances
 11 | from tqdm import tqdm
 12 | from joblib import Parallel, delayed
 13 | # from parser import parameter_parser
 14 | # import numpy.distutils.system_info as sysinfo
 15 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 16 | 
 17 | 
 18 | class WeisfeilerLehmanMachine:
 19 |     """
 20 |     Weisfeiler Lehman feature extractor class.
 21 |     """
 22 | 
 23 |     def __init__(self, graph, features, iterations):
 24 |         """
 25 |         Initialization method which also executes feature extraction.
 26 |         :param graph: The Nx graph object.
 27 |         :param features: Feature hash table.
 28 |         :param iterations: Number of WL iterations.
 29 |         """
 30 |         self.iterations = iterations
 31 |         self.graph = graph
 32 |         self.features = features
 33 |         self.nodes = self.graph.nodes()
 34 |         self.extracted_features = [str(v) for k, v in features.items()]
 35 |         self.do_recursions()
 36 | 
 37 |     def do_a_recursion(self):
 38 |         """
 39 |         The method does a single WL recursion.
 40 |         :return new_features: The hash table with extracted WL features.
 41 |         """
 42 |         new_features = {}
 43 |         for node in self.nodes:
 44 |             nebs = self.graph.neighbors(node)
 45 |             degs = [self.features[neb] for neb in nebs]
 46 |             features = "_".join([str(self.features[node])] + sorted([str(deg) for deg in degs]))
 47 |             hash_object = hashlib.md5(features.encode())
 48 |             hashing = hash_object.hexdigest()
 49 |             new_features[node] = hashing
 50 |         self.extracted_features = self.extracted_features + list(new_features.values())
 51 |         return new_features
 52 | 
 53 |     def do_recursions(self):
 54 |         """
 55 |         The method does a series of WL recursions.
 56 |         """
 57 |         for iteration in range(self.iterations):
 58 |             self.features = self.do_a_recursion()
 59 | 
 60 | 
 61 | def feature_extractor(path, rounds):
 62 |     """
 63 |     Function to extract WL features from a graph.
 64 |     :param path: The path to the graph json.
 65 |     :param rounds: Number of WL iterations.
 66 |     :return doc: Document collection object.
 67 |     """
 68 |     graph, features, name, graph_len, graph_hash = dataset_reader(path)
 69 |     machine = WeisfeilerLehmanMachine(graph, features, rounds)
 70 |     doc = TaggedDocument(words=machine.extracted_features, tags=[name, str(graph_len), graph_hash])
 71 |     return doc
 72 | 
 73 | 
 74 | def dataset_reader(path):
 75 |     """
 76 |     Function to read the graph and features from a json file.
 77 |     :param path: The path to the graph json.
 78 |     :return graph: The graph object.
 79 |     :return features: Features hash table.
 80 |     :return name: Name of the graph.
 81 |     """
 82 |     # name = path.strip(".json").split("/")[-1]
 83 |     # data = json.load(open(path))
 84 |     # graph = nx.from_edgelist(data["edges"])
 85 |     name = path
 86 |     graph = nx.drawing.nx_agraph.read_dot(path)
 87 |     graph_len = 0
 88 |     graph_hash = extract_node_names_features(graph, name)
 89 |     features = {}
 90 |     for node in graph.nodes:
 91 |         features[node] = graph.nodes[node]['label']
 92 |         graph_len += 1
 93 | 
 94 |     # if "features" in data.keys():
 95 |     #     features = data["features"]
 96 |     # else:
 97 |     #     features = nx.degree(graph)
 98 |     #
 99 |     # features = {int(k): v for k, v, in features.items()}
100 |     return graph, features, name, graph_len, graph_hash
101 | 
102 | 
103 | def extract_node_names_features(graph, name):
104 |     lines_numbers = set()
105 |     basic_block_ids = set()
106 |     llvm_instructions = set()
107 |     for node in graph.nodes:
108 |         label = graph.nodes[node]['label']
109 |         if 'line' in graph.nodes[node]:
110 |             line = graph.nodes[node]['line']
111 |             lines_numbers.add(line)
112 |         if 'basic_block_id' in graph.nodes[node]:
113 |             bb_id = graph.nodes[node]['basic_block_id']
114 |             basic_block_ids.add(bb_id)
115 |         llvm_instructions.add(label)
116 | 
117 |     return compute_construct_hash(name, lines_numbers, basic_block_ids, llvm_instructions)
118 | 
119 | 
120 | def compute_construct_hash(name, lines_numbers, basic_block_ids, llvm_instructions):
121 |     construct_string = ''
122 |     # print self.file_info
123 |     construct_string += name[:name.find('.c/pdg') + 2]
124 |     # print construct_string
125 |     for id in basic_block_ids:
126 |         construct_string += str(id)
127 |     for line in lines_numbers:
128 |         construct_string += str(line)
129 |     for llvm_instruction in llvm_instructions:
130 |         construct_string += str(llvm_instruction)
131 |     # print construct_string
132 |     # self.construct_hash = int(hashlib.sha1(construct_string).hexdigest(), 16) % (10 ** 8)
133 |     return hashlib.sha1(construct_string).hexdigest()
134 | 
135 | 
136 | def save_embedding(output_path, model, files, dimensions):
137 |     """
138 |     Function to save the embedding.
139 |     :param output_path: Path to the embedding csv.
140 |     :param model: The embedding model object.
141 |     :param files: The list of files.
142 |     :param dimensions: The embedding dimension parameter.
143 |     """
144 |     out = []
145 |     for f in files:
146 |         identifier = f.split("/")[-1].strip(".json")
147 |         out.append([int(identifier)] + list(model.docvecs["g_" + identifier]))
148 | 
149 |     out = pd.DataFrame(out, columns=["type"] + ["x_" + str(dimension) for dimension in range(dimensions)])
150 |     out = out.sort_values(["type"])
151 |     out.to_csv(output_path, index=None)
152 | 
153 | 
154 | class Graph2Vec:
155 | 
156 |     def __init__(self, project_dir, files_paths, arguments=None):
157 |         self.graph_files = files_paths
158 |         self.project_dir = project_dir
159 |         self.arguments = arguments
160 |         self.graphs = None
161 |         self.node_label_attr_name = 'label'
162 | 
163 |         self.wlk_h = 2
164 |         self.wl_iterations = 5
165 |         if self.arguments:
166 |             self.workers = self.arguments.count_cpu
167 |         else:
168 |             self.workers = 4
169 |         self.learning_rate = 0.1
170 |         self.embedding_size = 1024  # 512
171 |         self.num_negative_samples = 6
172 |         self.epochs = 100  # 1000
173 |         self.batch_size = 10
174 |         self.final_embeddings = None
175 |         self.corpus = None
176 |         self.min_count = 0
177 |         self.down_sampling = 0.0001
178 | 
179 |     def run(self):
180 |         # print("\nFeature extraction started ...\n")
181 |         document_collections = \
182 |             Parallel(n_jobs=self.workers)(
183 |                 delayed(feature_extractor)(g, self.wl_iterations) for g in self.graph_files)
184 |         # print("\nOptimization started.\n")
185 |         unique_hashes = set()
186 |         docs = []
187 |         # analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
188 |         for index, text in enumerate(document_collections):
189 |             tags = text[1]
190 |             graph_file = tags[0]
191 |             graph_len = int(tags[1])
192 |             graph_hash = tags[2]
193 |             if graph_len > 2 and graph_hash not in unique_hashes:
194 |                 docs.append(text)
195 |                 unique_hashes.add(graph_hash)
196 |             else:
197 |                 self.graph_files.remove(graph_file)
198 | 
199 |         document_collections = docs
200 |         model = Doc2Vec(document_collections,
201 |                         vector_size=self.embedding_size,
202 |                         window=0,
203 |                         min_count=self.min_count,
204 |                         dm=0,
205 |                         sample=self.down_sampling,
206 |                         workers=self.workers,
207 |                         epochs=self.epochs,
208 |                         alpha=self.learning_rate)
209 |         out = []
210 |         for f in self.graph_files:
211 |             out.append(list(model.docvecs[f]))
212 |         self.final_embeddings = out
213 | 


--------------------------------------------------------------------------------
/learning/graph2vec/skipgram.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | from time import time
  4 | 
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | class skipgram(object):
  9 |     '''
 10 |     skipgram model - refer Mikolov et al (2013)
 11 |     '''
 12 | 
 13 |     def __init__(self, num_graphs, num_subgraphs, learning_rate, embedding_size,
 14 |                  num_negsample, num_steps, corpus, arguments):
 15 |         self.num_graphs = num_graphs
 16 |         self.num_subgraphs = num_subgraphs
 17 |         self.embedding_size = embedding_size
 18 |         self.num_negsample = num_negsample
 19 |         self.learning_rate = learning_rate
 20 |         self.num_steps = num_steps
 21 |         self.corpus = corpus
 22 |         self.graph, self.batch_inputs, self.batch_labels, self.normalized_embeddings, \
 23 |         self.loss, self.optimizer = self.trainer_initial()
 24 |         self.arguments = arguments
 25 | 
 26 |     def trainer_initial(self):
 27 |         graph = tf.Graph()
 28 |         with graph.as_default():
 29 |             batch_inputs = tf.placeholder(tf.int32, shape=([None, ]))
 30 |             batch_labels = tf.placeholder(tf.int64, shape=([None, 1]))
 31 |             num_negsample = self.num_negsample
 32 |             if self.num_subgraphs < num_negsample:
 33 |                 num_negsample = self.num_subgraphs
 34 | 
 35 |             graph_embeddings = tf.Variable(
 36 |                 tf.random_uniform([self.num_graphs, self.embedding_size], -0.5 / self.embedding_size,
 37 |                                   0.5 / self.embedding_size))
 38 | 
 39 |             batch_graph_embeddings = tf.nn.embedding_lookup(graph_embeddings, batch_inputs)  # hidden layer
 40 | 
 41 |             weights = tf.Variable(tf.truncated_normal([self.num_subgraphs, self.embedding_size],
 42 |                                                       stddev=1.0 / math.sqrt(self.embedding_size)))  # output layer wt
 43 |             biases = tf.Variable(tf.zeros(self.num_subgraphs))  # output layer biases
 44 | 
 45 |             # negative sampling part
 46 |             loss = tf.reduce_mean(
 47 |                 tf.nn.nce_loss(weights=weights,
 48 |                                biases=biases,
 49 |                                labels=batch_labels,
 50 |                                inputs=batch_graph_embeddings,
 51 |                                num_sampled=self.num_negsample,
 52 |                                num_classes=self.num_subgraphs,
 53 |                                sampled_values=tf.nn.fixed_unigram_candidate_sampler(
 54 |                                    true_classes=batch_labels,
 55 |                                    num_true=1,
 56 |                                    num_sampled=num_negsample,
 57 |                                    unique=True,
 58 |                                    range_max=self.num_subgraphs,
 59 |                                    distortion=0.75,
 60 |                                    unigrams=self.corpus.subgraph_id_freq_map_as_list)  # word_id_freq_map_as_list is the
 61 |                                # frequency of each word in vocabulary
 62 |                                ))
 63 | 
 64 |             global_step = tf.Variable(0, trainable=False)
 65 |             learning_rate = tf.train.exponential_decay(self.learning_rate,
 66 |                                                        global_step, 100000, 0.96,
 67 |                                                        staircase=True)  # linear decay over time
 68 | 
 69 |             learning_rate = tf.maximum(learning_rate,
 70 |                                        0.001)  # cannot go below 0.001 to ensure at least a minimal learning
 71 | 
 72 |             optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 73 | 
 74 |             norm = tf.sqrt(tf.reduce_mean(tf.square(graph_embeddings), 1, keep_dims=True))
 75 |             normalized_embeddings = graph_embeddings / norm
 76 | 
 77 |         return graph, batch_inputs, batch_labels, normalized_embeddings, loss, optimizer
 78 | 
 79 |     def train(self, corpus, batch_size):
 80 |         with tf.Session(graph=self.graph,
 81 |                         config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True,
 82 |                                               intra_op_parallelism_threads=self.arguments.count_cpu,
 83 |                                               inter_op_parallelism_threads=self.arguments.count_cpu,
 84 |                                               device_count={'CPU': self.arguments.count_cpu})) as sess:
 85 | 
 86 |             init = tf.global_variables_initializer()
 87 |             sess.run(init)
 88 | 
 89 |             loss = 0
 90 | 
 91 |             for i in xrange(self.num_steps):
 92 |                 t0 = time()
 93 |                 step = 0
 94 |                 while corpus.epoch_flag == False:
 95 |                     batch_data, batch_labels = corpus.generate_batch_from_file(
 96 |                         batch_size)  # get (target,context) wordid tuples
 97 | 
 98 |                     feed_dict = {self.batch_inputs: batch_data, self.batch_labels: batch_labels}
 99 |                     _, loss_val = sess.run([self.optimizer, self.loss], feed_dict=feed_dict)
100 | 
101 |                     loss += loss_val
102 | 
103 |                     if step % 100 == 0:
104 |                         if step > 0:
105 |                             average_loss = loss / step
106 |                             logging.info('Epoch: %d : Average loss for step: %d : %f' % (i, step, average_loss))
107 |                     step += 1
108 | 
109 |                 corpus.epoch_flag = False
110 |                 epoch_time = time() - t0
111 |                 logging.info('#########################   Epoch: %d :  %f, %.2f sec.  #####################' % (
112 |                     i, loss / step, epoch_time))
113 |                 loss = 0
114 | 
115 |             # done with training
116 |             final_embeddings = self.normalized_embeddings.eval()
117 |         return final_embeddings
118 | 


--------------------------------------------------------------------------------
/learning/graph2vec/train_utils.py:
--------------------------------------------------------------------------------
 1 | from corpus_parser import Corpus
 2 | from skipgram import skipgram
 3 | 
 4 | 
 5 | def train_skipgram(fnames, extn, learning_rate, embedding_size, num_negsample, epochs,
 6 |                    batch_size, arguments):  # , output_dir):
 7 |     '''
 8 | 
 9 |     :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled
10 |     according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>....
11 |     :param extn: Extension of the WL relabled file
12 |     :param learning_rate: learning rate for the skipgram model (will involve a linear decay)
13 |     :param embedding_size: number of dimensions to be used for learning subgraph representations
14 |     :param num_negsample: number of negative samples to be used by the skipgram model
15 |     :param epochs: number of iterations the dataset is traversed by the skipgram model
16 |     :param batch_size: size of each batch for the skipgram model
17 |     :param output_dir: the folder where embedding file will be stored
18 |     :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013))
19 |     '''
20 | 
21 |     # op_fname = '_'.join([os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs',
22 |     #                      str(epochs),'lr',str(learning_rate),'embeddings.txt'])
23 |     # op_fname = os.path.join(output_dir, op_fname)
24 |     # if os.path.isfile(op_fname):
25 |     #     logging.info('The embedding file: {} is already present, hence NOT training skipgram model '
26 |     #                  'for subgraph vectors'.format(op_fname))
27 |     #     return op_fname
28 | 
29 |     print "Initializing SKIPGRAM..."
30 |     corpus = Corpus(fnames, extn=extn, max_files=0)  # just load 'max_files' files from this folder
31 |     corpus.scan_and_load_corpus()
32 | 
33 |     model_skipgram = skipgram(
34 |         num_graphs=corpus.num_graphs,
35 |         num_subgraphs=corpus.num_subgraphs,
36 |         learning_rate=learning_rate,
37 |         embedding_size=embedding_size,
38 |         num_negsample=num_negsample,
39 |         num_steps=epochs,  # no. of time the training set will be iterated through
40 |         corpus=corpus,  # data set of (target,context) tuples
41 |         arguments=arguments
42 |     )
43 | 
44 |     final_embeddings = model_skipgram.train(corpus=corpus, batch_size=batch_size)
45 | 
46 |     # logging.info('Write the matrix to a word2vec format file')
47 |     # save_graph_embeddings(corpus, final_embeddings, op_fname)
48 |     # logging.info('Completed writing the final embeddings, pls check file: {} for the same'.format(op_fname))
49 |     # return op_fname
50 |     return corpus, final_embeddings
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     pass
55 | 


--------------------------------------------------------------------------------
/learning/graph2vec/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | def get_files_from_dir(dirname, extn, max_files=0):
 6 |     all_files = [os.path.join(dirname, f) for f in os.listdir(dirname) if f.endswith(extn)]
 7 |     for root, dirs, files in os.walk(dirname):
 8 |         for f in files:
 9 |             if f.endswith(extn):
10 |                 all_files.append(os.path.join(root, f))
11 | 
12 |     all_files = list(set(all_files))
13 |     all_files.sort()
14 |     if max_files:
15 |         return all_files[:max_files]
16 |     else:
17 |         return all_files
18 | 
19 | 
20 | def get_files(fnames, extn, max_files=0):
21 |     all_files = ['{}.{}'.format(f, extn) for f in fnames]
22 |     for file_name in all_files:
23 |         if not os.path.isfile(file_name):
24 |             print 'error, missing graph', file_name
25 |     all_files = list(set(all_files))
26 |     all_files.sort()
27 |     if max_files:
28 |         return all_files[:max_files]
29 |     else:
30 |         return all_files
31 | 
32 | 
33 | def save_graph_embeddings(corpus, final_embeddings, opfname):
34 |     dict_to_save = {}
35 |     for i in range(len(final_embeddings)):
36 |         graph_fname = corpus._id_to_graph_name_map[i]
37 |         graph_embedding = final_embeddings[i, :].tolist()
38 |         dict_to_save[graph_fname] = graph_embedding
39 | 
40 |     with open(opfname, 'w') as fh:
41 |         json.dump(dict_to_save, fh, indent=4)
42 | 
43 | 
44 | def get_class_labels(graph_files, class_labels_fname):
45 |     graph_to_class_label_map = {l.split()[0].split('.')[0]: int(l.split()[1].strip()) for l in open(class_labels_fname)}
46 |     labels = [graph_to_class_label_map[os.path.basename(g).split('.')[0]] for g in graph_files]
47 |     return labels
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     print 'nothing to do'
52 | 


--------------------------------------------------------------------------------
/learning/graphkernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/graphkernel/__init__.py


--------------------------------------------------------------------------------
/learning/graphkernel/weisfeiler_lehman.py:
--------------------------------------------------------------------------------
  1 | """Weisfeiler_Lehman graph kernel.
  2 | Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by:
  3 | Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt
  4 | Mehlhorn, Karsten M. Borgwardt, JMLR, 2012.
  5 | http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html
  6 | Author : Sandro Vega-Pons, Emanuele Olivetti
  7 | """
  8 | 
  9 | import numpy as np
 10 | import networkx as nx
 11 | import copy
 12 | 
 13 | import pandas as pd
 14 | 
 15 | 
 16 | class GK_WL():
 17 |     """
 18 |     Weisfeiler_Lehman graph kernel.
 19 |     """
 20 | 
 21 |     def compare_list(self, graph_list, h=1, node_label=True):
 22 |         """Compute the all-pairs kernel values for a list of graphs.
 23 |         This function can be used to directly compute the kernel
 24 |         matrix for a list of graphs. The direct computation of the
 25 |         kernel matrix is faster than the computation of all individual
 26 |         pairwise kernel values.
 27 |         Parameters
 28 |         ----------
 29 |         graph_list: list
 30 |             A list of graphs (list of networkx graphs)
 31 |         h : interger
 32 |             Number of iterations.
 33 |         node_label : boolean
 34 |             Whether to use original node labels. True for using node labels
 35 |             saved in the attribute 'node_label'. False for using the node
 36 |             degree of each node as node attribute.
 37 |         Return
 38 |         ------
 39 |         K: numpy.array, shape = (len(graph_list), len(graph_list))
 40 |         The similarity matrix of all graphs in graph_list.
 41 |         """
 42 |         graph_list = self.convert_node_names_to_int(graph_list)
 43 |         self.graphs = graph_list
 44 |         n = len(graph_list)
 45 |         lists = [0] * n
 46 |         k = [0] * (h + 1)
 47 |         n_nodes = 0
 48 |         n_max = 0
 49 | 
 50 |         # Compute adjacency lists and n_nodes, the total number of
 51 |         # nodes in the dataset.
 52 |         for i in range(n):
 53 |             self.get_adj_list(graph_list, i, lists)
 54 |             n_nodes = n_nodes + graph_list[i].number_of_nodes()
 55 | 
 56 |             # Computing the maximum number of nodes in the graphs. It
 57 |             # will be used in the computation of vectorial
 58 |             # representation.
 59 |             if (n_max < graph_list[i].number_of_nodes()):
 60 |                 n_max = graph_list[i].number_of_nodes()
 61 | 
 62 |         phi = np.zeros((n_max, n), dtype=np.uint64)
 63 | 
 64 |         # INITIALIZATION: initialize the nodes labels for each graph
 65 |         # with their labels or with degrees (for unlabeled graphs)
 66 | 
 67 |         labels = [0] * n
 68 |         label_lookup = {}
 69 |         label_counter = 0
 70 | 
 71 |         # label_lookup is an associative array, which will contain the
 72 |         # mapping from multiset labels (strings) to short labels
 73 |         # (integers)
 74 | 
 75 |         if node_label is True:
 76 |             for i in range(n):
 77 |                 l_aux = nx.get_node_attributes(graph_list[i],
 78 |                                                'label').values()
 79 |                 # It is assumed that the graph has an attribute
 80 |                 # 'label'
 81 |                 labels[i] = np.zeros(len(l_aux), dtype=np.int32)
 82 | 
 83 |                 for j in range(len(l_aux)):
 84 |                     if not (l_aux[j] in label_lookup):
 85 |                         label_lookup[l_aux[j]] = label_counter
 86 |                         labels[i][j] = label_counter
 87 |                         label_counter += 1
 88 |                     else:
 89 |                         labels[i][j] = label_lookup[l_aux[j]]
 90 |                     # labels are associated to a natural number
 91 |                     # starting with 0.
 92 |                     phi[labels[i][j], i] += 1
 93 |         else:
 94 |             for i in range(n):
 95 |                 labels[i] = np.array(graph_list[i].degree().values())
 96 |                 for j in range(len(labels[i])):
 97 |                     phi[labels[i][j], i] += 1
 98 | 
 99 |         # Simplified vectorial representation of graphs (just taking
100 |         # the vectors before the kernel iterations), i.e., it is just
101 |         # the original nodes degree.
102 |         self.vectors = np.copy(phi.transpose())
103 | 
104 |         k = np.dot(phi.transpose(), phi)
105 | 
106 |         # MAIN LOOP
107 |         it = 0
108 |         new_labels = copy.deepcopy(labels)
109 | 
110 |         while it < h:
111 |             # create an empty lookup table
112 |             label_lookup = {}
113 |             label_counter = 0
114 | 
115 |             phi = np.zeros((n_nodes, n), dtype=np.uint64)
116 |             for i in range(n):
117 |                 for v in range(len(lists[i])):
118 |                     # form a multiset label of the node v of the i'th graph
119 |                     # and convert it to a string
120 |                     long_label = np.concatenate((np.array([labels[i][v]]),
121 |                                                  np.sort(labels[i]
122 |                                                          [lists[i][v]])))
123 |                     long_label_string = str(long_label)
124 |                     # if the multiset label has not yet occurred, add it to the
125 |                     # lookup table and assign a number to it
126 |                     if not (long_label_string in label_lookup):
127 |                         label_lookup[long_label_string] = label_counter
128 |                         new_labels[i][v] = label_counter
129 |                         label_counter += 1
130 |                     else:
131 |                         new_labels[i][v] = label_lookup[long_label_string]
132 |                 # fill the column for i'th graph in phi
133 |                 aux = np.bincount(new_labels[i])
134 |                 # phi[new_labels[i], i] += aux[new_labels[i]]
135 |                 np.add(phi[new_labels[i], i], aux[new_labels[i]], out=phi[new_labels[i], i], casting="unsafe")
136 | 
137 |             k += np.dot(phi.transpose(), phi)
138 |             labels = copy.deepcopy(new_labels)
139 |             it = it + 1
140 | 
141 |         # Compute the normalized version of the kernel
142 |         k_norm = np.zeros(k.shape)
143 |         for i in range(k.shape[0]):
144 |             for j in range(k.shape[1]):
145 |                 k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])
146 | 
147 |         return k_norm
148 | 
149 |     def convert_node_names_to_int(self, graph_list):
150 |         graphs = []
151 |         for i in range(len(graph_list)):
152 |             # nodes = list(graph.nodes)
153 |             # mapping = zip(nodes, pd.Series(nodes).astype('category').cat.codes.values)
154 |             graphs.append(nx.convert_node_labels_to_integers(graph_list[i]))
155 |             # print graph_list[i].nodes
156 |         return graphs
157 | 
158 |     def get_adj_list(self, graph_list, i, lists):
159 |         adj_list = []
160 |         for n, nbrdict in graph_list[i].adjacency():
161 |             adj_list.append(nbrdict.keys())
162 |         print adj_list
163 |         lists[i] = adj_list
164 | 
165 |     def compare(self, g_1, g_2, h=1, node_label=True):
166 |         """Compute the kernel value (similarity) between two graphs.
167 |         The kernel is normalized to [0,1] by the equation:
168 |         k_norm(g1, g2) = k(g1, g2) / sqrt(k(g1,g1) * k(g2,g2))
169 |         Parameters
170 |         ----------
171 |         g_1 : networkx.Graph
172 |             First graph.
173 |         g_2 : networkx.Graph
174 |             Second graph.
175 |         h : interger
176 |             Number of iterations.
177 |         node_label : boolean
178 |             Whether to use the values under the graph attribute 'node_label'
179 |             as node labels. If False, the degree of the nodes are used as
180 |             labels.
181 |         Returns
182 |         -------
183 |         k : The similarity value between g1 and g2.
184 |         """
185 |         gl = [g_1, g_2]
186 |         return self.compare_list(gl, h, node_label)[0, 1]
187 | 


--------------------------------------------------------------------------------
/learning/node2vec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/learning/node2vec/__init__.py


--------------------------------------------------------------------------------
/learning/node2vec/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Reference implementation of node2vec. 
  3 | 
  4 | Author: Aditya Grover
  5 | 
  6 | For more details, refer to the paper:
  7 | node2vec: Scalable Feature Learning for Networks
  8 | Aditya Grover and Jure Leskovec 
  9 | Knowledge Discovery and Data Mining (KDD), 2016
 10 | '''
 11 | 
 12 | import argparse
 13 | import numpy as np
 14 | import networkx as nx
 15 | import node2vec
 16 | from gensim.models import Word2Vec
 17 | 
 18 | def parse_args():
 19 | 	'''
 20 | 	Parses the node2vec arguments.
 21 | 	'''
 22 | 	parser = argparse.ArgumentParser(description="Run node2vec.")
 23 | 
 24 | 	parser.add_argument('--input', nargs='?', default='graph/karate.edgelist',
 25 | 	                    help='Input graph path')
 26 | 
 27 | 	parser.add_argument('--output', nargs='?', default='emb/karate.emb',
 28 | 	                    help='Embeddings path')
 29 | 
 30 | 	parser.add_argument('--dimensions', type=int, default=128,
 31 | 	                    help='Number of dimensions. Default is 128.')
 32 | 
 33 | 	parser.add_argument('--walk-length', type=int, default=80,
 34 | 	                    help='Length of walk per source. Default is 80.')
 35 | 
 36 | 	parser.add_argument('--num-walks', type=int, default=10,
 37 | 	                    help='Number of walks per source. Default is 10.')
 38 | 
 39 | 	parser.add_argument('--window-size', type=int, default=10,
 40 |                     	help='Context size for optimization. Default is 10.')
 41 | 
 42 | 	parser.add_argument('--iter', default=1, type=int,
 43 |                       help='Number of epochs in SGD')
 44 | 
 45 | 	parser.add_argument('--workers', type=int, default=8,
 46 | 	                    help='Number of parallel workers. Default is 8.')
 47 | 
 48 | 	parser.add_argument('--p', type=float, default=1,
 49 | 	                    help='Return hyperparameter. Default is 1.')
 50 | 
 51 | 	parser.add_argument('--q', type=float, default=1,
 52 | 	                    help='Inout hyperparameter. Default is 1.')
 53 | 
 54 | 	parser.add_argument('--weighted', dest='weighted', action='store_true',
 55 | 	                    help='Boolean specifying (un)weighted. Default is unweighted.')
 56 | 	parser.add_argument('--unweighted', dest='unweighted', action='store_false')
 57 | 	parser.set_defaults(weighted=False)
 58 | 
 59 | 	parser.add_argument('--directed', dest='directed', action='store_true',
 60 | 	                    help='Graph is (un)directed. Default is undirected.')
 61 | 	parser.add_argument('--undirected', dest='undirected', action='store_false')
 62 | 	parser.set_defaults(directed=False)
 63 | 
 64 | 	return parser.parse_args()
 65 | 
 66 | def read_graph():
 67 | 	'''
 68 | 	Reads the input network in networkx.
 69 | 	'''
 70 | 	if args.weighted:
 71 | 		G = nx.read_edgelist(args.input, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph())
 72 | 	else:
 73 | 		G = nx.read_edgelist(args.input, nodetype=int, create_using=nx.DiGraph())
 74 | 		for edge in G.edges():
 75 | 			G[edge[0]][edge[1]]['weight'] = 1
 76 | 
 77 | 	if not args.directed:
 78 | 		G = G.to_undirected()
 79 | 
 80 | 	return G
 81 | 
 82 | def learn_embeddings(walks):
 83 | 	'''
 84 | 	Learn embeddings by optimizing the Skipgram objective using SGD.
 85 | 	'''
 86 | 	walks = [map(str, walk) for walk in walks]
 87 | 	model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, iter=args.iter)
 88 | 	model.save_word2vec_format(args.output)
 89 | 	
 90 | 	return
 91 | 
 92 | def main(args):
 93 | 	'''
 94 | 	Pipeline for representational learning for all nodes in a graph.
 95 | 	'''
 96 | 	nx_G = read_graph()
 97 | 	G = node2vec.Graph(nx_G, args.directed, args.p, args.q)
 98 | 	G.preprocess_transition_probs()
 99 | 	walks = G.simulate_walks(args.num_walks, args.walk_length)
100 | 	learn_embeddings(walks)
101 | 
102 | if __name__ == "__main__":
103 | 	args = parse_args()
104 | 	main(args)
105 | 


--------------------------------------------------------------------------------
/learning/node2vec/node2vec.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import networkx as nx
  3 | import random
  4 | 
  5 | 
  6 | class Graph():
  7 |     def __init__(self, nx_G, is_directed, p, q):
  8 |         self.G = nx_G
  9 |         self.is_directed = is_directed
 10 |         self.p = p
 11 |         self.q = q
 12 | 
 13 |     def node2vec_walk(self, walk_length, start_node):
 14 |         '''
 15 | 		Simulate a random walk starting from start node.
 16 | 		'''
 17 |         G = self.G
 18 |         alias_nodes = self.alias_nodes
 19 |         alias_edges = self.alias_edges
 20 | 
 21 |         walk = [start_node]
 22 | 
 23 |         while len(walk) < walk_length:
 24 |             cur = walk[-1]
 25 |             cur_nbrs = sorted(G.neighbors(cur))
 26 |             if len(cur_nbrs) > 0:
 27 |                 if len(walk) == 1:
 28 |                     walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
 29 |                 else:
 30 |                     prev = walk[-2]
 31 |                     next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
 32 |                                                alias_edges[(prev, cur)][1])]
 33 |                     walk.append(next)
 34 |             else:
 35 |                 break
 36 | 
 37 |         return walk
 38 | 
 39 |     def simulate_walks(self, num_walks, walk_length):
 40 |         '''
 41 | 		Repeatedly simulate random walks from each node.
 42 | 		'''
 43 |         G = self.G
 44 |         walks = []
 45 |         nodes = list(G.nodes())
 46 |         # print 'Walk iteration:'
 47 |         for walk_iter in range(num_walks):
 48 |             # print str(walk_iter + 1), '/', str(num_walks)
 49 |             random.shuffle(nodes)
 50 |             for node in nodes:
 51 |                 walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
 52 | 
 53 |         return walks
 54 | 
 55 |     def get_alias_edge(self, src, dst):
 56 |         '''
 57 | 		Get the alias edge setup lists for a given edge.
 58 | 		'''
 59 |         G = self.G
 60 |         p = self.p
 61 |         q = self.q
 62 | 
 63 |         unnormalized_probs = []
 64 |         for dst_nbr in sorted(G.neighbors(dst)):
 65 |             if dst_nbr == src:
 66 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
 67 |             elif G.has_edge(dst_nbr, src):
 68 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'])
 69 |             else:
 70 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
 71 |         norm_const = sum(unnormalized_probs)
 72 |         normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
 73 | 
 74 |         return alias_setup(normalized_probs)
 75 | 
 76 |     def preprocess_transition_probs(self):
 77 |         '''
 78 | 		Preprocessing of transition probabilities for guiding the random walks.
 79 | 		'''
 80 |         G = self.G
 81 |         is_directed = self.is_directed
 82 | 
 83 |         alias_nodes = {}
 84 |         for node in G.nodes():
 85 |             unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
 86 |             norm_const = sum(unnormalized_probs)
 87 |             normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
 88 |             alias_nodes[node] = alias_setup(normalized_probs)
 89 | 
 90 |         alias_edges = {}
 91 |         triads = {}
 92 | 
 93 |         if is_directed:
 94 |             for edge in G.edges():
 95 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
 96 |         else:
 97 |             for edge in G.edges():
 98 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
 99 |                 alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
100 | 
101 |         self.alias_nodes = alias_nodes
102 |         self.alias_edges = alias_edges
103 | 
104 |         return
105 | 
106 | 
107 | def alias_setup(probs):
108 |     '''
109 | 	Compute utility lists for non-uniform sampling from discrete distributions.
110 | 	Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
111 | 	for details
112 | 	'''
113 |     K = len(probs)
114 |     q = np.zeros(K)
115 |     J = np.zeros(K, dtype=np.int)
116 | 
117 |     smaller = []
118 |     larger = []
119 |     for kk, prob in enumerate(probs):
120 |         q[kk] = K * prob
121 |         if q[kk] < 1.0:
122 |             smaller.append(kk)
123 |         else:
124 |             larger.append(kk)
125 | 
126 |     while len(smaller) > 0 and len(larger) > 0:
127 |         small = smaller.pop()
128 |         large = larger.pop()
129 | 
130 |         J[small] = large
131 |         q[large] = q[large] + q[small] - 1.0
132 |         if q[large] < 1.0:
133 |             smaller.append(large)
134 |         else:
135 |             larger.append(large)
136 | 
137 |     return J, q
138 | 
139 | 
140 | def alias_draw(J, q):
141 |     '''
142 | 	Draw sample from a non-uniform discrete distribution using alias sampling.
143 | 	'''
144 |     K = len(J)
145 | 
146 |     kk = int(np.floor(np.random.rand() * K))
147 |     if np.random.rand() < q[kk]:
148 |         return kk
149 |     else:
150 |         return J[kk]
151 | 


--------------------------------------------------------------------------------
/learning/similarity.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import math
 3 | 
 4 | import networkx as nx
 5 | 
 6 | 
 7 | def counter_cosine_similarity(c1, c2):
 8 |     terms = set(c1).union(c2)
 9 |     dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
10 |     magA = math.sqrt(sum(c1.get(k, 0) ** 2 for k in terms))
11 |     magB = math.sqrt(sum(c2.get(k, 0) ** 2 for k in terms))
12 |     return dotprod / (magA * magB)
13 | 
14 | 
15 | def get_graph_similarity(graph1, graph2):
16 |     laplacian1 = nx.spectrum.laplacian_spectrum(graph1)
17 |     laplacian2 = nx.spectrum.laplacian_spectrum(graph2)
18 |     k1 = select_k(laplacian1)
19 |     k2 = select_k(laplacian2)
20 |     k = min(k1, k2)
21 | 
22 |     similarity = sum((laplacian1[:k] - laplacian2[:k]) ** 2)
23 | 
24 |     return similarity
25 | 
26 | 
27 | def select_k(spectrum, minimum_energy=1):
28 |     running_total = 0.0
29 |     total = sum(spectrum)
30 |     if total == 0.0:
31 |         return len(spectrum)
32 |     for i in range(len(spectrum)):
33 |         running_total += spectrum[i]
34 |         if running_total / total >= minimum_energy:
35 |             return i + 1
36 |     return len(spectrum)
37 | 


--------------------------------------------------------------------------------
/learning/struc2vec/algorithms.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from time import time
  3 | from collections import deque
  4 | import numpy as np
  5 | import math,random,logging
  6 | from concurrent.futures import ProcessPoolExecutor, as_completed
  7 | import multiprocessing as mp
  8 | from collections import defaultdict
  9 | 
 10 | from utils import *
 11 | 
 12 | 
 13 | def generate_parameters_random_walk(workers):
 14 | 
 15 |     logging.info('Loading distances_nets from disk...')
 16 | 
 17 |     sum_weights = {}
 18 |     amount_edges = {}
 19 |     
 20 |     layer = 0
 21 |     while(isPickle('distances_nets_weights-layer-'+str(layer))):
 22 |         logging.info('Executing layer {}...'.format(layer))            
 23 |         weights = restoreVariableFromDisk('distances_nets_weights-layer-'+str(layer))
 24 |     
 25 |         for k,list_weights in weights.iteritems():
 26 |             if(layer not in sum_weights):
 27 |                 sum_weights[layer] = 0
 28 |             if(layer not in amount_edges):
 29 |                 amount_edges[layer] = 0
 30 | 
 31 |             for w in list_weights:
 32 |                 sum_weights[layer] += w
 33 |                 amount_edges[layer] += 1
 34 |         
 35 |         logging.info('Layer {} executed.'.format(layer))
 36 |         layer += 1
 37 | 
 38 |     average_weight = {}
 39 |     for layer in sum_weights.keys():
 40 |         average_weight[layer] = sum_weights[layer] / amount_edges[layer]
 41 | 
 42 |     logging.info("Saving average_weights on disk...")
 43 |     saveVariableOnDisk(average_weight,'average_weight')
 44 | 
 45 |     amount_neighbours = {}
 46 | 
 47 |     layer = 0
 48 |     while(isPickle('distances_nets_weights-layer-'+str(layer))):
 49 |         logging.info('Executing layer {}...'.format(layer))            
 50 |         weights = restoreVariableFromDisk('distances_nets_weights-layer-'+str(layer))
 51 | 
 52 |         amount_neighbours[layer] = {}
 53 | 
 54 |         for k,list_weights in weights.iteritems():
 55 |             cont_neighbours = 0
 56 |             for w in list_weights:
 57 |                 if(w > average_weight[layer]):
 58 |                     cont_neighbours += 1
 59 |             amount_neighbours[layer][k] = cont_neighbours
 60 | 
 61 |         logging.info('Layer {} executed.'.format(layer))
 62 |         layer += 1
 63 | 
 64 |     logging.info("Saving amount_neighbours on disk...")
 65 |     saveVariableOnDisk(amount_neighbours,'amount_neighbours')
 66 | 
 67 | def chooseNeighbor(v,graphs,alias_method_j,alias_method_q,layer):
 68 |     v_list = graphs[layer][v]
 69 | 
 70 |     idx = alias_draw(alias_method_j[layer][v],alias_method_q[layer][v])
 71 |     v = v_list[idx]
 72 | 
 73 |     return v
 74 | 
 75 | 
 76 | def exec_random_walk(graphs,alias_method_j,alias_method_q,v,walk_length,amount_neighbours):
 77 |     original_v = v
 78 |     t0 = time()
 79 |     initialLayer = 0
 80 |     layer = initialLayer
 81 | 
 82 | 
 83 |     path = deque()
 84 |     path.append(v)
 85 | 
 86 |     while len(path) < walk_length:
 87 |         r = random.random()
 88 | 
 89 |         if(r < 0.3):
 90 |                 v = chooseNeighbor(v,graphs,alias_method_j,alias_method_q,layer)
 91 |                 path.append(v)
 92 | 
 93 |         else:
 94 |             r = random.random()
 95 |             limiar_moveup = prob_moveup(amount_neighbours[layer][v])
 96 |             if(r > limiar_moveup):
 97 |                 if(layer > initialLayer):
 98 |                     layer = layer - 1           
 99 |             else:
100 |                 if((layer + 1) in graphs and v in graphs[layer + 1]):
101 |                     layer = layer + 1
102 | 
103 |     t1 = time()
104 |     logging.info('RW - vertex {}. Time : {}s'.format(original_v,(t1-t0)))
105 | 
106 |     return path
107 | 
108 | 
109 | def exec_ramdom_walks_for_chunck(vertices,graphs,alias_method_j,alias_method_q,walk_length,amount_neighbours):
110 |     walks = deque()
111 |     for v in vertices:
112 |         walks.append(exec_random_walk(graphs,alias_method_j,alias_method_q,v,walk_length,amount_neighbours))
113 |     return walks
114 | 
115 | def generate_random_walks_large_graphs(num_walks,walk_length,workers,vertices):
116 | 
117 |     logging.info('Loading distances_nets from disk...')
118 | 
119 |     graphs = restoreVariableFromDisk('distances_nets_graphs')
120 |     alias_method_j = restoreVariableFromDisk('nets_weights_alias_method_j')
121 |     alias_method_q = restoreVariableFromDisk('nets_weights_alias_method_q')
122 |     amount_neighbours = restoreVariableFromDisk('amount_neighbours')
123 | 
124 |     logging.info('Creating RWs...')
125 |     t0 = time()
126 |     
127 |     walks = deque()
128 |     initialLayer = 0
129 | 
130 |     parts = workers
131 | 
132 |     with ProcessPoolExecutor(max_workers=workers) as executor:
133 | 
134 |         for walk_iter in range(num_walks):
135 |             random.shuffle(vertices)
136 |             logging.info("Execution iteration {} ...".format(walk_iter))
137 |             walk = exec_ramdom_walks_for_chunck(vertices,graphs,alias_method_j,alias_method_q,walk_length,amount_neighbours)
138 |             walks.extend(walk)
139 |             logging.info("Iteration {} executed.".format(walk_iter))
140 | 
141 | 
142 | 
143 |     t1 = time()
144 |     logging.info('RWs created. Time : {}m'.format((t1-t0)/60))
145 |     logging.info("Saving Random Walks on disk...")
146 |     save_random_walks(walks)
147 | 
148 | def generate_random_walks(num_walks,walk_length,workers,vertices):
149 | 
150 |     logging.info('Loading distances_nets on disk...')
151 | 
152 |     graphs = restoreVariableFromDisk('distances_nets_graphs')
153 |     alias_method_j = restoreVariableFromDisk('nets_weights_alias_method_j')
154 |     alias_method_q = restoreVariableFromDisk('nets_weights_alias_method_q')
155 |     amount_neighbours = restoreVariableFromDisk('amount_neighbours')
156 | 
157 |     logging.info('Creating RWs...')
158 |     t0 = time()
159 |     
160 |     walks = deque()
161 |     initialLayer = 0
162 | 
163 |     if(workers > num_walks):
164 |         workers = num_walks
165 | 
166 |     with ProcessPoolExecutor(max_workers=workers) as executor:
167 |         futures = {}
168 |         for walk_iter in range(num_walks):
169 |             random.shuffle(vertices)
170 |             job = executor.submit(exec_ramdom_walks_for_chunck,vertices,graphs,alias_method_j,alias_method_q,walk_length,amount_neighbours)
171 |             futures[job] = walk_iter
172 |             #part += 1
173 |         logging.info("Receiving results...")
174 |         for job in as_completed(futures):
175 |             walk = job.result()
176 |             r = futures[job]
177 |             logging.info("Iteration {} executed.".format(r))
178 |             walks.extend(walk)
179 |             del futures[job]
180 | 
181 | 
182 |     t1 = time()
183 |     logging.info('RWs created. Time: {}m'.format((t1-t0)/60))
184 |     logging.info("Saving Random Walks on disk...")
185 |     save_random_walks(walks)
186 | 
187 | def save_random_walks(walks):
188 |     with open('random_walks.txt', 'w') as file:
189 |         for walk in walks:
190 |             line = ''
191 |             for v in walk:
192 |                 line += str(v)+' '
193 |             line += '\n'
194 |             file.write(line)
195 |     return
196 | 
197 | def prob_moveup(amount_neighbours):
198 |     x = math.log(amount_neighbours + math.e)
199 |     p = (x / ( x + 1))
200 |     return p
201 | 
202 | 
203 | 
204 | def alias_draw(J, q):
205 |     '''
206 |     Draw sample from a non-uniform discrete distribution using alias sampling.
207 |     '''
208 |     K = len(J)
209 | 
210 |     kk = int(np.floor(np.random.rand()*K))
211 |     if np.random.rand() < q[kk]:
212 |         return kk
213 |     else:
214 |         return J[kk]
215 | 


--------------------------------------------------------------------------------
/learning/struc2vec/graph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Graph utilities."""
  5 | 
  6 | import logging
  7 | import sys
  8 | import math
  9 | from io import open
 10 | from os import path
 11 | from time import time
 12 | from glob import glob
 13 | from six.moves import range, zip, zip_longest
 14 | from six import iterkeys
 15 | from collections import defaultdict, Iterable
 16 | from multiprocessing import cpu_count
 17 | import random
 18 | from random import shuffle
 19 | from itertools import product,permutations
 20 | import collections
 21 | 
 22 | from concurrent.futures import ProcessPoolExecutor
 23 | 
 24 | from multiprocessing import Pool
 25 | from multiprocessing import cpu_count
 26 | 
 27 | #novas importações
 28 | import numpy as np
 29 | import operator
 30 | 
 31 | 
 32 | class Graph(defaultdict):
 33 |   """Efficient basic implementation of nx `Graph' â€“ Undirected graphs with self loops"""  
 34 |   def __init__(self):
 35 |     super(Graph, self).__init__(list)
 36 | 
 37 |   def nodes(self):
 38 |     return self.keys()
 39 | 
 40 |   def adjacency_iter(self):
 41 |     return self.iteritems()
 42 | 
 43 |   def subgraph(self, nodes={}):
 44 |     subgraph = Graph()
 45 |     
 46 |     for n in nodes:
 47 |       if n in self:
 48 |         subgraph[n] = [x for x in self[n] if x in nodes]
 49 |         
 50 |     return subgraph
 51 | 
 52 |   def make_undirected(self):
 53 |   
 54 |     t0 = time()
 55 | 
 56 |     for v in self.keys():
 57 |       for other in self[v]:
 58 |         if v != other:
 59 |           self[other].append(v)
 60 |     
 61 |     t1 = time()
 62 |     #logger.info('make_directed: added missing edges {}s'.format(t1-t0))
 63 | 
 64 |     self.make_consistent()
 65 |     return self
 66 | 
 67 |   def make_consistent(self):
 68 |     t0 = time()
 69 |     for k in iterkeys(self):
 70 |       self[k] = list(sorted(set(self[k])))
 71 |     
 72 |     t1 = time()
 73 |     #logger.info('make_consistent: made consistent in {}s'.format(t1-t0))
 74 | 
 75 |     #self.remove_self_loops()
 76 | 
 77 |     return self
 78 | 
 79 |   def remove_self_loops(self):
 80 | 
 81 |     removed = 0
 82 |     t0 = time()
 83 | 
 84 |     for x in self:
 85 |       if x in self[x]: 
 86 |         self[x].remove(x)
 87 |         removed += 1
 88 |     
 89 |     t1 = time()
 90 | 
 91 |     #logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0)))
 92 |     return self
 93 | 
 94 |   def check_self_loops(self):
 95 |     for x in self:
 96 |       for y in self[x]:
 97 |         if x == y:
 98 |           return True
 99 |     
100 |     return False
101 | 
102 |   def has_edge(self, v1, v2):
103 |     if v2 in self[v1] or v1 in self[v2]:
104 |       return True
105 |     return False
106 | 
107 |   def degree(self, nodes=None):
108 |     if isinstance(nodes, Iterable):
109 |       return {v:len(self[v]) for v in nodes}
110 |     else:
111 |       return len(self[nodes])
112 | 
113 |   def order(self):
114 |     "Returns the number of nodes in the graph"
115 |     return len(self)    
116 | 
117 |   def number_of_edges(self):
118 |     "Returns the number of nodes in the graph"
119 |     return sum([self.degree(x) for x in self.keys()])/2
120 | 
121 |   def number_of_nodes(self):
122 |     "Returns the number of nodes in the graph"
123 |     return self.order() 
124 | 
125 |   def gToDict(self):
126 |     d = {}
127 |     for k,v in self.iteritems():
128 |       d[k] = v
129 |     return d
130 | 
131 |   def printAdjList(self):
132 |     for key,value in self.iteritems():
133 |       print (key,":",value)
134 | 
135 | 
136 | 
137 | def clique(size):
138 |     return from_adjlist(permutations(range(1,size+1)))
139 | 
140 | # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
141 | def grouper(n, iterable, padvalue=None):
142 |     "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
143 |     return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
144 | 
145 | def parse_adjacencylist(f):
146 |   adjlist = []
147 |   for l in f:
148 |     if l and l[0] != "#":
149 |       introw = [int(x) for x in l.strip().split()]
150 |       row = [introw[0]]
151 |       row.extend(set(sorted(introw[1:])))
152 |       adjlist.extend([row])
153 |   
154 |   return adjlist
155 | 
156 | def parse_adjacencylist_unchecked(f):
157 |   adjlist = []
158 |   for l in f:
159 |     if l and l[0] != "#":
160 |       adjlist.extend([[int(x) for x in l.strip().split()]])
161 |   return adjlist
162 | 
163 | def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):
164 | 
165 |   if unchecked:
166 |     parse_func = parse_adjacencylist_unchecked
167 |     convert_func = from_adjlist_unchecked
168 |   else:
169 |     parse_func = parse_adjacencylist
170 |     convert_func = from_adjlist
171 | 
172 |   adjlist = []
173 | 
174 |   t0 = time()
175 | 
176 |   with open(file_) as f:
177 |     with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
178 |       total = 0 
179 |       for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))):
180 |           adjlist.extend(adj_chunk)
181 |           total += len(adj_chunk)
182 |   
183 |   t1 = time()
184 | 
185 |   logging.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0))
186 | 
187 |   t0 = time()
188 |   G = convert_func(adjlist)
189 |   t1 = time()
190 | 
191 |   logging.info('Converted edges to graph in {}s'.format(t1-t0))
192 | 
193 |   if undirected:
194 |     t0 = time()
195 |     G = G.make_undirected()
196 |     t1 = time()
197 |     logging.info('Made graph undirected in {}s'.format(t1-t0))
198 | 
199 |   return G 
200 | 
201 | 
202 | def load_edgelist(file_, undirected=True):
203 |   G = Graph()
204 |   with open(file_) as f:
205 |     for l in f:
206 |       if(len(l.strip().split()[:2]) > 1):
207 |         x, y = l.strip().split()[:2]
208 |         x = int(x)
209 |         y = int(y)
210 |         G[x].append(y)
211 |         if undirected:
212 |           G[y].append(x)
213 |       else:
214 |         x = l.strip().split()[:2]
215 |         x = int(x[0])
216 |         G[x] = []  
217 |   
218 |   G.make_consistent()
219 |   return G
220 | 
221 | 
222 | def load_matfile(file_, variable_name="network", undirected=True):
223 |   mat_varables = loadmat(file_)
224 |   mat_matrix = mat_varables[variable_name]
225 | 
226 |   return from_numpy(mat_matrix, undirected)
227 | 
228 | 
229 | def from_networkx(G_input, undirected=True):
230 |     G = Graph()
231 | 
232 |     for idx, x in enumerate(G_input.nodes_iter()):
233 |         for y in iterkeys(G_input[x]):
234 |             G[x].append(y)
235 | 
236 |     if undirected:
237 |         G.make_undirected()
238 | 
239 |     return G
240 | 
241 | 
242 | def from_numpy(x, undirected=True):
243 |     G = Graph()
244 | 
245 |     if issparse(x):
246 |         cx = x.tocoo()
247 |         for i,j,v in zip(cx.row, cx.col, cx.data):
248 |             G[i].append(j)
249 |     else:
250 |       raise Exception("Dense matrices not yet supported.")
251 | 
252 |     if undirected:
253 |         G.make_undirected()
254 | 
255 |     G.make_consistent()
256 |     return G
257 | 
258 | 
259 | def from_adjlist(adjlist):
260 |     G = Graph()
261 |     
262 |     for row in adjlist:
263 |         node = row[0]
264 |         neighbors = row[1:]
265 |         G[node] = list(sorted(set(neighbors)))
266 | 
267 |     return G
268 | 
269 | 
270 | def from_adjlist_unchecked(adjlist):
271 |     G = Graph()
272 |     
273 |     for row in adjlist:
274 |         node = row[0]
275 |         neighbors = row[1:]
276 |         G[node] = neighbors
277 | 
278 |     return G
279 | 
280 | 
281 | def from_dict(d):
282 |     G = Graph()
283 |     for k,v in d.iteritems():
284 |       G[k] = v
285 | 
286 |     return G
287 | 
288 | 


--------------------------------------------------------------------------------
/learning/struc2vec/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse, logging
  5 | import numpy as np
  6 | import struc2vec
  7 | from gensim.models import Word2Vec
  8 | from gensim.models.word2vec import LineSentence
  9 | from time import time
 10 | 
 11 | import graph
 12 | 
 13 | logging.basicConfig(filename='struc2vec.log',filemode='w',level=logging.DEBUG,format='%(asctime)s %(message)s')
 14 | 
 15 | def parse_args():
 16 | 	'''
 17 | 	Parses the struc2vec arguments.
 18 | 	'''
 19 | 	parser = argparse.ArgumentParser(description="Run struc2vec.")
 20 | 
 21 | 	parser.add_argument('--input', nargs='?', default='graph/karate.edgelist',
 22 | 	                    help='Input graph path')
 23 | 
 24 | 	parser.add_argument('--output', nargs='?', default='emb/karate.emb',
 25 | 	                    help='Embeddings path')
 26 | 
 27 | 	parser.add_argument('--dimensions', type=int, default=128,
 28 | 	                    help='Number of dimensions. Default is 128.')
 29 | 
 30 | 	parser.add_argument('--walk-length', type=int, default=80,
 31 | 	                    help='Length of walk per source. Default is 80.')
 32 | 
 33 | 	parser.add_argument('--num-walks', type=int, default=10,
 34 | 	                    help='Number of walks per source. Default is 10.')
 35 | 
 36 | 	parser.add_argument('--window-size', type=int, default=10,
 37 |                     	help='Context size for optimization. Default is 10.')
 38 | 
 39 | 	parser.add_argument('--until-layer', type=int, default=None,
 40 |                     	help='Calculation until the layer.')
 41 | 
 42 | 	parser.add_argument('--iter', default=5, type=int,
 43 |                       help='Number of epochs in SGD')
 44 | 
 45 | 	parser.add_argument('--workers', type=int, default=4,
 46 | 	                    help='Number of parallel workers. Default is 8.')
 47 | 
 48 | 	parser.add_argument('--weighted', dest='weighted', action='store_true',
 49 | 	                    help='Boolean specifying (un)weighted. Default is unweighted.')
 50 | 	parser.add_argument('--unweighted', dest='unweighted', action='store_false')
 51 | 	parser.set_defaults(weighted=False)
 52 | 
 53 | 	parser.add_argument('--directed', dest='directed', action='store_true',
 54 | 	                    help='Graph is (un)directed. Default is undirected.')
 55 | 	parser.add_argument('--undirected', dest='undirected', action='store_false')
 56 | 	parser.set_defaults(directed=False)
 57 | 
 58 | 	parser.add_argument('--OPT1', default=False, type=bool,
 59 |                       help='optimization 1')
 60 | 	parser.add_argument('--OPT2', default=False, type=bool,
 61 |                       help='optimization 2')
 62 | 	parser.add_argument('--OPT3', default=False, type=bool,
 63 |                       help='optimization 3')	
 64 | 	return parser.parse_args()
 65 | 
 66 | def read_graph():
 67 | 	'''
 68 | 	Reads the input network.
 69 | 	'''
 70 | 	logging.info(" - Loading graph...")
 71 | 	G = graph.load_edgelist(args.input,undirected=True)
 72 | 	logging.info(" - Graph loaded.")
 73 | 	return G
 74 | 
 75 | def learn_embeddings():
 76 | 	'''
 77 | 	Learn embeddings by optimizing the Skipgram objective using SGD.
 78 | 	'''
 79 | 	logging.info("Initializing creation of the representations...")
 80 | 	walks = LineSentence('random_walks.txt')
 81 | 	model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
 82 | 	model.wv.save_word2vec_format(args.output)
 83 | 	logging.info("Representations created.")
 84 | 	
 85 | 	return
 86 | 
 87 | def exec_struc2vec(args):
 88 | 	'''
 89 | 	Pipeline for representational learning for all nodes in a graph.
 90 | 	'''
 91 | 	if(args.OPT3):
 92 | 		until_layer = args.until_layer
 93 | 	else:
 94 | 		until_layer = None
 95 | 
 96 | 	G = read_graph()
 97 | 	G = struc2vec.Graph(G, args.directed, args.workers, untilLayer = until_layer)
 98 | 
 99 | 	if(args.OPT1):
100 | 		G.preprocess_neighbors_with_bfs_compact()
101 | 	else:
102 | 		G.preprocess_neighbors_with_bfs()
103 | 
104 | 	if(args.OPT2):
105 | 		G.create_vectors()
106 | 		G.calc_distances(compactDegree = args.OPT1)
107 | 	else:
108 | 		G.calc_distances_all_vertices(compactDegree = args.OPT1)
109 | 
110 | 
111 | 	G.create_distances_network()
112 | 	G.preprocess_parameters_random_walk()
113 | 
114 | 	G.simulate_walks(args.num_walks, args.walk_length)
115 | 
116 | 
117 | 	return G
118 | 
119 | def main(args):
120 | 
121 | 	G = exec_struc2vec(args)
122 | 
123 | 	learn_embeddings()
124 | 
125 | 
126 | if __name__ == "__main__":
127 | 	args = parse_args()
128 | 	main(args)
129 | 
130 | 


--------------------------------------------------------------------------------
/learning/struc2vec/struc2vec.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import random,sys,logging
  5 | from concurrent.futures import ProcessPoolExecutor, as_completed
  6 | from multiprocessing import Manager
  7 | from time import time
  8 | from collections import deque
  9 | 
 10 | from utils import *
 11 | from algorithms import *
 12 | from algorithms_distances import *
 13 | import graph
 14 | 
 15 | 
 16 | class Graph():
 17 | 	def __init__(self, g, is_directed, workers, untilLayer = None):
 18 | 
 19 | 		logging.info(" - Converting graph to dict...")
 20 | 		self.G = g.gToDict()
 21 | 		logging.info("Graph converted.")
 22 | 
 23 | 		self.num_vertices = g.number_of_nodes()
 24 | 		self.num_edges = g.number_of_edges()
 25 | 		self.is_directed = is_directed
 26 | 		self.workers = workers
 27 | 		self.calcUntilLayer = untilLayer
 28 | 		logging.info('Graph - Number of vertices: {}'.format(self.num_vertices))
 29 | 		logging.info('Graph - Number of edges: {}'.format(self.num_edges))
 30 | 
 31 | 
 32 | 	def preprocess_neighbors_with_bfs(self):
 33 | 
 34 | 		with ProcessPoolExecutor(max_workers=self.workers) as executor:
 35 | 			job = executor.submit(exec_bfs,self.G,self.workers,self.calcUntilLayer)
 36 | 			
 37 | 			job.result()
 38 | 
 39 | 		return
 40 | 
 41 | 	def preprocess_neighbors_with_bfs_compact(self):
 42 | 
 43 | 		with ProcessPoolExecutor(max_workers=self.workers) as executor:
 44 | 			job = executor.submit(exec_bfs_compact,self.G,self.workers,self.calcUntilLayer)
 45 | 			
 46 | 			job.result()
 47 | 
 48 | 		return
 49 | 
 50 | 	def preprocess_degree_lists(self):
 51 | 
 52 | 		with ProcessPoolExecutor(max_workers=self.workers) as executor:
 53 | 			job = executor.submit(preprocess_degreeLists)
 54 | 			
 55 | 			job.result()
 56 | 
 57 | 		return
 58 | 
 59 | 
 60 | 	def create_vectors(self):
 61 | 		logging.info("Creating degree vectors...")
 62 | 		degrees = {}
 63 | 		degrees_sorted = set()
 64 | 		G = self.G
 65 | 		for v in G.keys():
 66 | 			degree = len(G[v])
 67 | 			degrees_sorted.add(degree)
 68 | 			if(degree not in degrees):
 69 | 				degrees[degree] = {}
 70 | 				degrees[degree]['vertices'] = deque() 
 71 | 			degrees[degree]['vertices'].append(v)
 72 | 		degrees_sorted = np.array(list(degrees_sorted),dtype='int')
 73 | 		degrees_sorted = np.sort(degrees_sorted)
 74 | 
 75 | 		l = len(degrees_sorted)
 76 | 		for index, degree in enumerate(degrees_sorted):
 77 | 			if(index > 0):
 78 | 				degrees[degree]['before'] = degrees_sorted[index - 1]
 79 | 			if(index < (l - 1)):
 80 | 				degrees[degree]['after'] = degrees_sorted[index + 1]
 81 | 		logging.info("Degree vectors created.")
 82 | 		logging.info("Saving degree vectors...")
 83 | 		saveVariableOnDisk(degrees,'degrees_vector')
 84 | 
 85 | 
 86 | 	def calc_distances_all_vertices(self,compactDegree = False):
 87 | 
 88 | 		logging.info("Using compactDegree: {}".format(compactDegree))
 89 | 		if(self.calcUntilLayer):
 90 | 			logging.info("Calculations until layer: {}".format(self.calcUntilLayer))
 91 | 
 92 | 		futures = {}
 93 | 
 94 | 		count_calc = 0
 95 | 
 96 | 		vertices = list(reversed(sorted(self.G.keys())))
 97 | 
 98 | 		if(compactDegree):
 99 | 		    logging.info("Recovering degreeList from disk...")
100 | 		    degreeList = restoreVariableFromDisk('compactDegreeList')
101 | 		else:
102 | 		    logging.info("Recovering compactDegreeList from disk...")
103 | 		    degreeList = restoreVariableFromDisk('degreeList')
104 | 
105 | 		parts = self.workers
106 | 		chunks = partition(vertices,parts)
107 | 
108 | 		t0 = time()
109 | 
110 | 		with ProcessPoolExecutor(max_workers = self.workers) as executor:
111 | 
112 | 			part = 1
113 | 			for c in chunks:
114 | 				logging.info("Executing part {}...".format(part))
115 | 				list_v = []
116 | 				for v in c:
117 | 					list_v.append([vd for vd in degreeList.keys() if vd > v])
118 | 				job = executor.submit(calc_distances_all, c, list_v, degreeList,part, compactDegree = compactDegree)
119 | 				futures[job] = part
120 | 				part += 1
121 | 
122 | 
123 | 			logging.info("Receiving results...")
124 | 
125 | 			for job in as_completed(futures):
126 | 				job.result()
127 | 				r = futures[job]
128 | 				logging.info("Part {} Completed.".format(r))
129 | 		
130 | 		logging.info('Distances calculated.')
131 | 		t1 = time()
132 | 		logging.info('Time : {}m'.format((t1-t0)/60))
133 | 		
134 | 		return
135 | 
136 | 
137 | 	def calc_distances(self, compactDegree = False):
138 | 
139 | 		logging.info("Using compactDegree: {}".format(compactDegree))
140 | 		if(self.calcUntilLayer):
141 | 			logging.info("Calculations until layer: {}".format(self.calcUntilLayer))
142 | 
143 | 		futures = {}
144 | 		#distances = {}
145 | 
146 | 		count_calc = 0
147 | 
148 | 		G = self.G
149 | 		vertices = G.keys()
150 | 
151 | 		parts = self.workers
152 | 		chunks = partition(vertices,parts)
153 | 
154 | 		with ProcessPoolExecutor(max_workers = 1) as executor:
155 | 
156 | 			logging.info("Split degree List...")
157 | 			part = 1
158 | 			for c in chunks:
159 | 				job = executor.submit(splitDegreeList,part,c,G,compactDegree)
160 | 				job.result()
161 | 				logging.info("degreeList {} completed.".format(part))
162 | 				part += 1
163 | 
164 | 		
165 | 		with ProcessPoolExecutor(max_workers = self.workers) as executor:
166 | 
167 | 			part = 1
168 | 			for c in chunks:
169 | 				logging.info("Executing part {}...".format(part))
170 | 				job = executor.submit(calc_distances, part, compactDegree = compactDegree)
171 | 				futures[job] = part
172 | 				part += 1
173 | 
174 | 			logging.info("Receiving results...")
175 | 			for job in as_completed(futures):
176 | 				job.result()
177 | 				r = futures[job]
178 | 				logging.info("Part {} completed.".format(r))
179 | 
180 | 
181 | 		return
182 | 
183 | 	def consolide_distances(self):
184 | 
185 | 		distances = {}
186 | 
187 | 		parts = self.workers
188 | 		for part in range(1,parts + 1):
189 | 			d = restoreVariableFromDisk('distances-'+str(part))
190 | 			preprocess_consolides_distances(distances)
191 | 			distances.update(d)
192 | 
193 | 
194 | 		preprocess_consolides_distances(distances)
195 | 		saveVariableOnDisk(distances,'distances')
196 | 
197 | 
198 | 	def create_distances_network(self):
199 | 
200 | 		with ProcessPoolExecutor(max_workers=1) as executor:
201 | 			job = executor.submit(generate_distances_network,self.workers)
202 | 
203 | 			job.result()
204 | 
205 | 		return
206 | 
207 | 	def preprocess_parameters_random_walk(self):
208 | 
209 | 		with ProcessPoolExecutor(max_workers=1) as executor:
210 | 			job = executor.submit(generate_parameters_random_walk,self.workers)
211 | 
212 | 			job.result()
213 | 
214 | 		return
215 | 
216 | 
217 | 	def simulate_walks(self,num_walks,walk_length):
218 | 
219 | 		# for large graphs, it is serially executed, because of memory use.
220 | 		if(len(self.G) > 500000):
221 | 
222 | 			with ProcessPoolExecutor(max_workers=1) as executor:
223 | 				job = executor.submit(generate_random_walks_large_graphs,num_walks,walk_length,self.workers,self.G.keys())
224 | 
225 | 				job.result()
226 | 
227 | 		else:
228 | 
229 | 			with ProcessPoolExecutor(max_workers=1) as executor:
230 | 				job = executor.submit(generate_random_walks,num_walks,walk_length,self.workers,self.G.keys())
231 | 
232 | 				job.result()
233 | 
234 | 
235 | 		return	
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 		
242 | 
243 |       	
244 | 
245 | 
246 | 


--------------------------------------------------------------------------------
/learning/struc2vec/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from time import time
 3 | import logging,inspect
 4 | import cPickle as pickle
 5 | from itertools import islice
 6 | import os.path
 7 | 
 8 | dir_f = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
 9 | folder_pickles = dir_f+"/../pickles/"
10 | 
11 | def returnPathStruc2vec():
12 |     return dir_f
13 | 
14 | def isPickle(fname):
15 |     return os.path.isfile(dir_f+'/../pickles/'+fname+'.pickle')
16 | 
17 | def chunks(data, SIZE=10000):
18 |     it = iter(data)
19 |     for i in xrange(0, len(data), SIZE):
20 |         yield {k:data[k] for k in islice(it, SIZE)}
21 | 
22 | def partition(lst, n):
23 |     division = len(lst) / float(n)
24 |     return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n) ]
25 | 
26 | def restoreVariableFromDisk(name):
27 |     logging.info('Recovering variable...')
28 |     t0 = time()
29 |     val = None
30 |     with open(folder_pickles + name + '.pickle', 'rb') as handle:
31 |         val = pickle.load(handle)
32 |     t1 = time()
33 |     logging.info('Variable recovered. Time: {}m'.format((t1-t0)/60))
34 | 
35 |     return val
36 | 
37 | def saveVariableOnDisk(f,name):
38 |     logging.info('Saving variable on disk...')
39 |     t0 = time()
40 |     with open(folder_pickles + name + '.pickle', 'wb') as handle:
41 |         pickle.dump(f, handle, protocol=pickle.HIGHEST_PROTOCOL)
42 |     t1 = time()
43 |     logging.info('Variable saved. Time: {}m'.format((t1-t0)/60))
44 | 
45 |     return
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bcrypt==3.1.7
 2 | boto3==1.15.1
 3 | botocore==1.18.1
 4 | bz2file==0.98
 5 | certifi==2020.6.20
 6 | cffi==1.14.2
 7 | chardet==3.0.4
 8 | cloudpickle==1.3.0
 9 | cryptography==3.1
10 | dask==1.2.2
11 | decorator==4.4.2
12 | enum34==1.1.10
13 | funcsigs==1.0.2
14 | futures==3.3.0
15 | gensim==3.8.3
16 | graphviz==0.14.1
17 | idna==2.10
18 | ipaddress==1.0.23
19 | jmespath==0.10.0
20 | joblib==0.14.1
21 | locket==0.2.0
22 | matplotlib==1.4.3
23 | mock==3.0.5
24 | networkx==2.2
25 | nose==1.3.7
26 | numpy==1.16.1
27 | pandas==0.24.2
28 | paramiko==2.7.2
29 | partd==1.0.0
30 | pycparser==2.20
31 | pydot==1.0.28
32 | pygraphviz==1.5
33 | pymongo==3.11.0
34 | PyNaCl==1.4.0
35 | pyparsing==1.5.7
36 | python-dateutil==2.8.1
37 | pytz==2020.1
38 | requests==2.24.0
39 | s3transfer==0.3.3
40 | scikit-learn==0.20.4
41 | scipy==1.2.3
42 | scour==0.32
43 | six==1.10.0
44 | smart-open==1.10.1
45 | sshtunnel==0.1.5
46 | toolz==0.10.0
47 | tqdm==4.49.0
48 | urllib3==1.25.10
49 | 


--------------------------------------------------------------------------------
/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/sample/__init__.py


--------------------------------------------------------------------------------
/sample/astfile.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import networkx as nx
  3 | 
  4 | from learning.similarity import counter_cosine_similarity
  5 | 
  6 | 
  7 | class ASTFile:
  8 | 
  9 |     def __init__(self, ast_file, arguments, ast=None, feature_type=''):
 10 |         self.ast_file = ast_file
 11 |         self.arguments = arguments
 12 |         self.ast = ast
 13 |         if self.ast is None:
 14 |             try:
 15 |                 self.ast = nx.read_graphml(self.ast_file)
 16 |                 # self.ast = self.index.read(self.ast_file)
 17 |             except Exception, e:
 18 |                 print e
 19 |                 print self.ast_file
 20 | 
 21 |         self.functions_root_nodes = []
 22 |         self.features = []
 23 |         self.functions_features_counters = []
 24 |         self.function_names = []
 25 |         self.feature_type = feature_type
 26 | 
 27 |     def extract_features(self):
 28 |         self.functions_root_nodes = [x for x, y in self.ast.nodes(data=True)
 29 |                                      if 'type' in y and y['type'] == '"FUNCTION_DECL"']
 30 | 
 31 |         for root_node in self.functions_root_nodes:
 32 |             # print root_node
 33 |             self.extract_potential_features(root_node)
 34 |             self.functions_features_counters.append(Counter(self.features))
 35 |             self.function_names.append(self.ast.node[root_node]['spelling'].replace('"', ''))
 36 | 
 37 |     def extract_potential_features(self, root_node):
 38 |         # self.print_graph(root_node)
 39 |         s = list(nx.dfs_preorder_nodes(self.ast, root_node))
 40 |         self.features = []
 41 | 
 42 |         feature_types = self.feature_type.split('+')
 43 |         for feature_type in feature_types:
 44 |             if feature_type == 'MR':
 45 |                 self.extract_members(s)
 46 |             elif feature_type == 'C':
 47 |                 self.extract_calls(s)
 48 |             elif feature_type == 'NT':
 49 |                 self.extract_node_types(s)
 50 | 
 51 |     def extract_members(self, s):
 52 | 
 53 |         for item in s:
 54 |             node_type = self.ast.node[item]['type'].replace('"', '')
 55 |             if node_type == 'MEMBER_REF_EXPR' or node_type == 'MEMBER_REF':
 56 |                 node_spelling = self.ast.node[item]['spelling'].replace('"', '')
 57 |                 if node_spelling != '':
 58 |                     self.features.append('{}_{}'.format(node_type, node_spelling))
 59 | 
 60 |     def extract_calls(self, s):
 61 | 
 62 |         for item in s:
 63 |             node_type = self.ast.node[item]['type'].replace('"', '')
 64 |             if node_type == 'CALL_EXPR':
 65 |                 node_spelling = self.ast.node[item]['spelling'].replace('"', '')
 66 |                 if node_spelling != '':
 67 |                     self.features.append('{}_{}'.format(node_type, node_spelling))
 68 | 
 69 |     def extract_node_types(self, s):
 70 | 
 71 |         for item in s:
 72 |             node_type = self.ast.node[item]['type'].replace('"', '')
 73 |             self.features.append('NODE_TYPE_{}'.format(node_type))
 74 | 
 75 |     def print_graph(self, root_node):
 76 |         if self.ast.node[root_node]['spelling'].replace('"', '') in \
 77 |                 ['X509v3_addr_get_afi', 'ssl3_get_record', 'aes_gcm_ctrl']:
 78 |             print root_node, self.ast.node[root_node]['spelling']
 79 |             s = list(nx.dfs_preorder_nodes(self.ast, root_node))
 80 |             for item in s:
 81 |                 print self.ast.node[item]
 82 | 
 83 |     def compute_functions_similarities(self):
 84 |         functions_similarities = []
 85 | 
 86 |         for i in range(len(self.functions_features_counters) - 1):
 87 |             for j in range(len(self.functions_features_counters)):
 88 |                 if i == i + j:
 89 |                     continue
 90 |                 if i + j >= len(self.functions_features_counters):
 91 |                     continue
 92 |                 functions_similarities.append({'func1': self.ast.node[self.functions_root_nodes[i]]['spelling'],
 93 |                                                'func2': self.ast.node[self.functions_root_nodes[i + j]]['spelling'],
 94 |                                                'score': counter_cosine_similarity(self.functions_features_counters[i],
 95 |                                                                                   self.functions_features_counters[i +
 96 |                                                                                                                    j])})
 97 | 
 98 |         return sorted(functions_similarities, key=lambda k: k['score'], reverse=True)
 99 | 
100 |     def extract_backup_features(self, root_node):
101 |             # self.print_graph(root_node)
102 |             s = list(nx.dfs_preorder_nodes(self.ast, root_node))
103 |             features = []
104 |             for item in s:
105 |                 node_type = self.ast.node[item]['type'].replace('"', '')
106 |                 features.append(node_type)
107 |                 if node_type == 'MEMBER_REF_EXPR' or node_type == 'MEMBER_REF' or node_type =='TYPEDEF_DECL':
108 |                     node_spelling = self.ast.node[item]['spelling'].replace('"', '')
109 |                     if node_spelling != '':
110 |                         features.append(node_spelling)
111 |             # print features
112 |             return features
113 | 


--------------------------------------------------------------------------------
/sample/bitcodefile.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import subprocess
  3 | from collections import Counter, defaultdict
  4 | # from node2vec import Node2Vec
  5 | from operator import itemgetter
  6 | from subprocess import call
  7 | from timeit import default_timer
  8 | 
  9 | import networkx as nx
 10 | import numpy as np
 11 | from gensim.models import Word2Vec
 12 | 
 13 | from learning.node2vec import node2vec
 14 | from slicer import Slicer
 15 | from utils.inout import *
 16 | 
 17 | 
 18 | class BitCodeFile:
 19 | 
 20 |     def __init__(self, file_info, arguments, analysis_type='', feature_type=''):
 21 |         self.file_info = file_info
 22 |         self.arguments = arguments
 23 |         self.analysis_type = analysis_type
 24 |         self.feature_type = feature_type
 25 | 
 26 |         self.features = []
 27 |         self.afs_features_counters_list = defaultdict(list)
 28 |         self.afs_features_counters = {}
 29 |         self.afs_graph = None
 30 |         self.graph_len = 0
 31 |         self.basic_block_ids = set()
 32 |         self.lines_numbers = set()
 33 |         self.llvm_instructions = set()
 34 |         self.construct_hash = None
 35 | 
 36 |     def analyze(self):
 37 |         if self.analysis_type == 'pdg':
 38 |             return self.extract_pdg()
 39 |         elif self.analysis_type == 'as':
 40 |             return self.extract_as()
 41 |         else:
 42 |             return False
 43 | 
 44 |     def extract_pdg(self):
 45 |         pdg_dir = join_path(get_parent_dir(self.file_info), 'pdg')
 46 |         make_dir_if_not_exist(pdg_dir)
 47 |         for function_name in self.get_functions():
 48 |             function_pdg_file = join_path(pdg_dir, '{}.pdg.dot'.format(function_name))
 49 |             # llvm_pdg_log_file = join_path(pdg_dir, '{}.log.txt'.format(function))
 50 |             time_file = join_path(pdg_dir, '{}.pdg.time.txt'.format(function_name))
 51 |             # args = ['-entrypoint', function, '-nocfg', '>', function_pdg_file]
 52 |             parse_start_time = default_timer()
 53 |             try:
 54 |                 OUTPUT = open(function_pdg_file, 'w')
 55 | 
 56 |                 # set 60 seconds timeout for pdg extraction
 57 |                 call(['timeout', '60', self.arguments.pdg_dumper, self.file_info,
 58 |                       '-entrypoint', function_name, '-nocfg'],
 59 |                      stdout=OUTPUT, stderr=subprocess.STDOUT, close_fds=True)
 60 |                 OUTPUT.close()
 61 | 
 62 |                 if not exist_file(function_pdg_file):
 63 |                     print 'error in:', function_name, self.file_info
 64 |                 # output = open(function_pdg_file, 'r')
 65 |                 # output_lines = output.readlines()
 66 |                 # for i in range(len(output_lines)):
 67 |                 #     if 'WARNING' in output_lines[i]:
 68 |                 #         # digraph "DependenceGraph"
 69 |                 #         print function_pdg_file
 70 | 
 71 |                 parse_elapsed = default_timer() - parse_start_time
 72 |                 write_file(time_file, '{}'.format(parse_elapsed))
 73 |             except:
 74 |                 print 'crash in pdg dumper', self.file_info, function_name
 75 |                 return False
 76 | 
 77 |         return True
 78 | 
 79 |     def get_functions(self):
 80 |         functions = []
 81 |         functions_file = join_path(get_parent_dir(self.file_info), 'functions.txt')
 82 |         try:
 83 |             for line in read_lines(functions_file):
 84 |                 # Do not consider inlinehint functions
 85 |                 if ' inlinehint ' not in line.split(':')[1]:
 86 |                     functions.append(line.split('@')[1].split('(')[0])
 87 |         except Exception, e:
 88 |             print functions_file, e
 89 |         return functions
 90 | 
 91 |     def extract_as(self):
 92 |         slicer = Slicer(pdg_graph_file=self.file_info, arguments=self.arguments)
 93 |         if not slicer.error:
 94 |             slicer.run()
 95 |             del slicer
 96 |             return True
 97 |         else:
 98 |             return False
 99 | 
100 |     def extract_features(self):
101 |         self.extract_potential_features()
102 | 
103 |     def extract_potential_features(self):
104 |         self.afs_graph = nx.drawing.nx_agraph.read_dot(self.file_info)
105 |         self.graph_len = len(self.afs_graph)
106 |         if self.feature_type == 'NN':
107 |             self.extract_node_names_features()
108 |             self.afs_features_counters = Counter(self.features)
109 |         elif self.feature_type == 'NNMD':
110 |             self.extract_node_names_features()
111 |             self.afs_features_counters = Counter(self.features)
112 |             self.extract_metadata_features()
113 |         elif self.feature_type == 'SM':
114 |             self.extract_semantic_features()
115 | 
116 |     def extract_node_names_features(self):
117 |         for node in self.afs_graph.nodes:
118 |             label = self.afs_graph.nodes[node]['label']
119 |             if 'line' in self.afs_graph.nodes[node]:
120 |                 line = self.afs_graph.nodes[node]['line']
121 |                 self.lines_numbers.add(line)
122 |             if 'basic_block_id' in self.afs_graph.nodes[node]:
123 |                 bb_id = self.afs_graph.nodes[node]['basic_block_id']
124 |                 self.basic_block_ids.add(bb_id)
125 |             self.features.append(label)
126 |             self.llvm_instructions.add(label)
127 | 
128 |         self.compute_construct_hash()
129 | 
130 |     def extract_metadata_features(self):
131 |         self.afs_features_counters['metadata_num_edges'] = len(self.afs_graph.edges)
132 |         self.afs_features_counters['metadata_num_nodes'] = len(self.afs_graph.nodes)
133 |         # d = self.centrality_distribution(self.afs_graph)
134 |         # self.afs_features_counters['metadata_entropy_centrality_distribution'] = self.entropy(d)
135 | 
136 |     def entropy(self, dist):
137 |         """
138 |         Returns the entropy of `dist` in bits (base-2).
139 | 
140 |         """
141 |         dist = np.asarray(dist)
142 |         ent = np.nansum(dist * np.log2(1 / dist))
143 |         return ent
144 | 
145 |     def centrality_distribution(self, G):
146 |         """
147 |         Returns a centrality distribution.
148 | 
149 |         Each normalized centrality is divided by the sum of the normalized
150 |         centralities. Note, this assumes the graph is simple.
151 | 
152 |         """
153 |         if len(G) == 1:
154 |             print self.file_info
155 |         centrality = nx.degree_centrality(G).values()
156 |         centrality = np.asarray(centrality)
157 |         centrality /= centrality.sum()
158 |         return centrality
159 | 
160 |     def extract_semantic_features(self):
161 |         # self.build_laplacian_features()
162 |         self.build_node2vec_features_node_representation()
163 |         # self.build_graph2vec_features_node_representation()
164 | 
165 |     def build_node2vec_features_node_representation(self):
166 |         afs_graph = nx.DiGraph()
167 |         for e in self.afs_graph.edges():
168 |             afs_graph.add_weighted_edges_from([(e[0], e[1], 1)])
169 |         walks = self.get_node2vec_walks(afs_graph)
170 |         if len(walks):
171 |             node2vec_ref = self.learn_embeddings(walks)
172 |         else:
173 |             node2vec_ref = {}
174 | 
175 |         data = []
176 |         for node in afs_graph.nodes:
177 |             data.append(node2vec_ref.get_vector(node))
178 | 
179 |         data = np.array(data)
180 |         data = np.average(data, axis=0)
181 | 
182 |         for index, value in enumerate(data):
183 |             feature_name = 'representation_{}'.format(index)
184 |             self.afs_features_counters[feature_name] = value
185 | 
186 |     def build_node2vec_features_similar_nodes(self):
187 |         afs_graph = nx.DiGraph()
188 |         for e in self.afs_graph.edges():
189 |             afs_graph.add_weighted_edges_from([(e[0], e[1], 1)])
190 |         walks = self.get_node2vec_walks(afs_graph)
191 |         if len(walks):
192 |             node2vec_ref = self.learn_embeddings(walks)
193 |         else:
194 |             node2vec_ref = {}
195 | 
196 |         # for index, value in enumerate(node2vec_ref.get_vector(node)):
197 |         for node in afs_graph.nodes:
198 |             similar_nodes = sorted(node2vec_ref.wv.most_similar(node), key=itemgetter(1), reverse=True)
199 |             sn_len = len(similar_nodes)
200 |             for item in similar_nodes[0:min(sn_len, 5)]:
201 | 
202 |                 similar_label = self.afs_graph.node[item[0]]['label']
203 |                 node_label = self.afs_graph.node[node]['label']
204 |                 # feature_name = '{}_{}'.format(similar_label, node_label)
205 |                 feature_name = '{}'.format(similar_label)
206 |                 if feature_name not in self.afs_features_counters_list.keys():
207 |                     self.afs_features_counters[feature_name] = 1
208 |                 else:
209 |                     self.afs_features_counters[feature_name] += 1
210 | 
211 |     def build_node2vec_features_single_node(self):
212 |         afs_graph = nx.DiGraph()
213 |         for e in self.afs_graph.edges():
214 |             afs_graph.add_weighted_edges_from([(e[0], e[1], 1)])
215 |         walks = self.get_node2vec_walks(afs_graph)
216 |         if len(walks):
217 |             node2vec_ref = self.learn_embeddings(walks)
218 |         else:
219 |             node2vec_ref = {}
220 | 
221 |         for node in afs_graph.nodes:
222 |             # print node, self.afs_graph.node[node]['label']
223 |             # print node2vec_ref.get_vector(node)
224 | 
225 |             for index, value in enumerate(node2vec_ref.get_vector(node)):
226 |                 feature_name = '{} ({})'.format(self.afs_graph.node[node]['label'], index)
227 |                 if feature_name not in self.afs_features_counters_list.keys():
228 |                     self.afs_features_counters_list[feature_name].append(round(value, 2))
229 |                 else:
230 |                     self.afs_features_counters_list[feature_name].append(round(value, 2))
231 |         for key, value in self.afs_features_counters_list.iteritems():
232 |             self.afs_features_counters[key] = round(np.average(value), 2)
233 | 
234 |     def get_node2vec_walks(self, afs_graph):
235 |         num_walks = 10
236 |         walk_length = 10
237 |         p = 0.25
238 |         q = 0.25
239 |         node2vec_graph = node2vec.Graph(afs_graph, True, p, q)
240 |         node2vec_graph.preprocess_transition_probs()
241 |         walks = node2vec_graph.simulate_walks(num_walks, walk_length)
242 |         return walks
243 | 
244 |     def learn_embeddings(self, walks):
245 |         '''
246 |         Learn embeddings by optimizing the Skipgram objective using SGD.
247 |         '''
248 |         dimensions = 128
249 |         window_size = 10
250 |         workers = 5
251 |         iteration = 1
252 |         output = '/Users/mansourahmadi/Desktop/aaa.out'
253 |         walks = [map(str, walk) for walk in walks]
254 |         model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=iteration)
255 |         # print model
256 |         return model.wv
257 | 
258 |     def compute_construct_hash(self):
259 |         construct_string = ''
260 |         # print self.file_info
261 |         construct_string += self.file_info[:self.file_info.find('.c/pdg') + 2]
262 |         # print construct_string
263 |         for id in self.basic_block_ids:
264 |             construct_string += str(id)
265 |         for line in self.lines_numbers:
266 |             construct_string += str(line)
267 |         for llvm_instruction in self.llvm_instructions:
268 |             construct_string += str(llvm_instruction)
269 |         # print construct_string
270 |         # self.construct_hash = int(hashlib.sha1(construct_string).hexdigest(), 16) % (10 ** 8)
271 |         self.construct_hash = hashlib.sha1(construct_string).hexdigest()
272 | 


--------------------------------------------------------------------------------
/sample/languagetype.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class LanguageType(Enum):
 5 |     C = 'C'
 6 | 
 7 |     @staticmethod
 8 |     def get_names():
 9 |         return [e.name for e in LanguageType]
10 | 
11 |     @staticmethod
12 |     def get_detail():
13 |         return ['{}: {}'.format(e.name, e.value) for e in LanguageType]
14 | 


--------------------------------------------------------------------------------
/sample/sourcefile.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | from pprint import pprint
  3 | from subprocess import call
  4 | from timeit import default_timer
  5 | 
  6 | # import clang.cindex as cl
  7 | import networkx as nx
  8 | import pandas
  9 | 
 10 | from utils.inout import *
 11 | 
 12 | 
 13 | def get_diagnostics_info(diagnostics):
 14 |     return {'severity': diagnostics.severity,
 15 |             'location': diagnostics.location,
 16 |             'spelling': diagnostics.spelling
 17 |             # 'ranges': diagnostics.ranges,
 18 |             # 'fixits': diagnostics.fixits
 19 |             }
 20 | 
 21 | 
 22 | class SourceFile:
 23 | 
 24 |     def __init__(self, source_file, arguments, project_dir='', compile_arguments=[], analysis_type=''):
 25 |         self.source_file = source_file
 26 |         self.source_file_ast_dir = ''
 27 |         self.source_file_bc_dir = ''
 28 |         self.project_dir = project_dir
 29 |         # self.project_includes = project_includes
 30 |         self.compile_arguments = compile_arguments
 31 |         self.arguments = arguments
 32 |         # if not cl.Config.loaded:
 33 |         #     cl.Config.set_library_path(self.arguments.clang_lib_dir)
 34 |         # self.index = cl.Index.create()
 35 |         self.index = None
 36 |         self.ast = None
 37 |         self.root_nodes = []
 38 |         self.function_count = 0
 39 |         self.function_name = ''
 40 |         self.file_ast_json = {}
 41 |         self.cursor_list = {}
 42 |         self.analysis_type = analysis_type
 43 |         # self.cursor_list = []
 44 | 
 45 |     def emit_llvm_bc(self):
 46 |         parent_dir = get_parent_dir(self.source_file).replace(self.arguments.projects_dir, self.arguments.bcs_dir)
 47 |         make_dir_if_not_exist(parent_dir)
 48 |         self.source_file_bc_dir = join_path(parent_dir, get_basename(self.source_file))
 49 |         make_dir_if_not_exist(self.source_file_bc_dir)
 50 |         bc_file = join_path(self.source_file_bc_dir, get_filename_without_ext(get_basename(self.source_file)) + '.bc')
 51 |         ll_file = join_path(self.source_file_bc_dir, get_filename_without_ext(get_basename(self.source_file)) + '.ll')
 52 |         llvm_log_file = join_path(self.source_file_bc_dir, 'llvm.log.txt')
 53 |         llvm_dis_log_file = join_path(self.source_file_bc_dir, 'llvm-dis.log.txt')
 54 |         function_names_file = join_path(self.source_file_bc_dir, 'functions.txt')
 55 |         time_file = join_path(self.source_file_bc_dir, 'bc.time.txt')
 56 | 
 57 |         args = []
 58 |         # print self.compile_arguments
 59 |         if self.compile_arguments is None and not self.arguments.ignore_compile_commands:
 60 |             return True
 61 |         self.compile_arguments = [ca for ca in self.compile_arguments if not ca.startswith('-O')] + ['-O0']
 62 |         if "-Wexpansion-to-defined" in self.compile_arguments:
 63 |             self.compile_arguments.remove("-Wexpansion-to-defined")
 64 |         # self.compile_arguments = [ca for ca in self.compile_arguments if ca.startswith('-I')] + ['-O0']
 65 |         args += self.compile_arguments
 66 |         args += ['-I/usr/lib/clang/3.8.0/include/']
 67 |         args += self.arguments.includes
 68 |         # print 'args', args
 69 | 
 70 |         cwd = get_current_directory()
 71 |         change_directory(self.project_dir)
 72 |         # print [self.arguments.clang, '-emit-llvm', '-c', '-g', self.source_file, '-o', bc_file] + args
 73 | 
 74 |         LOG = open(llvm_log_file, 'w')
 75 |         parse_start_time = default_timer()
 76 |         try:
 77 |             call([self.arguments.clang, '-emit-llvm', '-c', '-g', self.source_file, '-o', bc_file] + args,
 78 |                  stdout=LOG, stderr=subprocess.STDOUT, close_fds=True)
 79 |         except:
 80 |             print 'crash in clang', self.source_file
 81 |             return True
 82 |         parse_elapsed = default_timer() - parse_start_time
 83 |         write_file(time_file, '{}'.format(parse_elapsed))
 84 |         change_directory(cwd)
 85 |         # llvm_dis = join_path(self.arguments.llvm_config, 'llvm-dis')
 86 |         llvm_dis = 'llvm-dis-3.8'
 87 | 	LOG = open(llvm_dis_log_file, 'w')
 88 |         try:
 89 |             if is_file(bc_file):
 90 |                 call([llvm_dis, bc_file, '-o', ll_file],
 91 |                      stdout=LOG, stderr=subprocess.STDOUT, close_fds=True)
 92 |             else:
 93 |                 print 'No bc file:', bc_file
 94 |                 print [arg for arg in args if arg.startswith('-I')]
 95 |         except:
 96 |             print llvm_dis, ' cannot be found'
 97 |             return True
 98 |         functions = open(function_names_file, 'w')
 99 |         lines = read_lines(ll_file)
100 |         functions_name = []
101 |         prev_line = ''
102 |         for line in lines:
103 |             if line[0:6] == 'define':
104 |                 functions_name.append(line + prev_line + '\n')
105 |             prev_line = line
106 |         functions.writelines(functions_name)
107 |         return True
108 | 
109 |     def emit_llvm_ll_and_functions(self, bitcode_file):
110 |         bitcode_filename = get_basename(bitcode_file)
111 |         sourcecode_filename = bitcode_filename[:-2] + 'c'
112 |         ll_filename = bitcode_filename[:-2] + 'll'
113 |         c_dir = join_path(get_parent_dir(bitcode_file), sourcecode_filename)
114 |         make_dir_if_not_exist(c_dir)
115 |         new_bitcode_file = join_path(c_dir, bitcode_filename)
116 |         move_file(bitcode_file, new_bitcode_file)
117 |         ll_file = join_path(c_dir, ll_filename)
118 |         llvm_dis_log_file = join_path(c_dir, 'llvm-dis.log.txt')
119 |         function_names_file = join_path(c_dir, 'functions.txt')
120 | 
121 |         llvm_dis = join_path(self.arguments.llvm_config, 'llvm-dis-3.8')
122 |         LOG = open(llvm_dis_log_file, 'w')
123 |         try:
124 |             if is_file(new_bitcode_file):
125 |                 call([llvm_dis, new_bitcode_file, '-o', ll_file],
126 |                      stdout=LOG, stderr=subprocess.STDOUT, close_fds=True)
127 |             else:
128 |                 print 'No bc file:', new_bitcode_file
129 |         except:
130 |             print llvm_dis, ' cannot be found'
131 |             return True
132 |         functions = open(function_names_file, 'w')
133 |         lines = read_lines(ll_file)
134 |         functions_name = []
135 |         prev_line = ''
136 |         for line in lines:
137 |             if line[0:6] == 'define':
138 |                 functions_name.append(line + prev_line + '\n')
139 |             prev_line = line
140 |         functions.writelines(functions_name)
141 | 
142 |     def emit_llvm_ast(self):
143 |         # I commented the following lines to be able to run on conda
144 |         # self.ast = ig.Graph(directed=True)
145 |         # if not cl.Config.loaded:
146 |         #     cl.Config.set_library_path(self.arguments.clang_lib_dir)
147 |         # self.index = cl.Index.create()
148 |         self.ast = nx.DiGraph()
149 |         include_args = []
150 |         parent_dir = get_parent_dir(self.source_file).replace(self.arguments.projects_dir, self.arguments.asts_dir)
151 |         make_dir_if_not_exist(parent_dir)
152 |         self.source_file_ast_dir = join_path(parent_dir, get_basename(self.source_file))
153 |         make_dir_if_not_exist(self.source_file_ast_dir)
154 |         # for include in self.project_includes:
155 |         #     include_args.append('-I{}'.format(include))
156 | 
157 |         args = []
158 |         # args += ['--no-standard-includes', '-nostdinc++', '-nobuiltininc']
159 |         # args += ['-nostdinc','-nostdinc++']
160 |         # args += include_args
161 |         if len(self.compile_arguments) == 0 and not self.arguments.ignore_compile_commands:
162 |             return True
163 |         args += self.compile_arguments
164 |         args += ['-I/usr/lib/clang/3.8.0/include/']
165 |         # args += ['-S', '-emit-llvm', '-c', '-o', 'xx.bc']
166 |         # args += ['-I/home/mansour/nfs/vulfinder/tools/clang+llvm-5.0.1-x86_64-linux-gnu-ubuntu-16.04/include/clang']
167 | 
168 |         parse_start_time = default_timer()
169 |         cwd = get_current_directory()
170 |         change_directory(self.project_dir)
171 |         translation_unit = self.index.parse(self.source_file, args=args)
172 |         change_directory(cwd)
173 |         parse_elapsed = default_timer() - parse_start_time
174 |         diagnostics = map(get_diagnostics_info, translation_unit.diagnostics)
175 |         diag_file = join_path(self.source_file_ast_dir, '{}.diag.txt'.format(get_basename(self.source_file)))
176 |         pandas.DataFrame(diagnostics).to_csv(diag_file, index=False)
177 |         iteration_start_time = default_timer()
178 |         # self.add_node(translation_unit.cursor)
179 |         if self.arguments.save_format == 'graph':
180 |             self.get_info_graph(translation_unit.cursor)
181 |             self.save_ast(self.function_name)
182 |         elif self.arguments.save_format == 'json':
183 |             ast_json = self.get_info_json(translation_unit.cursor)
184 |             self.save_ast_json(ast_json)
185 |             pprint(ast_json)
186 |         elif self.arguments.save_format == 'ast':
187 |             self.save_tu(translation_unit)
188 |         iteration_elapsed = default_timer() - iteration_start_time
189 |         time_file = join_path(self.source_file_ast_dir, '{}.time.txt'.format(get_basename(self.source_file)))
190 |         write_file(time_file, 'Parsed Time: {} , Iteration Time: {}'.format(parse_elapsed, iteration_elapsed))
191 |         return True
192 | 
193 |     def analyze(self):
194 |         if self.analysis_type == 'ast':
195 |             return self.emit_llvm_ast()
196 |         elif self.analysis_type == 'bc':
197 |             return self.emit_llvm_bc()
198 |         else:
199 |             return False
200 | 
201 |     def get_info_graph(self, node, depth=0):
202 | 
203 |         flag = True if self.source_file in str(node.location) else False
204 | 
205 |         if flag:
206 |             parent_vertex_id = self.add_node(node)
207 | 
208 |         # children_info = []
209 |         for c in node.get_children():
210 |             self.get_info_graph(c, depth + 1)
211 |             if flag:
212 |                 child_vertex_id = self.add_node(c)
213 |                 if parent_vertex_id != child_vertex_id:
214 |                     self.ast.add_edge(parent_vertex_id, child_vertex_id)
215 | 
216 |     # Should be tested
217 |     def get_info_json(self, node, depth=0):
218 |         node_kind = str(node.kind).split('.')[1]
219 |         flag = True if self.source_file in str(node.location) or node_kind == 'TRANSLATION_UNIT' else False
220 | 
221 |         if not flag:
222 |             return None
223 | 
224 |         children = [self.get_info_json(c, depth + 1) for c in node.get_children()]
225 |         children = [c for c in children if c is not None]
226 | 
227 |         return {'id': self.get_cursor_id(node),
228 |                 'kind': node_kind,
229 |                 #'usr': node.get_usr(),
230 |                 'spelling': node.spelling,
231 |                 'location': str(node.location).split(',')[1:],
232 |                 #'extent.start': str(node.extent.start),
233 |                 #'extent.end': str(node.extent.end),
234 |                 'is_definition': node.is_definition(),
235 |                 # 'definition id': get_cursor_id(node.get_definition()),
236 |                 'children': children}
237 | 
238 |     def add_node(self, node):
239 |         node_kind = str(node.kind).split('.')[1]
240 |         if node_kind == 'FUNCTION_DECL':
241 |             if self.function_count >= 1:
242 |                 self.save_ast(self.function_name)
243 |                 # self.ast = None
244 |                 self.ast = nx.DiGraph()
245 |                 self.cursor_list = {}
246 |                 # self.cursor_list = []
247 |             self.function_count += 1
248 |             self.function_name = node.spelling
249 | 
250 |         # node_id = self.get_cursor_id(node)
251 |         node_id = self.get_cursor_id(self.get_unique_hash(node))
252 |         self.ast.add_node(node_id,
253 |                           type='"{}"'.format(node_kind),
254 |                           usr='"{}"'.format(node.get_usr()),
255 |                           spelling=u'"{}"'.format(str(node.spelling).replace('"', '')),
256 |                           location='"{}"'.format(node.location),
257 |                           extent_start='"{}"'.format(node.extent.start),
258 |                           extent_end='"{}"'.format(node.extent.end),
259 |                           is_definition=node.is_definition()
260 |                           # definition_id = self.get_cursor_id(node.get_definition())
261 |                           )
262 | 
263 |         return node_id
264 | 
265 |     def get_cursor_id_bk(self, cursor_hash):
266 | 
267 |         if cursor_hash is None:
268 |             return None
269 | 
270 |         self.cursor_list.append(cursor_hash)
271 |         index = self.cursor_list.index(cursor_hash)
272 |         return index - 1
273 | 
274 |     def get_cursor_id(self, cursor_hash):
275 | 
276 |         if cursor_hash is None:
277 |             return None
278 | 
279 |         for key, value in self.cursor_list.iteritems():
280 |             if cursor_hash == value:
281 |                 return key
282 |         len_cursor_list = len(self.cursor_list)
283 |         self.cursor_list[len_cursor_list] = cursor_hash
284 |         return len_cursor_list
285 | 
286 |     def get_unique_hash(self, cursor):
287 |         return hash(('"{}"'.format(cursor.kind),
288 |                      '"{}"'.format(cursor.get_usr()),
289 |                      u'"{}"'.format(str(cursor.spelling).replace('"', '')),
290 |                                     '"{}"'.format(cursor.location),
291 |                                     '"{}"'.format(cursor.extent.start),
292 |                                     '"{}"'.format(cursor.extent.end),
293 |                                     cursor.is_definition()
294 |                      ))
295 | 
296 |     def save_ast(self, function_name):
297 |         existing_files = get_files_in_dir(self.source_file_ast_dir, ext='{}.graphml'.format(function_name))
298 |         if len(existing_files) == 0:
299 |             ast_file = join_path(self.source_file_ast_dir, '{}.graphml'.format(function_name))
300 |         else:
301 |             ast_file = join_path(self.source_file_ast_dir, '+{}'.format(get_basename(existing_files[0])))
302 | 
303 |         nx.write_graphml(self.ast, ast_file)
304 | 
305 |     def save_ast_json(self, ast_json):
306 |         # pprint(ast_json)
307 |         try:
308 |             write_file_json(join_path(self.source_file_ast_dir, 'ast.json'), ast_json)
309 |         except Exception as e:
310 |             print e
311 | 
312 |     def save_tu(self, translation_unit):
313 |         translation_unit.save(join_path(self.source_file_ast_dir, '{}.ast'.format(get_basename(self.source_file_ast_dir))))
314 | 
315 | 


--------------------------------------------------------------------------------
/scripts/get_inconsistencies.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | benchmark=$1
  4 | threshold=$2
  5 | granularity=$3
  6 | preprocess=$4
  7 | 
  8 | if [ -z "${benchmark}" ]; then
  9 |     echo "benchmark is unset or set to the empty string"
 10 |     exit 1;
 11 | fi
 12 | 
 13 | if [ -z "${preprocess}" ]; then
 14 |     echo "No preprocessing"
 15 |     preprocess="np"
 16 | fi
 17 | 
 18 | rm output
 19 | 
 20 | if [ "${preprocess}" = "p" ]; then
 21 | 	datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 22 | 	bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 23 | 	data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 24 | 
 25 | 	rm -rf "$data/$datasets/$benchmark"
 26 | 	rm -rf "$data/$bcs/$benchmark"
 27 | 	python __init__.py -p=$benchmark -a=BC
 28 | 	python __init__.py -p=$benchmark -a=PDG
 29 | 	python __init__.py -p=$benchmark -a=AS
 30 | 	python __init__.py -p=$benchmark -a=AS -cws=2
 31 | 	python __init__.py -p=$benchmark -a=AS -hcf
 32 |         python __init__.py -p=$benchmark -a=AS -hcf -cws=2
 33 | 	python __init__.py -p=$benchmark -a=FE -ft=afs_NN
 34 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN
 35 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_NN
 36 | 	python __init__.py -p=$benchmark -a=FE -hcf -ft=afs_NN
 37 |         python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb1_NN
 38 |         python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb2_NN
 39 | fi
 40 | 
 41 | if [ "$threshold" = "all" ]; then
 42 | 	if [ "$granularity" = "all" ]; then
 43 | 	        python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.99
 44 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.99
 45 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.99
 46 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.99
 47 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.99
 48 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99
 49 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.99
 50 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.99
 51 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.99
 52 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.99
 53 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.99
 54 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99
 55 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.99
 56 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.99
 57 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.99
 58 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.99
 59 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.99
 60 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99
 61 | 
 62 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.70,cc_0.99
 63 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.75,cc_0.99
 64 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.80,cc_0.99
 65 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.85,cc_0.99
 66 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.90,cc_0.99
 67 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99
 68 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99
 69 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99
 70 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99
 71 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99
 72 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99
 73 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
 74 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99
 75 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99
 76 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99
 77 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99
 78 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99
 79 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
 80 | 	elif [ "$granularity" = "afs" ]; then
 81 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.99
 82 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.99
 83 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.99
 84 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.99
 85 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.99
 86 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99
 87 | 
 88 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.70,cc_0.99
 89 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.75,cc_0.99
 90 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.80,cc_0.99
 91 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.85,cc_0.99
 92 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.90,cc_0.99
 93 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99
 94 | 	elif [ "$granularity" = "afs.bb1" ]; then
 95 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.99
 96 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.99
 97 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.99
 98 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.99
 99 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.99
100 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99
101 | 
102 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99
103 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99
104 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99
105 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99
106 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99
107 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
108 | 	elif [ "$granularity" = "afs.bb2" ]; then
109 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.99
110 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.99
111 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.99
112 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.99
113 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.99
114 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99
115 | 
116 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99
117 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99
118 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99
119 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99
120 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99
121 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
122 | 	fi
123 | 
124 | elif [ "$threshold" = "most" ]; then
125 | 	if [ "$granularity" = "all" ]; then
126 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99
127 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99
128 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99
129 | 
130 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99
131 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
132 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
133 |         elif [ "$granularity" = "afs" ]; then
134 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99
135 | 
136 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -hcf -ca=cc_0.95,cc_0.99
137 |         elif [ "$granularity" = "afs.bb1" ]; then
138 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99
139 | 
140 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
141 |         elif [ "$granularity" = "afs.bb2" ]; then
142 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.99
143 | 
144 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
145 |         fi
146 | fi
147 | 
148 | 


--------------------------------------------------------------------------------
/scripts/get_inconsistencies_NN_G2v.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | benchmark=$1
  4 | threshold=$2
  5 | granularity=$3
  6 | preprocess=$4
  7 | 
  8 | if [ -z "${benchmark}" ]; then
  9 |     echo "benchmark is unset or set to the empty string"
 10 |     exit 1;
 11 | fi
 12 | 
 13 | if [ -z "${preprocess}" ]; then
 14 |     echo "No preprocessing"
 15 |     preprocess="np"
 16 | fi
 17 | 
 18 | rm output
 19 | 
 20 | if [ "${preprocess}" = "p" ]; then
 21 | 	datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 22 | 	bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 23 | 	data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 24 | 
 25 | 	rm -rf "$data/$datasets/$benchmark"
 26 | 	rm -rf "$data/$bcs/$benchmark"
 27 | 	python __init__.py -p=$benchmark -a=BC
 28 | 	python __init__.py -p=$benchmark -a=PDG
 29 | 	python __init__.py -p=$benchmark -a=AS
 30 | 	python __init__.py -p=$benchmark -a=AS -cws=2
 31 | 
 32 | 	python __init__.py -p=$benchmark -a=FE -ft=afs_NN
 33 |         python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN
 34 |         python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_NN
 35 | 
 36 | 	python __init__.py -p=$benchmark -a=FE -ft=afs_G2v
 37 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_G2v
 38 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_G2v
 39 | fi
 40 | 
 41 | if [ "$threshold" = "all" ]; then
 42 | 	if [ "$granularity" = "all" ]; then
 43 | 	        python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.98
 44 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.98
 45 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.98
 46 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.98
 47 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.98
 48 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98
 49 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.98
 50 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.98
 51 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.98
 52 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.98
 53 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.98
 54 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98
 55 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.98
 56 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.98
 57 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.98
 58 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.98
 59 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.98
 60 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98
 61 | 
 62 | 	elif [ "$granularity" = "afs" ]; then
 63 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.70,cc_0.98
 64 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.75,cc_0.98
 65 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.80,cc_0.98
 66 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.85,cc_0.98
 67 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.90,cc_0.98
 68 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98
 69 | 
 70 | 	elif [ "$granularity" = "afs.bb1" ]; then
 71 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.70,cc_0.98
 72 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.75,cc_0.98
 73 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.80,cc_0.98
 74 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.85,cc_0.98
 75 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.90,cc_0.98
 76 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98
 77 | 
 78 | 	elif [ "$granularity" = "afs.bb2" ]; then
 79 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.70,cc_0.98
 80 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.75,cc_0.98
 81 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.80,cc_0.98
 82 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.85,cc_0.98
 83 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.90,cc_0.98
 84 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98
 85 | 
 86 | 	fi
 87 | 
 88 | elif [ "$threshold" = "most" ]; then
 89 | 	if [ "$granularity" = "all" ]; then
 90 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98
 91 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98
 92 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98
 93 | 
 94 |         elif [ "$granularity" = "afs" ]; then
 95 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98
 96 | 
 97 |         elif [ "$granularity" = "afs.bb1" ]; then
 98 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98
 99 | 
100 |         elif [ "$granularity" = "afs.bb2" ]; then
101 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_NN,afs.bb2_G2v -ca=cc_0.95,cc_0.98
102 | 
103 |         fi
104 | fi
105 | 
106 | 


--------------------------------------------------------------------------------
/scripts/get_inconsistencies_g2v.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | benchmark=$1
  4 | threshold=$2
  5 | granularity=$3
  6 | preprocess=$4
  7 | 
  8 | if [ -z "${benchmark}" ]; then
  9 |     echo "benchmark is unset or set to the empty string"
 10 |     exit 1;
 11 | fi
 12 | 
 13 | if [ -z "${preprocess}" ]; then
 14 |     echo "No preprocessing"
 15 |     preprocess="np"
 16 | fi
 17 | 
 18 | rm output
 19 | 
 20 | if [ "${preprocess}" = "p" ]; then
 21 | 	datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 22 | 	bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 23 | 	data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
 24 | 
 25 | 	rm -rf "$data/$datasets/$benchmark"
 26 | 	rm -rf "$data/$bcs/$benchmark"
 27 | 	python __init__.py -p=$benchmark -a=BC
 28 | 	python __init__.py -p=$benchmark -a=PDG
 29 | 	python __init__.py -p=$benchmark -a=AS
 30 | 	python __init__.py -p=$benchmark -a=AS -cws=2
 31 | 
 32 | 	python __init__.py -p=$benchmark -a=FE -ft=afs_NN
 33 |         python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN
 34 |         python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_NN
 35 | 	python __init__.py -p=$benchmark -a=FE -hcf -ft=afs_NN
 36 |         python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb1_NN
 37 |         python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb2_NN
 38 | 
 39 | 	python __init__.py -p=$benchmark -a=FE -ft=afs_G2v
 40 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_G2v
 41 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb2_G2v
 42 | 	python __init__.py -p=$benchmark -a=FE -hcf -ft=afs_G2v
 43 |         python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb1_G2v
 44 |         python __init__.py -p=$benchmark -a=FE -hcf -ft=afs.bb2_G2v
 45 | fi
 46 | 
 47 | if [ "$threshold" = "all" ]; then
 48 | 	if [ "$granularity" = "all" ]; then
 49 | 	        python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.70,cc_0.99
 50 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.75,cc_0.99
 51 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.80,cc_0.99
 52 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.85,cc_0.99
 53 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.90,cc_0.99
 54 |         	python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99
 55 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.70,cc_0.99
 56 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.75,cc_0.99
 57 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.80,cc_0.99
 58 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.85,cc_0.99
 59 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.90,cc_0.99
 60 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99
 61 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.70,cc_0.99
 62 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.75,cc_0.99
 63 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.80,cc_0.99
 64 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.85,cc_0.99
 65 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.90,cc_0.99
 66 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99
 67 | 
 68 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.70,cc_0.99
 69 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.75,cc_0.99
 70 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.80,cc_0.99
 71 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.85,cc_0.99
 72 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.90,cc_0.99
 73 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99
 74 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99
 75 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99
 76 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99
 77 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99
 78 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99
 79 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
 80 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99
 81 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99
 82 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99
 83 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99
 84 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99
 85 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
 86 | 	elif [ "$granularity" = "afs" ]; then
 87 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.70,cc_0.99
 88 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.75,cc_0.99
 89 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.80,cc_0.99
 90 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.85,cc_0.99
 91 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.90,cc_0.99
 92 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99
 93 | 
 94 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.70,cc_0.99
 95 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.75,cc_0.99
 96 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.80,cc_0.99
 97 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.85,cc_0.99
 98 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.90,cc_0.99
 99 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99
100 | 	elif [ "$granularity" = "afs.bb1" ]; then
101 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.70,cc_0.99
102 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.75,cc_0.99
103 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.80,cc_0.99
104 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.85,cc_0.99
105 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.90,cc_0.99
106 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99
107 | 
108 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.70,cc_0.99
109 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.75,cc_0.99
110 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.80,cc_0.99
111 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.85,cc_0.99
112 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.90,cc_0.99
113 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
114 | 	elif [ "$granularity" = "afs.bb2" ]; then
115 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.70,cc_0.99
116 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.75,cc_0.99
117 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.80,cc_0.99
118 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.85,cc_0.99
119 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.90,cc_0.99
120 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99
121 | 
122 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.70,cc_0.99
123 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.75,cc_0.99
124 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.80,cc_0.99
125 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.85,cc_0.99
126 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.90,cc_0.99
127 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
128 | 	fi
129 | 
130 | elif [ "$threshold" = "most" ]; then
131 | 	if [ "$granularity" = "all" ]; then
132 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99
133 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99
134 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99
135 | 
136 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99
137 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
138 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
139 |         elif [ "$granularity" = "afs" ]; then
140 |                 python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -ca=cc_0.95,cc_0.99
141 | 
142 | 		python __init__.py -p=$benchmark -a=MC -cf=afs_G2v,afs_G2v -hcf -ca=cc_0.95,cc_0.99
143 |         elif [ "$granularity" = "afs.bb1" ]; then
144 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -ca=cc_0.95,cc_0.99
145 | 
146 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_G2v,afs.bb1_G2v -hcf -ca=cc_0.95,cc_0.99
147 |         elif [ "$granularity" = "afs.bb2" ]; then
148 |                 python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -ca=cc_0.95,cc_0.99
149 | 
150 | 		python __init__.py -p=$benchmark -a=MC -cf=afs.bb2_G2v,afs.bb2_G2v -hcf -ca=cc_0.95,cc_0.99
151 |         fi
152 | fi
153 | 
154 | 


--------------------------------------------------------------------------------
/scripts/get_inconsistencies_real_programs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | benchmark=$1
 4 | preprocess=$2
 5 | 
 6 | if [ -z "${benchmark}" ]; then
 7 |     echo "benchmark is unset or set to the empty string"
 8 |     exit 1;
 9 | fi
10 | 
11 | if [ -z "${preprocess}" ]; then
12 |     echo "No preprocessing"
13 |     preprocess="np"
14 | fi
15 | 
16 | rm output
17 | 
18 | if [ "${preprocess}" = "p" ]; then
19 | 	datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
20 | 	bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
21 | 	data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
22 | 
23 | 	rm -rf "$data/$datasets/$benchmark"
24 | 	rm -rf "$data/$bcs/$benchmark"
25 | 	python __init__.py -p=$benchmark -a=BC
26 | 	python __init__.py -p=$benchmark -a=PDG
27 | 	python __init__.py -p=$benchmark -a=AS
28 | 	python __init__.py -p=$benchmark -a=FE -ft=afs_NN
29 | 	python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN
30 | fi
31 | 
32 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.99
33 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.99
34 | 
35 | 


--------------------------------------------------------------------------------
/scripts/get_inconsistencies_real_programs_NN_G2v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | benchmark=$1
 4 | preprocess=$2
 5 | split=$3
 6 | 
 7 | if [ -z "${benchmark}" ]; then
 8 |     echo "benchmark is unset or set to the empty string"
 9 |     exit 1;
10 | fi
11 | 
12 | if [ -z "${preprocess}" ]; then
13 |     echo "No preprocessing"
14 |     preprocess="np"
15 | fi
16 | 
17 | if [ "${preprocess}" = "p" ]; then
18 | 	datasets=$(cat ./settings.py | grep "DATASETS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
19 | 	bcs=$(cat ./settings.py | grep "BCS_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
20 | 	data=$(cat ./settings.py | grep "DATA_DIR" | cut -d '=' -f 2 | cut -d '#' -f 1 | tr -d \'\" | tr -d '[:space:]')
21 | 	echo "Removing dataset folder of $benchmark"
22 | 	rm -rf "$data/$datasets/$benchmark"
23 | 	echo "Removing IR folder of $benchmark"
24 | 	rm -rf "$data/$bcs/$benchmark"
25 | 	python __init__.py -p=$benchmark -a=BC
26 | 	python __init__.py -p=$benchmark -a=PDG
27 | 	python __init__.py -p=$benchmark -a=AS
28 | 	if [ "${split}" = "ns" ]; then
29 | 		python __init__.py -p=$benchmark -a=FE -ft=afs_NN
30 | 		python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN
31 | 	else
32 | 		python __init__.py -p=$benchmark -a=FE -ft=afs_NN -s=True
33 |                 python __init__.py -p=$benchmark -a=FE -ft=afs.bb1_NN -s=True
34 | 	fi
35 | fi
36 | 
37 | python __init__.py -p=$benchmark -a=MC -cf=afs_NN,afs_G2v -ca=cc_0.95,cc_0.98 -sc=online
38 | python __init__.py -p=$benchmark -a=MC -cf=afs.bb1_NN,afs.bb1_G2v -ca=cc_0.95,cc_0.98 -sc=online
39 | 


--------------------------------------------------------------------------------
/settings-bak.py:
--------------------------------------------------------------------------------
 1 | ACTIONS = 'AS'
 2 | LANGUAGES = 'C'
 3 | DATA_DIR = '/Users/mansourahmadi/Bank/Work/NEU/vulfinder/data/'  # Full path must be provided
 4 | PROJECTS_DIR = 'projects'
 5 | ASTS_DIR = 'asts'
 6 | SAVE_FORMAT = 'graph'  # json , graph , ast
 7 | BCS_DIR = 'bcs'
 8 | DATASETS_DIR = 'datasets'
 9 | PLOTS_DIR = 'plots'
10 | CLUSTERING_ALGS = 'dbscancos_0.3,dbscancos_0.01'  # aff_cos dbscan_cos means
11 | COSE_SIMILARITY_CHUNK_SIZE = 25000  # If there is 200G Ram
12 | CLUSTERING_FEAT = 'afs_NN,afs_G2v'
13 | SECOND_CLUSTERING = 'offline'  # offline, online
14 | BIG_CLUSTERS_IGNORE = 200  # Size of big clusters that should be ignored from the first step clustering
15 | #  as they might not contain useful data
16 | CHUNK_WINDOW_SIZE = 2
17 | SPLIT = True
18 | SEARCH_SPACES = []  # Empty means everywhere in the projects
19 | PROJECTS = 'juliet-test-suite'  # 'openssl-41bff72'  # 'ffmpeg-b2f0f37'  # Empty means all projects
20 | IGNORE_COMPILE_COMMANDS = True
21 | # UNIFIED_PROJECTS = [['test']]
22 | FEATURE_TYPES = 'afs_NN'  # afs_G2v, afs.bb2_NN
23 | # CLANG_LIB_DIR = '../../tools/clang+llvm-5.0.1-x86_64-linux-gnu-ubuntu-16.04/lib'
24 | CLANG_LIB_DIR = './../tools/clang+llvm-5.0.1/lib'
25 | LLVM_CONFIG = '/usr/local/Cellar/llvm/6.0.0/bin/'
26 | INCLUDES = ''
27 | CLANG = 'clang'
28 | PDG_DUMPER = ''
29 | STAT_TYPE = 'ST'  # SS, SI, ST
30 | STAT_SIM_TYPES = 'NN,G2v'
31 | INCONSISTENCY_TYPE = 'check'  # check, call, type, order
32 | SIMILARITY_THRESHOLD = 0.7
33 | GRANULARITY = 'afs,afs.bb1,afs.bb2'
34 | DEPENDENCY = ''  # all , odd , cdd
35 | CALL_INCONSISTENCY = 'free,close,memset,clear,zero,remove,unlock,end,clean,cleanse,assert'
36 | TYPE_INCONSISTENCY = 'sext,trunc'  # 'fptrunc,sext,zext,call zeroext,call signext,sitofp,uitofp,bitcast'
37 | STORE_INCONSISTENCY = 'null'
38 | INCONSISTENCY_QUERY_OPTIONS = 'top_10'
39 | COUNT_CPU = 10
40 | BENCHMARK_GROUNDTRUTH_PATH = '../iBench/'


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | ACTIONS = ''
 2 | LANGUAGES = 'C'
 3 | DATA_DIR = '/home/mansour/bank/FICS/data/'  # Full path must be provided
 4 | PROJECTS_DIR = 'projects'
 5 | ASTS_DIR = 'asts'
 6 | SAVE_FORMAT = 'graph'  # json , graph , ast
 7 | BCS_DIR = 'bcs'
 8 | DATASETS_DIR = 'datasets'
 9 | PLOTS_DIR = 'plots'
10 | CLUSTERING_ALGS = 'dbscancos_0.3,dbscancos_0.02'
11 | CLUSTERING_FEAT = 'afs_NN,afs_G2v'
12 | SECOND_CLUSTERING = 'offline'
13 | COSE_SIMILARITY_CHUNK_SIZE = 200000  # BB 150000  # If there is 200G Ram
14 | BIG_CLUSTERS_IGNORE = 50
15 | CHUNK_WINDOW_SIZE = 1
16 | SPLIT = False
17 | SEARCH_SPACES = []  # Empty means everywhere in the projects
18 | PROJECTS = ''  # 'ffmpeg-b2f0f37'  # Empty means all projects
19 | IGNORE_COMPILE_COMMANDS = False
20 | FEATURE_TYPES = 'afs_NN'  # afs_G2v, afs.bb2_NN
21 | # CLANG_LIB_DIR = '../../tools/clang+llvm-5.0.1-x86_64-linux-gnu-ubuntu-16.04/lib'
22 | CLANG_LIB_DIR = '/usr/lib/x86_64-linux-gnu'
23 | LLVM_CONFIG = ''
24 | INCLUDES = ''
25 | CLANG = 'clang-3.8'  # 'clang-6.0'  #'clang-3.8'
26 | PDG_DUMPER = './dg/tools/llvm-dg-dump'
27 | STAT_TYPE = 'ST'  # SI, SS , ST
28 | STAT_SIM_TYPES = 'NN,G2v'
29 | INCONSISTENCY_TYPE = 'check'  # check, call, type, order
30 | SIMILARITY_THRESHOLD = 0.7
31 | GRANULARITY = 'afs,afs.bb1,afs.bb2'
32 | DEPENDENCY = ''  # all , odd , cdd
33 | CALL_INCONSISTENCY = 'free,close,memset,clear,bzero,remove,unlock,end,clean,cleanse,assert'
34 | TYPE_INCONSISTENCY = 'trunc'  # 'fptrunc,sext,zext,call zeroext,call signext,sitofp,uitofp,bitcast'
35 | STORE_INCONSISTENCY = 'null'
36 | INCONSISTENCY_QUERY_OPTIONS = 'top_10'
37 | COUNT_CPU = 8
38 | BENCHMARK_GROUNDTRUTH_PATH = 'iBench/'
39 | 


--------------------------------------------------------------------------------
/ssh_private_key_password.py:
--------------------------------------------------------------------------------
1 | ip=''
2 | username=''
3 | password=''
4 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RiS3-Lab/FICS/82c8abef52ca943946b7e82a16998cf67f1d2049/utils/__init__.py


--------------------------------------------------------------------------------
/utils/computation.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | 
 3 | 
 4 | def is_number(num):
 5 |     try:
 6 |         if isinstance(float(num), numbers.Number):
 7 |             return True
 8 |         else:
 9 |             return False
10 |     except:
11 |         return False
12 | 
13 | 


--------------------------------------------------------------------------------
/utils/inout.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | import sys
  5 | from cgitb import grey
  6 | from os.path import join, normpath
  7 | from shutil import move, copy
  8 | 
  9 | import pandas as pd
 10 | 
 11 | 
 12 | def exist_file(file_path):
 13 |     return os.path.isfile(file_path)
 14 | 
 15 | 
 16 | def exist_dir(path):
 17 |     return os.path.isdir(path)
 18 | 
 19 | 
 20 | def make_dir_if_not_exist(path):
 21 |     if not exist_dir(path):
 22 |         os.makedirs(path)
 23 | 
 24 | 
 25 | def get_basename(file_path):
 26 |     return os.path.basename(file_path)
 27 | 
 28 | 
 29 | def join_path(*args):
 30 |     return normpath(join(*args))
 31 | 
 32 | 
 33 | def get_dataframe(file_path, columns=''):
 34 |     try:
 35 |         dataframe = pd.read_csv(file_path)
 36 |     except:
 37 |         with open(file_path, 'w') as f:
 38 |             f.write(columns)
 39 |             f.close()
 40 |         dataframe = pd.read_csv(file_path)
 41 |     return dataframe
 42 | 
 43 | 
 44 | def show_error(message):
 45 |     print
 46 |     print '*' * 50
 47 |     print 'Error: {}'.format(message)
 48 |     sys.exit(2)
 49 | 
 50 | 
 51 | def get_current_directory():
 52 |     current_path = os.getcwd()
 53 |     return current_path
 54 | 
 55 | 
 56 | def change_directory(directory):
 57 |     os.chdir(directory)
 58 | 
 59 | 
 60 | def load_from_csv(modes_files_path, file_path, separated=False):
 61 |     mode_file_path = modes_files_path + '/' + file_path
 62 |     mode_file = pd.read_csv(mode_file_path, delimiter=',')
 63 |     if separated is True:
 64 |         mode_feature_vectors = mode_file.ix[:, :-1]
 65 |         mode_class_labels = mode_file.ix[:, -1]
 66 | 
 67 |         return mode_feature_vectors, mode_class_labels
 68 |     else:
 69 |         return mode_file
 70 | 
 71 | 
 72 | def get_directories(directory_path):
 73 |     return [join_path(directory_path, i) for i in os.listdir(directory_path) if not i.startswith('.')]
 74 | 
 75 | 
 76 | def remove_file(file_name):
 77 |     os.remove(file_name)
 78 | 
 79 | 
 80 | def remove_directory(dir_name):
 81 |     if exist_dir(dir_name):
 82 |         shutil.rmtree(dir_name)
 83 | 
 84 | 
 85 | def move_file(src_file, dst_file):
 86 |     move(src_file, dst_file)
 87 | 
 88 | 
 89 | def copy_file(src_file, dst_file):
 90 |     copy(src_file, dst_file)
 91 | 
 92 | 
 93 | def get_parent_dir(file_name):
 94 |     return os.path.dirname(file_name)
 95 | 
 96 | 
 97 | def is_file(file_name):
 98 |     return os.path.isfile(file_name)
 99 | 
100 | 
101 | def get_files_in_dir(dir, ext='', search_spaces=[], start=''):
102 |     files = []
103 |     for path, sub_dirs, file_names in os.walk(dir):
104 | 
105 |         for file_name in file_names:
106 |             if file_name.endswith(ext) and file_name.startswith(start):
107 |                 in_search_space = False
108 |                 if len(search_spaces) == 0:
109 |                     in_search_space = True
110 |                 for search_space in search_spaces:
111 |                     if search_space in path:
112 |                         in_search_space = True
113 |                 if not in_search_space:
114 |                     continue
115 |                 files.append(join_path(path, file_name))
116 | 
117 |     return files
118 | 
119 | 
120 | def get_cfiles_compile_db(compile_db):
121 |     cfiles = {}
122 |     for json_values in compile_db:
123 |         if 'file' in json_values:
124 |             file_path = join_path(json_values['directory'], json_values['file'])
125 |             # print file_path
126 |             if file_path.endswith(".c"):
127 |                 final_args = []
128 |                 if 'arguments' in json_values:
129 | 		    args = list(json_values['arguments'])
130 | 		else:
131 | 		    args = [item for item in json_values['command'].split() if not item.endswith('.o') and not item.endswith('.c')]
132 |                 remove_items = ('-c', 'cc', '-o', '-g')
133 |                 for remove_item in remove_items:
134 |                     if remove_item in args:
135 |                         args.remove(remove_item)
136 | 
137 |                 for i in range(len(args)):
138 |                     arg = args[i]
139 |                     if arg.startswith('-I.') or arg.startswith('-I..'):
140 |                         arg = '-I' + join_path(json_values['directory'], arg[2:])
141 |                     if arg == '.' or arg == '..':
142 |                         arg = join_path(json_values['directory'], arg)
143 |                     final_args.append(arg)
144 | 
145 |                 cfiles[file_path] = final_args
146 | 		# print file_path, final_args
147 |             else:
148 |                 print "Not a C file:", file_path
149 | 
150 |     return cfiles
151 | 
152 | 
153 | def read_file(file_name):
154 |     with open(file_name, 'r') as f:
155 |         return f.read()
156 | 
157 | 
158 | def write_file(file_name, content):
159 |     with open(file_name, 'w') as f:
160 |         f.write(content)
161 |         f.close()
162 | 
163 | 
164 | def write_file_json(file_name, content):
165 |     with open(file_name, 'w') as f:
166 |         json.dump(content, f)
167 |         f.close()
168 | 
169 | 
170 | def get_filename_without_ext(file_name):
171 |     return os.path.splitext(file_name)[0]
172 | 
173 | 
174 | def load_json(file_path):
175 |     return json.load(open(file_path))
176 | 
177 | 
178 | def load_json_file(file_path):
179 |     with open(file_path) as f:
180 |         json_content = eval(f.read())
181 |         return json_content
182 | 
183 | 
184 | def get_arguments(file_path, json_data):
185 |     for json_values in json_data:
186 |         if 'file' in json_values:
187 |             if join_path(json_values['directory'], json_values['file']) == file_path:
188 |                 args = list(json_values['arguments'])
189 |                 for i in range(len(args)):
190 |                     arg = args[i]
191 |                     if arg.startswith('-I./'):
192 |                         arg = '-I{}'.format(arg[4:])
193 |                     if arg.startswith('-I..'):
194 |                         parent_include = json_values['directory']
195 |                         include_path = arg[2:]
196 |                         for item in range(arg.count('..')):
197 |                             parent_include = get_parent_dir(parent_include)
198 |                             include_path = include_path[3:]
199 |                         parent_include = join_path(parent_include, include_path)
200 |                         args[i] = '-I{}'.format(parent_include)
201 |                 remove_items = ('-c', 'cc', '-o')
202 |                 for item in remove_items:
203 |                     if item in args:
204 |                         args.remove(item)
205 |                 return args
206 | 
207 |     # print file_path
208 |     return None
209 | 
210 | 
211 | def check_missing_files(c_files, json_data):
212 |     for json_values in json_data:
213 |         flag = 0
214 |         for c_file in c_files:
215 |             if 'file' in json_values:
216 |                 if join_path(json_values['directory'], json_values['file']) == c_file:
217 |                     flag = 1
218 |         if flag == 0:
219 |             if 'file' in json_values:
220 |                 print join_path(json_values['directory'], json_values['file'])
221 |             elif 'files' in json_values:
222 |                 print json_values['directory'], json_values['files']
223 | 
224 | 
225 | def read_lines(file_path):
226 |     with open(file_path) as f:
227 |         content = f.readlines()
228 |     return [x.strip() for x in content]
229 | 
230 | 
231 | def read_csv_header(file_path):
232 |     first_row = pd.read_csv(file_path, index_col=0, nrows=1)
233 |     return first_row.columns.values
234 | 
235 | 
236 | class bcolors:
237 |     HEADER = '\033[95m'
238 |     OKBLUE = '\033[94m'
239 |     OKGREEN = '\033[92m'
240 |     WARNING = '\033[93m'
241 |     FAIL = '\033[91m'
242 |     ENDC = '\033[0m'
243 |     BOLD = '\033[1m'
244 |     UNDERLINE = '\033[4m'
245 |     GREY = '\033[90m'
246 | 


--------------------------------------------------------------------------------
/utils/progress.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | class Progress:
 5 |     def __init__(self, total, message='progress'):
 6 |         self.message = message
 7 |         self.total = total
 8 |         self.current = 0
 9 | 
10 |     def next(self, step=1, extra_message=''):
11 |         bar_length, status = 20, ""
12 |         self.current += step
13 |         progress = float(self.current) / float(self.total)
14 |         if progress >= 1.:
15 |             progress = 1
16 |         block = int(round(bar_length * progress))
17 |         text = "\r{} [{}] {:.0f}% {} {}".format(self.message,
18 |                                                 "#" * block + "-" * (bar_length - block), round(progress * 100, 0),
19 |                                                 status, extra_message)
20 |         sys.stdout.write(text)
21 |         sys.stdout.flush()
22 | 
23 |     @staticmethod
24 |     def finish():
25 |         sys.stdout.write("\n")
26 |         sys.stdout.flush()
27 | 
28 |     @staticmethod
29 |     def print_counter(counter, message):
30 |         text = "\r{}: {}".format(message, counter)
31 |         sys.stdout.write(text)
32 |         sys.stdout.flush()
33 | 


--------------------------------------------------------------------------------