├── LICENSE ├── README.md ├── centrifuge ├── __pycache__ │ ├── binfile.cpython-36.pyc │ ├── binfile.cpython-37.pyc │ ├── dataID.cpython-37.pyc │ └── datablock.cpython-37.pyc ├── binfile.py ├── datablock.py ├── distributions │ ├── cpu_architectures │ │ ├── AMD64_reference │ │ ├── ARM64_reference │ │ ├── ARMEL_reference │ │ ├── MIPS64EL_reference │ │ ├── MIPSEL_reference │ │ ├── PPC64_reference │ │ ├── PowerPC_reference │ │ ├── SH4_reference │ │ └── i386_reference │ └── data_types │ │ ├── archive │ │ └── readme.txt │ │ ├── machine_code │ │ ├── max_entropy │ │ └── utf8_english └── utils │ └── plotutils.py ├── gallery ├── 1.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 14.png ├── 15.png ├── 16.png ├── 17.png ├── 18.png ├── 19.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png └── 9.png ├── images ├── approach.png ├── approach_2.png └── approach_3.png ├── notebooks ├── Analyzing Firmware with Centrifuge Example 2.ipynb ├── Analyzing Firmware with Centrifuge.ipynb ├── Analyzing Machine Code Targeting an Usupported Architecture.ipynb ├── CPU Architecture Reference Distributions │ ├── Comparing CPU Architecture Reference Distributions.ipynb │ ├── Exploring Machine Code Byte Value Distributions.ipynb │ └── architectures │ │ ├── AMD64 reference distribution construction.ipynb │ │ ├── ARM64 reference distribution construction.ipynb │ │ ├── ARMEL reference distribution construction.ipynb │ │ ├── MIPS64EL reference distribution construction.ipynb │ │ ├── MIPSEL reference distribution construction.ipynb │ │ ├── PPC64 reference distribution construction.ipynb │ │ ├── PowerPC reference distribution construction.ipynb │ │ ├── SH4 reference distribution construction.ipynb │ │ └── i386 reference distribution construction.ipynb ├── Data Type Reference Distributions │ ├── The Machine Code Reference Distribution.ipynb │ ├── The Max Entropy Reference Distribution.ipynb │ └── The UTF-8 (English) Reference Distribution.ipynb ├── Introduction to Centrifuge.ipynb ├── Using DBSCAN to Cluster File Data.ipynb └── archive │ ├── Analyzing Executable Binaries with DBSCAN.ipynb │ └── readme.txt └── scripts ├── basic_DBSCAN_clustering.py ├── entropy_plot.py ├── entropy_plot_text_section.py ├── identify_clusters.py ├── plot_all_variables.py ├── plot_cluster_cdfs.py ├── plot_two_variables.py ├── readme.txt └── small_elf.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Julian Daeumer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Centrifuge 2 | 3 | Centrifuge makes it easy to use visualization, statistics and machine learning to analyze information in binary files. 4 | 5 |
6 | 7 | This tool implements two new approaches to analysis of file data: 8 | 9 | 1. [DBSCAN](https://scikit-learn.org/stable/modules/clustering.html#dbscan), an unsupervised machine learning algorithm, is used to find clusters of byte sequences based on their statistical properties (features). Byte sequences that encode the same data type, e.g. machine code, typically have similar properties. As a result, clusters are often representative of a specific data type. Each cluster can be extracted and analysed further. 10 | 11 | 2. The specific data type of a cluster can often be identified without using machine learning by measuring the [Wasserstein distance](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html) between its byte value distribution and a data type *reference distribution*. If this distance is less than a set threshold for a particular data type, that cluster will be identified as that data type. Currently, reference distributions exist for high entropy data, UTF-8 english, and machine code targeting various CPU architectures. 12 | 13 | These two approaches are used together in sequence: first DBSCAN finds clusters, then the Wasserstein distances between the clusters' data and the reference distributions are measured to identify their data type. To identify the target CPU of any machine code discovered in the file, Centrifuge uses [ISAdetect](https://github.com/kairis/isadetect). 14 | 15 | ## Required Libraries 16 | 17 | All required libraries come bundled with [Anaconda](https://www.anaconda.com/products/individual). 18 | 19 | *Developed in a Linux environment. Not tested on Windows or MacOS. 20 | 21 | ## Usage 22 | 23 | Detailed walkthroughs can be found in the [notebooks](https://github.com/BinaryResearch/centrifuge/tree/master/notebooks). Code snippets are located in the [scripts](https://github.com/BinaryResearch/centrifuge-toolkit/tree/master/scripts) folder. 24 | 25 | - [Introduction to Centrifuge](https://github.com/BinaryResearch/centrifuge/blob/master/notebooks/Introduction%20to%20Centrifuge.ipynb) provides an overview of Centrifuge's features and a demonstration of how the tool works. 26 | - [Using DBSCAN to Cluster File Data](https://github.com/BinaryResearch/centrifuge-toolkit/blob/master/notebooks/Using%20DBSCAN%20to%20Cluster%20File%20Data.ipynb) shows examples of how to adjust DBSCAN's `eps` and `min_samples` parameters to get the best results. 27 | - [Analyzing Firmware with Centrifuge](https://github.com/BinaryResearch/centrifuge-toolkit/blob/master/notebooks/Analyzing%20Firmware%20with%20Centrifuge.ipynb) and [Analyzing Firmware with Centrifuge Example 2](https://github.com/BinaryResearch/centrifuge-toolkit/blob/master/notebooks/Analyzing%20Firmware%20with%20Centrifuge%20Example%202.ipynb) provide tutorials for analyzing firmware binaries. 28 | - [Analyzing Machine Code Targeting an Usupported Architecture](https://github.com/BinaryResearch/centrifuge-toolkit/blob/master/notebooks/Analyzing%20Machine%20Code%20Targeting%20an%20Usupported%20Architecture.ipynb) discusses what may occur when an executable binary contains machine code targeting a CPU architecture for which there is no matching reference distribution and ISAdetect does not correctly classify it. 29 | 30 | ## Overview of the Approach 31 | 32 | The first step is file partitioning and feature measurement. 33 | 34 | 35 | 36 | DBSCAN can then be used to find clusters in the file data. 37 | 38 | 39 | 40 | Once clusters have been found, the data in the clusters can be identified. 41 | 42 | 43 | 44 | The feature observations of each cluster are stored in a separate data frame, one for each cluster (e.g if 6 clusters are found, there will be 6 data frames, 1 per cluster). The output of DBSCAN is also saved in a data frame. This means custom analysis of any/all clusters can easily be performed any time after DBSCAN identifies clusters in the file data. 45 | 46 | ## Example Output 47 | 48 | Output of `bash.identify_cluster_data_types()`, as seen in [Introduction to Centrifuge](https://github.com/BinaryResearch/centrifuge/blob/master/notebooks/Introduction%20to%20Centrifuge.ipynb): 49 | 50 | ``` 51 | Searching for machine code 52 | -------------------------------------------------------------------- 53 | 54 | [+] Checking Cluster 4 for possible match 55 | [+] Closely matching CPU architecture reference(s) found for Cluster 4 56 | [+] Sending sample to https://isadetect.com/ 57 | [+] response: 58 | 59 | { 60 | "prediction": { 61 | "architecture": "amd64", 62 | "endianness": "little", 63 | "wordsize": 64 64 | }, 65 | "prediction_probability": 1.0 66 | } 67 | 68 | 69 | Searching for utf8-english data 70 | ------------------------------------------------------------------- 71 | 72 | [+] UTF-8 (english) detected in Cluster 3 73 | Wasserstein distance to reference: 16.337275669642857 74 | 75 | [+] UTF-8 (english) detected in Cluster 5 76 | Wasserstein distance to reference: 11.878225097656252 77 | 78 | 79 | Searching for high entropy data 80 | ------------------------------------------------------------------- 81 | 82 | [+] High entropy data found in Cluster 1 83 | Wasserstein distance to reference: 0.48854199218749983 84 | [*] This distance suggests the data in this cluster could be 85 | a) encrypted 86 | b) compressed via LZMA with maximum compression level 87 | c) something else that is random or close to random. 88 | ``` 89 | 90 | ## File Data Visualization 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | More pictures can be found in the [gallery](https://github.com/BinaryResearch/centrifuge-toolkit/tree/master/gallery). 105 | 106 | ## Example Use Cases 107 | 108 | - **Determining whether a file contains a particular type of data.** 109 | 110 | An entropy scan is useful for discovering compressed or encrypted data, but what about other data types such as machine code, symbol tables, sections of hardcoded ASCII strings, etc? Centrifuge takes advantage of the fact that in binary files, information encoded in a particular way is stored contiguously and uses scikit-learn's implementation of DBSCAN to locate these regions. 111 | - **Analyzing files with no metadata such as magic numbers, headers or other format information.** 112 | 113 | This includes most firmware, as well as corrupt files. Centrifuge does not depend on metadata or signatures of any kind. 114 | - **Investigating differences between different types of data using statistical methods or machine learning, or building a model or "profile" of a specific data type.** 115 | 116 | Does machine code differ in a systematic way from other types of information encoded in binary files? Can compressed data be distinguished from encrypted data? These questions can be investigated in an empirical way using Centrifuge. 117 | - **Visualizing information in files using Python libraries such as Seaborn, Matplotlib and Altair** 118 | 119 | Rather than generate elaborate 2D or 3D visual representations of file contents using space-filling curves or cylindrical coordinate systems, Centrifuge creates data frames that contain the feature measurements of each cluster. The information in these data frames can be easily visualized with boxplots, violin plots, pairplots, histograms, density plots, scatterplots, barplots, cumulative distribution function (CDF) plots, etc. 120 | 121 | ## Dataset 122 | 123 | The [ISAdetect dataset](https://etsin.fairdata.fi/dataset/9f6203f5-2360-426f-b9df-052f3f936ed2/data) was used to create the i386, AMD64, MIPSEL, MIPS64EL, ARM64, ARMEL, PowerPC, PPC64, and SH4 reference distributions. 124 | 125 | ## Todo 126 | 127 | - Adding the ability to use [OPTICS](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS) for automatic clustering. It would be nice to automate the entire workflow, going straight from an input file to data type identification. Currently this is not possible because `eps` and `min_samples` need to be adjusted manually in order ensure meaningful results when using DBSCAN. 128 | - Improving the UTF-8 english data reference distribution. Rather than derive it from text extracted from an ebook, samples should be drawn from hard-coded text data in executable binaries. 129 | - Creating reference distributions for AVR and Xtensa 130 | - update the code with docstrings and comments 131 | -------------------------------------------------------------------------------- /centrifuge/__pycache__/binfile.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/__pycache__/binfile.cpython-36.pyc -------------------------------------------------------------------------------- /centrifuge/__pycache__/binfile.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/__pycache__/binfile.cpython-37.pyc -------------------------------------------------------------------------------- /centrifuge/__pycache__/dataID.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/__pycache__/dataID.cpython-37.pyc -------------------------------------------------------------------------------- /centrifuge/__pycache__/datablock.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/__pycache__/datablock.cpython-37.pyc -------------------------------------------------------------------------------- /centrifuge/binfile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import requests 5 | import matplotlib.pyplot as plt 6 | from scipy.stats import entropy, gamma, relfreq, wasserstein_distance 7 | import numpy as np 8 | import pandas as pd 9 | from pandas.plotting import scatter_matrix 10 | import seaborn as sns; sns.set() 11 | from sklearn.cluster import DBSCAN 12 | from sklearn import metrics 13 | from sklearn.preprocessing import StandardScaler 14 | from sklearn.neighbors import NearestNeighbors 15 | from math import ceil, sqrt 16 | 17 | from centrifuge.datablock import DataBlock 18 | 19 | 20 | 21 | 22 | def find_optimal_eps(matrix, k, epsilon): 23 | """ 24 | Uses kNN distances to plot a curve. This curve can then be used 25 | to choose an optimal value of eps for DBSCAN 26 | """ 27 | nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(matrix) 28 | distances, indices = nbrs.kneighbors(matrix) 29 | sorted_distances = np.sort(np.concatenate(distances[:, -1:])) 30 | 31 | plt.axhline(y=epsilon, color='red') 32 | plt.text(0, epsilon+0.25, "eps = " + str(epsilon)) 33 | 34 | plt.plot(np.arange(len(sorted_distances)), sorted_distances) 35 | plt.title("K-nearest neighbor distances. Use this plot to choose an optimal epsilon value") 36 | plt.xlabel("index") 37 | plt.ylabel("kNN distances for k = " + str(k)) 38 | plt.ylim(0) 39 | plt.show() 40 | 41 | 42 | 43 | 44 | class BinFile: 45 | def __init__(self, file_handle): 46 | self.file_handle = file_handle 47 | self.pathname = self.file_handle.name 48 | self.block_size = 1024 # default; will be scaled based on file size, and may be updated manually 49 | 50 | self.file_handle.seek(0) 51 | #self.data = self.file_handle.read() # read full file into memory 52 | self.file_handle.seek(0,2) 53 | self.size = self.file_handle.tell() 54 | self.file_handle.seek(0) 55 | 56 | self.debug_level = 0 # default; may be overridden via set_debug_level() 57 | 58 | # 59 | self.blocks = [] 60 | #self.block_offsets = [] 61 | self.block_offsets = None 62 | 63 | self.block_entropy_levels = None # uses log base 2, not log base 10 64 | self.block_zeroes_ratios = None # % bytes in chunk that are 0 65 | self.block_ascii_ratios = None # % bytes in chunk that fall between 32 and 126 inclusive 66 | self.block_byteval_std_dev = None # 67 | self.block_byteval_std_dev_counts = None # std dev of the counts of each byte value, not the values themselves 68 | self.block_byteval_mean = None # 69 | self.block_byteval_median = None # 70 | self.file_data_frame = None # data frame built from np arrays of file chunk information stats 71 | 72 | # DBSCAN results stored in this variable 73 | self.db = None # refers to DBSCAN output 74 | self.dbscan_data_frame = None # file data frame + cluster labels 75 | 76 | ############################### 77 | # methods 78 | 79 | 80 | def seek(self, offset): 81 | self.file_handle.seek(offset) 82 | 83 | 84 | # manually set block size 85 | def set_block_size(self, num_bytes): 86 | self.block_size = num_bytes 87 | 88 | # manually set size of file 89 | def set_size(self, num_bytes): 90 | self.size = num_bytes 91 | 92 | 93 | def slice_file(self): 94 | # number of blocks = file size / block size 95 | 96 | # initialize np arrays 97 | self.block_offsets = np.empty(ceil(self.size / self.block_size), dtype='int64') 98 | self.block_entropy_levels = np.empty(ceil(self.size / self.block_size), dtype='float64') 99 | self.block_zeroes_ratios = np.empty(ceil(self.size / self.block_size), dtype='float64') 100 | self.block_ascii_ratios = np.empty(ceil(self.size / self.block_size), dtype='float64') 101 | self.block_byteval_std_dev = np.empty(ceil(self.size / self.block_size), dtype='float64') 102 | self.block_byteval_std_dev_counts = np.empty(ceil(self.size / self.block_size), dtype='float64') 103 | self.block_byteval_mean = np.empty(ceil(self.size / self.block_size), dtype='float64') 104 | self.block_byteval_median = np.empty(ceil(self.size / self.block_size), dtype='int64') 105 | 106 | offset = 0 # tracks block offsets 107 | for i in range(0, (ceil(self.size / self.block_size))): 108 | new_block = DataBlock(self.pathname, 109 | self.file_handle.read(self.block_size), 110 | self.block_size, 111 | offset) 112 | self.blocks.append(new_block) 113 | 114 | self.block_offsets[i] = new_block.offset 115 | self.block_entropy_levels[i] = new_block.entropy 116 | self.block_zeroes_ratios[i] = new_block.zeroes_ratio 117 | self.block_ascii_ratios[i] = new_block.ascii_ratio 118 | self.block_byteval_std_dev[i] = new_block.byteval_std_dev 119 | self.block_byteval_std_dev_counts[i] = new_block.byteval_std_dev_counts 120 | self.block_byteval_mean[i] = new_block.byteval_mean 121 | self.block_byteval_median[i] = new_block.byteval_median 122 | 123 | offset += self.block_size 124 | 125 | # now that the np arrays have been created, build data frame 126 | self.create_data_frame() 127 | 128 | 129 | # should create a dataframe out of these arrays 130 | def create_data_frame(self): 131 | '''create data frame from lists''' 132 | if self.debug_level > 0: 133 | print("[+] creating data frame") 134 | 135 | self.file_data_frame = pd.DataFrame({'entropy': self.block_entropy_levels, 136 | 'zeroes ratios': self.block_zeroes_ratios, 137 | 'ascii ratios': self.block_ascii_ratios, 138 | 'byte value std dev': self.block_byteval_std_dev, 139 | 'byte value counts std dev': self.block_byteval_std_dev_counts, 140 | 'byte value mean': self.block_byteval_mean, 141 | 'byte value median': self.block_byteval_median}) 142 | 143 | 144 | def show_scatter_matrix(self): 145 | '''plots all columns against each other''' 146 | pd.plotting.scatter_matrix(self.file_data_frame, alpha=0.3, figsize=(20,20), diagonal='kde') 147 | plt.show() 148 | 149 | 150 | 151 | 152 | # TODO: add docstring 153 | #def entropy_vs_zeroes_ratios_quickplot(self): 154 | # 155 | # plt.scatter(self.block_zeroes_ratios, 156 | # self.block_entropy_levels, 157 | # alpha=0.15, 158 | # color='purple') 159 | # 160 | # plt.title('Entropy vs. Ratio of 0x00 Byte Values') 161 | # plt.xlabel('0x00 Byte Ratio') 162 | # plt.ylabel('Entropy') 163 | # plt.xlim(-0.05, 1) 164 | # plt.ylim(0, 8.5) 165 | # 166 | # plt.show() 167 | 168 | 169 | 170 | def plot_variables_by_range(self, x, y, start, end, target_data_marker=None, other_data_marker=None, target_data_color=None, other_data_color=None, title=None, xlabel=None, ylabel=None): 171 | plt.title('test') 172 | 173 | within_range_mask = np.logical_and(self.block_offsets >= start, self.block_offsets <= end) 174 | out_of_range_mask = np.logical_xor(self.block_offsets >= start, self.block_offsets <= end) 175 | 176 | offsets_within_range = self.block_offsets[within_range_mask] 177 | x_within_range = x[within_range_mask] 178 | y_within_range = y[within_range_mask] 179 | 180 | if target_data_marker is None: 181 | target_data_marker = 's' 182 | 183 | if other_data_marker is None: 184 | other_data_marker = 'o' 185 | 186 | if target_data_color is None: 187 | target_data_color = 'red' 188 | 189 | if other_data_color is None: 190 | other_data_color = 'black' 191 | 192 | plt.plot(x_within_range, y_within_range, target_data_marker, color = target_data_color, alpha=0.3) 193 | 194 | offsets_out_of_range = self.block_offsets[out_of_range_mask] 195 | x_out_of_range = x[out_of_range_mask] 196 | y_out_of_range = y[out_of_range_mask] 197 | 198 | plt.plot(x_out_of_range, y_out_of_range, other_data_marker, color = other_data_color, alpha=0.3) 199 | 200 | plt.title(title) 201 | plt.xlabel(xlabel) 202 | plt.ylabel(ylabel) 203 | 204 | plt.show() 205 | 206 | 207 | 208 | 209 | def plot_file_entropy(self, start=None, end=None): 210 | ''' 211 | start and none are numbers representing offsets within the file. 212 | Can be decimal or hexadecimal. 213 | ''' 214 | 215 | try: 216 | plt.axvline(x=start, color='red') 217 | plt.axvline(x=end, color='red') 218 | except TypeError: 219 | pass 220 | 221 | plt.plot(self.block_offsets, 222 | self.block_entropy_levels, 223 | linewidth=0.8, 224 | color='blue') 225 | 226 | plt.title("Entropy of " + self.pathname.split('/')[-1:][0]) 227 | plt.xlabel('Offset') 228 | plt.ylabel('Entropy') 229 | plt.ylim(-0.5, 8.25) 230 | 231 | plt.show() 232 | 233 | 234 | def plot_file_feature(self, feature, color=None, start=None, end=None): 235 | ''' 236 | generalized version of plot_file_entropy 237 | ''' 238 | try: 239 | plt.axvline(x=start, color='red') 240 | plt.axvline(x=end, color='red') 241 | except TypeError: 242 | pass 243 | 244 | 245 | features = {"mean":self.block_byteval_mean, 246 | "median":self.block_byteval_median, 247 | "std_dev":self.block_byteval_std_dev, 248 | "std_dev_counts":self.block_byteval_std_dev_counts, 249 | "entropy":self.block_entropy_levels, 250 | "ascii":self.block_ascii_ratios, 251 | "zeroes":self.block_zeroes_ratios} 252 | 253 | if color is None: 254 | color = "black" 255 | 256 | if feature in features: 257 | plt.plot(self.block_offsets, 258 | features[feature], 259 | linewidth=1.1, 260 | color=color) 261 | else: 262 | print("Feature argument must be one of the following: ") 263 | for feature in features: 264 | print(feature) 265 | 266 | plt.title(self.pathname.split('/')[-1:][0]) 267 | plt.xlabel('Offset') 268 | plt.ylabel(feature) 269 | plt.show() 270 | 271 | 272 | # 273 | def set_debug_level(self, level): 274 | self.debug_level = level 275 | 276 | if self.debug_level > 0: 277 | print("[+]\tDebug level set to %d" % self.debug_level) 278 | 279 | 280 | ##################################################################################### 281 | # Clustering with DBSCAN 282 | ##################################################################################### 283 | 284 | # eps=0.4 and min_sample=10 perform well in general, but 285 | # eps needs to be increased to 0.7 or higher for files smaller than ~100KB 286 | # min_sample needs to be increased to 20, 30 or higher for larger (~3MB+) files 287 | # try finding optimal value of eps using kNN distances 288 | 289 | def cluster_DBSCAN(self, epsilon, minimum_samples, find_optimal_epsilon=True): 290 | """ 291 | return a data frame containing data from clustering results and the data blocks 292 | """ 293 | 294 | X = StandardScaler().fit_transform(self.file_data_frame) # standardize and scale data frame. Using scikit-learn nc 295 | 296 | if (find_optimal_epsilon==True): 297 | find_optimal_eps(X, minimum_samples, epsilon) 298 | 299 | self.db = DBSCAN(eps=epsilon, min_samples=minimum_samples).fit(X) 300 | self.db.n_clusters_ = len(set(self.db.labels_)) - (1 if -1 in self.db.labels_ else 0) 301 | 302 | if self.debug_level > 0: 303 | print("Set of clusters found by DBSCAN: " + str(set(self.db.labels_))) 304 | 305 | core_samples_mask = np.zeros_like(self.db.labels_, dtype=bool) 306 | core_samples_mask[self.db.core_sample_indices_] = True 307 | 308 | # create data frame 309 | db_data_frame = self.file_data_frame.copy(deep=True) 310 | db_data_frame['core samples mask'] = core_samples_mask 311 | db_data_frame['cluster labels'] = self.db.labels_ 312 | 313 | #return db_data_frame 314 | self.dbscan_data_frame = db_data_frame 315 | # 316 | # labels = self.db.labels_ 317 | # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) 318 | # 319 | # print("Number of clusters found via DBSCAN: " + str(n_clusters_)) 320 | 321 | #def cluster_DBSCAN(self, epsilon, minimum_samples): 322 | # """ 323 | # Returns DBSCAN object 324 | # """ 325 | # self.db = ClusterDBSCAN(epsilon, minimum_samples, self.file_data_frame) 326 | 327 | 328 | # TODO: outliers (cluster -1) need to be black 329 | def plot_DBSCAN_results(self): 330 | 331 | """ 332 | Refactored. 333 | """ 334 | 335 | #grid = plt.GridSpec(2, 4, wspace=0.4, hspace=0.3) # <-------------- 336 | #plt.subplot(grid[0, 0:]) 337 | #plt.scatter(np.random.random(100), np.random.random(100)) 338 | #plt.subplot(grid[1, 2:]) 339 | #plt.subplot(grid[1, :2]); 340 | 341 | # rainbow_r, prism, Spectral 342 | #plt.subplot(grid[0, 0:]) # <----------- 343 | #plt.plot(self.block_offsets, 344 | # self.block_entropy_levels, 345 | # linewidth=0.3, 346 | # color='black') 347 | #plt.ylim(-0.25, 8.5) 348 | 349 | labels = self.db.labels_ 350 | unique_labels = set(labels) 351 | colors = [plt.cm.rainbow_r(each) 352 | for each in np.linspace(0, 1, len(unique_labels))] 353 | shapes = ['H','D', 's', 'd', 'o', 'v', 'p', 'h', '^', '>', '<', '.'] 354 | 355 | cluster_dfs, _ = self.extract_clusters() 356 | 357 | if cluster_dfs is None: 358 | print("[!] No clusters to plot. Exiting.") 359 | return 360 | 361 | for cluster_id in sorted(cluster_dfs.keys()): 362 | if cluster_id == -1: 363 | color = "black" 364 | else: 365 | color = colors[cluster_id] 366 | 367 | plt.scatter(list(cluster_dfs[cluster_id]["entropy"].index * self.block_size), 368 | cluster_dfs[cluster_id]["entropy"], 369 | edgecolors="k", 370 | marker=shapes[cluster_id], 371 | color=color, 372 | alpha=1) 373 | 374 | plt.ylim(-0.25, 8.25) 375 | plt.title(self.pathname.split('/')[-1:][0]) 376 | plt.xlabel("Block Offset") 377 | plt.ylabel("Block Entropy") 378 | plt.show() 379 | 380 | 381 | 382 | for cluster_id in sorted(cluster_dfs.keys()): 383 | if cluster_id == -1: 384 | color = "black" 385 | else: 386 | color = colors[cluster_id] 387 | 388 | plt.scatter(cluster_dfs[cluster_id]["byte value std dev"], 389 | cluster_dfs[cluster_id]["entropy"], 390 | edgecolors="k", 391 | marker=shapes[cluster_id], 392 | color=color, 393 | alpha=1) 394 | 395 | plt.xlim(-5) 396 | plt.ylim(-0.25, 8.25) 397 | plt.title(self.pathname.split('/')[-1:][0]) 398 | plt.xlabel("Block Byte Value Standard Deviation") 399 | plt.ylabel("Block Entropy") 400 | plt.show() 401 | 402 | 403 | 404 | for cluster_id in sorted(cluster_dfs.keys()): 405 | if cluster_id == -1: 406 | color = "black" 407 | else: 408 | color = colors[cluster_id] 409 | 410 | plt.scatter(cluster_dfs[cluster_id]["byte value median"], 411 | cluster_dfs[cluster_id]["zeroes ratios"], 412 | edgecolors="k", 413 | marker=shapes[cluster_id], 414 | color=color, 415 | alpha=1) 416 | 417 | plt.xlabel("Block Printable ASCII Ratio") 418 | plt.title(self.pathname.split('/')[-1:][0]) 419 | plt.xlabel("Block Median") 420 | plt.ylabel("Block Zeroes Ratio") 421 | plt.show() 422 | 423 | 424 | 425 | def plot_two_features_with_cluster_labels(self, feature_1, feature_2, with_noise=True): 426 | if feature_1 not in self.dbscan_data_frame.columns or feature_2 not in self.dbscan_data_frame.columns: 427 | print("Arguments must be 2 of the following: ") 428 | for feature in self.dbscan_data_frame.columns[0:-2]: 429 | print(feature) 430 | return 431 | 432 | labels = self.db.labels_ 433 | unique_labels = set(labels) 434 | colors = [plt.cm.rainbow_r(each) 435 | for each in np.linspace(0, 1, len(unique_labels))] 436 | shapes = ['H','D', 's', 'd', 'o', 'v', 'p', 'h', '^', '>', '<', '.'] 437 | 438 | cluster_dfs, _ = self.extract_clusters() 439 | if with_noise is False: 440 | cluster_dfs.pop(-1) 441 | 442 | if cluster_dfs is None: 443 | print("[!] No clusters to plot. Exiting.") 444 | return 445 | 446 | 447 | for cluster_id in sorted(cluster_dfs.keys()): 448 | if cluster_id == -1: 449 | color = "black" 450 | else: 451 | color = colors[cluster_id] 452 | 453 | plt.scatter(cluster_dfs[cluster_id][feature_1], 454 | cluster_dfs[cluster_id][feature_2], 455 | edgecolors="k", 456 | marker=shapes[cluster_id], 457 | color=color, 458 | alpha=1) 459 | 460 | plt.xlabel(feature_1) 461 | plt.ylabel(feature_2) 462 | plt.title(self.pathname.split('/')[-1:][0] + " clusters") 463 | plt.show() 464 | 465 | 466 | 467 | 468 | def extract_clusters(self): 469 | cluster_dataframes = {} # key = cluster ID, value = that cluster's data frame 470 | cluster_bytes = {} # key = cluster ID, value = list of all bytes in cluster 471 | 472 | if self.dbscan_data_frame is not None: 473 | cluster_labels = list(set(self.dbscan_data_frame["cluster labels"])) # example output: [0, 1, 2, -1] 474 | for label in cluster_labels: 475 | 476 | # extract data frame 477 | cluster_df = self.dbscan_data_frame[self.dbscan_data_frame["cluster labels"] == label] 478 | cluster_dataframes[label] = cluster_df 479 | 480 | # extract data/bytes of all blocks in cluster 481 | bytes = [] 482 | blocks = [self.blocks[i] for i in cluster_df.index] 483 | 484 | for block in blocks: 485 | bytes += block.data 486 | 487 | cluster_bytes[label] = bytes 488 | 489 | else: 490 | print("[!] No cluster data frames to extract\n") 491 | return None, None 492 | 493 | return cluster_dataframes, cluster_bytes 494 | 495 | 496 | 497 | 498 | def load_data_type_distributions(self): 499 | distributions = {} 500 | base_directory = os.path.dirname(__file__) 501 | load_path = base_directory + "/distributions/data_types/" 502 | 503 | for file in os.listdir(load_path): 504 | if os.path.isdir(load_path + file): 505 | continue 506 | with open(load_path + file, "rb") as f: 507 | try: 508 | distributions[file] = pickle.load(f) 509 | except: 510 | continue 511 | 512 | return distributions 513 | 514 | 515 | 516 | def load_machine_code_distributions(self): 517 | distributions = {} 518 | base_directory = os.path.dirname(__file__) 519 | load_path = base_directory + "/distributions/cpu_architectures/" 520 | 521 | for file in os.listdir(load_path): 522 | if os.path.isdir(load_path + file): 523 | continue 524 | with open(load_path + file, "rb") as f: 525 | try: 526 | distributions[file] = pickle.load(f) 527 | except: 528 | continue 529 | 530 | return distributions 531 | 532 | 533 | 534 | def id_code_clusters(self, cluster_dfs, cluster_bytes, reference_dist): 535 | 536 | distances = {} # store initial distance measurements between clusters and data type distributions 537 | closely_matching_arch_ref = False 538 | arch_classification = None 539 | for id, bytes in cluster_bytes.items(): 540 | id_string = "Cluster " + str(id) 541 | initial_d = wasserstein_distance(reference_dist, bytes) 542 | distances[id_string] = initial_d 543 | in_code_range = False 544 | if (cluster_dfs[id]["entropy"].mean() > 5.2 and cluster_dfs[id]["entropy"].mean() < 6.8): # Initial cutoff. 545 | in_code_range = True 546 | arch_distances = {} # store distance measurements between a cluster and each CPU arch. ref. dist. 547 | mc_reference_distributions = self.load_machine_code_distributions() 548 | print("[+] Checking %s for possible match" % id_string) 549 | for arch, ref_bytes in mc_reference_distributions.items(): 550 | code_d = wasserstein_distance(ref_bytes, bytes) 551 | arch_distances[arch] = code_d 552 | if code_d <= 10: # second cutoff. Looking for close matches 553 | closely_matching_arch_ref = True 554 | 555 | if closely_matching_arch_ref is True: 556 | print("[+] Closely matching CPU architecture reference(s) found for %s" % id_string) 557 | arch_classification = self.get_arch_ID(bytes) 558 | else: 559 | if in_code_range is True: 560 | print("[X] No closely matching CPU architecture reference found.\n\n") 561 | 562 | distances[id_string] = [distances[id_string], arch_distances] 563 | closely_matching_arch_ref = False 564 | 565 | if arch_classification is None: 566 | print("[X] No machine code cluster detected\n\n") 567 | 568 | #distances = json.dumps(distances, indent = 4) 569 | return distances, arch_classification 570 | 571 | 572 | 573 | 574 | def id_utf8_en_clusters(self, cluster_bytes, reference_dist): 575 | distances = {} 576 | match_found = False 577 | for id, bytes in cluster_bytes.items(): 578 | id_string = "Cluster " + str(id) 579 | d = wasserstein_distance(reference_dist, bytes) 580 | distances[id_string] = d 581 | if d < 30: # initial cutoff 582 | print("[+] UTF-8 (english) detected in %s\n Wasserstein distance to reference: %s\n" % (id_string, d)) 583 | match_found = True 584 | 585 | if match_found is False: 586 | print("[X] No UTF-8 (english) cluster detected.\n") 587 | 588 | #distances = json.dumps(distances, indent = 4) 589 | #print(distances) 590 | return distances 591 | 592 | 593 | 594 | def id_high_entropy_clusters(self, cluster_dfs, cluster_bytes, reference_dist): 595 | distances = {} 596 | match_found = False 597 | for id, bytes in cluster_bytes.items(): 598 | id_string = "Cluster " + str(id) 599 | d = wasserstein_distance(reference_dist, bytes) 600 | distances[id_string] = d 601 | if d < 10: # initial cutoff 602 | print("[+] High entropy data found in %s\n Wasserstein distance to reference: %s" % (id_string, d)) 603 | match_found = True 604 | if d < 1: 605 | print("[*] This distance suggests the data in this cluster could be\n" \ 606 | " a) encrypted\n" \ 607 | " b) compressed via LZMA with maximum compression level\n" \ 608 | " c) something else that is random or close to random.") 609 | else: 610 | print("[*] This distance suggests the data in this cluster is compressed\n") 611 | 612 | if match_found is False: 613 | print("[X] No high entropy data cluster detected.\n") 614 | 615 | #distances = json.dumps(distances, indent = 4) 616 | #print(distances) 617 | return distances 618 | 619 | 620 | 621 | 622 | def identify_cluster_data_types(self, show_all=False): 623 | cluster_dfs, cluster_bytes = self.extract_clusters() 624 | cluster_bytes.pop(-1, None) # get rid of noise 625 | reference_distributions = self.load_data_type_distributions() 626 | 627 | print("Searching for machine code\n--------------------------------------------------------------------\n") 628 | code_distances, arch_classification = self.id_code_clusters(cluster_dfs, cluster_bytes, reference_distributions["machine_code"]) 629 | 630 | print("\nSearching for utf8-english data\n-------------------------------------------------------------------\n") 631 | utf8_en_distances = self.id_utf8_en_clusters(cluster_bytes, reference_distributions["utf8_english"]) 632 | 633 | print("\nSearching for high entropy data\n-------------------------------------------------------------------\n") 634 | high_entropy_distances = self.id_high_entropy_clusters(cluster_dfs, cluster_bytes, reference_distributions["max_entropy"]) 635 | 636 | full_results = {"machine code": [code_distances, arch_classification], 637 | "utf8_en": utf8_en_distances, 638 | "high entropy": high_entropy_distances} 639 | 640 | if show_all is True: 641 | print("\n\nFull results: \n") 642 | print(json.dumps(full_results, indent=4)) 643 | 644 | return full_results 645 | 646 | 647 | 648 | 649 | def get_arch_ID(self, data): 650 | print("[+] Sending sample to https://isadetect.com/") 651 | req = requests.post("https://isadetect.com/binary/", 652 | files = { "binary":bytes(data) }, 653 | data = {"type": "code"}) 654 | print("[+] response:\n") 655 | response = json.dumps(req.json(), indent=4, sort_keys=True) 656 | print(response + "\n") 657 | 658 | return req.json() 659 | 660 | 661 | 662 | def plot_cluster_cdfs(self): 663 | #colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 664 | # '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] 665 | colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 666 | 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'] 667 | counter = 0 668 | 669 | _, cluster_bytes = self.extract_clusters() 670 | cluster_bytes.pop(-1, None) 671 | for cluster_id, bytes in cluster_bytes.items(): 672 | sns.distplot(bytes, 673 | norm_hist=True, 674 | kde=False, 675 | hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':2, 'alpha':1}, 676 | kde_kws={'cumulative': True}, 677 | bins=256, 678 | color=colors[counter % len(colors)]) # wrap around 679 | counter += 1 680 | plt.title("CDF of Cluster %d" % cluster_id) 681 | plt.xlim(-10, 265) 682 | plt.show() 683 | 684 | 685 | 686 | def plot_cluster_histograms(self): 687 | colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 688 | 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'] 689 | counter = 0 690 | 691 | _, cluster_bytes = self.extract_clusters() 692 | cluster_bytes.pop(-1, None) 693 | for cluster_id, bytes in cluster_bytes.items(): 694 | sns.distplot(bytes, 695 | kde=False, 696 | hist_kws={'alpha':1}, 697 | bins=256, 698 | color=colors[counter % len(colors)]) # wrap around 699 | counter += 1 700 | plt.title("Byte Value Histogram of Cluster %d" % cluster_id) 701 | plt.xlim(-10, 265) 702 | plt.show() 703 | 704 | 705 | 706 | def cluster_scatterplot_matrix(self): 707 | sns.pairplot(self.dbscan_data_frame[self.dbscan_data_frame["cluster labels"] != -1].drop(["core samples mask"], axis=1), hue="cluster labels") 708 | 709 | 710 | def violinplot_cluster_by_feature(self, feature): 711 | if feature not in self.dbscan_data_frame.columns: 712 | print("[!] feature must be one of the following:\n") 713 | for column in self.dbscan_data_frame.drop(["core samples mask", "cluster labels"], axis=1).columns: 714 | print(column) 715 | return 716 | 717 | with sns.axes_style("whitegrid"): 718 | sns.color_palette("rainbow") 719 | sns.violinplot(x = "cluster labels", 720 | y = feature, 721 | data = self.dbscan_data_frame[self.dbscan_data_frame["cluster labels"] != -1]) 722 | plt.title(self.pathname.split('/')[-1:][0] + " clusters") 723 | plt.show() 724 | 725 | 726 | 727 | def boxplot_cluster_by_feature(self, feature): 728 | if feature not in self.dbscan_data_frame.columns: 729 | print("[!] feature must be one of the following:\n") 730 | for column in self.dbscan_data_frame.drop(["core samples mask", "cluster labels"], axis=1).columns: 731 | print(column) 732 | return 733 | 734 | with sns.axes_style("whitegrid"): 735 | sns.boxplot(x = "cluster labels", 736 | y = feature, 737 | data = self.dbscan_data_frame[self.dbscan_data_frame["cluster labels"] != -1]) 738 | plt.title(self.pathname.split('/')[-1:][0] + " clusters") 739 | plt.show() 740 | 741 | -------------------------------------------------------------------------------- /centrifuge/datablock.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns; sns.set() 4 | import numpy as np 5 | from scipy.stats import entropy 6 | 7 | 8 | def to_byte_dict(data): 9 | byte_dict = {} 10 | for i in range(0, 256): byte_dict.update({i:0}) 11 | for i in list(data): byte_dict[i]+= 1 12 | return byte_dict 13 | 14 | 15 | def count_ascii(byte_dict): 16 | num_ascii = 0 17 | for i in range(0, 256): 18 | if byte_dict[i] >= 32 and byte_dict[i] <= 126: 19 | num_ascii += byte_dict[i] 20 | 21 | return num_ascii 22 | 23 | 24 | class DataBlock: 25 | def __init__(self, pathname, data_block, block_size, file_offset): 26 | # self explanatory 27 | self.path = pathname 28 | self.data = data_block 29 | self.size = block_size 30 | self.offset = file_offset 31 | 32 | # really annoying. Have to do this if we want to use an externally declared function 33 | # there must be a better way to do this 34 | self.to_byte_dict = to_byte_dict 35 | self.byte_dict = to_byte_dict(self.data) 36 | 37 | # features engineered for this data 38 | self.entropy = entropy(list(self.byte_dict.values()), base=2) # entropy 39 | 40 | self.zeroes_ratio = self.byte_dict[0] / block_size # percent of bytes that are 0 41 | self.ascii_ratio = count_ascii(self.byte_dict) / block_size # percent of bytes that fall within the ASCII range 42 | 43 | self.byteval_std_dev_counts = np.std(list(self.byte_dict.values())) 44 | self.byteval_std_dev = np.std(list(data_block)) 45 | 46 | self.byteval_mean = np.mean(list(data_block)) 47 | self.byteval_median = np.median(list(data_block)) 48 | 49 | # methods 50 | 51 | def plot_relative_frequency_distribution(self): 52 | 53 | # unvariate 54 | #plt.rcParams['figure.figsize'] = [15, 5] 55 | ax = sns.distplot(np.array(list(self.data)), bins=256, kde=False, norm_hist=True, color='purple'); 56 | ax.set(xlabel='Byte Value (base 10)', 57 | ylabel='Frequency', 58 | title='Byte Value Distribution at offset ' + str(self.offset) + ' in ' + self.path) 59 | # control x axis range 60 | ax.set_xlim(-10, 260) 61 | #ax.set_ylim(0, 0.10) 62 | plt.show() 63 | 64 | 65 | def plot_cdf(self): 66 | #plt.rcParams['figure.figsize'] = [15, 5] 67 | ax = sns.distplot(np.array(list(self.data)), 68 | bins=256, 69 | kde=False, 70 | hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':1, 'alpha':1}, 71 | kde_kws={'cumulative': True}, 72 | norm_hist=True, 73 | color='red'); 74 | ax.set(xlabel='Byte Value (base 10)', 75 | ylabel='Probability', 76 | title='CDF of byte values at offset ' + str(self.offset) + ' in ' + self.path) 77 | # control x axis range 78 | ax.set_xlim(-10, 260) 79 | #ax.set_ylim(0, 0.10) 80 | 81 | plt.show() 82 | 83 | -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/AMD64_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/AMD64_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/ARM64_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/ARM64_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/ARMEL_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/ARMEL_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/MIPS64EL_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/MIPS64EL_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/MIPSEL_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/MIPSEL_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/PPC64_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/PPC64_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/PowerPC_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/PowerPC_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/SH4_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/SH4_reference -------------------------------------------------------------------------------- /centrifuge/distributions/cpu_architectures/i386_reference: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/cpu_architectures/i386_reference -------------------------------------------------------------------------------- /centrifuge/distributions/data_types/archive/readme.txt: -------------------------------------------------------------------------------- 1 | Examples of data types are machine code, ASCII text, compressed data, encrypted data, ELF debug info, etc. 2 | After DBSCAN performs clustering, it will compare the byte distribution of each cluster to a data type reference. 3 | This way no non-machine code information is compared with the reference distributions of the CPU architectures. 4 | In other words, only if the cluster is identified to be data type "code" will an attempt to identify the 5 | architecture be made. 6 | 7 | 8 | 9 | To create the machine code reference, take a sample of size N bytes from all the architectures, merge them, then 10 | build. 11 | 12 | 13 | -------------------------------------------------------------------------------- /centrifuge/distributions/data_types/machine_code: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/data_types/machine_code -------------------------------------------------------------------------------- /centrifuge/distributions/data_types/max_entropy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/data_types/max_entropy -------------------------------------------------------------------------------- /centrifuge/distributions/data_types/utf8_english: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/centrifuge/distributions/data_types/utf8_english -------------------------------------------------------------------------------- /centrifuge/utils/plotutils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def plot_file_entropy(bf, start=None, end=None): 4 | '''bf argument is an instance of the BinFile class, start and none are numbers representing offsets within the file''' 5 | 6 | try: 7 | plt.axvline(x=start, color='red') 8 | plt.axvline(x=end, color='red') 9 | except TypeError: 10 | pass 11 | 12 | plt.plot(bf.block_offsets, 13 | bf.block_entropy_levels, 14 | linewidth=0.8, 15 | color='blue') 16 | 17 | plt.title("Entropy of " + str(bf.pathname)) 18 | plt.xlabel('Offset') 19 | plt.ylabel('Entropy') 20 | plt.ylim(-0.5, 8) 21 | 22 | plt.show() 23 | 24 | 25 | 26 | def quickplot(x, y, c, title, xl, yl): 27 | 28 | plt.scatter(x, y, alpha=0.15, color=c) 29 | 30 | plt.title(title) 31 | plt.xlabel(xl) 32 | plt.ylabel(yl) 33 | #plt.xlim(-0.05, 1) 34 | #plt.ylim(0, 8.5) 35 | 36 | plt.show() 37 | -------------------------------------------------------------------------------- /gallery/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/1.png -------------------------------------------------------------------------------- /gallery/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/10.png -------------------------------------------------------------------------------- /gallery/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/11.png -------------------------------------------------------------------------------- /gallery/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/12.png -------------------------------------------------------------------------------- /gallery/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/13.png -------------------------------------------------------------------------------- /gallery/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/14.png -------------------------------------------------------------------------------- /gallery/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/15.png -------------------------------------------------------------------------------- /gallery/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/16.png -------------------------------------------------------------------------------- /gallery/17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/17.png -------------------------------------------------------------------------------- /gallery/18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/18.png -------------------------------------------------------------------------------- /gallery/19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/19.png -------------------------------------------------------------------------------- /gallery/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/2.png -------------------------------------------------------------------------------- /gallery/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/3.png -------------------------------------------------------------------------------- /gallery/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/4.png -------------------------------------------------------------------------------- /gallery/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/5.png -------------------------------------------------------------------------------- /gallery/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/6.png -------------------------------------------------------------------------------- /gallery/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/7.png -------------------------------------------------------------------------------- /gallery/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/8.png -------------------------------------------------------------------------------- /gallery/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/gallery/9.png -------------------------------------------------------------------------------- /images/approach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/images/approach.png -------------------------------------------------------------------------------- /images/approach_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/images/approach_2.png -------------------------------------------------------------------------------- /images/approach_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinaryResearch/centrifuge-toolkit/d68a0ba7df8ab2ef9cd8cd34a1dfa8183a641285/images/approach_3.png -------------------------------------------------------------------------------- /notebooks/Data Type Reference Distributions/The Machine Code Reference Distribution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 62, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pickle\n", 11 | "import pandas as pd\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns\n", 14 | "plt.rcParams['figure.figsize'] = [16, 9]\n", 15 | "sns.set_style(\"whitegrid\")\n", 16 | "\n", 17 | "from scipy import stats" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "path = \"../centrifuge/distributions/cpu_architectures/\"\n", 27 | "archs = os.listdir(path)\n", 28 | "\n", 29 | "reference_dict = {}\n", 30 | "\n", 31 | "for file in archs:\n", 32 | " with open(path + file, \"rb\") as f:\n", 33 | " reference_dict[file] = pickle.load(f)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "reference_df = pd.DataFrame(index = reference_dict.keys(),\n", 43 | " columns = [i for i in range(1000)])\n", 44 | "\n", 45 | "for file, code in reference_dict.items():\n", 46 | " reference_df.loc[file] = code" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 63, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "image/png": "\n", 57 | "text/plain": [ 58 | "
" 59 | ] 60 | }, 61 | "metadata": {}, 62 | "output_type": "display_data" 63 | } 64 | ], 65 | "source": [ 66 | "for code in reference_dict.values():\n", 67 | " sns.distplot(code,\n", 68 | " norm_hist=True, \n", 69 | " kde=False,\n", 70 | " hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':2, 'alpha':1},\n", 71 | " kde_kws={'cumulative': True}, \n", 72 | " bins=256)\n", 73 | "\n", 74 | "sns.distplot(reference_df.mean(axis = 0), \n", 75 | " norm_hist=True, \n", 76 | " kde=False, \n", 77 | " hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':10, 'alpha':0.5},\n", 78 | " kde_kws={'cumulative': True}, \n", 79 | " bins=256,\n", 80 | " color=\"black\")\n", 81 | "\n", 82 | "sns.distplot(reference_df.median(axis = 0), \n", 83 | " norm_hist=True, \n", 84 | " kde=False, \n", 85 | " hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':3, 'alpha':0.8},\n", 86 | " kde_kws={'cumulative': True}, \n", 87 | " bins=256,\n", 88 | " color=\"red\")\n", 89 | "\n", 90 | "plt.legend(list(reference_dict.keys())+[\"mean\"]+[\"median\"], loc=\"upper left\")\n", 91 | "plt.title(\"Mean of reference distributions vs. All reference distributions\")\n", 92 | "plt.show()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/html": [ 103 | "
\n", 104 | "\n", 117 | "\n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | "
Wasserstein Distance (Mean)Wasserstein Distance (Median)
PPC64_reference5.254332.46349
SH4_reference6.86794.77315
ARM64_reference6.969694.47539
AMD64_reference8.021075.39967
i386_reference11.33169.2355
PowerPC_reference12.470814.5362
MIPS64EL_reference12.896714.7124
ARMEL_reference17.38716.1934
MIPSEL_reference23.809925.8836
\n", 173 | "
" 174 | ], 175 | "text/plain": [ 176 | " Wasserstein Distance (Mean) Wasserstein Distance (Median)\n", 177 | "PPC64_reference 5.25433 2.46349\n", 178 | "SH4_reference 6.8679 4.77315\n", 179 | "ARM64_reference 6.96969 4.47539\n", 180 | "AMD64_reference 8.02107 5.39967\n", 181 | "i386_reference 11.3316 9.2355\n", 182 | "PowerPC_reference 12.4708 14.5362\n", 183 | "MIPS64EL_reference 12.8967 14.7124\n", 184 | "ARMEL_reference 17.387 16.1934\n", 185 | "MIPSEL_reference 23.8099 25.8836" 186 | ] 187 | }, 188 | "execution_count": 7, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "distance_df = pd.DataFrame(index = reference_dict.keys(),\n", 195 | " columns = [\"Wasserstein Distance (Mean)\",\n", 196 | " \"Wasserstein Distance (Median)\"])\n", 197 | "\n", 198 | "for file, code in reference_dict.items():\n", 199 | " distance_df.loc[file][\"Wasserstein Distance (Mean)\"] = stats.wasserstein_distance(reference_df.mean(axis = 0), code)\n", 200 | " distance_df.loc[file][\"Wasserstein Distance (Median)\"] = stats.wasserstein_distance(reference_df.median(axis = 0), code)\n", 201 | "\n", 202 | "distance_df.sort_values(\"Wasserstein Distance (Mean)\")" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "distance_df.mean(axis=0)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "with open(\"../distributions/data_types/machine_code\", \"wb\") as f:\n", 221 | " pickle.dump(reference_df.mean(axis=0), f)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Though using the medians minimizes distance overall, using the means might be a better choice. The more distant distributions (MIPSEL, MIPS64EL, ARMEL, PowerPC) are less distant to the means.\n", 229 | "\n", 230 | "
\n", 231 | "\n", 232 | "Since the above distributions are so diverse, an entropy cutoff was used to identify clusters containing code, rather than using Wasserstein distance between the distribution of byte values in the cluster and this reference distribution. Currently, this metric is still measured, but not used." 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 33, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "def plot_dist_cdf(df):\n", 242 | " for entry in list(df.index):\n", 243 | " sns.distplot(df.loc[entry],\n", 244 | " norm_hist=True, \n", 245 | " kde=False,\n", 246 | " hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':2, 'alpha':1},\n", 247 | " kde_kws={'cumulative': True}, \n", 248 | " bins=256)\n", 249 | " plt.legend(list(df.index), loc=\"upper left\")\n", 250 | " plt.show()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 46, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "def create_family_ref(mc_ref_dist, dist_1, dist_2):\n", 260 | " family_df = pd.DataFrame(index=[dist_1, dist_2],\n", 261 | " columns=[i for i in range(0,1000)])\n", 262 | " family_df.loc[dist_1] = mc_ref_dist.loc[dist_1]\n", 263 | " family_df.loc[dist_2] = mc_ref_dist.loc[dist_2]\n", 264 | " family_df_median = family_df.median(axis = 0)\n", 265 | " family_df = family_df.append(pd.Series(family_df_median, name=\"medians\"))\n", 266 | " \n", 267 | " return family_df, family_df_median" 268 | ] 269 | } 270 | ], 271 | "metadata": { 272 | "kernelspec": { 273 | "display_name": "Python 3", 274 | "language": "python", 275 | "name": "python3" 276 | }, 277 | "language_info": { 278 | "codemirror_mode": { 279 | "name": "ipython", 280 | "version": 3 281 | }, 282 | "file_extension": ".py", 283 | "mimetype": "text/x-python", 284 | "name": "python", 285 | "nbconvert_exporter": "python", 286 | "pygments_lexer": "ipython3", 287 | "version": "3.7.6" 288 | } 289 | }, 290 | "nbformat": 4, 291 | "nbformat_minor": 2 292 | } 293 | -------------------------------------------------------------------------------- /notebooks/Data Type Reference Distributions/The UTF-8 (English) Reference Distribution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pickle\n", 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import seaborn as sns\n", 15 | "from tqdm import tqdm\n", 16 | "from scipy import stats\n", 17 | "\n", 18 | "plt.rcParams['figure.figsize'] = [16, 9]\n", 19 | "sns.set_style(\"whitegrid\")" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "['62786-0.txt', '62819.txt', '62875-0.txt']" 31 | ] 32 | }, 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "path = \"../files/utf8-english/\"\n", 40 | "files = os.listdir(path)\n", 41 | "files.remove('README.txt')\n", 42 | "files" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "text_data = {}\n", 52 | "\n", 53 | "for file in files:\n", 54 | " with open(path + file, \"rb\") as f:\n", 55 | " f.seek(10000)\n", 56 | " text_data[file] = list(f.read(50000))" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stderr", 66 | "output_type": "stream", 67 | "text": [ 68 | "100%|██████████| 1000/1000 [00:47<00:00, 20.97it/s]\n", 69 | "100%|██████████| 1000/1000 [00:48<00:00, 20.77it/s]\n", 70 | "100%|██████████| 1000/1000 [00:48<00:00, 20.69it/s]\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "file_means = {}\n", 76 | "file_medians = {}\n", 77 | "\n", 78 | "for file, text in text_data.items():\n", 79 | " df = pd.DataFrame(index=[i for i in range(0,1000)],\n", 80 | " columns=[i for i in range(0,1000)])\n", 81 | " for i in tqdm(range(0, 1000)):\n", 82 | " df.iloc[i] = sorted(np.random.choice(text, size=1000, replace=True))\n", 83 | " \n", 84 | " file_means[file] = df.mean(axis = 0)\n", 85 | " file_medians[file] = df.median(axis = 0)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 22, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "image/png": "\n", 96 | "text/plain": [ 97 | "
" 98 | ] 99 | }, 100 | "metadata": {}, 101 | "output_type": "display_data" 102 | } 103 | ], 104 | "source": [ 105 | "for file, sample in file_means.items():\n", 106 | " sns.distplot(round(sample),\n", 107 | " norm_hist=True, \n", 108 | " kde=False,\n", 109 | " hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':2, 'alpha':1},\n", 110 | " kde_kws={'cumulative': True}, \n", 111 | " bins=256)\n", 112 | " \n", 113 | "plt.legend(file_means.keys(), loc=\"upper left\")\n", 114 | "plt.show()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 20, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def plot_dist_hist(data):\n", 124 | " sns.distplot(data,\n", 125 | " kde=False,\n", 126 | " hist_kws={'alpha':1},\n", 127 | " bins=256)\n", 128 | " plt.show()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 35, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "image/png": "\n", 139 | "text/plain": [ 140 | "
" 141 | ] 142 | }, 143 | "metadata": {}, 144 | "output_type": "display_data" 145 | } 146 | ], 147 | "source": [ 148 | "utf8_reference = round(file_means['62875-0.txt'])\n", 149 | "plot_dist_hist(utf8_reference)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 45, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# replace \" \" with 0\n", 159 | "\n", 160 | "for index, value in enumerate(utf8_reference):\n", 161 | " if value == 32:\n", 162 | " utf8_reference[index] = 0" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 46, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "image/png": "\n", 173 | "text/plain": [ 174 | "
" 175 | ] 176 | }, 177 | "metadata": {}, 178 | "output_type": "display_data" 179 | } 180 | ], 181 | "source": [ 182 | "plot_dist_hist(utf8_reference)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 44, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "" 194 | ] 195 | }, 196 | "execution_count": 44, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | }, 200 | { 201 | "data": { 202 | "image/png": "\n", 203 | "text/plain": [ 204 | "
" 205 | ] 206 | }, 207 | "metadata": {}, 208 | "output_type": "display_data" 209 | } 210 | ], 211 | "source": [ 212 | "sns.distplot(utf8_reference,\n", 213 | " norm_hist=True, \n", 214 | " kde=False,\n", 215 | " hist_kws={'histtype':'step', 'cumulative': True, 'linewidth':2, 'alpha':1},\n", 216 | " kde_kws={'cumulative': True}, \n", 217 | " bins=256)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 43, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "with open(\"utf8_english\", \"wb\") as f:\n", 227 | " pickle.dump(utf8_reference, f)" 228 | ] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.7.6" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 2 252 | } 253 | -------------------------------------------------------------------------------- /notebooks/archive/readme.txt: -------------------------------------------------------------------------------- 1 | The notebooks in this directory have not been updated to reflect changes made to the code. 2 | -------------------------------------------------------------------------------- /scripts/basic_DBSCAN_clustering.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path[0:0] = ['.', '..'] 3 | 4 | from centrifuge.binfile import BinFile 5 | 6 | def cluster(): 7 | with open("/bin/ls", "rb") as f: 8 | binfile = BinFile(f) 9 | binfile.slice_file() 10 | 11 | binfile.cluster_DBSCAN(0.9, 10, find_optimal_epsilon=True) 12 | binfile.plot_DBSCAN_results() 13 | 14 | if __name__=="__main__": 15 | cluster() 16 | -------------------------------------------------------------------------------- /scripts/entropy_plot.py: -------------------------------------------------------------------------------- 1 | 2 | #!~/anaconda3/bin/python3 3 | 4 | 5 | import sys 6 | sys.path[0:0] = ['.', '..'] 7 | 8 | from centrifuge.binfile import BinFile 9 | 10 | def visualize_file_entropy(): 11 | with open("/bin/bash", "rb") as f: 12 | binfile = BinFile(f) 13 | binfile.slice_file() 14 | binfile.plot_file_entropy() 15 | 16 | if __name__ == "__main__": 17 | visualize_file_entropy() 18 | -------------------------------------------------------------------------------- /scripts/entropy_plot_text_section.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path[0:0] = ['.', '..'] 3 | 4 | from centrifuge.binfile import BinFile 5 | 6 | def label_text_section(): 7 | with open("/bin/bash", "rb") as f: 8 | binfile = BinFile(f) 9 | binfile.slice_file() 10 | binfile.plot_file_entropy(0x2cbc0, 0x2cbc0 + 0xa2c02) 11 | 12 | if __name__ == "__main__": 13 | label_text_section() 14 | 15 | -------------------------------------------------------------------------------- /scripts/identify_clusters.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path[0:0] = ['.', '..'] 3 | 4 | from centrifuge.binfile import BinFile 5 | 6 | def cluster(): 7 | with open("/bin/ls", "rb") as f: 8 | binfile = BinFile(f) 9 | binfile.slice_file() 10 | binfile.cluster_DBSCAN(0.9, 10, find_optimal_epsilon=False) 11 | results = binfile.identify_cluster_data_types() 12 | print(results) 13 | 14 | if __name__=="__main__": 15 | cluster() 16 | 17 | -------------------------------------------------------------------------------- /scripts/plot_all_variables.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path[0:0] = ['.', '..'] 3 | 4 | from centrifuge.binfile import BinFile 5 | 6 | 7 | def plot_all(): 8 | with open("/bin/bash", "rb") as f: 9 | binfile = BinFile(f) 10 | binfile.slice_file() 11 | 12 | binfile.show_scatter_matrix() 13 | print(binfile.file_data_frame) 14 | 15 | 16 | if __name__ == "__main__": 17 | plot_all() 18 | 19 | -------------------------------------------------------------------------------- /scripts/plot_cluster_cdfs.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | sns.set_style("whitegrid") 4 | 5 | import sys 6 | sys.path[0:0] = ['.', '..'] 7 | 8 | from centrifuge.binfile import BinFile 9 | 10 | def cluster(): 11 | with open("/bin/ls", "rb") as f: 12 | binfile = BinFile(f) 13 | binfile.slice_file() 14 | binfile.cluster_DBSCAN(0.9, 5, find_optimal_epsilon=False) 15 | #binfile.plot_DBSCAN_results() 16 | 17 | binfile.plot_cluster_cdfs() 18 | 19 | 20 | if __name__=="__main__": 21 | cluster() 22 | -------------------------------------------------------------------------------- /scripts/plot_two_variables.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path[0:0] = ['.', '..'] 3 | 4 | from centrifuge.binfile import BinFile 5 | 6 | def plot(): 7 | with open("/bin/bash", "rb") as f: 8 | binfile = BinFile(f) 9 | binfile.slice_file() 10 | 11 | binfile.plot_variables_by_range(binfile.block_entropy_levels, 12 | binfile.block_byteval_std_dev, 13 | 0x2cbc0, 0x2cbc0 + 0xa2c02, 14 | title="bash", 15 | xlabel="entropy", 16 | ylabel="byte value standard deviation") 17 | 18 | if __name__ == "__main__": 19 | plot() 20 | -------------------------------------------------------------------------------- /scripts/readme.txt: -------------------------------------------------------------------------------- 1 | example: 2 | 3 | $ python3 plot_all_variables.py 4 | -------------------------------------------------------------------------------- /scripts/small_elf.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | sns.set_style("whitegrid") 4 | 5 | import sys 6 | sys.path[0:0] = ['.', '..'] 7 | 8 | from centrifuge.binfile import BinFile 9 | 10 | def cluster(): 11 | with open("/bin/cat", "rb") as f: 12 | cat = BinFile(f) 13 | cat.slice_file() 14 | cat.cluster_DBSCAN(1, 3, find_optimal_epsilon=True) 15 | cat.plot_DBSCAN_results() 16 | 17 | cat.identify_cluster_data_types() 18 | 19 | if __name__=="__main__": 20 | cluster() 21 | --------------------------------------------------------------------------------