├── redwood ├── __init__.py ├── io │ ├── __init__.py │ └── csv_importer.py ├── helpers │ ├── __init__.py │ ├── visual.py │ └── core.py ├── shell │ ├── __init__.py │ ├── controller.py │ └── modes.py ├── connection │ ├── __init__.py │ └── connect.py ├── foundation │ ├── __init__.py │ ├── aggregator.py │ ├── report.py │ └── prevalence.py └── filters │ ├── __init__.py │ └── redwood_filter.py ├── .gitignore ├── CHANGELOG.md ├── MAINTAINERS.md ├── CONTRIBUTING.md ├── images ├── clustering.png ├── discovery.png ├── histogram0.png ├── redwood_0.png └── logo │ ├── favicon.ico │ ├── redwood_logo.png │ ├── redwood_logo.xcf │ └── license │ ├── README.txt │ ├── clker_tos.txt │ └── SIL_open_font_license.txt ├── reports └── resources │ ├── images │ ├── bot_left.png │ ├── top_left.png │ ├── bot_right.png │ ├── top_right.png │ ├── redwood_logo.png │ ├── bot_left_light.png │ ├── bot_right_light.png │ ├── top_left_light.png │ └── top_right_light.png │ └── css │ └── style.css ├── MANIFEST.in ├── AUTHORS ├── sql ├── filewalk.sh ├── create_redwood_db.sql ├── synthesize_data.sh ├── create_redwood_sp.sql └── filewalk.py ├── LICENSE.txt ├── bin └── redwood ├── setup.py ├── Filters ├── filenames.py ├── locality_uniqueness.py └── filter_prevalence.py ├── docs └── conf.py └── README.md /redwood/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /redwood/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /redwood/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /redwood/shell/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | -------------------------------------------------------------------------------- /redwood/connection/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /redwood/foundation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Current release is 0.1.0 2 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | Paul M 2 | -------------------------------------------------------------------------------- /redwood/filters/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | filter_list = list() 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /images/clustering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/clustering.png -------------------------------------------------------------------------------- /images/discovery.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/discovery.png -------------------------------------------------------------------------------- /images/histogram0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/histogram0.png -------------------------------------------------------------------------------- /images/redwood_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/redwood_0.png -------------------------------------------------------------------------------- /images/logo/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/logo/favicon.ico -------------------------------------------------------------------------------- /images/logo/redwood_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/logo/redwood_logo.png -------------------------------------------------------------------------------- /images/logo/redwood_logo.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/logo/redwood_logo.xcf -------------------------------------------------------------------------------- /reports/resources/images/bot_left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_left.png -------------------------------------------------------------------------------- /reports/resources/images/top_left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_left.png -------------------------------------------------------------------------------- /reports/resources/images/bot_right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_right.png -------------------------------------------------------------------------------- /reports/resources/images/top_right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_right.png -------------------------------------------------------------------------------- /reports/resources/images/redwood_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/redwood_logo.png -------------------------------------------------------------------------------- /reports/resources/images/bot_left_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_left_light.png -------------------------------------------------------------------------------- /reports/resources/images/bot_right_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_right_light.png -------------------------------------------------------------------------------- /reports/resources/images/top_left_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_left_light.png -------------------------------------------------------------------------------- /reports/resources/images/top_right_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_right_light.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS 2 | include *.md 3 | recursive-include docs *.md *.png *.rst *.bat *.pickle *.doctree *.html *.inv *.js *.txt *.gif *.css *.1 *.py 4 | recursive-include sql *.sh *.sql 5 | include docs/Makefile 6 | -------------------------------------------------------------------------------- /images/logo/license/README.txt: -------------------------------------------------------------------------------- 1 | Images in logo are modified versions of public domain clipart from clker.com: 2 | - "Tree Silhouettes" by stephen foley 3 | - "Tree" by Zeta 4 | 5 | Logo font is "Lobster Two," by Pablo Impallari, from Google Webfonts and licensed under the SIL Open Font License. 6 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This file lists all individuals having contributed content to the repository. 2 | # If you're submitting a patch, please add your name here in alphabetical order as part of the patch. 3 | # 4 | # For a list of active project maintainers, see the MAINTAINERS file. 5 | # 6 | Charlie Lewis 7 | Paul M 8 | -------------------------------------------------------------------------------- /sql/filewalk.sh: -------------------------------------------------------------------------------- 1 | echo "contents_hash,dirname,basename,inode,device,permissions,user_owner,group_owner,last_accessed,last_modified,last_changed,inode_birth,user_flags,links_to_file,size" > filewalk; sudo find / -type f -exec sh -c 'A=$(shasum "$0" | cut -d" " -f1-2 | tr -d " ") ; DIR="$(dirname "$0")/"; BASE=$(basename "$0"); B=$(stat -f "%i,%d,%p,%Su,%Sg,%a,%m,%c,%B,%f,%l,%z" "$0"); echo $A,$DIR,$BASE,$B >> filewalk ; ' {} \; 2 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /bin/redwood: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import exceptions 4 | import cmd 5 | import getopt 6 | import multiprocessing 7 | import os 8 | import string 9 | import sys 10 | import ConfigParser 11 | import MySQLdb 12 | import redwood.filters 13 | import redwood.helpers.core as core 14 | import redwood.connection.connect as rconn 15 | 16 | from redwood.shell.controller import SessionController 17 | 18 | 19 | def main(argv): 20 | 21 | if(len(argv) != 1 and len(argv) != 2): 22 | print "Please provide database configuration file" 23 | sys.exit(1) 24 | 25 | print '\033[1;31m\n\n#################################\nWelcome to Redwood\n#################################\n\033[1;m' 26 | 27 | print "Establishing connection to database...\n", 28 | print "...running with {} cores".format(multiprocessing.cpu_count()) 29 | 30 | cnx = rconn.connect_with_config(argv[0]) 31 | 32 | 33 | #import the filters 34 | if(len(argv) == 2): 35 | core.import_filters(argv[1], cnx) 36 | 37 | sc = SessionController(cnx) 38 | sc.preloop() 39 | sc.cmdloop() 40 | 41 | cnx.close() 42 | 43 | if __name__ == "__main__": 44 | main(sys.argv[1:]) 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | from setuptools.command.install import install 5 | import os 6 | 7 | class MyInstall(install): 8 | 9 | def run(self): 10 | install.run(self) 11 | print "Installing dependencies" 12 | 13 | setup( 14 | name='RedwoodUtility', 15 | version='0.1.0', 16 | author='Lab41', 17 | author_email='paulm@lab41.org', 18 | description='A project that implements statistical methods for identifying anomalous files.', 19 | url='http://lab41.github.io/Redwood', 20 | packages=['redwood', 'redwood.filters', 'redwood.shell','redwood.io','redwood.helpers', 'redwood.connection', 'redwood.foundation'], 21 | scripts=['bin/redwood'], 22 | license='LICENSE.txt', 23 | long_description=open('README.md').read(), 24 | keywords='redwood stats statistics anomalies'.split(), 25 | cmdclass={'install': MyInstall}, 26 | classifiers=[ 27 | 'Programming Language :: Python', 28 | 'Operating System :: POSIX :: Linux', 29 | 'Topic :: Software Development :: Libraries :: Application Frameworks', 30 | 'Environment :: Other Environment' 31 | ], 32 | data_files=[ 33 | ('', ['LICENSE.txt']) 34 | ], 35 | install_requires=[ 36 | 'numpy', 37 | 'scipy', 38 | ] 39 | ) 40 | -------------------------------------------------------------------------------- /redwood/connection/connect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | 22 | This package provides connection functionality to a redwood MySQL db 23 | """ 24 | 25 | 26 | import sys 27 | import os 28 | import getopt 29 | import string 30 | import MySQLdb 31 | import exceptions 32 | import ConfigParser 33 | 34 | 35 | def connect_with_config(config_path): 36 | """ 37 | Given a path, returns a connection object 38 | 39 | :param config_path: path to the configuration file 40 | 41 | :return MySQL connection object 42 | """ 43 | 44 | cnx = None 45 | 46 | if config_path is None: 47 | print "Error: A config file must be provided" 48 | return cnx 49 | 50 | try: 51 | with open(config_path): pass 52 | except IOError: 53 | print ('Error: Configuration file \'{}\' not found'.format(config_path)) 54 | return cnx 55 | 56 | config = ConfigParser.RawConfigParser() 57 | config.read(config_path) 58 | user = config.get("mysqld", "username") 59 | password = config.get("mysqld", "password") 60 | host = config.get("mysqld", "host") 61 | database = config.get("mysqld", "database") 62 | try: 63 | port = int(config.get("mysqld", "port")) 64 | except: 65 | port = 3306 66 | 67 | try: 68 | 69 | cnx = MySQLdb.connect(host=host, user=user, passwd=password, db=database, port=port, local_infile=1) 70 | except MySQLdb.Error as e: 71 | print(e) 72 | return None 73 | 74 | if cnx is None: 75 | print "Error: Unable to connect to database" 76 | return None 77 | 78 | return cnx 79 | -------------------------------------------------------------------------------- /redwood/shell/controller.py: -------------------------------------------------------------------------------- 1 | import cmd 2 | import sys 3 | import shlex 4 | import redwood.helpers.core as core 5 | import redwood.io.csv_importer as csv_load 6 | from modes import SubInterpreterFilter 7 | from redwood.filters import filter_list 8 | 9 | class SessionController(cmd.Cmd): 10 | prompt = '\033[1;32mredwood$ \033[1;m' 11 | 12 | 13 | def __init__(self, cnx): 14 | cmd.Cmd.__init__(self) 15 | self.cnx = cnx 16 | 17 | def default(self, line): 18 | if line == 'EOF' or line == 'exit': 19 | self.do_quit(line) 20 | return True 21 | else: 22 | print "*** Command not recognized, try 'help'" 23 | 24 | def emptyline(self): 25 | pass 26 | 27 | def preloop(self, cnx=None): 28 | #self.cnx = cnx 29 | pass 30 | def cmdloop(self): 31 | try: 32 | if not filter_list: 33 | core.import_filters("./Filters", self.cnx) 34 | return cmd.Cmd.cmdloop(self) 35 | except KeyboardInterrupt: 36 | sys.stdout.write('\n') 37 | return self.cmdloop() 38 | 39 | def help_help(self): 40 | self.do_help('') 41 | 42 | def do_filter(self, line): 43 | '''[*] filter\n\t|--activates FILTER mode:''' 44 | sub_cmd = SubInterpreterFilter(self.cnx) 45 | sub_cmd.cmdloop() 46 | 47 | def do_import_filters(self, line): 48 | '''[*] import_filters \n\t|-[path] - path to the directory containing the filters''' 49 | new_filters = core.import_filters(line, self.cnx) 50 | 51 | if new_filters is not None: 52 | print "Importing the following filters: " 53 | for f in new_filters: 54 | print "{}".format(f.name) 55 | else: 56 | print "No filters found" 57 | 58 | def do_load_csv(self, line): 59 | '''[*] load_csv 60 | |-[path] - path where csv files exist or a path to a csv file 61 | ''' 62 | try: 63 | csv_load.run(self.cnx, line) 64 | except Exception as e: 65 | print "Error occurred {}".format(e) 66 | return 67 | 68 | def do_quit(self, line): 69 | '''quit: Exit the redwood console''' 70 | if self.cnx != None: 71 | self.cnx.close() 72 | sys.stdout.write('\n') 73 | print "quitting" 74 | sys.exit(0) 75 | return True 76 | -------------------------------------------------------------------------------- /redwood/helpers/visual.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Created on 19 October 2013 19 | @author: Lab41 20 | 21 | Helper functions for creating visualizations 22 | """ 23 | import array 24 | import matplotlib.pyplot as plt 25 | import numpy as np 26 | import matplotlib 27 | 28 | 29 | def visualize_scatter(counts, codes, data, codebook, num_clusters, xlabel="", ylabel="", title=""): 30 | """ 31 | Generates a 2-d scatter plot visualization of two feature data for 32 | 33 | :param counts: dictionary of counts for the number of observations pairs for 34 | each cluster 35 | :param codes: list of codes for each observation row in the order returned by the original query 36 | :param data: list of observations returned from query in their original order 37 | :param codebook: the coordinates of the centroids 38 | :param num_clusters: number of specified clusters up to 8 39 | :param xlabel: a label for the x axis (Default: None) 40 | :param ylabel: a label for the y axis (Default: None) 41 | """ 42 | if num_clusters > 8: 43 | print "Visualize scatter only supports up to 8 clusters" 44 | return 45 | 46 | num_features = 2 47 | list_arrays = list() 48 | list_arr_idx = array.array("I", [0, 0, 0]) 49 | 50 | for idx in range(num_clusters): 51 | list_arrays.append(np.zeros((counts[idx], num_features))) 52 | 53 | 54 | for i, j in zip(codes, data): 55 | 56 | list_arrays[i][list_arr_idx[i]][0] = j[0] 57 | list_arrays[i][list_arr_idx[i]][1] = j[1] 58 | list_arr_idx[i] += 1 59 | 60 | #plot the clusters first as relatively larger circles 61 | plt.scatter(codebook[:,0], codebook[:,1], color='orange', s=260) 62 | 63 | colors = ['red', 'blue', 'green', 'purple', 'cyan', 'black', 'brown', 'grey'] 64 | 65 | for idx in range(num_clusters): 66 | plt.scatter(list_arrays[idx][:,0], list_arrays[idx][:,1], c=colors[idx]) 67 | 68 | plt.title(title) 69 | plt.ylabel(ylabel) 70 | plt.xlabel(xlabel) 71 | plt.show() 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /redwood/foundation/aggregator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | """ 22 | 23 | 24 | class Aggregator(): 25 | 26 | def __init__(self, cnx): 27 | self.cnx = cnx 28 | 29 | 30 | def aggregate(self, filter_list, dist_str=None): 31 | ''' 32 | should come in as a:x, b:y, c:z, etc, where x+y+z = 100, and a-c are filter ids 33 | standard aggregate is equally weighted 34 | ''' 35 | 36 | weights = list() 37 | #TODO: make the dup_list a dict 38 | dup_list = list() 39 | 40 | if not dist_str is None: 41 | if len(dist_str) != len(filter_list): 42 | print "The number of loaded filters ({}) does not equal the number of provided weights ({})".format(len(filter_list), len(dist_str)) 43 | return 44 | try: 45 | for s in dist_str: 46 | p = s.split(':') 47 | filter_id = int(p[0]) 48 | percent = float(p[1]) 49 | 50 | if filter_id in dup_list: 51 | print "Error: Mutliple weights entered for filter with id {}".format(filter_id) 52 | return 53 | dup_list.append(filter_id) 54 | weights.append((filter_id, percent / float(100))) 55 | 56 | if sum([w[1] for w in weights]) != 1: 57 | print "The filter weights must total 100" 58 | return 59 | except: 60 | print "There was an error with your sytax, try again" 61 | return 62 | else: 63 | i = 0 64 | even_split = 1 / float(len(filter_list)) 65 | for f in filter_list: 66 | weights.append((i, even_split)) 67 | i += 1 68 | 69 | query = "UPDATE unique_file\n" 70 | 71 | #now create the query 72 | 73 | for w in weights: 74 | fltr = filter_list[w[0]] 75 | print "{} weight -> {}".format(fltr.name, w[1]) 76 | query += "LEFT JOIN " + fltr.score_table + " ON " + fltr.score_table + ".id = unique_file.id\n" 77 | 78 | query += "SET unique_file.reputation = (" 79 | 80 | for filter_id, weight in weights: 81 | fltr = filter_list[filter_id] 82 | query += "{} * {}.score + ".format(weight, fltr.score_table) 83 | 84 | #remove the last + 85 | query = query[0:len(query)-3] 86 | query += ")" 87 | cursor = self.cnx.cursor() 88 | cursor.execute(query) 89 | self.cnx.commit() 90 | cursor.close() 91 | -------------------------------------------------------------------------------- /sql/create_redwood_db.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS os ( 2 | id INT UNSIGNED NOT NULL AUTO_INCREMENT, 3 | name VARCHAR(150) NOT NULL, 4 | PRIMARY KEY (id), 5 | UNIQUE INDEX name_UNIQUE (name ASC) 6 | ) ENGINE=InnoDB; 7 | 8 | CREATE TABLE IF NOT EXISTS media_source ( 9 | id INT UNSIGNED NOT NULL AUTO_INCREMENT, 10 | reputation INT NULL, 11 | name VARCHAR(150) NULL, 12 | date_acquired DATETIME NULL, 13 | os_id INT UNSIGNED NOT NULL, 14 | PRIMARY KEY (id), 15 | UNIQUE INDEX name_UNIQUE (name ASC), 16 | CONSTRAINT fk_os_id FOREIGN KEY (os_id) 17 | REFERENCES os (id) 18 | ON DELETE NO ACTION ON UPDATE NO ACTION 19 | ) ENGINE=InnoDB; 20 | 21 | CREATE TABLE IF NOT EXISTS unique_file ( 22 | id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, 23 | hash CHAR(40) NOT NULL, 24 | reputation DOUBLE NOT NULL DEFAULT .5, 25 | status INT DEFAULT 0, 26 | PRIMARY KEY (id), 27 | INDEX file_reputation (reputation ASC), 28 | UNIQUE INDEX hash_UNIQUE (hash ASC) 29 | ) ENGINE=InnoDB; 30 | 31 | 32 | CREATE TABLE IF NOT EXISTS unique_path ( 33 | id INT UNSIGNED NOT NULL AUTO_INCREMENT, 34 | full_path VARCHAR(4096) NOT NULL, 35 | path_hash CHAR(40) NULL, 36 | PRIMARY KEY (id), 37 | UNIQUE INDEX path_hash_UNIQUE (path_hash ASC) 38 | ) ENGINE=InnoDB; 39 | 40 | 41 | CREATE TABLE IF NOT EXISTS file_metadata ( 42 | id BIGINT UNSIGNED UNIQUE NOT NULL, 43 | unique_file_id BIGINT UNSIGNED NULL, 44 | source_id INT UNSIGNED NOT NULL, 45 | unique_path_id INT UNSIGNED NOT NULL, 46 | parent_id BIGINT UNSIGNED NULL, 47 | file_name VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, 48 | filesystem_id INT UNSIGNED NULL DEFAULT NULL, 49 | device_id INT NULL DEFAULT NULL, 50 | attributes INT NULL DEFAULT NULL, 51 | user_owner VARCHAR(45) NULL DEFAULT NULL COMMENT ' ', 52 | group_owner VARCHAR(45) NULL DEFAULT NULL, 53 | size INT UNSIGNED NULL DEFAULT NULL, 54 | created DATETIME NULL DEFAULT NULL, 55 | last_accessed DATETIME NULL DEFAULT NULL, 56 | last_modified DATETIME NULL DEFAULT NULL, 57 | last_changed DATETIME NULL DEFAULT NULL, 58 | user_flags INT NULL DEFAULT NULL, 59 | links_to_file INT NULL DEFAULT NULL, 60 | disk_offset BIGINT NULL, 61 | entropy TINYINT NULL, 62 | file_content_status TINYINT NULL, 63 | extension VARCHAR(32) NULL, 64 | file_type VARCHAR(64) NULL, 65 | os_id INT UNSIGNED NOT NULL, 66 | INDEX fk_source_id_idx USING BTREE (source_id ASC), 67 | CONSTRAINT fk_source_id FOREIGN KEY (source_id) 68 | REFERENCES media_source (id) 69 | ON DELETE NO ACTION ON UPDATE NO ACTION, 70 | INDEX fk_unique_file_id_idx (unique_file_id ASC), 71 | CONSTRAINT fk_unique_file_id FOREIGN KEY (unique_file_id) 72 | REFERENCES unique_file (id) 73 | ON DELETE NO ACTION ON UPDATE NO ACTION, 74 | INDEX fk_unique_path_idx (unique_path_id ASC), 75 | CONSTRAINT fk_unique_path FOREIGN KEY (unique_path_id) 76 | REFERENCES unique_path (id) 77 | ON DELETE NO ACTION ON UPDATE NO ACTION, 78 | UNIQUE INDEX source_id_unique_path_id_file_name_idx USING BTREE (unique_path_id ASC , file_name ASC , source_id ASC), 79 | INDEX fk_os_id_idx (os_id ASC), 80 | CONSTRAINT fk_os_id2 FOREIGN KEY (os_id) 81 | REFERENCES os (id) 82 | ON DELETE NO ACTION ON UPDATE NO ACTION, 83 | INDEX parent_id_idx USING BTREE (parent_id ASC), 84 | INDEX file_name_idx USING BTREE (file_name ASC) 85 | ) ENGINE=InnoDB; 86 | 87 | -------------------------------------------------------------------------------- /sql/synthesize_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCES_ROOT=sources 4 | DIR_BASE=Home 5 | DIR_TEMPLATE=${DIR_BASE}_0 6 | NUM_SUBDIRS=10 7 | NUM_BASE_FILES=30 8 | NUM_SOURCES=20 9 | NUM_SUBDIR_FILES=10 10 | 11 | 12 | 13 | 14 | 15 | 16 | function genAnomalyPrevalence { 17 | 18 | ########## PREVALENCE ANOMALIES ################ 19 | 20 | #now add the anomalie s 21 | NUM_PREVALENCE_ANOMALIES=5 22 | 23 | i=0 24 | 25 | while [ $i -lt $NUM_PREVALENCE_ANOMALIES ]; do 26 | source_name=$((RANDOM % NUM_SOURCES)) 27 | sub_dir=$((RANDOM % NUM_SUBDIRS)) 28 | dd bs=100 count=10 if=/dev/urandom of=${SOURCES_ROOT}/${DIR_BASE}_${source_name}/${sub_dir}A_DIR/anom_${i} &>/dev/null 29 | i=$[$i+1] 30 | done 31 | 32 | } 33 | 34 | 35 | function genAnomalyLocality { 36 | 37 | echo "Generating anomalies for Locality Uniqueness" 38 | 39 | ############ LOCALITY UNIQUENESS ANOMALIES ######################## 40 | 41 | #since loc unq is currently using time 42 | sleep 3 43 | 44 | NUM_LOCUNQ_ANOMALIES=5 45 | 46 | i=0 47 | 48 | while [ $i -lt $NUM_LOCUNQ_ANOMALIES ]; do 49 | source_name=$((RANDOM % NUM_SOURCES)) 50 | sub_dir=$((RANDOM % NUM_SUBDIRS)) 51 | dd bs=100 count=10 if=/dev/urandom of=${SOURCES_ROOT}/${DIR_BASE}_${source_name}/${sub_dir}A_DIR/newer_anom_${i} &>/dev/null 52 | i=$[$i+1] 53 | done 54 | } 55 | 56 | 57 | function genAnomalyFileName { 58 | 59 | echo "Generating anomalies for file name" 60 | 61 | ############# FILE NAME ANOMALY ############################### 62 | 63 | i=0 64 | NUM_NAME_ANOMALIES=3 65 | 66 | while [ $i -lt $NUM_NAME_ANOMALIES ]; do 67 | 68 | #anomaly 1 69 | source_id=$((RANDOM % NUM_SOURCES)) 70 | base_file=$((RANDOM % NUM_BASE_FILES)) 71 | 72 | 73 | mv ${SOURCES_ROOT}/${DIR_BASE}_${source_id}/file_${base_file} ${SOURCES_ROOT}/${DIR_BASE}_${source_id}/diff_name_${base_file} 74 | i=$[$i+1] 75 | 76 | done 77 | 78 | } 79 | 80 | 81 | 82 | 83 | 84 | rm -rf $SOURCES_ROOT 85 | mkdir $SOURCES_ROOT 86 | mkdir ${SOURCES_ROOT}/${DIR_TEMPLATE} 87 | 88 | i=0 89 | 90 | #make subdir level A 91 | while [ $i -lt ${NUM_SUBDIRS} ]; do 92 | mkdir "${SOURCES_ROOT}/${DIR_TEMPLATE}/${i}A_DIR" 93 | i=$[$i+1] 94 | done 95 | 96 | 97 | i=0 98 | 99 | while [ $i -lt $NUM_BASE_FILES ]; do 100 | dd bs=32 count=10 if=/dev/urandom of="${SOURCES_ROOT}/${DIR_TEMPLATE}/file_${i}" &>/dev/null 101 | i=$[$i+1] 102 | done 103 | 104 | i=0 105 | 106 | while [ $i -lt $NUM_SUBDIR_FILES ]; do 107 | num=$((RANDOM % NUM_SUBDIRS)) 108 | dd bs=50 count=10 if=/dev/urandom of="${SOURCES_ROOT}/${DIR_TEMPLATE}/${num}A_DIR/f_${i}" &>/dev/null 109 | i=$[$i+1] 110 | done 111 | 112 | 113 | i=1 114 | 115 | while [ $i -lt $(( NUM_SOURCES )) ]; do 116 | 117 | cp -rf ${SOURCES_ROOT}/${DIR_TEMPLATE} ${SOURCES_ROOT}/${DIR_BASE}_${i} 118 | i=$[$i+1] 119 | done 120 | 121 | 122 | #now create the anomalies 123 | genAnomalyFileName 124 | genAnomalyLocality 125 | genAnomalyPrevalence 126 | 127 | i=0 128 | 129 | 130 | #go ahead and create the csvs 131 | if [ -e filewalk.py ]; then 132 | 133 | rm -rf csv 134 | mkdir csv 135 | 136 | while [ $i -lt ${NUM_SOURCES} ]; do 137 | loc="${SOURCES_ROOT}/${DIR_BASE}_${i}" 138 | python filewalk.py $loc test_os source_${i} csv 139 | i=$[$i+1] 140 | done 141 | fi 142 | 143 | echo "done" 144 | 145 | 146 | -------------------------------------------------------------------------------- /reports/resources/css/style.css: -------------------------------------------------------------------------------- 1 | .redwood-title 2 | { 3 | font-size: 40px; 4 | text-indent: 5px; 5 | } 6 | .redwood-header 7 | { 8 | background-color: orange; 9 | text-indent: 5px; 10 | } 11 | 12 | #redwood-table 13 | { 14 | font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; 15 | font-size: 12px; 16 | margin: 15px; 17 | width: 480px; 18 | text-align: left; 19 | border-collapse: collapse; 20 | border-style: hidden; 21 | } 22 | #redwood-table caption.caption 23 | { 24 | text-align: left; 25 | font-size: 25px; 26 | font-style: bold; 27 | } 28 | #redwood-table thead th.rounded-head-left 29 | { 30 | background: orange url('../images/top_left.png') left -1px no-repeat; 31 | } 32 | #redwood-table thead th.rounded-head-left-light 33 | { 34 | background: #FFE0B2 url('../images/top_left_light.png') left -1px no-repeat; 35 | } 36 | #redwood-table thead th.rounded-head-right 37 | { 38 | background: orange url('../images/top_right.png') right -1px no-repeat; 39 | } 40 | #redwood-table thead th.rounded-head-right-light 41 | { 42 | background: #FFE0B2 url('../images/top_right_light.png') right -1px no-repeat; 43 | } 44 | #redwood-table th 45 | { 46 | padding: 8px; 47 | font-weight: normal; 48 | font-size: 13px; 49 | color: black; 50 | background: orange; 51 | } 52 | #redwood-table th.score-divider 53 | { 54 | background: orange 55 | } 56 | #redwood-table th.count-divider 57 | { 58 | /*border-right: 2px solid;*/ 59 | background: #FFE0B2; 60 | } 61 | #redwood-table td 62 | { 63 | padding: 8px; 64 | background: #FFE0B2; 65 | border-top: 1px solid #fff; 66 | color: black; 67 | } 68 | #redwood-table td.score-divider 69 | { 70 | background: orange; 71 | } 72 | #redwood-table td.count-divider 73 | { 74 | /*border-right: 2px solid;*/ 75 | } 76 | #redwood-table tfoot td.rounded-foot-left 77 | { 78 | background: orange url('../images/bot_left.png') left bottom no-repeat; 79 | } 80 | #redwood-table tfoot td.rounded-foot-left-light 81 | { 82 | background: #FFE0B2 url('../images/bot_left_light.png') left bottom no-repeat; 83 | } 84 | #redwood-table tfoot td.rounded-foot-right 85 | { 86 | background: orange url('../images/bot_right.png') right bottom no-repeat; 87 | } 88 | #redwood-table tfoot td.rounded-foot-right-light 89 | { 90 | background: #FFE0B2 url('../images/bot_right_light.png') right bottom no-repeat; 91 | } 92 | #redwood-table tbody tr:hover td 93 | { 94 | background: #FFC266; 95 | } 96 | div#top 97 | { 98 | margin-left: 225px 99 | } 100 | div#navigation 101 | { 102 | display: inline-block; 103 | position: absolute; 104 | float: left/right; 105 | width: 200px; 106 | height: 97.9%; 107 | top: 15% 108 | right: auto; 109 | bottom: 100px 110 | left: 0; 111 | color: #ffffff; 112 | background-color: #FFE0B2; 113 | background-image: url('../images/top_left_light.png'), url('../images/top_right_light.png'), url('../images/bot_right_light.png'), url('../images/bot_left_light.png'); 114 | background-position: left top, right top, right bottom, left bottom; 115 | background-repeat: no-repeat; 116 | padding: 5px; 117 | } 118 | div#content 119 | { 120 | margin-left: 210px; 121 | } 122 | div#navigation .center 123 | { 124 | width: 150px; 125 | height: 150px; 126 | display: block; 127 | margin-left: auto; 128 | margin-right: auto; 129 | } 130 | div.a 131 | { 132 | display: block; 133 | width: 60px; 134 | } 135 | dl.list 136 | { 137 | text-align: center; 138 | } 139 | .button 140 | { 141 | width: 180px; 142 | height: 25px; 143 | background: orange; 144 | padding: 10px; 145 | border-radius: 5px; 146 | color: black; 147 | font-size: 20; 148 | font-weight: bold; 149 | margin-bottom: 5px; 150 | } 151 | ul#navigation 152 | { 153 | margin: 0; 154 | margin-top: 10px; 155 | padding: 0; 156 | width: 300px; 157 | } 158 | ul#navigation li 159 | { 160 | list-style-type: none; 161 | } 162 | a 163 | { 164 | display: block; 165 | padding: 10px 20px; 166 | width: 60px; 167 | text-decoration: none; 168 | } 169 | -------------------------------------------------------------------------------- /sql/create_redwood_sp.sql: -------------------------------------------------------------------------------- 1 | DROP PROCEDURE IF EXISTS map_staging_table; 2 | 3 | DELIMITER // 4 | CREATE PROCEDURE map_staging_table(IN source_id INT, IN os_id INT) 5 | BEGIN 6 | INSERT INTO `unique_file` (hash) 7 | SELECT DISTINCT contents_hash 8 | FROM `staging_table` where basename != "/" and LENGTH(contents_hash) > 0 9 | ON DUPLICATE KEY UPDATE hash = hash; 10 | INSERT IGNORE INTO `unique_path` (full_path, path_hash) 11 | SELECT dirname, dirname_hash 12 | FROM `staging_table`; 13 | INSERT IGNORE INTO `file_metadata` 14 | (id, 15 | unique_file_id, 16 | source_id, 17 | unique_path_id, 18 | parent_id, 19 | file_name, 20 | filesystem_id, 21 | device_id, 22 | attributes, 23 | user_owner, 24 | group_owner, 25 | size, 26 | created, 27 | last_accessed, 28 | last_modified, 29 | last_changed, 30 | user_flags, 31 | links_to_file, 32 | disk_offset, 33 | entropy, 34 | file_content_status, 35 | extension, 36 | file_type, 37 | os_id) 38 | SELECT 39 | staging_table.global_file_id, 40 | unique_file.id, 41 | source_id, 42 | unique_path.id, 43 | staging_table.parent_id, 44 | staging_table.basename, 45 | staging_table.filesystem_id, 46 | staging_table.device_id, 47 | staging_table.attributes, 48 | staging_table.user_owner, 49 | staging_table.group_owner, 50 | staging_table.size, 51 | staging_table.created, 52 | staging_table.last_accessed, 53 | staging_table.last_modified, 54 | staging_table.last_changed, 55 | staging_table.user_flags, 56 | staging_table.links_to_file, 57 | staging_table.disk_offset, 58 | staging_table.entropy, 59 | staging_table.file_content_status, 60 | staging_table.extension, 61 | staging_table.file_type, 62 | os_id 63 | FROM `staging_table` 64 | LEFT JOIN `unique_file` 65 | ON (staging_table.contents_hash = unique_file.hash) 66 | LEFT JOIN `unique_path` 67 | ON (staging_table.dirname_hash = unique_path.path_hash); 68 | END // 69 | DELIMITER ; 70 | 71 | 72 | DROP VIEW IF EXISTS joined_file_metadata; 73 | 74 | CREATE VIEW `joined_file_metadata` AS 75 | SELECT 76 | `file_metadata`.id AS file_metadata_id, 77 | unique_file_id, 78 | source_id, 79 | unique_path_id, 80 | file_name, 81 | parent_id, 82 | filesystem_id, 83 | device_id, 84 | attributes, 85 | user_owner, 86 | group_owner, 87 | size, 88 | created, 89 | last_accessed, 90 | last_modified, 91 | last_changed, 92 | user_flags, 93 | links_to_file, 94 | disk_offset, 95 | entropy, 96 | file_content_status, 97 | extension, 98 | file_type, 99 | hash, 100 | reputation, 101 | full_path, 102 | path_hash 103 | FROM 104 | file_metadata 105 | LEFT JOIN 106 | unique_file ON `file_metadata`.unique_file_id = `unique_file`.id 107 | LEFT JOIN 108 | unique_path ON `unique_path`.id = `file_metadata`.unique_path_id; 109 | 110 | 111 | -------------------------------------------------------------------------------- /images/logo/license/clker_tos.txt: -------------------------------------------------------------------------------- 1 | Terms of use 2 | Clker.com is owned by Rolera LLC, an Illinois Limited Liability Corporation. Clker and Clker.com are trademarks of Rolera LLC. 3 | 4 | Clker.com is an online sharing service where users share free public domain vector cliparts, or share public domain photos and derive vector cliparts from those photos using clker's online tracer. 5 | 6 | Users who upload shared cliparts and photos on Clker.com shall certify they are in public domain, as it is shown on the upload page. Please flag any content suspected otherwise. We have no other information concerning the status of the uploaded pictures and cliparts. 7 | 8 | Using clker.com 9 | You are allowed to use clker.com and any content provided by clker.com if you are 18 years or older or if your parent or guardian who is 18 years or older reads the entire terms listed on this page including disclaimers and agrees to all of them. If you or your parent or guardian do not agree to the entire terms listed on this page then you shall not use clker.com or any content provided by clker.com. 10 | 11 | You shall not use clker.com to draw any image, or download images from clker.com that will be used or characterized as: 12 | 13 | derogatory, humliating or condescending towards any person, group of people, associations, organizations or corporations. 14 | expressing hate towards any one or group. 15 | porn or advertises sexual activities even if it was legal in your state or country. 16 | violating any US laws. 17 | The terms listed here are subject to change without notice. If you download content from, upload or draw on clker.com, you hereby agree that it is your responsibility to continuously check the terms of use for updates. In the event that the terms change and you do not agree with the new terms, you shall cease using any content downloaded or delete any content drawn or uploaded from or to clker.com upon publishing those newer terms. 18 | 19 | DISCLAIMER & NO WARRANTY 20 | BECAUSE CLKER.COM AND ITS CONTENTS ARE FREE OF CHARGE, WE PROVIDE ABSOLUTELY NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING, CLKER.COM AND ITS OWNERS PROVIDE THE CONTENT AND IMAGES 'AS IS' WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE CONTENT IS WITH YOU. SHOULD ANY PART OF CLKER.COM OR ITS CONTENT PROVE DEFECTIVE, OR NOT PUBLIC DOMAIN YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, CORRECTION AND OTHER COSTS THAT MIGHT HAPPEN TO YOU OR YOUR PRODUCT OR CLIENTS OR CUSTOMERS FROM USING CONTENT OR IMAGES FROM CLKER.COM OR ANY DERIVATIVES OF YOUR WORK THAT INCLUDED OR WAS DERIVED FROM OUR CONTENT. 21 | 22 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL CLKER.COM, IT'S OWNERS, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A FAILURE OF THE PICTURES/IMAGES/SOFTWARE TO OPERATE WITH ANY PROGRAMS) THE SITE OR ITS CONTENTS, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. 23 | 24 | Uploading content 25 | By uploading content you hereby declare your responsibility for what you upload and release clker.com from any liabilities and responibilities towards your upload. Simply, clker.com is just hosting your upload. 26 | 27 | By uploading content, you certify that it is free from any copyrights and trademarks, and in case you are the copyright holder you hereby release it under the lastest version of the creative commons CC0 public domain dedication found here. All uploads must not a. Contain profanity words, or imply profanity by using gestures b. Must not contain porn, adult content or not safe for work images c. Photo pictures of tatto, body piercing and other forms of body art are not allowed in raster format. However, tattoo designs are allowed in vector SVG format d. Must not show hatred or imply hatered to any group or ethnicity e. Must not be discriminatory in any way f. Must not violate any US laws including copyrights. 28 | 29 | You may not upload any photos that contain identifiable living or recently deceased people unless the photo and context cannot be protected by personal privacy or publicity rights. For example, picture of prominant members of governments while doing their job are allowed, but pictures of your family and yourself are not allowed if people can be identified in the photos. 30 | 31 | Clker.com reserves the right to delete any content deemed unacceptable, and reservers the right to determine what content is acceptable even if you uploaded content that you believe complies with the upload policy. Any content that turns out to be copyright protected, will be deleted as soon as Clker.com learns of it. 32 | 33 | Clker.com reserves the right to terminate any user account for repeated policy violations or for uploading copyright protected images. 34 | 35 | DMCA notices 36 | Greiman, Rome & Griesmeyer is our DMCA agent. Please do not email or call them for support questions.. For support questions please email support at clker dot com. 37 | 38 | Please send all DMCA notices to: 39 | Brian J. Pleviak, Attorney 40 | Ginsberg Jacobs LLC 41 | 300 South Wacker Drive, Suite 2450 42 | Chicago Illinois 60606 43 | Phone: (312) 660-9626 44 | Fax: (312) 660-9612 45 | -------------------------------------------------------------------------------- /sql/filewalk.py: -------------------------------------------------------------------------------- 1 | import binascii 2 | import datetime 3 | import hashlib 4 | import mimetypes 5 | import os 6 | import re 7 | import struct 8 | import subprocess 9 | import sys 10 | import time 11 | import urllib 12 | import csv 13 | from Queue import Queue 14 | 15 | # 8 byte unique ID generator give a path. 16 | # - first five bytes are first five from sha1 of path name 17 | # - last 3 are the first three from the current time 18 | # Returns a long 19 | def generateUniqueId(path): 20 | 21 | m = hashlib.md5() 22 | m.update(path) 23 | first_five = m.digest()[:5] 24 | last_three = struct.pack("I", int(time.time()))[:3] 25 | combined = first_five + last_three 26 | return long(binascii.hexlify(combined), 16) 27 | 28 | 29 | def write_stat_info(basename, dirname, file_id, parent_id, dirname_digest, csv_writer): 30 | 31 | #need to escape commas from base name and dirname since we are creating a csv 32 | 33 | 34 | path = os.path.join(dirname, basename) 35 | 36 | try: 37 | stat_obj = os.stat(path) 38 | except Exception: 39 | # print "Error trying to stat {}".format(path) 40 | return 41 | 42 | url = urllib.pathname2url(path) 43 | file_type = mimetypes.guess_type(url)[0] 44 | hash_val = hash_file(path, file_type) 45 | 46 | #file_id, parent_id,dirname,basename,hash,fs_id,device,permissions,uid,gid,size,create_time,access_time,mod_time,metadata_change_time,user_flags,links,disk_offset,entropy,file_content_status,extensions,file_type 47 | 48 | csv_writer.writerow([file_id, parent_id, dirname, basename, hash_val, dirname_digest, stat_obj.st_ino, stat_obj.st_dev, 49 | str(oct(stat_obj.st_mode)), stat_obj.st_uid, stat_obj.st_gid, stat_obj.st_size, long(os.path.getctime(path)), 50 | long(stat_obj.st_atime), long(stat_obj.st_mtime), long(stat_obj.st_ctime), "", stat_obj.st_nlink, "", "", "", 51 | os.path.splitext(basename)[1], file_type]) 52 | 53 | 54 | BUFFER = 4096 55 | 56 | def hash_file(path, file_type): 57 | 58 | ret = "" 59 | # some files you can't hash 60 | if(file_type == 'inode/chardevice' \ 61 | or file_type == 'inode/symlink' \ 62 | or file_type == 'inode/socket' \ 63 | or file_type == 'inode/blockdevice' \ 64 | or file_type == 'inode/x-empty' \ 65 | or file_type == 'application/x-coredump' \ 66 | or file_type == 'inode/directory'): 67 | ret = "0" 68 | return ret 69 | 70 | fd = None 71 | try: 72 | h = hashlib.sha1() 73 | fd = os.open(path, os.O_RDONLY | getattr(os, 'O_NONBLOCK', 0) | os.O_NONBLOCK) 74 | data = os.read(fd, BUFFER) 75 | while(len(data)>0): 76 | h.update(data) 77 | data = os.read(fd, BUFFER) 78 | ret = h.hexdigest() 79 | except Exception, err: 80 | # print "Hash Error: {} on file {} with type {}".format(err, path, 81 | # file_type) 82 | pass 83 | finally: 84 | if(fd != None): 85 | os.close(fd) 86 | return ret 87 | 88 | 89 | omitted_dirs = ['/dev', '/proc', '/sys', '/Volumes', '/mnt', '/net'] 90 | 91 | 92 | def main(argv): 93 | 94 | if(len(argv) != 5): 95 | print "filewalk.py " 96 | return 97 | 98 | 99 | #make sure output dir exists 100 | if os.path.exists(argv[4]) is False: 101 | print "Output dir {} does not exist".format(argv[4]) 102 | return 103 | 104 | today = datetime.date.today() 105 | str_date = today.strftime('%Y-%m-%d') 106 | out_file = os.path.join(argv[4], "{}--{}--{}".format(str_date, argv[2], argv[3])) 107 | start_dir = argv[1] 108 | 109 | 110 | stack = list() 111 | 112 | with open(out_file, "w") as file_handle: 113 | 114 | csv_writer = csv.writer(file_handle) 115 | csv_writer.writerow(["file_id","parent_id","dirname","basename","contents_hash", "dirname_hash", "fs_id","device","permissions", 116 | "uid","gid","size","create_time","access_time","mod_time","metadata_change_time", 117 | "user_flags","links","disk_offset","entropy","file_content_status","extensions","file_type"]) 118 | 119 | # start the queue with a 0 value 120 | stack.append(0L) 121 | 122 | for root, dirs, files in os.walk(start_dir): 123 | # We want to have a nice, dynamic output that doesn't flood the 124 | # terminal with lines of text. So we'll write a line, then flush it 125 | # with '\r'. In order to do this properly, we need to first measure 126 | # the width of the terminal. 127 | # We're also going to put it inside the loop in case the window 128 | # gets resized while it's running 129 | rows,columns = os.popen('stty size', 'r').read().split() 130 | rows = int(rows) 131 | columns = int(columns) 132 | 133 | parent_id = stack.pop() 134 | 135 | #some directories we will ignore as so 136 | if root in omitted_dirs: 137 | del dirs[:] 138 | continue 139 | 140 | sys.stdout.write('\r') 141 | sys.stdout.write(' ' * columns) 142 | sys.stdout.write('\r') 143 | sys.stdout.write('processing {}'.format(root[:columns-12])) 144 | sys.stdout.flush() 145 | 146 | new_parent_id = generateUniqueId(root) 147 | 148 | # for each of the child dirs, add the parent id. This assumes a BFS 149 | # search 150 | for d in dirs: 151 | stack.append(new_parent_id) 152 | 153 | h = hashlib.sha1() 154 | h.update(root) 155 | root_digest = h.hexdigest() 156 | 157 | # write the parent directory 158 | write_stat_info("/", root, new_parent_id, parent_id, root_digest,csv_writer) 159 | for f in files: 160 | _id = generateUniqueId(os.path.join(root, f)) 161 | write_stat_info(f, root, _id, new_parent_id, root_digest, csv_writer) 162 | file_handle.flush() 163 | 164 | if __name__=="__main__": 165 | main(sys.argv) 166 | -------------------------------------------------------------------------------- /redwood/filters/redwood_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | """ 22 | from collections import namedtuple 23 | import inspect 24 | 25 | class RedwoodFilter(object): 26 | """ 27 | Base class for Filter creation 28 | 29 | :ivar name: Name of the filter. This should be one word, lower case, with underscores if needed 30 | :ivar cnx: connection instance to the database 31 | :ivar score_table: name of the table containing reputation scores. The table must have exactly two columns (id, score) 32 | """ 33 | def __init__(self): 34 | self.name = "generic" 35 | self.cnx = None 36 | self.score_table = None 37 | def clean(self): 38 | """ 39 | Deletes all required tables for this filter (method must be overridden) 40 | """ 41 | raise NotImplementedError 42 | 43 | def update(self, source): 44 | """ 45 | Updates filter tables with new data from (method must be overridden) 46 | 47 | :param source: name of the media source 48 | """ 49 | raise NotImplementedError 50 | 51 | def rebuild(self): 52 | """ 53 | Deletes all tables for this filter, recreates them, then rebuilds data for them from the datastore 54 | """ 55 | self.clean() 56 | self.build() 57 | 58 | #get a list of the sources 59 | query = """ 60 | SELECT media_source.name FROM media_source 61 | """ 62 | 63 | cursor = self.cnx.cursor() 64 | cursor.execute(query) 65 | 66 | print "...Rebuild process started" 67 | for source in cursor.fetchall(): 68 | print "rebuilding for source: {}".format(source[0]) 69 | self.update(source[0]) 70 | 71 | def show_results(self, direction, count, source, out=None): 72 | """ 73 | Displays avg file prevalence in orderr for a given source 74 | 75 | :param direction: either [top] or [bottom] 76 | :param count: number of rows to retrieve from the direction 77 | :param out: file to write results to 78 | """ 79 | 80 | print "[+] Running list_by_source..." 81 | cursor = self.cnx.cursor() 82 | dir_val = ("desc" if direction == "top" else "asc") 83 | 84 | if direction == "top": 85 | dir_val = "desc" 86 | elif direction == "bottom": 87 | dir_val = "asc" 88 | else: 89 | print "Error: direction must be \"top\" or \"bottom\"" 90 | return 91 | 92 | 93 | print "Fetching {} results from {} for filter {}".format(direction, source, self.name) 94 | 95 | query = """ 96 | SELECT {}.score, unique_path.full_path, file_metadata.file_name 97 | FROM {} LEFT JOIN file_metadata ON {}.id = file_metadata.unique_file_id 98 | LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id 99 | WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}") 100 | ORDER BY {}.score {} LIMIT 0, {} 101 | """.format(self.score_table, self.score_table, self.score_table, source, self.score_table, dir_val, count) 102 | 103 | cursor.execute(query) 104 | 105 | if out is None: 106 | results = cursor.fetchall() 107 | i = 0 108 | for r in results: 109 | print "{}:\t{}\t{}/{}".format(i, r[0], r[1], r[2]) 110 | i+=1 111 | return results 112 | else: 113 | 114 | with open (out, "w") as f: 115 | v = 0 116 | for x in cursor.fetchall(): 117 | f.write("{}:\t{}\t{}/{}\n".format(v, x[0], x[1], x[2])) 118 | v += 1 119 | 120 | cursor.close() 121 | 122 | 123 | def build(self): 124 | """ 125 | Builds necessary tables for the filter. This function must create the scores table. The standard practice 126 | is to create a table called "filter_name"_scores that has two columns (id, double score). As an example for a 127 | filter called "woohoo", you would want to add the following create table:: 128 | 129 | CREATE TABLE IF NOT EXISTS `woohoo_scores` ( 130 | id BIGINT unsigned NOT NULL, 131 | score double DEFAULT NULL, 132 | PRIMARY KEY(id), 133 | CONSTRAINT `fk_unique_file_woohoo_id` FOREIGN KEY (`id`) 134 | REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION 135 | ) ENGINE=InnoDB 136 | """ 137 | 138 | raise NotImplementedError 139 | 140 | def run_survey(self, source_name): 141 | """ 142 | Given a source name, this function will create an html file summarizing its analysis. The survey should be an 143 | html file named "survey.html", and it should be located in a directory called "survey_[your file name]_[source name]. 144 | The survey directory should also contain a resources directory where html resources such as images will be saved:: 145 | 146 | survey_filtername__sourcename 147 | |- survey.html 148 | |- resources 149 | 150 | :param source_name: name of the source 151 | 152 | :return path to the survey directory 153 | """ 154 | 155 | raise NotImplementedError 156 | 157 | def run_func(self, func_name, *args): 158 | """ 159 | Helper function that will run the with for this filter 160 | 161 | :param func_name: name of the function to run 162 | :param args: list of arguments to run with the function 163 | """ 164 | func = getattr(self, 'discover_' + func_name, None) 165 | if not func: 166 | return False 167 | 168 | ret = inspect.getargspec(func) 169 | #subtract one for the "self" 170 | upper_num_args = len(ret.args) - 1 171 | 172 | if ret.defaults is not None: 173 | lower_num_args = upper_num_args - len(ret.defaults) 174 | else: 175 | lower_num_args = upper_num_args 176 | 177 | actual_args = len(args) 178 | 179 | if actual_args > upper_num_args or actual_args < lower_num_args: 180 | print "Error: Incorrect number of args" 181 | return False 182 | 183 | func(*args) 184 | return True 185 | 186 | def do_help(self, cmd): 187 | "Get help on a command. Usage: help command" 188 | if cmd: 189 | func = getattr(self, 'discover_' + cmd, None) 190 | if func: 191 | print func.__doc__ 192 | return True 193 | return False 194 | -------------------------------------------------------------------------------- /redwood/shell/modes.py: -------------------------------------------------------------------------------- 1 | import cmd 2 | import exceptions 3 | import sys 4 | import time 5 | import shlex 6 | import redwood.filters 7 | import redwood.helpers.core as core 8 | from redwood.filters import filter_list 9 | from redwood.foundation.aggregator import Aggregator 10 | from redwood.foundation.report import Report 11 | 12 | class SubInterpreterDiscover(cmd.Cmd): 13 | 14 | def __init__(self, cnx, line): 15 | cmd.Cmd.__init__(self) 16 | self.cnx = cnx 17 | 18 | if line: 19 | self.plugin = filter_list[int(line)] 20 | self.prompt = '\033[1;32mredwood-'+str(self.plugin.name)+'-discover$ \033[1;m' 21 | publicMethods = filter(lambda funcname: funcname.startswith('discover_'), dir(self.plugin)) 22 | self.added_attrs = [] 23 | for method in publicMethods: 24 | self.added_attrs.append(method.replace("discover_", "do_", 1)) 25 | setattr(SubInterpreterDiscover, method.replace("discover_", "do_", 1), self.run) 26 | 27 | 28 | def default(self, line): 29 | if line == 'EOF' or line == 'exit' or line == 'quit': 30 | self.do_back(line) 31 | return True 32 | else: 33 | print "*** Command not recognized, try 'help'" 34 | 35 | def emptyline(self): 36 | pass 37 | 38 | def help_help(self): 39 | self.do_help('') 40 | 41 | def do_back(self, line): 42 | '''Go back a level in the shell''' 43 | for attr in self.added_attrs: 44 | delattr(SubInterpreterDiscover, attr) 45 | return True 46 | 47 | def run(self, line): 48 | '''Calls out the run_func in redwood_filter''' 49 | if line: 50 | #line_a = self.cmdline.split() 51 | line_a = shlex.split(self.cmdline) 52 | func_name = line_a[0] 53 | args = tuple(line_a[1:]) 54 | self.plugin.run_func(func_name, *args) 55 | else: 56 | self.plugin.do_help(self.cmdline) 57 | 58 | def do_help(self, line): 59 | if line: 60 | self.plugin.do_help(line) 61 | else: 62 | cmd.Cmd.do_help(self, line) 63 | 64 | def precmd(self, line): 65 | self.cmdline = line 66 | return line 67 | 68 | def do_quit(self, line): 69 | '''quit: Exit the redwood console''' 70 | if self.cnx != None: 71 | self.cnx.close() 72 | sys.stdout.write('\n') 73 | sys.exit(0) 74 | 75 | class SubInterpreterFilter(cmd.Cmd): 76 | prompt = '\033[1;32mredwood-filter$ \033[1;m' 77 | 78 | def __init__(self, cnx): 79 | cmd.Cmd.__init__(self) 80 | self.cnx = cnx 81 | 82 | def do_quit(self, line): 83 | '''quit: Exit the redwood console''' 84 | if self.cnx != None: 85 | self.cnx.close() 86 | sys.stdout.write('\n') 87 | sys.exit(0) 88 | 89 | def default(self, line): 90 | if line == 'EOF' or line == 'exit' or line == 'quit': 91 | self.do_back(line) 92 | return True 93 | else: 94 | print "*** Command not recognized, try 'help'" 95 | 96 | def emptyline(self): 97 | pass 98 | 99 | def help_help(self): 100 | self.do_help('') 101 | 102 | def do_back(self, line): 103 | '''Go back a level in the shell''' 104 | return True 105 | 106 | def do_discover(self, line): 107 | ''' 108 | discover 109 | 110 | activates discover mode for the given filter with id "filter-id" 111 | ''' 112 | if line: 113 | v = SubInterpreterFilter.validateFilterId(line) 114 | if v >= 0: 115 | sub_cmd = SubInterpreterDiscover(self.cnx, line) 116 | sub_cmd.cmdloop() 117 | else: 118 | print "Error: Filter Id required" 119 | 120 | def do_show_results(self, line): 121 | ''' 122 | show_results 123 | 124 | shows the results for the given filter's score table 125 | 126 | filter-id - id of filter 127 | direction - top or bottom 128 | count - items to display 129 | source - source name 130 | out - file to write output to (optional) 131 | ''' 132 | args = line.split() 133 | if len(args) != 5 and len(args) != 4 : 134 | print "Error: incorrect number of arguments" 135 | return 136 | v = self.validateFilterId(args[0]) 137 | plugin = filter_list[v] 138 | plugin.show_results(*args[1:]) 139 | 140 | 141 | def do_rerun(self, line): 142 | ''' 143 | rerun 144 | 145 | Reruns a filter on all sources 146 | ''' 147 | args = line.split() 148 | if(len(args) != 1): 149 | print "Error: Filter Id required" 150 | return 151 | 152 | v = self.validateFilterId(args[0]) 153 | if v < 0: 154 | return 155 | plugin = filter_list[v] 156 | plugin.clean() 157 | 158 | print "Deleting old data in filter storage" 159 | 160 | sources = core.get_all_sources(self.cnx) 161 | 162 | print "Creating new data" 163 | for src_info in sources: 164 | print "Running filter on source: {}".format(src_info.source_name) 165 | plugin.update(src_info.source_name) 166 | 167 | print "Rerun complete" 168 | 169 | def do_list(self, line): 170 | '''list: lists the avialble filters''' 171 | print "Available Filters" 172 | i = 0 173 | for plugin in filter_list: 174 | print "{}............{}".format(i, plugin.name) 175 | i+=1 176 | 177 | def do_aggregate_scores(self, line): 178 | ''' 179 | aggregate_scores filter_id:weight filter_id:weight ... 180 | 181 | Aggregates the reputations of all files using the list of filters and weights provided. If no list is 182 | provided, all filters are weighted equally. The "filter_id" is the numeric id of the filter. The "weight" 183 | is a percentage between 0-100, such that the total of all specified weights is 100. 184 | 185 | For example, if you have 3 filters loaded, and you want to aggregate the scores such that the distribution of weights 186 | is 50, 30, 20 respectively, then you would run the following command 187 | 188 | Example 189 | 190 | aggregate_scores 0:50 1:30 2:20 191 | ''' 192 | 193 | print "Aggregating Scores" 194 | args = line.split() 195 | ag = Aggregator(self.cnx) 196 | if args and len(args) > 0: 197 | ag.aggregate(filter_list, args) 198 | else: 199 | ag.aggregate(filter_list) 200 | 201 | def do_run_survey(self, line): 202 | ''' 203 | run_survey 204 | 205 | runs the survey function for the given source 206 | 207 | [source_name] - option name of source to process 208 | ''' 209 | 210 | args = shlex.split(line) 211 | 212 | if len(args) < 1: 213 | print "Error: Incorrect # of arguments" 214 | return 215 | 216 | src_obj = core.get_source_info(self.cnx, args[0]) 217 | 218 | if src_obj is None: 219 | print "Error: Unable to find source {}".format(args[0]) 220 | return 221 | else: 222 | rpt = Report(self.cnx, src_obj) 223 | if len(args) > 1: 224 | rpt.run(args[1:]) 225 | else: 226 | rpt.run(None) 227 | 228 | @staticmethod 229 | def validateFilterId(str_val): 230 | 231 | try: 232 | value = int(str_val) 233 | except exceptions.ValueError: 234 | print "Error: \'{}\' is not a number".format(str_val) 235 | return -1 236 | 237 | if(value < 0 or value >= len(filter_list)): 238 | print "Error: no plugin exists for that number" 239 | return -1 240 | 241 | return value 242 | -------------------------------------------------------------------------------- /Filters/filenames.py: -------------------------------------------------------------------------------- 1 | from redwood.filters.redwood_filter import RedwoodFilter 2 | import redwood.helpers.core as core 3 | import time 4 | import os 5 | import shutil 6 | 7 | class FileNameFilter(RedwoodFilter): 8 | 9 | def __init__(self): 10 | self.name = "FileNameFilter" 11 | self.score_table = "FileNameFilter_scores" 12 | 13 | def clean(self): 14 | """ 15 | Cleans all tables associtaed witht his filter 16 | """ 17 | cursor = self.cnx.cursor() 18 | cursor.execute("DROP TABLE IF EXISTS FileNameFilter_scores") 19 | cursor.execute("DROP TABLE IF EXISTS FileNameFilter_unique_name") 20 | self.cnx.commit() 21 | cursor.close() 22 | 23 | def build(self): 24 | """ 25 | Builds all persistent tables associated with this filter 26 | """ 27 | cursor = self.cnx.cursor() 28 | query = """ 29 | CREATE TABLE IF NOT EXISTS `FileNameFilter_scores` ( 30 | id BIGINT unsigned NOT NULL, 31 | score double DEFAULT NULL, 32 | PRIMARY KEY(id), 33 | CONSTRAINT `FNF_unique_file1_id` FOREIGN KEY (`id`) 34 | REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION 35 | ) ENGINE=InnoDB 36 | """ 37 | cursor.execute(query) 38 | self.cnx.commit() 39 | 40 | query = """ 41 | CREATE TABLE IF NOT EXISTS FileNameFilter_unique_name ( 42 | id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, 43 | file_name VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, 44 | unique_path_id INT(10) NOT NULL, 45 | count INT DEFAULT 1, 46 | PRIMARY KEY (id), 47 | UNIQUE INDEX file_path_idx USING BTREE (file_name ASC, unique_path_id), 48 | INDEX file_name_idx USING BTREE (file_name ASC) 49 | ) ENGINE=InnoDB; 50 | """ 51 | cursor.execute(query) 52 | self.cnx.commit() 53 | cursor.close() 54 | 55 | def update(self, source): 56 | print "[+] FileName Filter running on {} ".format(source) 57 | 58 | #creates the basic tables if they do not exist 59 | self.build() 60 | 61 | cursor = self.cnx.cursor() 62 | 63 | src_info = core.get_source_info(self.cnx, source) 64 | 65 | if src_info is None: 66 | print "Error: Source {} not found".format(source) 67 | return 68 | 69 | now = time.time() 70 | 71 | # self.cnx.autocommit(False) 72 | #query = """ 73 | # INSERT INTO FileNameFilter_unique_name 74 | # (file_name, unique_path_id) 75 | # (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = {}) 76 | # ON DUPLICATE KEY UPDATE count = count + 1; 77 | #""".format(src_info.source_id) 78 | cursor.execute(""" 79 | INSERT INTO FileNameFilter_unique_name 80 | (file_name, unique_path_id) 81 | (SELECT file_name, unique_path_id 82 | FROM file_metadata 83 | WHERE file_name != "/" and source_id = %s) 84 | ON DUPLICATE KEY UPDATE count = count + 1; 85 | """, (src_info.source_id,)) 86 | # self.cnx.autocommit(True) 87 | 88 | later = time.time() 89 | 90 | #print "Updated counts in {} secs\nUpdating Scores".format(later - now) 91 | 92 | cursor.execute("SELECT MAX(count) FROM FileNameFilter_unique_name") 93 | (max_count,) = cursor.fetchone() 94 | 95 | now = time.time() 96 | #query = """ 97 | # INSERT INTO FileNameFilter_scores 98 | # (id, score) 99 | # ( 100 | # SELECT 101 | # fm.unique_file_id, MIN(fnfun.count / {}) 102 | # FROM FileNameFilter_unique_name fnfun 103 | # LEFT JOIN file_metadata fm 104 | # ON fnfun.file_name = fm.file_name 105 | # AND fnfun.unique_path_id = fm.unique_path_id 106 | # WHERE not isnull(fm.unique_file_id) 107 | # GROUP BY fm.unique_file_id 108 | # ) 109 | # ON DUPLICATE KEY UPDATE score = score 110 | # """.format(max_count) 111 | cursor.execute(""" 112 | INSERT INTO FileNameFilter_scores 113 | (id, score) 114 | (SELECT fm.unique_file_id, MIN(fnfun.count / %s) 115 | FROM FileNameFilter_unique_name fnfun 116 | LEFT JOIN file_metadata fm 117 | ON fnfun.file_name = fm.file_name 118 | AND fnfun.unique_path_id = fm.unique_path_id 119 | WHERE not isnull(fm.unique_file_id) 120 | GROUP BY fm.unique_file_id) 121 | ON DUPLICATE KEY UPDATE score = score 122 | """, (max_count,)) 123 | self.cnx.commit() 124 | later = time.time() 125 | #print "Scores updated in {} secs".format(later - now) 126 | cursor.close() 127 | 128 | def discover_unique_names(self, source): 129 | """usage: unique_names source_name""" 130 | 131 | data = self.get_unique_names(source) 132 | 133 | if data is not None: 134 | for (file, dir) in data: 135 | print "Unique file %s %s" % (file, dir) 136 | 137 | 138 | def get_unique_names(self, source): 139 | """usage: unique_names source_name""" 140 | 141 | #creates the basic tables if they do not exist 142 | self.build() 143 | 144 | cursor = self.cnx.cursor() 145 | 146 | src_info = core.get_source_info(self.cnx, source) 147 | 148 | if src_info is None: 149 | print "Error: Source {} not found".format(source) 150 | return 151 | 152 | #query = """ 153 | # SELECT fm.file_name, up.full_path 154 | # FROM file_metadata fm 155 | # LEFT JOIN FileNameFilter_unique_name fnfun 156 | # ON fnfun.file_name = fm.file_name 157 | # AND fnfun.unique_path_id = fm.unique_path_id 158 | # LEFT JOIN unique_path up 159 | # ON up.id = fm.unique_path_id 160 | # WHERE not isnull(fm.unique_file_id) 161 | # AND fnfun.count = 1 162 | # AND fm.source_id = {} 163 | #""".format(src_info.source_id) 164 | cursor.execute(""" 165 | SELECT fm.file_name, up.full_path 166 | FROM file_metadata fm 167 | LEFT JOIN FileNameFilter_unique_name fnfun 168 | ON fnfun.file_name = fm.file_name 169 | AND fnfun.unique_path_id = fm.unique_path_id 170 | LEFT JOIN unique_path up 171 | ON up.id = fm.unique_path_id 172 | WHERE not isnull(fm.unique_file_id) 173 | AND fnfun.count = 1 174 | AND fm.source_id = %s 175 | """, (src_info.source_id,)) 176 | data = cursor.fetchall() 177 | 178 | cursor.close() 179 | return data 180 | 181 | def run_survey(self, source_name): 182 | 183 | resources = "resources" 184 | survey_file = "survey.html" 185 | survey_dir = "survey_{}_{}".format(self.name, source_name) 186 | 187 | resource_dir = os.path.join(survey_dir, resources) 188 | html_file = os.path.join(survey_dir, survey_file) 189 | 190 | try: 191 | shutil.rmtree(survey_dir) 192 | except: 193 | pass 194 | 195 | os.mkdir(survey_dir) 196 | os.mkdir(resource_dir) 197 | 198 | results = self.get_unique_names(source_name) 199 | 200 | with open(html_file, 'w') as f: 201 | 202 | f.write(""" 203 | 204 | 205 | 206 | 207 | 208 |

FileNameFilter Snapshot

209 | """) 210 | f.write("

One Timers in Directories

") 211 | f.write("") 212 | f.write("") 213 | f.write("") 214 | f.write("") 215 | i = 0 216 | lr = len(results) 217 | for (b,a) in results: 218 | if i == lr - 1: 219 | f.write("") 220 | f.write("".format(a, b)) 221 | else: 222 | f.write("".format(a, b)) 223 | i += 1 224 | f.write("
Parent PathFilename
{}{}
{}{}
") 225 | f.write("") 226 | 227 | return survey_dir 228 | -------------------------------------------------------------------------------- /redwood/helpers/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | 22 | This module contains core helper functions for Redwood 23 | """ 24 | 25 | import sys 26 | import os 27 | import inspect 28 | import time 29 | from collections import namedtuple 30 | from redwood.filters.redwood_filter import RedwoodFilter 31 | from redwood.filters import filter_list 32 | from redwood.foundation.prevalence import PrevalenceAnalyzer 33 | 34 | SourceInfo = namedtuple('SourceInfo', 'source_id source_name os_id os_name date_acquired') 35 | 36 | 37 | def get_filter_by_name(filter_name): 38 | """ 39 | Fetches an instance of a loaded filter by its name 40 | 41 | :param filter_name: the name of the filter 42 | 43 | :return an instance of a loaded filter with name filter_name 44 | """ 45 | for f in filter_list: 46 | if f.name == filter_name: 47 | return f 48 | 49 | return None 50 | 51 | def import_filters(path, cnx): 52 | """ 53 | Imports filters from an external directory at runtime. Imported filters will be added 54 | to the global filter_list 55 | 56 | :param path: path where the modules reside 57 | :param cnx: an instance of the connection 58 | 59 | :return list of newly add filter instances 60 | """ 61 | 62 | new_filters = list() 63 | 64 | 65 | print "Importing specified filters from {}".format(path) 66 | 67 | #make sure path exists 68 | if os.path.isdir(path) is False: 69 | print "Error: path {} does not exist".format(path) 70 | return None 71 | 72 | #add the path to the PYTHONPATH 73 | sys.path.append(path) 74 | 75 | #acquire list of files in the path 76 | mod_list = os.listdir(path) 77 | 78 | for f in mod_list: 79 | 80 | #continue if it is not a python file 81 | if f[-3:] != '.py': 82 | continue 83 | 84 | #get module name by removing extension 85 | mod_name = os.path.basename(f)[:-3] 86 | 87 | #import the module 88 | module = __import__(mod_name, locals(), globals()) 89 | for name,cls in inspect.getmembers(module): 90 | #check name comaprison too since RedwoodFilter is a subclass of itself 91 | if inspect.isclass(cls) and issubclass(cls, RedwoodFilter) and name != "RedwoodFilter": 92 | instance = cls() 93 | #append an instance of the class to the filter_list 94 | instance.cnx = cnx 95 | filter_list.append(instance) 96 | new_filters.append(instance) 97 | print name 98 | 99 | return new_filters 100 | 101 | def get_source_info(cnx, source_name): 102 | """ 103 | Retrieves a SourceInfo instance given a 104 | 105 | :param cnx: a instance of the connection 106 | :param source_name: name of the media source 107 | 108 | :return SourceInfo instance or None if not found 109 | """ 110 | cursor = cnx.cursor() 111 | 112 | #query = """ 113 | # SELECT media_source.id as source_id, 114 | # media_source.name as source_name, 115 | # os.id as os_id, os.name as os_name, 116 | # media_source.date_acquired as date_acquired 117 | # FROM media_source 118 | # LEFT JOIN os 119 | # ON media_source.os_id = os.id 120 | # WHERE media_source.name = "{}";""".format(source_name) 121 | 122 | cursor.execute(""" 123 | SELECT media_source.id as source_id, 124 | media_source.name as source_name, 125 | os.id as os_id, os.name as os_name, 126 | media_source.date_acquired as date_acquired 127 | FROM media_source 128 | LEFT JOIN os 129 | ON media_source.os_id = os.id 130 | WHERE media_source.name = %s;""", (source_name,)) 131 | r = cursor.fetchone() 132 | 133 | if r is None: 134 | return r 135 | 136 | return SourceInfo(r[0], r[1], r[2],r[3],r[4]) 137 | 138 | def get_malware_reputation_threshold(cnx): 139 | """ 140 | Retrieves the max reputation of all confirmed malware 141 | 142 | :param cnx: mysql connection instance 143 | 144 | :return max reputation score 145 | """ 146 | 147 | cursor = cnx.cursor() 148 | 149 | query = """ 150 | select AVG(unique_file.reputation) 151 | from validator_0 left join unique_file on validator_0.id=unique_file.id 152 | LEFT JOIN file_metadata ON file_metadata.unique_file_id=unique_file.id where validator_0.status=3; 153 | """ 154 | 155 | cursor.execute(query) 156 | 157 | r = cursor.fetchone() 158 | 159 | if r is None: 160 | return r 161 | 162 | return r[0] 163 | 164 | def get_num_systems(cnx, os_name_or_id): 165 | """ 166 | Retrieves the number of unique media sources for a given os 167 | 168 | :param cnx: mysql connection instance 169 | :param os_name_or_id: os name or os id 170 | 171 | :return the number of systems found or None if the os does not exist 172 | """ 173 | 174 | cursor = cnx.cursor() 175 | 176 | 177 | try: 178 | val = int(os_name_or_id) 179 | 180 | cursor.execute(""" 181 | SELECT COUNT(media_source.id) FROM os 182 | LEFT JOIN media_source ON os.id = media_source.os_id 183 | WHERE os.id = %s 184 | GROUP BY os.id 185 | """, (val,)) 186 | 187 | except Exception as e: 188 | cursor.execute(""" 189 | SELECT COUNT(media_source.id) FROM os 190 | LEFT JOIN media_source ON os.id = media_source.os_id 191 | WHERE os.id = (SELECT DISTINCT os.id from os where os.name = %s) GROUP BY os.id""", (os_name_or_id,)) 192 | 193 | r = cursor.fetchone() 194 | 195 | if r is None: 196 | return None 197 | 198 | return r[0] 199 | 200 | 201 | def update_analyzers(cnx, sources): 202 | """ 203 | Runs Analyzers and Filters against each source in the source_os_list, updating the 204 | approriate tables 205 | 206 | :param sources: list of SourceInfo instances 207 | """ 208 | print "...Beginning Analyzers and Filters for inputted sources" 209 | 210 | start_time = time.time() 211 | 212 | #now let's run the prevalence analyzer 213 | pu = PrevalenceAnalyzer(cnx) 214 | pu.update(sources) 215 | 216 | elapsed_time = time.time() - start_time 217 | print "...completed analyzers on inputed sources in {}".format(elapsed_time) 218 | 219 | 220 | def update_filters(cnx, sources): 221 | 222 | start_time = time.time() 223 | 224 | #set the cnx for each plugin 225 | for p in filter_list: 226 | p.cnx = cnx 227 | 228 | for source in sources: 229 | #for source in sources: 230 | print "==== Beginning filter analysis of {} ====".format(source.source_name) 231 | for p in filter_list: 232 | p.update(source.source_name) 233 | 234 | elapsed_time = time.time() - start_time 235 | print "...completed filter analysis on inputted sources in {}".format(elapsed_time) 236 | 237 | 238 | 239 | def table_exists(cnx, name): 240 | """ 241 | Checks if the mysql table with exists 242 | 243 | :param cnx: mysql connection instance 244 | :param name: table name 245 | 246 | :return True if exists, else False 247 | """ 248 | cursor = cnx.cursor() 249 | result = None 250 | try: 251 | cursor.execute("""select COUNT(id) from %s""", (name,)) 252 | result = cursor.fetchone() 253 | cursor.close() 254 | except Exception as err: 255 | print err 256 | pass 257 | 258 | 259 | if(result == None or result[0] == 0): 260 | return False 261 | else: 262 | return True 263 | 264 | def get_all_sources(cnx): 265 | """ 266 | Returns a list of all sources currently loaded into Redwood 267 | 268 | :param cnx: mysql connection instance 269 | """ 270 | 271 | cursor = cnx.cursor() 272 | result = list() 273 | try: 274 | cursor.execute("""SELECT media_source.id, media_source.name, os.id, os.name, date_acquired FROM media_source 275 | INNER JOIN os 276 | ON media_source.os_id = os.id 277 | """) 278 | result = cursor.fetchall() 279 | cursor.close() 280 | except Exception as err: 281 | print err 282 | return None 283 | 284 | sources = list() 285 | for r in result: 286 | sources.append(SourceInfo(r[0],r[1], r[2],r[3],r[4])) 287 | 288 | return sources 289 | 290 | def get_reputation_by_source(cnx, source_name): 291 | """ 292 | Returns a list of scores for every file on the source 293 | 294 | :param cnx: myqsl connection instance 295 | """ 296 | 297 | cursor = cnx.cursor() 298 | result = list() 299 | 300 | try: 301 | cursor.execute("""SELECT ROUND(unique_file.reputation, 2), 302 | COUNT(DISTINCT unique_file.id) FROM unique_file 303 | INNER JOIN file_metadata 304 | ON unique_file.id = file_metadata.unique_file_id 305 | INNER JOIN media_source 306 | ON file_metadata.source_id = media_source.id 307 | WHERE media_source.name = %s 308 | GROUP BY ROUND(unique_file.reputation, 2) 309 | """, (source_name,)) 310 | result = cursor.fetchall() 311 | cursor.close() 312 | except Exception as err: 313 | print err 314 | return None 315 | 316 | return result 317 | -------------------------------------------------------------------------------- /redwood/io/csv_importer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | """ 22 | 23 | 24 | import sys 25 | import os 26 | import shutil 27 | import getopt 28 | import string 29 | import time 30 | from datetime import datetime 31 | import MySQLdb 32 | from redwood.helpers.core import SourceInfo 33 | from redwood.foundation.prevalence import PrevalenceAnalyzer 34 | from redwood.filters import filter_list 35 | import redwood.helpers.core as core 36 | from redwood.foundation.report import Report 37 | 38 | def db_load_file(connection, path): 39 | """ 40 | Loads a file located at into the database 41 | 42 | :param connection: connection object for the database 43 | :param path: path where the file is located 44 | 45 | :return SourceInfo representing the inputted source 46 | """ 47 | 48 | try: 49 | with open(path): pass 50 | except IOError: 51 | print '*** Error: File \'{}\' does not exist'.format(path) 52 | return 53 | 54 | 55 | filename = os.path.basename(path) 56 | fields = string.split(filename, '--') 57 | 58 | if(len(fields) != 3): 59 | print "*** Error: Improper naming scheme with {} fields".format(len(fields)) 60 | print path 61 | print fields 62 | return 63 | 64 | cursor = connection.cursor() 65 | os_id = None 66 | 67 | source_name = fields[2] 68 | os_name = fields[1] 69 | 70 | print "=== Loading \"{}\" into database ===".format(source_name) 71 | #transaction for adding to media and os tables. Both succeed or both fail 72 | try: 73 | 74 | data_os = { 75 | 'name':os_name, 76 | } 77 | 78 | #add os 79 | add_os = ("INSERT INTO `os` (name) VALUES('%(name)s') ON DUPLICATE KEY UPDATE id=id") % data_os 80 | cursor.execute(add_os) 81 | connection.commit() 82 | 83 | except MySQLdb.Error, e: 84 | if connection: 85 | connection.rollback() 86 | print "*** Error %d: %s" % (e.args[0],e.args[1]) 87 | return 88 | 89 | #now get the os_id for the os_name 90 | #query = "SELECT os.id FROM os WHERE os.name = \"{}\"".format(os_name) 91 | cursor.execute("""SELECT os.id FROM os WHERE os.name = %s""", (os_name,)) 92 | r = cursor.fetchone() 93 | os_id = r[0] 94 | 95 | if(os_id is None): 96 | print "*** Error: Unable to find corresponding os" 97 | return 98 | 99 | try: 100 | date_object = datetime.strptime(fields[0], '%Y-%m-%d') 101 | 102 | data_media_source = { 103 | 104 | 'name':fields[2], 105 | 'date_acquired':date_object.isoformat(), 106 | 'os_id':os_id, 107 | } 108 | 109 | #add the media source 110 | add_media_source = ("INSERT INTO `media_source` (reputation, name, date_acquired, os_id) " 111 | "VALUES(0, '%(name)s', '%(date_acquired)s', '%(os_id)s') ") % data_media_source 112 | 113 | cursor.execute(add_media_source) 114 | connection.commit() 115 | source_id = cursor.lastrowid 116 | 117 | except MySQLdb.Error, e: 118 | if connection: 119 | connection.rollback() 120 | print "*** Error %d: %s" % (e.args[0],e.args[1]) 121 | return 122 | 123 | media_source_id = cursor.lastrowid 124 | 125 | path = path.replace('\\','\\\\') 126 | #load raw csv into the staging table from the client 127 | #add_staging_table = ("""LOAD DATA LOCAL INFILE '{}' INTO TABLE `staging_table` 128 | # FIELDS TERMINATED BY ',' ENCLOSED BY '\"' LINES TERMINATED BY '\\n' 129 | # IGNORE 1 LINES 130 | # (global_file_id, parent_id, dirname, basename,contents_hash,dirname_hash,filesystem_id,device_id, 131 | # attributes,user_owner,group_owner,size,@created_param,@accessed_param,@modified_param,@changed_param, 132 | # @user_flags,links_to_file, @disk_offset, @entropy, @file_content_status, @extension, file_type) 133 | # SET created = FROM_UNIXTIME(@created_param), 134 | # last_accessed = FROM_UNIXTIME(@accessed_param), 135 | # last_modified = FROM_UNIXTIME(@modified_param), 136 | # last_changed = FROM_UNIXTIME(@changed_param), 137 | # user_flags = nullif(@user_flags,''), disk_offset = nullif(@disk_offset,''), 138 | # entropy=nullif(@entropy,''), file_content_status=nullif(@file_content_status,''), 139 | # extension = nullif(@extension,'');""").format(path) 140 | 141 | try: 142 | 143 | #create the staging table 144 | query = """ 145 | CREATE TABLE IF NOT EXISTS staging_table ( 146 | global_file_id LONG NOT NULL, 147 | parent_id LONG NULL, 148 | dirname VARCHAR(4096) NULL, 149 | basename VARCHAR(255) NULL, 150 | contents_hash CHAR(40) NULL, 151 | dirname_hash CHAR(40) NULL, 152 | filesystem_id INT UNSIGNED NULL, 153 | device_id INT NULL, 154 | attributes INT NULL, 155 | user_owner INT NULL, 156 | group_owner INT NULL, 157 | size INT UNSIGNED NULL, 158 | created DATETIME NULL, 159 | last_accessed DATETIME NULL, 160 | last_modified DATETIME NULL, 161 | last_changed DATETIME NULL, 162 | user_flags INT NULL DEFAULT NULL, 163 | links_to_file INT NULL, 164 | disk_offset BIGINT NULL, 165 | entropy TINYINT NULL, 166 | file_content_status TINYINT NULL, 167 | extension VARCHAR(32) NULL, 168 | file_type VARCHAR(64) NULL, 169 | INDEX contents_hash_idx (contents_hash ASC), 170 | INDEX dirname_hash_idx (dirname_hash ASC) 171 | ) ENGINE=InnoDB; 172 | """ 173 | 174 | cursor.execute(query) 175 | connection.commit() 176 | 177 | start_time = time.time() 178 | cursor.execute(""" 179 | LOAD DATA LOCAL INFILE %s INTO TABLE `staging_table` 180 | FIELDS TERMINATED BY ',' 181 | ENCLOSED BY '\"' LINES TERMINATED BY '\\n' 182 | IGNORE 1 LINES 183 | (global_file_id, parent_id, dirname, basename,contents_hash,dirname_hash,filesystem_id,device_id, 184 | attributes,user_owner,group_owner,size,@created_param,@accessed_param,@modified_param,@changed_param, 185 | @user_flags,links_to_file, @disk_offset, @entropy, @file_content_status, @extension, file_type) 186 | SET created = FROM_UNIXTIME(@created_param), 187 | last_accessed = FROM_UNIXTIME(@accessed_param), 188 | last_modified = FROM_UNIXTIME(@modified_param), 189 | last_changed = FROM_UNIXTIME(@changed_param), 190 | user_flags = nullif(@user_flags,''), 191 | disk_offset = nullif(@disk_offset,''), 192 | entropy=nullif(@entropy,''), 193 | file_content_status=nullif(@file_content_status,''), 194 | extension = nullif(@extension,'');""", (path,)) 195 | connection.commit() 196 | print "...data transfer to staging table in {}".format(time.time() - start_time) 197 | start_time = time.time() 198 | 199 | cursor.callproc('map_staging_table', (media_source_id, os_id)) 200 | cursor.execute("DROP TABLE `staging_table`;") 201 | connection.commit() 202 | print "...data written from staging table to main tables in {}".format(time.time() - start_time) 203 | except Exception as err: 204 | print "Exception occurred: {}".format(err) 205 | cursor.close() 206 | sys.exit(1) 207 | 208 | total_time = time.time() - start_time 209 | print "...completed in {}".format(total_time) 210 | cursor.close() 211 | #TODO: just call get source info here 212 | return SourceInfo(source_id, source_name, os_id, os_name, None) 213 | 214 | def run(cnx, path): 215 | """ 216 | Loads all csv files from the path into the database 217 | 218 | :param cnx: mysql connection object 219 | :param path: directory containing csv files or the full path to a csv file 220 | """ 221 | src_os_list = list() 222 | 223 | if(path == None): 224 | print "*** Error: Path is required" 225 | return 226 | 227 | if(os.path.isfile(path)): 228 | info = db_load_file(cnx, path) 229 | if info is not None: 230 | src_os_list.append(info) 231 | elif(os.path.isdir(path)): 232 | for r, d, f in os.walk(path): 233 | while len(d) > 0: 234 | d.pop() 235 | for file in f: 236 | if not file.startswith('.'): 237 | os.path.abspath(os.path.join(r, file)) 238 | info = db_load_file(cnx, path + "/" + file) 239 | if info is not None: 240 | src_os_list.append(info) 241 | else: 242 | print 'Please input a valid file or a directory for import' 243 | return 244 | 245 | #update the analyzers and filters 246 | core.update_analyzers(cnx,src_os_list) 247 | core.update_filters(cnx, src_os_list) 248 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Hemlock documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Sep 9 22:31:44 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath('../')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest', 'sphinx.ext.coverage'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'Redwood' 44 | copyright = u'2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1.0' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1.0' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'Redwooddoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'Redwood.tex', u'Redwood Documentation', 187 | u'Paul M', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'redwood', u'Redwood Documentation', 217 | [u'Paul M'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'Redwood', u'Redwood Documentation', 231 | u'Paul M', 'Redwood', 'A project that implements statistical methods for identifying anomalous files.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | 244 | # If true, do not generate a @detailmenu in the "Top" node's menu. 245 | #texinfo_no_detailmenu = False 246 | 247 | 248 | # -- Options for Epub output --------------------------------------------------- 249 | 250 | # Bibliographic Dublin Core info. 251 | epub_title = u'Redwood' 252 | epub_author = u'Paul M' 253 | epub_publisher = u'Paul M' 254 | epub_copyright = u'2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.' 255 | 256 | # The language of the text. It defaults to the language option 257 | # or en if the language is not set. 258 | #epub_language = '' 259 | 260 | # The scheme of the identifier. Typical schemes are ISBN or URL. 261 | #epub_scheme = '' 262 | 263 | # The unique identifier of the text. This can be a ISBN number 264 | # or the project homepage. 265 | #epub_identifier = '' 266 | 267 | # A unique identification for the text. 268 | #epub_uid = '' 269 | 270 | # A tuple containing the cover image and cover page html template filenames. 271 | #epub_cover = () 272 | 273 | # A sequence of (type, uri, title) tuples for the guide element of content.opf. 274 | #epub_guide = () 275 | 276 | # HTML files that should be inserted before the pages created by sphinx. 277 | # The format is a list of tuples containing the path and title. 278 | #epub_pre_files = [] 279 | 280 | # HTML files shat should be inserted after the pages created by sphinx. 281 | # The format is a list of tuples containing the path and title. 282 | #epub_post_files = [] 283 | 284 | # A list of files that should not be packed into the epub file. 285 | #epub_exclude_files = [] 286 | 287 | # The depth of the table of contents in toc.ncx. 288 | #epub_tocdepth = 3 289 | 290 | # Allow duplicate toc entries. 291 | #epub_tocdup = True 292 | 293 | # Fix unsupported image types using the PIL. 294 | #epub_fix_images = False 295 | 296 | # Scale large images. 297 | #epub_max_image_width = 0 298 | 299 | # If 'no', URL addresses will not be shown. 300 | #epub_show_urls = 'inline' 301 | 302 | # If false, no index is generated. 303 | #epub_use_index = True 304 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Association-Based Data Reduction (REDWOOD) 2 | 3 | Finding the Tree in the Forest 4 | 5 | ![Redwood](https://raw.github.com/Lab41/Redwood/master/images/logo/redwood_logo.png "Redwood") 6 | 7 | 8 |

Redwood is a Python framework intended to identify anomalous files through analyzing the file metadata of a collection of media. Each file analyzed is assigned a score that signals its reputation relative to other files in the system --the lower a reputation score, the more likely that a file is anomalous. The final reputation score of a given file is based on an aggegation of scores assigned to it by modules that we call "Filters".

9 |

A Filter is a plugin whose functionality is only limited by the creativity of the developer. Redwood can support any number of Filters, so long as a Filter extends the RedwoodFilter class and produces a table assigning a reputation score to each unique file in the system. Much of the Redwood framework is aimed at making the process of adding new Filters to the system as frictionless as possible (see the Filter section below for more information).

10 |

In addition to the Filters, Redwood also provides an effective data model for analyzing and storing file metadata, an API for interacting with that data, a simple shell for executing Redwood commands, and two example Filters (a "prevalence" Filter and a "locality uniqueness" Filter. Though sample Filters are included in the project, ultimately the effectiveness of Redwood will be based on the Filters that you write for the particular anomaly that you are looking for. To that end, Redwood is nothing more than a simple framework for connecting Filters to a well-formed data model.

11 | 12 | ##Quick Setup 13 | The instructions that follow should get you up and running quickly. Redwood has been tested on OS X and Linux. Windows will likely work with a few changes. 14 | 15 | #### Stuff to Download 16 | 1. Python 2.7 17 | 2. Python packages 18 | * SciPy (Matplotlib, MqSQLdb) 19 | 3. MySQL Client for your client OS 20 | 4. MySQL Server for the server hosting the DB 21 | 22 | #### Prep the Database 23 | Redwood uses a MySQL database to store metadata. In order to use Redwood, you will need to first set up your own MySQL DB, then run the following two SQL scripts to create the required tables and subroutines. 24 | 25 | ```bash 26 | mysql -uyour_db_user -pyour_password -hyour_host -Dyour_database < sql/create_redwood_db.sql 27 | mysql -uyour_db_user -pyour_password -hyour_host -Dyour_database < sql/create_redwood_sp.sql 28 | ``` 29 | 30 | #### Create a config 31 | 32 | Create a file containing the following configuration information specific to your database 33 | 34 | ``` 35 | [mysqld] 36 | database:your_db_name 37 | host:your_host 38 | username:your_username 39 | password:your_password 40 | ``` 41 | 42 | ## Run Redwood 43 | 44 | There are two ways that you can run Redwood. If you just want to play with the tool, and maybe create a couple of filters, the "Redwood Shell" method is probably the best choice. If you want to make modifications to the core package and or create your own UI, then you probably want to use the API. Examples of how to do both are below: 45 | 46 | #### Using the Redwood Shell 47 | 48 | ```bash 49 | #append to the python path the Redwood directory 50 | export PYTHONPATH=/path/to/Redwood 51 | #from the Redwood directory run 52 | python bin/redwood /path/to/config 53 | ``` 54 | 55 | #### Using the API to create your Application 56 | This is a brief example of how to use the API to load a media source into the database and then run specific filter functions on that source 57 | 58 | ```python 59 | import redwood.connection.connect as connect 60 | import redwood.io.csv_importer as loader 61 | import redwood.helpers.core as core 62 | 63 | #connect to the database 64 | cnx = connect.connect_with_config("my_db.cfg") 65 | 66 | #load a csv to the database 67 | loader.run(cnx,"directory_containing_csv_data_pulls", false) 68 | 69 | core.import_filters("./Filters", cnx) 70 | 71 | #grab instances of two specific filters 72 | fp = core.get_filter_by_name("prevalence") 73 | lu = core.get_filter_by_name("locality_uniqueness") 74 | 75 | #generate a histogram to see distribution of files for that source 76 | fp.discover_histogram_by_source("some_source") 77 | 78 | #run a survey for a particular source 79 | fp.run_survey("some_source") 80 | ``` 81 | 82 | 83 | ##Documentation 84 | from the root project directory, run the following 85 | ```bash 86 | sphinx-apidoc -o docs redwood -F; pushd docs; make html; make man; popd 87 | ``` 88 | 89 | ###Data 90 | 91 | Redwood currently only loads data from a CSV file with the fields below. Information about these fields can typically be found in a stat 92 | 93 | |Field Name | Field Description| 94 | |-----------|------------------| 95 | |file_id| Unique id of the file | 96 | |parent_id| file_id of the parent | 97 | |dirname| path excluding filename | 98 | |basename| filename | 99 | |hash| Sha1 of file contents | 100 | |fs_id| Inode of linux or non-linux equivalent | 101 | |device| Device Node identifier | 102 | |permissions| Permission of the file | 103 | |uid| User owner of the file | 104 | |gid| Group owner of the file | 105 | |size| Size in bytes | 106 | |create_time | file create in seconds from epoch | 107 | |access_time| file last accessed in seconds from epoch | 108 | |mod_time| file modification in seconds from epoch | 109 | |metadata_change_time| file change in seconds from epoch | 110 | |user_flags| user flags | 111 | |links| links to the file | 112 | |disk_offset| disk offset | 113 | |entropy| entropy of the file | 114 | |file_content_status|file content status| 115 | |extensions| file extension if available | 116 | |file_type| file type if auto discovered | 117 | 118 | 119 | The **sql/filewalk.py** script will walk a hfs+ file system and (perhaps) other Unix/Linux file systems, collecting the relevant metadata using the stat command. The output will be in the appropriate format for the load_csv command. Note, this script has been optimized for Linux/OS X. It will not work on a Windows system... updates welcome) 120 | 121 | 122 | 123 | ##Redwood Architecture 124 | 125 | Redwood is composed of 5 core engines, all backed by a MySQL DB 126 | 127 | 1. Ingestion Engine 128 | - The ingestion engine is responsible for importing data into the datastore from a metadata source file (currently only supporting csv). 129 | 2. Global Analytics Engine 130 | - The Global Analytics Engine is responsible for performing analytics on a global scale against all metadata and then providing those results to all filters for subsequent computation in the form of queriable tables. This engine typically conducts time intensive queries that you only want to perform once per new source. Currently, the only Global Analytics Engine is the "Prevalence" analyzer. This is not to be confused with the prevalence filter which leverages the tables produced by the the Prevalence analyzer. 131 | 3. Filter Engine 132 | - The Filter Engine has two main responsiblities. The first is to create a table for the reputation scores that it has calculated for each unique file in the the database. The second is to optionally provide a series of "Discovery" functions that are associated with the filter scoring yet can be used independently by the end user or developer to discover in more detail why a file has a paticular score. For more information, please refer to the "All About Filters" section. 133 | 4. Aggregation Engine 134 | - The Aggregation Engine is responsible for two main duties (1) aggregating the scores of each filter into a single reputation score based on some aggregation algorithm (2) freezing global reputation scores if the engine deems them as either definitely high or low reputations 135 | 5. Reporting Engine 136 | - The Reporting Engine is responsible for generating a comprehensive report highlighting user specified information about the data. 137 | 138 | 139 | ##All About Filters 140 | 141 | ####Summary 142 | Filters are the foundation of file scoring in Redwood. A Filter's central purpose is to create a score for each unique file in the system. After Redwood runs all the filters, each unique file should have a score from each filter. It is then that Redwood is responsible for combining these scores using an aggregation function such that each unique file has only a single score in the unique file table. Keep in mind that numerous filters can exist in a Redwood project.
143 | In addition to generating a score for each file, a Filter can optionally create one or more "Discovery" functions. A Discovery function is a function that allows the user of the Filter to explore the data beyond just deriving a score. It is common for a Discovery function to also be used in the calculations for file scoring -- the Redwood model just provides a structured way for the developer to make that function available to the end user. 144 | 145 | ####Writing your own Filter 146 | Your filter should inherit from the base class RedwoodFilter in redwood.filters.redwood_filter. You must override those functions that raise a "NotImplementedError". To assist in writing your own filter, look at the sample filters (locality_uniqueness and file_prevalence) in the Filters directory. 147 | 148 | - If you are using the Redwood Shell, any Filter placed in the Filters directory will be automatically imported into the application. 149 | - All discovery functions should be preceded by "discover_" in their name so that during introspection a developer knows which functions are intended for discovery 150 | - A Filter is free to create any tables in the database. This can become necessary for efficiently calculating the reputation scores 151 | - The update function must produce (or update if not exists) a table called self.score_table with two columns (id, score) where the id is the unique_file.id of the the given file and the score is the calculated score 152 | - The self.cnx instance variable must be set prior to running any of the functions of the filter. The self.cnx is a mysql connection object. Redwood will set the cnx instance if you use its import functions. 153 | 154 | 155 | ```python 156 | class YourFilterName(RedwoodFilter) 157 | 158 | def __init__(self): 159 | self.name = "YourFilterName" 160 | self.score_table = "YourScoreTableName" 161 | self.cnx 162 | def usage(self): 163 | print "Your usage statement" 164 | 165 | def update(self, source_name): 166 | #code to update all filter tables with source_name data 167 | 168 | #survey function 169 | def run_survey(source): 170 | your code 171 | 172 | #build 173 | def build(): 174 | your code 175 | 176 | #clean 177 | def clean(self) 178 | your code 179 | 180 | #discovery functions 181 | def discover_your_discover_func0(self, arg0, ..., argN): 182 | your code 183 | ... 184 | def discover_your discover_funcM(self, arg0, ..., argN): 185 | your code 186 | 187 | ``` 188 | 189 | ##Screen Shots 190 | Sceenshot of the Sample Shell
191 | ![Shell](https://raw.github.com/Lab41/Redwood/master/images/redwood_0.png "Redwood Shell") 192 |
Sceenshot of the Filter Options
193 | ![Shell](https://raw.github.com/Lab41/Redwood/master/images/discovery.png "Filter Options") 194 |
Sceenshot of the File Distribution discovery function for Filter Prevalence
195 | ![Shell](https://raw.github.com/Lab41/Redwood/master/images/histogram0.png "Prevalence Filter file distribution") 196 |
Sceenshot of the discovery function for Locality Uniqueness
197 | ![Clustering](https://raw.github.com/Lab41/Redwood/master/images/clustering.png "Locality Uniquenss Clustering") 198 | 199 | 200 | 201 | 202 | 203 | ##Optimizing MySQL Notes 204 | bulk_insert_buffer_size: 8G 205 | 206 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/Lab41/redwood/trend.png)](https://bitdeli.com/free "Bitdeli Badge") 207 | -------------------------------------------------------------------------------- /redwood/foundation/report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | """ 22 | 23 | import os 24 | import shutil 25 | import math 26 | from redwood.filters import filter_list 27 | from redwood.helpers import core 28 | import matplotlib.pylab as plt 29 | from redwood.foundation.aggregator import Aggregator 30 | 31 | 32 | class Report(): 33 | def __init__(self, cnx, source_info): 34 | self.report_dir = "reports" 35 | self.cnx = cnx 36 | self.source = source_info 37 | 38 | def run(self, agg_weights=None): 39 | 40 | print "Running report survey for: " + self.source.source_name 41 | print "... aggregating most recent filter scores" 42 | ag = Aggregator(self.cnx) 43 | ag.aggregate(filter_list, agg_weights) 44 | self.run_filter_survey() 45 | self.generate_report() 46 | 47 | #collects survey reports from each filter and aggregates the results into one central report 48 | def run_filter_survey(self): 49 | print "...Generating Report" 50 | for f in filter_list: 51 | f.cnx = self.cnx 52 | print f.name 53 | path = f.run_survey(self.source.source_name) 54 | try: 55 | shutil.rmtree(self.report_dir + "/" + self.source.source_name + "/filters/" + f.name) 56 | except: 57 | pass 58 | 59 | if path == None: 60 | continue 61 | 62 | shutil.move(path, self.report_dir + "/" + self.source.source_name + "/filters/" + f.name) 63 | 64 | def generate_report(self): 65 | report_dir = "reports/" + self.source.source_name 66 | report_file = self.source.source_name + "_report.html" 67 | html_file = os.path.join(report_dir, report_file) 68 | 69 | score_counts = core.get_reputation_by_source(self.cnx, self.source.source_name) 70 | 71 | bins = [.05,.1,.15,.2,.25,.30,.35,.40,.45,.50,.55,.60,.65,.70,.75,.80,.85,.90,.95,1.00] 72 | scores, counts = zip(*score_counts) 73 | fig = plt.figure() 74 | ax = fig.add_subplot(111, title="Reputation Distribution") 75 | ax.hist(scores, weights=counts, bins = bins) 76 | ax.set_xlabel("Reputation Score") 77 | ax.set_ylabel("File Occurrences") 78 | 79 | threshold = None 80 | #TODO: if you have a truth source, use it here 81 | #if you have a validation engine, use the line below 82 | #threshold = core.get_malware_reputation_threshold(self.cnx) 83 | #print "thres: {}".format(threshold) 84 | #if threshold is not None: 85 | # plt.axvline(x=threshold, color="r", ls='--') 86 | #plt.xticks(bins) 87 | 88 | #for tick in ax.xaxis.get_major_ticks(): 89 | # tick.label.set_fontsize(8) 90 | 91 | hist_reputation = os.path.join(report_dir, "rep.png") 92 | plt.savefig(hist_reputation) 93 | 94 | 95 | table_height = int(math.ceil(len(score_counts) / float(3))) 96 | file_count = 0 97 | for s in score_counts: 98 | file_count += s[1] 99 | 100 | with open(html_file, 'w') as f: 101 | f.write(""" 102 | 103 | 104 | 105 | 106 | 107 | 119 |
120 |

Report for {}

\n""".format(self.source.source_name)) 121 | f.write("\t\t

Source Information

\n") 122 | f.write("\t\t
\n") 123 | f.write("\t\t\t
Acquisition Date: {}
\n".format(self.source.date_acquired)) 124 | f.write("\t\t\t
Operating System: {}
\n".format(self.source.os_name)) 125 | f.write("\t\t\t
File Count: {}
\n".format(file_count)) 126 | f.write("\t\t
\n\t\t
\n") 127 | f.write("\t\t
\n") 128 | f.write("\t\t\n") 129 | f.write("\t\t\t\n") 130 | f.write("\t\t\t\n") 131 | f.write(""" 132 | 133 | 134 | """) 135 | f.write(""" 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | """) 147 | for i in range(0, table_height): 148 | if len(score_counts) == 1: 149 | f.write(""" 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | """.format(score_counts[i][0], score_counts[i][1])) 161 | elif table_height * 2 + i >= len(score_counts): 162 | if i == table_height - 1: 163 | f.write(""" 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | """.format(score_counts[i][0], score_counts[i][1], 175 | score_counts[table_height + i][0], score_counts[table_height + i][1])) 176 | else: 177 | f.write(""" 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | """.format(score_counts[i][0], score_counts[i][1], 186 | score_counts[table_height + i][0], score_counts[table_height + i][1])) 187 | else: 188 | f.write(""" 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | """.format(score_counts[i][0], score_counts[i][1], 197 | score_counts[table_height + i][0], score_counts[table_height + i][1], 198 | score_counts[table_height * 2 + i][0], score_counts[table_height * 2 + i][1])) 199 | f.write(""" 200 |
File Score DistributionReputation Distribution
ScoreCountScoreCountScoreCount
{}{}
{}{}{}{}
{}{}{}{}
{}{}{}{}{}{}
201 |
202 | """) 203 | #This is what the query should look like in production 204 | cursor = self.cnx.cursor() 205 | #cursor.execute(""" 206 | # SELECT file_metadata.file_name AS Filename, 207 | # unique_file.reputation AS Reputation, 208 | # unique_path.full_path As Path, 209 | # unique_file.hash AS Hash 210 | # FROM file_metadata 211 | # INNER JOIN unique_file 212 | # ON file_metadata.unique_file_id = unique_file.id 213 | # INNER JOIN unique_path 214 | # ON file_metadata.unique_path_id = unique_path.id 215 | # WHERE source_id = {} 216 | # ORDER BY unique_file.reputation ASC 217 | # LIMIT 0, 100 218 | # """.format(source.os_id)) 219 | #Use this query if unique_file.reputation is no indexed 220 | cursor.execute(""" 221 | SELECT file_metadata.file_name AS Filename, 222 | unique_file.reputation AS Reputation, 223 | unique_path.full_path As Path, 224 | unique_file.hash AS Hash 225 | FROM file_metadata 226 | INNER JOIN unique_file 227 | ON file_metadata.unique_file_id = unique_file.id 228 | INNER JOIN unique_path 229 | ON file_metadata.unique_path_id = unique_path.id 230 | WHERE source_id = %s 231 | ORDER BY unique_file.reputation ASC 232 | LIMIT 0, 100 233 | """, (self.source.source_id,)) 234 | col_length = len(cursor.description) 235 | field_names = cursor.description 236 | results = cursor.fetchall() 237 | f.write("\t\t
\n") 238 | f.write("\t\t\n") 239 | f.write("\t\t\t\n") 240 | f.write(""" 241 | 242 | """) 243 | for i in range(0, col_length): 244 | if i == 0: 245 | f.write("".format(field_names[i][0])) 246 | elif i == col_length - 1: 247 | f.write("".format(field_names[i][0])) 250 | f.write(""" 251 | 252 | 253 | """) 254 | for row in results: 255 | f.write("") 256 | for l in row: 257 | f.write("".format(l)) 258 | f.write("\n") 259 | f.write("
Lowest Reputation Files (100)
{}{}".format(field_names[i][0])) 248 | else: 249 | f.write("{}
{}
") 260 | f.write("") 261 | f.close() 262 | -------------------------------------------------------------------------------- /redwood/foundation/prevalence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | """ 22 | 23 | 24 | class PrevalenceAnalyzer(): 25 | """ 26 | The PrevalenceAnalyzer is a core component of Redwood for determining prevalence 27 | analytics that can then be made available to all filters. 28 | """ 29 | 30 | def __init__(self, cnx): 31 | self.cnx = cnx 32 | 33 | def update(self, sources): 34 | """ 35 | Analyzes all sources from the source_os_list, storing results in the global tables 36 | for prevalence 37 | 38 | :param sources: a list of SourceInfo instances containing information about the sources. 39 | """ 40 | self.build() 41 | 42 | print "[+] Conducting global analysis for prevalence" 43 | 44 | cursor = self.cnx.cursor() 45 | #iterate through each of the new sources, updating the prevalence table accordingly 46 | for source in sources: 47 | print source.source_name 48 | #will need to fetch the number of systems first for the given os 49 | #query = """ 50 | # select COUNT(os.name) from os LEFT JOIN media_source ON(os.id = media_source.os_id) 51 | # where os.id = {} GROUP BY os.name 52 | #""".format(source.os_id) 53 | 54 | cursor.execute(""" 55 | select COUNT(os.name) from os 56 | LEFT JOIN media_source 57 | ON(os.id = media_source.os_id) 58 | where os.id = %s GROUP BY os.name 59 | """, (source.os_id,)) 60 | num_systems = cursor.fetchone()[0] 61 | 62 | #this query will either insert a new entry into the table or update an existing ones 63 | #This will only get prevalence of files, NOT directories since all directories have the same zero 64 | #contents hash. We exclude dirs by checking file size != 0, though some dirs slip through with larger file sizes 65 | #query = """ 66 | # INSERT INTO global_file_prevalence(unique_file_id, count, num_systems, os_id) 67 | # SELECT t.unique_file_id, COUNT(unique_file_id) as count, t.num_systems, t.os_idd from 68 | # (SELECT DISTINCT unique_file_id, media_source.id as src, s.os_idd, num_systems 69 | # from file_metadata JOIN media_source ON (file_metadata.source_id = media_source.id) 70 | # LEFT JOIN( select os.id as os_idd, os.name as os, COUNT(os.name) as num_systems 71 | # from os LEFT JOIN media_source ON(os.id = media_source.os_id) 72 | # WHERE os.id = {} GROUP BY os.name ) s 73 | # ON (s.os_idd = file_metadata.os_id) where media_source.id = {} AND file_metadata.unique_file_id is not null) t 74 | # GROUP BY t.os_idd, t.unique_file_id 75 | # ON DUPLICATE KEY UPDATE count=count+1 76 | #""".format(source.os_id, source.source_id) 77 | 78 | cursor.execute(""" 79 | INSERT INTO global_file_prevalence(unique_file_id, count, num_systems, os_id) 80 | SELECT t.unique_file_id, COUNT(unique_file_id) 81 | as count, t.num_systems, t.os_idd from 82 | (SELECT DISTINCT unique_file_id, media_source.id 83 | as src, s.os_idd, num_systems 84 | from file_metadata JOIN media_source 85 | ON (file_metadata.source_id = media_source.id) 86 | LEFT JOIN(select os.id as os_idd, os.name 87 | as os, COUNT(os.name) as num_systems 88 | from os LEFT JOIN media_source 89 | ON(os.id = media_source.os_id) 90 | WHERE os.id = %s GROUP BY os.name) s 91 | ON (s.os_idd = file_metadata.os_id) 92 | where media_source.id = %s 93 | AND file_metadata.unique_file_id is not null) t 94 | GROUP BY t.os_idd, t.unique_file_id 95 | ON DUPLICATE KEY UPDATE count=count+1 96 | """, (source.os_id, source.source_id,)) 97 | 98 | #TODO: use a local variable for num_systems 99 | #query = """ 100 | # UPDATE global_file_prevalence SET num_systems = {}, average = (SELECT count/num_systems) where os_id = {} 101 | #""".format(num_systems, source.os_id) 102 | 103 | cursor.execute(""" 104 | UPDATE global_file_prevalence 105 | SET num_systems = %s, average = 106 | (SELECT count/num_systems) where os_id = %s 107 | """, (num_systems, source.os_id,)) 108 | 109 | #get the prevalence of directories 110 | #query = """ 111 | # INSERT INTO global_dir_prevalence (unique_path_id, count, num_systems, os_id) 112 | # SELECT unique_path.id as path_id, COUNT(file_metadata.id) as count, t.num_systems, file_metadata.os_id 113 | # from unique_path LEFT JOIN file_metadata 114 | # ON file_metadata.unique_path_id = unique_path.id LEFT JOIN 115 | # (SELECT os.id as os_i, COUNT(media_source.id) as num_systems from os 116 | # LEFT JOIN media_source ON os.id = media_source.os_id 117 | # GROUP BY os.id) as t ON (file_metadata.os_id = t.os_i) 118 | # where file_metadata.file_name = '/' AND file_metadata.source_id = {} 119 | # GROUP BY file_metadata.os_id, unique_path.id 120 | # ON DUPLICATE KEY UPDATE count=count+1 121 | #""".format(source.source_id) 122 | 123 | cursor.execute(""" 124 | INSERT INTO global_dir_prevalence (unique_path_id, count, num_systems, os_id) 125 | SELECT unique_path.id as path_id, 126 | COUNT(file_metadata.id) 127 | as count, t.num_systems, file_metadata.os_id 128 | from unique_path LEFT JOIN file_metadata 129 | ON file_metadata.unique_path_id = unique_path.id 130 | LEFT JOIN (SELECT os.id as os_i, 131 | COUNT(media_source.id) 132 | as num_systems from os 133 | LEFT JOIN media_source 134 | ON os.id = media_source.os_id 135 | GROUP BY os.id) 136 | as t ON (file_metadata.os_id = t.os_i) 137 | where file_metadata.file_name = '/' 138 | AND file_metadata.source_id = %s 139 | GROUP BY file_metadata.os_id, unique_path.id 140 | ON DUPLICATE KEY UPDATE count=count+1 141 | """, (source.source_id,)) 142 | 143 | #query = """ 144 | # UPDATE global_dir_prevalence SET num_systems = {}, average = (SELECT count/num_systems) where os_id = {} 145 | #""".format(num_systems, source.os_id) 146 | 147 | cursor.execute(""" 148 | UPDATE global_dir_prevalence 149 | SET num_systems = %s, 150 | average = (SELECT count/num_systems) 151 | where os_id = %s 152 | """, (num_systems, source.os_id,)) 153 | 154 | self.cnx.commit() 155 | 156 | #TODO: There should be a better way for below code 157 | print "[+] Rebuilding the aggregated prevalence table for directories" 158 | 159 | if len(sources) == 0: 160 | return 161 | 162 | cursor.execute("DROP TABLE IF EXISTS global_dir_combined_prevalence") 163 | 164 | self.cnx.commit() 165 | 166 | query = """ 167 | CREATE TABLE IF NOT EXISTS global_dir_combined_prevalence ( 168 | unique_path_id INT UNSIGNED NOT NULL, 169 | average DOUBLE NOT NULL DEFAULT .5, 170 | PRIMARY KEY(unique_path_id), 171 | CONSTRAINT fk_unique_path_idx3 FOREIGN KEY(unique_path_id) 172 | REFERENCES unique_path (id) 173 | ON DELETE NO ACTION ON UPDATE NO ACTION 174 | ) ENGINE = InnoDB; 175 | """ 176 | 177 | cursor.execute(query) 178 | self.cnx.commit() 179 | 180 | query = """ 181 | INSERT INTO global_dir_combined_prevalence 182 | SELECT unique_path_id, avg(average) FROM file_metadata 183 | INNER JOIN global_file_prevalence ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id 184 | where file_metadata.file_name != '/' GROUP BY unique_path_id 185 | """ 186 | 187 | cursor.execute(query) 188 | self.cnx.commit() 189 | 190 | def clean(self): 191 | """ 192 | Removes all required tables 193 | """ 194 | 195 | cursor = self.cnx.cursor() 196 | cursor.execute("DROP TABLE IF EXISTS global_file_prevalence") 197 | cursor.execute("DROP TABLE IF EXISTS global_dir_prevalence") 198 | cursor.execute("DROP TABLE IF EXISTS global_dir_combined_prevalence") 199 | self.cnx.commit() 200 | 201 | 202 | def build(self): 203 | """ 204 | Builds all required tables 205 | """ 206 | cursor = self.cnx.cursor() 207 | 208 | query = """ 209 | CREATE TABLE IF NOT EXISTS global_file_prevalence ( 210 | unique_file_id BIGINT UNSIGNED NOT NULL, 211 | average DOUBLE NOT NULL DEFAULT .5, 212 | count INT NOT NULL DEFAULT 0, 213 | num_systems INT NOT NULL DEFAULT 0, 214 | os_id INT UNSIGNED NOT NULL, 215 | PRIMARY KEY(unique_file_id, os_id), 216 | INDEX idx_fp_average (average) USING BTREE, 217 | INDEX fk_unique_file_idx1 (unique_file_id), 218 | INDEX fk_os_id_idx1 (os_id), 219 | CONSTRAINT fk_unique_file_idx1 FOREIGN KEY(unique_file_id) 220 | REFERENCES unique_file (id) 221 | ON DELETE NO ACTION ON UPDATE NO ACTION, 222 | CONSTRAINT fk_os_id_idx1 FOREIGN KEY(os_id) 223 | REFERENCES os (id) 224 | ON DELETE NO ACTION ON UPDATE NO ACTION 225 | ) ENGINE = InnoDB; 226 | """ 227 | 228 | cursor.execute(query) 229 | 230 | query = """ 231 | CREATE TABLE IF NOT EXISTS global_dir_prevalence ( 232 | unique_path_id INT UNSIGNED NOT NULL, 233 | average DOUBLE NOT NULL DEFAULT .5, 234 | count INT NOT NULL DEFAULT 0, 235 | num_systems INT NOT NULL DEFAULT 0, 236 | os_id INT UNSIGNED NOT NULL, 237 | PRIMARY KEY(unique_path_id, os_id), 238 | INDEX fk_unique_path_idx1 (unique_path_id), 239 | INDEX fk_os_id_idx2 (os_id), 240 | CONSTRAINT fk_unique_path_idx2 FOREIGN KEY(unique_path_id) 241 | REFERENCES unique_path (id) 242 | ON DELETE NO ACTION ON UPDATE NO ACTION, 243 | CONSTRAINT fk_os_id_idx2 FOREIGN KEY(os_id) 244 | REFERENCES os (id) 245 | ON DELETE NO ACTION ON UPDATE NO ACTION 246 | ) ENGINE = InnoDB; 247 | """ 248 | 249 | cursor.execute(query) 250 | 251 | self.cnx.commit() 252 | cursor.close() 253 | -------------------------------------------------------------------------------- /images/logo/license/SIL_open_font_license.txt: -------------------------------------------------------------------------------- 1 | SIL Open Font License (OFL) 2 | 3 | General Information 4 | 5 | Text of the SIL Open Font License 6 | 7 | The OFL FAQ 8 | 9 | Fonts licensed via the OFL 10 | 11 | OFL Graphics 12 | 13 | 14 | Overview 15 | Documents 16 | Current version - 1.1 17 | Translations 18 | Using the OFL 19 | History 20 | Community review 21 | OFL fonts 22 | Details and rationale 23 | FLOSS-friendliness 24 | The 4 FSF Freedoms 25 | DFSG compatibility 26 | OSD compatibility 27 | "Human readable" version and visual representation 28 | Terminology 29 | Visual representation 30 | Attribution 31 | Share Alike 32 | Embedding 33 | DerivativeRenaming 34 | BundlingWhenSelling 35 | 36 | 37 | 38 | New version of the OFL-FAQ available: version 1.1-update3 39 | 40 | There is a new version of the OFL-FAQ (version 1.1-update3) available based on feedback from the wider open font design community. Various sections have been clarified but the main changes are mostly related to web font use and modification. There is also a separate discussion paper on Web Font and Reserved Font Names. 41 | 42 | Overview 43 | 44 | 45 | 46 | The SIL Open Font License (OFL) is a free, libre and open source license specifically designed for fonts and related software based on our experience in font design and linguistic software engineering. 47 | 48 | The OFL provides a legal framework and infrastructure for worldwide development, sharing and improvement of fonts and related software in a collaborative manner. It enables font authors to release their work under a common license that allows use, bundling, modification and redistribution. It encourages shared value, is not limited to any specific computing platform or environment, and can be used by other organizations or individuals. 49 | 50 | The OFL meets the specific needs of typographic design and engineering as well as the gold standards of the FLOSS (Free/Libre and Open Source Software) community, namely the cultural values and guidelines from the FSF 1, the Debian Free Software Guidelines2, as well as the Open Source Definition3. It draws inspiration from concepts and elements found in other licenses, but our improvements in the specific area of fonts have made the licensing model work better than other approaches currently in use. 51 | 52 | SIL International serves language communities worldwide, building their capacity for sustainable language development, by means of research, translation, training and materials development. We have been thinking about more open and participative models for a while, for example through our partnerships with UNESCO (Initiative B@bel) and our work on the Gentium typeface. See www.sil.org/resources/software_fonts for a detailed list of free/libre and open source software resources provided by SIL. 53 | 54 | We want to: 55 | 56 | enable others to participate in our projects 57 | enable others to cater to needs for which we don't have the resources 58 | share our wealth of knowledge and experience in the area of writing systems and pass on our tools 59 | equip the community to meet its font needs 60 | We serve the peoples of the world without regard to their material wealth, so we are grateful to those that do fund our work. Please visit Donate to SIL International for information on supporting our efforts. 61 | 62 | Documents 63 | 64 | We have gone through a lot of effort to make our license readable and easily understood by users, designers and software developers as well as package maintainers and distributors. To make the OFL even more human-readable, we have provided a FAQ (Frequently Asked Questions) to help everyone understand the intent and the practical aspects of using the license itself. Although it already covers many items, the FAQ will grow as needed. Please let us know if you have more questions. 65 | 66 | Current version - 1.1 67 | We recommend all authors use version 1.1 of the OFL, but version 1.0 is given here for reference. A full list of changes from 1.0 to 1.1 can be found on the OFL Review page. The most important change for authors is that no font names are reserved by default. Reserved Font Names must be explicitly listed alongside the copyright statement in the OFL header. 68 | 69 | Format OFL OFL-FAQ 70 | web (html) OFL 1.1 OFL-FAQ 1.1-update3 71 | plain text 72 | OFL Plaintext 73 | Nicolas Spalinger & Victor Gaultney, 2007-02-26 74 | Download "OFL.txt", Text document, 5KB [45809 downloads] 75 | OFL-FAQ Plaintext (1.1-update3) 76 | Nicolas Spalinger & Victor Gaultney, 2013-09-19 77 | Download "OFL-FAQ.txt", Text document, 57KB [19412 downloads] 78 | OFL 1.1 Documents 79 | 80 | Format OFL OFL-FAQ 81 | web (html) OFL 1.0 OFL-FAQ 1.0 82 | plain text 83 | OFL 1.0 Plaintext 84 | Nicolas Spalinger & Victor Gaultney, 2005-11-22 85 | Download "OFL10.txt", Text document, 4KB [10968 downloads] 86 | OFL-FAQ 1.0 Plaintext 87 | Nicolas Spalinger & Victor Gaultney, 2005-11-22 88 | Download "ofl-faq10.txt", Text document, 18KB [8591 downloads] 89 | OFL 1.0 Documents (for reference only) 90 | 91 | Translations 92 | We also recognise the need for people who are not familiar with English to be able to understand the OFL and this FAQ better - in their own language. If you are an experienced translator, you are very welcome to help by translating the OFL and its FAQ so that designers and users in your language community can understand the license better. But only the original English version of the license has legal value and has been approved by the community. Translations do not count as legal substitutes and should only serve as a way to explain the original license. SIL - as the author and steward of the license for the community at large - does not approve any translation of the OFL as legally valid because even small translation ambiguities could be abused and create problems. 93 | 94 | We give permission to publish unofficial translations into other languages provided that they comply with the following guidelines: 95 | 96 | 1) Put the following disclaimer in both English and the target language stating clearly that the translation is unofficial: 97 | 98 | "This is an unofficial translation of the SIL Open Font License into $language. It was not published by SIL International, and does not legally state the distribution terms for fonts that use the OFL. A release under the OFL is only valid when using the original English text. 99 | 100 | However, we recognize that this unofficial translation will help users and designers not familiar with English to understand the SIL OFL better and make it easier to use and release font families under this collaborative font design model. We encourage designers who consider releasing their creation under the OFL to read the FAQ in their own language if it is available. 101 | Please go to http://scripts.sil.org/OFL for the official version of the license and the accompanying FAQ." 102 | 103 | 2) Keep your unofficial translation current and update it at our request if needed, for example, if there is any ambiguity which could lead to confusion. 104 | 105 | If you start such a unofficial translation effort of the OFL and its accompanying FAQ please let us know, thank you. 106 | 107 | Using the OFL 108 | 109 | It is relatively simple to use the OFL for your own font project. If you are the copyright owner you only need to do the following: 110 | 111 | Put your copyright and Reserved Font Names information at the beginning of the main OFL.txt file in place of the dedicated placeholders (marked with the <> characters). Include this file in your release package. 112 | Put your copyright and the OFL text with your chosen Reserved Font Name(s) into your font files (the copyright and license fields). A link to the OFL text on the OFL web site is an acceptable (but not recommended) alternative. Also add this information to any other components (build scripts, glyph databases, documentation, test files, etc). Accurate metadata in your font files is beneficial to you as an increasing number of applications are exposing this information to the user. For example, clickable links can bring users back to your website and let them know about other work you have done or services you provide. Depending on the format of your fonts and sources, you can use template human-readable headers or machine-readable metadata. You should also double-check that there is no conflicting metadata in the font itself contradicting the license, such as the fstype bits in the os2 table or fields in the name table. 113 | Write an initial FONTLOG.txt for your font and include it in the release package (see Section 6 and Appendix A of the OFL-FAQ for details including a template). 114 | Include the relevant practical documentation on the license by adding the current OFL-FAQ.txt file in your package. 115 | If you wish, you can use the OFL Graphics on your web page. 116 | More information can be found in the OFL-FAQ. 117 | 118 | History 119 | 120 | Current version: 1.1 121 | 122 | 2013-09-19 - OFL-FAQ 1.1-update3. 123 | 124 | 2013-05-17 - OFL-FAQ 1.1-update3-draft and discussion paper on Web Font and Reserved Font Names available for review and comment. 125 | 126 | 2010-08-23 - OFL-FAQ 1.1-update2. 127 | 128 | 2009-04-06 - OFL recognized as compliant with the OSD (Open Source Definition) by the OSI board and placed on their list of approved licenses. 129 | 130 | 2007-02-26 - Version 1.1 released. 131 | 132 | 2006-03-18 - A minor revision of the OFL entered the review phase. OFL-1.1-review1 was followed by OFL-1.1-review2 a few months later. 133 | 134 | 2006-01-23 - OFL recognized as a free license by the FSF (Free Software Foundation) on their License List. 135 | 136 | 2005-11-22 - Version 1.0 released. 137 | 138 | 2005-11-07 - Version 1.0-review2 submitted to ofl-discuss. 139 | 140 | 2005-09-07 - Version 1.0-review1 submitted to the first round of public reviewers. 141 | 142 | Community review 143 | Between November 2005 and January 2007 the OFL was in a public review stage, with efforts going towards version 1.1. We selected a number of reviewers we felt were the relevant experts and sought their input. We submitted our draft for review and received very insightful feedback. 144 | 145 | The review period is over and even though we feel version 1.1 will likely meet the needs for open font licensing for quite some time, we remain open to community feedback. Please contact us with your queries and suggestions. 146 | 147 | Various font-related BoFs (Birds of a Feather meetings) have taken place at FLOSS conference (like Libre Graphics Meeting, Ubuntu Summit, GUADEC, DebConf, TextLayoutSummit among others) to discuss what would be needed to improve the font landscape. One key aspect was appropriate licensing of the fonts, flexibility to maintain and branch fonts without breaking rendering, interoperability across distributions, and the definition of a core set of fonts with recognized glyph quality, sufficient Unicode coverage and a good community-recognized license. The OFL has been recognised by many contributors to these discussion as a good solution for these issues. 148 | 149 | The goals of the OFL and its methodology have been presented and discussed at major conferences from the type industry like AtypI. 150 | 151 | Open font-related presentation have also been made at TUG (TeX User Group conferences). 152 | 153 | There is a campaign with support from various key organisations in the FLOSS community (Unifont.org, Freedesktop.org, the GNOME foundation, KDE e.V., the Linux Foundation and the Free Software Foundation) to encourage more designers and supporting institutions to consider choosing the OFL for their font projects. Visit Unifont.org/go_for_ofl for more details and ways you can participate. 154 | 155 | The OFL is now well-established as the most widely used licensing model for releasing and developing unrestricted font software. It is being used successfully by various organisations, both for-profit and not-for-profit, to release fonts of varying levels of scope and complexity. A number of institutions have now made the OFL their default recommended license for fonts. 156 | 157 | OFL fonts 158 | We intend to use the OFL for all our future font releases, and will re-release our existing and older font packages under the OFL as we have personnel time. The priority of older packages will depend on demand. 159 | 160 | If you release (or intend to release) your font(s) under the OFL, let us know and we may place a link to the fonts on our OFL fonts page. 161 | 162 | Details and rationale 163 | 164 | FLOSS-friendliness 165 | The OFL is designed to be in tune with the FLOSS (Free/Libre and Open Source Software) culture. It builds upon good ideas already in existence in some free/libre and open projects but by bringing our extensive font design experience and linguistic software engineering know-how into the mix, we have produced a font-specific license which is simpler, more human-readable, neutral and reusable and dedicated to the needs of font creators. 166 | 167 | The OFL authors were inspired by the partnership between GNOME and Bitstream for the Vera family of fonts and the licensing model which was chosen. They have also studied the community impact and some of the difficulties faced by this model. 168 | 169 | The 4 FSF Freedoms 170 | The OFL is listed and recognized as a valid Free Software license on the FSF License List. It complies with the Free Software Definition and its four foundational freedoms as defined by the Free Software Foundation for the GNU project: 171 | 172 | Use: the freedom to use font software for any purpose. (freedom 0) 173 | Study and adaptation: the freedom to study how font software works, and adapt it to your needs (freedom 1). Access and rights to the source code is a precondition for this. 174 | Redistribution: the freedom to redistribute copies of the font software so you can help your neighbor (freedom 2). 175 | Improvement and redistribution of modifications: the freedom to improve the font software and release your improvements (freedom 3), so that the community benefits. Access and rights to the source code is a precondition for this. 176 | DFSG compatibility 177 | Font Software released under the OFL complies with the Debian Free Software Guidelines: 178 | 179 | reselling: DFSG #1 180 | source code redistribution: DFSG #2 181 | derivatives: DFSG #3 182 | "compromise" clause permitting name change: DFSG #4 (this is very important for font derivatives for artistic integrity and anti-collision purposes) 183 | no discrimination against people/groups: DFSG #5 184 | no discrimination against fields of endeavour: DFSG #6 185 | license distribution: DFSG #7 186 | non-Debian specific: DFSG #8 187 | no contamination of other software: DFSG #9 188 | Various font families under OFL have been accepted in the main archive of Debian (as well as Ubuntu) by the ftp-masters. An increasing number of Debian and Ubuntu developers are maintaining font packages under the OFL in main (the component of the archive which only holds Free/Libre and Open Source software). 189 | 190 | OSD compatibility 191 | The OFL complies with the Open Source Definition: 192 | 193 | free redistribution: #1 194 | source code: #2 195 | derived works: #3 196 | integrity of the author(s) source code: #4 (with the possibility of requiring a name change) 197 | no discrimination against persons or groups: #5 198 | no discrimination against fields of endeavour: #6 199 | distribution of license: #7 200 | license must not be specific to a product: #8 201 | license must not restrict other software: #9 202 | license must be technology-neutral: #10 203 | The OSI (Open Source Initiative) has recognized the OFL's compliance with the OSD and placed it on their list of approved licenses. 204 | 205 | "Human readable" version and visual representation 206 | The spirit and working model of the OFL can be expressed in human-readable Creative Commons-like 4 terminology using the following permits / requires elements and visual representations: 207 | 208 | Please note that this terminology and visual representation is simply an expression of the working model of the license and has no legal value in itself. It is designed to help you understand and use the Open Font License in a similar way to the OFL FAQ. It is always intended to link back to the full license text of the OFL. Please note that although the terminology and visual representation of the OFL is based on work by Creative Commons, the OFL is not officially affiliated with Creative Commons. 209 | 210 | Terminology 211 | permits 212 | Distribution, Reproduction, Embedding, DerivativeWorks 213 | 214 | requires 215 | Attribution, Notice, ShareAlike, DerivativeRenaming, BundlingWhenSelling 216 | 217 | Visual representation 218 | 219 | Human-readable representation 220 | 221 | 222 | 223 | (the Distribution, Reproduction, DerivativeWorks and Notice elements are implied and not represented as icons). 224 | 225 | This is what each icon means: 226 | 227 | Attribution 228 | 229 | requirement 230 | 231 | The icon shows a person and represents the author(s). 232 | The requirement is for proper attribution of the author(s): name(s) and notice(s) must be preserved and abuse of the name(s) and reputation of the author(s) is forbidden. 233 | See condition 2) and 4) of the OFL. 234 | 235 | Share Alike 236 | 237 | requirement 238 | 239 | The icon shows a cycle and represents the way font software can be re-used by all under equivalent terms. 240 | The requirement is for derivative works to remain under the same license to encourage fair collaboration and prevent anyone from locking away contributions. 241 | See condition 5) of the OFL 242 | 243 | Embedding 244 | 245 | permission 246 | 247 | The icon shows a letter on a piece of paper and represents a font placed inside a document. 248 | The permission is for fonts to be embedded in any kind of document. This does not affect the licensing status of the document but makes it easier for documents to be used in different environments. 249 | See the first paragraph of the Permission and Conditions section as well as section 5) of the OFL. 250 | 251 | DerivativeRenaming 252 | 253 | requirement 254 | 255 | The icon shows letters A and B close to each other representing a font (A) from which another font (B) of a different shape is derived. It refers to a derivative branched from the original font and bearing a new name. 256 | The requirement is for derivative fonts to be renamed to allow branching while retaining artistic integrity. 257 | See condition 3) of the OFL 258 | 259 | BundlingWhenSelling 260 | 261 | requirement 262 | 263 | The icon shows a dollar sign between parentheses. The dollar sign represents money (although there are many other currencies in the world) and the parentheses refers to the bundling. 264 | The requirement is for fonts to be bundled with software when they are sold. Fonts cannot be sold on their own. Redistribution without selling is not restricted. 265 | See condition 1) of the OFL. 266 | 267 | 268 | 269 | 1 The Free Software Foundation Licensing Lab: www.fsf.org/licensing 270 | 2 The Debian Free Software Guidelines: www.debian.org/social_contract 271 | 3 The Open Source Definition: opensource.org/docs/definition.php 272 | 4 Creative Commons: http://creativecommons.org/about/licenses/ 273 | © 2003-2013 SIL International, all rights reserved, unless otherwise noted elsewhere on this page. 274 | Provided by SIL's Non-Roman Script Initiative. Contact us at nrsi@sil.org. 275 | -------------------------------------------------------------------------------- /Filters/locality_uniqueness.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Created on 19 October 2013 20 | @author: Lab41 21 | """ 22 | 23 | 24 | import time 25 | import operator 26 | import os 27 | import numpy as np 28 | import matplotlib.pyplot as plt 29 | from collections import namedtuple, defaultdict 30 | from hashlib import sha1 31 | from scipy.cluster.vq import vq, kmeans, whiten 32 | from redwood.filters.redwood_filter import RedwoodFilter 33 | import calendar 34 | import random 35 | import warnings 36 | from multiprocessing import Pool, Queue, Manager 37 | import Queue 38 | import redwood.helpers.core as core 39 | import redwood.helpers.visual as visual 40 | import shutil 41 | 42 | warnings.filterwarnings('ignore') 43 | 44 | #NOTE: the find_anomalies and do_eval functions are outside the class so that we 45 | #can run them in parallel using the apply_async function for thread pools 46 | 47 | SMALL_CLUSTERS_SCORE = .3 48 | DEFAULT_NUM_CLUSTERS = 3 49 | 50 | def find_anomalies(rows, sorted_results, code_count_dict): 51 | """ 52 | Helper function that given a list of results from kmeans will assign 53 | scores to each file given their distance form their centroid 54 | 55 | :param rows: output rows to append to 56 | :param sorted_rows: results from kmeans sorted by first column of their code id 57 | :sorted_code_counts: centroids sorted by number of observations 58 | """ 59 | #definitely want to adjust these distance thresholds 60 | distance_threshold0 = 1.0 61 | distance_threshold1 = 2.0 62 | distance_threshold2 = 5.0 63 | distance_threshold3 = 10.0 64 | 65 | #assign scores based on distance 66 | for c, d, r in sorted_results: 67 | 68 | #get code count 69 | code_count = code_count_dict[c] 70 | 71 | #if a file belongs to a cluster with fewer than three elements, we automatically assign in lower score 72 | if code_count < 3: 73 | score = SMALL_CLUSTERS_SCORE 74 | elif d > distance_threshold3: 75 | score = .1 76 | elif d > distance_threshold2: 77 | score = .2 78 | elif d> distance_threshold1: 79 | score = .3 80 | elif d> distance_threshold0: 81 | score = .4 82 | else: 83 | score = .8 84 | file_metadata_id = r[0] 85 | rows.put((file_metadata_id, score)) 86 | 87 | 88 | def do_eval(rows, full_path, files, num_clusters, num_features): 89 | """ 90 | Helper function that analyzes a directory, looking for outliers in clusters based on the input features. 91 | Currently, only two static features are analyzed, however future versions could allow 92 | for selectable set of features 93 | 94 | :param rows: output variable to append results to 95 | :param full_path: the path that is being analyzed 96 | :param files: meta data for files in the directory 97 | :param num_clusters: number of clusters to specify for kmeans 98 | :param num_features: number of features included 99 | 100 | :return: nothing... use the rows input as an output to append to 101 | """ 102 | 103 | num_obs = len(files) 104 | 105 | #if the number of observations is less than the num_clusters we do not cluster 106 | #but rather give each file SMALL CULSTER SCORE 107 | if(num_obs < num_clusters): 108 | for f in files: 109 | rows.put((f[0], SMALL_CLUSTERS_SCORE)) 110 | return 111 | 112 | #zero out the two dimensional array 113 | observations = np.zeros((num_obs, num_features)) 114 | 115 | i = 0 116 | 117 | #transfer the observations to the numpy array 118 | for file_metadata_id,mod_date,full_path,file_name,inode,parent_id, in files: 119 | seconds = calendar.timegm(mod_date.utctimetuple()) 120 | observations[i] = (inode, seconds) 121 | i += 1 122 | 123 | #normalize the observations 124 | whitened = whiten(observations) 125 | 126 | #get the centroids (aka codebook) 127 | codebook,_ = kmeans(whitened, num_clusters) 128 | 129 | #sometimes if all observations for a given feature are the same 130 | #the centroids will not be found. In that case we give a neutral score 131 | if len(codebook) != num_clusters: 132 | for f in files: 133 | rows.put((f[0], .5)) 134 | return 135 | 136 | #calulate the distances 137 | code, dist = vq(whitened, codebook) 138 | 139 | 140 | d = defaultdict(int) 141 | 142 | #quick way to get count of cluster sizes 143 | for c in code: 144 | d[c] += 1 145 | 146 | #combine the results with the original data, then sort by the code 147 | combined = zip(code, dist, files) 148 | sorted_results = sorted(combined, key=lambda tup: tup[0]) 149 | 150 | find_anomalies(rows, sorted_results, d) 151 | 152 | 153 | 154 | 155 | class LocalityUniqueness(RedwoodFilter): 156 | """ 157 | LocalityUniqueness seeks to identify anomalies through clustering of file features in a given directory. The 158 | assumption is that files of interest are those that are different than most of their neighbors in a given 159 | domain -- this case being the directory. As a result, this filter is responsible for giving outliers of clusters 160 | lower reputation scores than those files closer to the centroid 161 | """ 162 | 163 | def __init__(self, cnx=None): 164 | self.score_table = "lu_scores" 165 | self.name = "Locality_Uniqueness" 166 | self.cnx = cnx 167 | 168 | def usage(self): 169 | """ 170 | Prints the usage statements for the discovery functions 171 | """ 172 | print "[*] evaluate_dir [full_path] [source] [clusters]" 173 | print "\t|- runs kmeans and shows scatter plot" 174 | print "\t| [full_path] - path to analyze" 175 | print "\t| [source] - source where the path exists" 176 | print "\t| [clusters] - number of clusters to use" 177 | 178 | def update(self, source): 179 | """ 180 | Applies the Locality Uniqueness filter to the given source, updating existing data 181 | analyzed from previous sources. Currently the update function uses 3 clusters for clustering 182 | analysis. This will be dynamic in future versions. 183 | 184 | :param source: media source name 185 | """ 186 | print "[+] Locality Uniqueness Filter running on {}".format(source) 187 | self.build() 188 | self.evaluate_source(source) 189 | 190 | def evaluate_source(self, source_name, num_clusters=DEFAULT_NUM_CLUSTERS): 191 | """ 192 | Evaluates and scores a given source with a specified number of clusters for kmeans. Currently 193 | this function uses two set features as inputs (modification time and inode number), however 194 | futures versions will allow for dynamic feature inputs 195 | 196 | :param source_name: media source name 197 | :param num_clusters: number of clusters to input into kmeans (Default: 3) 198 | """ 199 | 200 | cursor = self.cnx.cursor() 201 | src_info = core.get_source_info(self.cnx, source_name) 202 | 203 | #returns all files sorted by directory for the given source 204 | #query = """ 205 | # SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash 206 | # FROM joined_file_metadata 207 | # where source_id = {} order by parent_id asc 208 | # """.format(src_info.source_id) 209 | 210 | cursor.execute(""" 211 | SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash 212 | FROM joined_file_metadata 213 | where source_id = %s order by parent_id asc 214 | """, (src_info.source_id,)) 215 | 216 | files = list() 217 | 218 | print "...Beginning clustering analysis" 219 | pool = Pool(processes=4) # start 4 worker processes 220 | manager = Manager() 221 | rows = manager.Queue() 222 | is_first = True 223 | 224 | parent_id_prev = None 225 | #should iterate by dir of a given source at this point 226 | for(file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash_val) in cursor: 227 | 228 | if is_first is True: 229 | is_first = False 230 | parent_id_prev = parent_id 231 | 232 | #if parent_id is diff than previous, we are in new directory, so pack it up for analysis 233 | if parent_id_prev != parent_id: 234 | 235 | parent_id_prev = parent_id 236 | 237 | if len(files) > 0: 238 | pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2]) 239 | files = list() 240 | 241 | #make sure to omit directories from the clustering analy 242 | if file_name != '/' and hash_val != "": 243 | files.append((file_metadata_id, last_modified, full_path,file_name, filesystem_id, parent_id)) 244 | 245 | if len(files) > 0: 246 | pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2]) 247 | 248 | pool.close() 249 | pool.join() 250 | 251 | input_rows = [] 252 | count = 0 253 | while rows.empty() is False: 254 | curr = rows.get() 255 | input_rows.append(curr) 256 | count +=1 257 | if count % 50000 is 0: 258 | print "...sending {} results to server".format(len(input_rows)) 259 | cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows) 260 | input_rows = [] 261 | count=0 262 | print "...sending {} results to server".format(len(input_rows)) 263 | 264 | cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows) 265 | self.cnx.commit() 266 | #need to drop the lu_scores and recalculate 267 | cursor.execute("drop table if exists lu_scores") 268 | 269 | query = ("""CREATE TABLE IF NOT EXISTS `lu_scores` ( 270 | `id` bigint(20) unsigned NOT NULL, 271 | `score` double DEFAULT NULL, 272 | KEY `fk_unique_file0_id` (`id`), 273 | CONSTRAINT `fk_unique_file0_id` FOREIGN KEY (`id`) 274 | REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION 275 | ) ENGINE=InnoDB""") 276 | 277 | cursor.execute(query) 278 | 279 | print "...updating scores on the server" 280 | query = """ 281 | INSERT INTO lu_scores 282 | (SELECT file_metadata.unique_file_id, avg(locality_uniqueness.score) FROM 283 | locality_uniqueness LEFT JOIN file_metadata on (locality_uniqueness.file_metadata_id = file_metadata.id) 284 | WHERE file_metadata.unique_file_id is not null 285 | GROUP BY file_metadata.unique_file_id) 286 | """ 287 | 288 | cursor.execute(query) 289 | self.cnx.commit() 290 | 291 | 292 | def clean(self): 293 | """ 294 | Removes all tables associated with this filter 295 | """ 296 | 297 | cursor = self.cnx.cursor() 298 | cursor.execute("DROP TABLE IF EXISTS lu_scores") 299 | cursor.execute("DROP TABLE IF EXISTS locality_uniqueness") 300 | self.cnx.commit() 301 | 302 | 303 | def build(self): 304 | """ 305 | Build all persistent tables associated with this filter 306 | """ 307 | cursor = self.cnx.cursor() 308 | 309 | query = """ 310 | CREATE table IF NOT EXISTS locality_uniqueness ( 311 | file_metadata_id BIGINT unsigned unique, 312 | score DOUBLE NOT NULL, 313 | PRIMARY KEY(file_metadata_id), 314 | INDEX lu_score (score ASC), 315 | CONSTRAINT fk_file_metadata11 FOREIGN KEY (file_metadata_id) 316 | REFERENCES file_metadata (id) 317 | ON DELETE NO ACTION ON UPDATE NO ACTION 318 | ) ENGINE = InnoDB; 319 | """ 320 | 321 | cursor.execute(query) 322 | 323 | self.cnx.commit() 324 | 325 | ################################################## 326 | # 327 | # DISCOVERY FUNCTIONS 328 | # 329 | ################################################## 330 | 331 | def discover_evaluate_dir(self, dir_name, source, num_clusters=DEFAULT_NUM_CLUSTERS): 332 | """ 333 | Discovery function that applies kmeans clustering to a specified directory, displays 334 | the resulting scatter plot with the clusters, and then prints out an ordered list of 335 | the file by the distance from their respective centroid. Currently, 336 | this function uses two static features of "modification date" and "inode number" but 337 | future versions will allow for dynamic features inputs. 338 | 339 | :param dir_name: directory name to be analyzed (Required) 340 | :source: source name to be analzyed (Required) 341 | :num_clusters: specified number of clusters to use for kmeans (Default: 3) 342 | """ 343 | 344 | num_features = 2 345 | num_clusters = int(num_clusters) 346 | cursor = self.cnx.cursor() 347 | 348 | if(dir_name.endswith('/')): 349 | dir_name = dir_name[:-1] 350 | 351 | print "...Running discovery function on source {} at directory {}".format(source, dir_name) 352 | 353 | src_info = core.get_source_info(self.cnx, source) 354 | if src_info is None: 355 | print "Error: Source {} does not exist".format(source) 356 | return 357 | 358 | #grab all files for a particular directory from a specific source 359 | hash_val = sha1(dir_name).hexdigest() 360 | 361 | #query = """ 362 | # SELECT file_name, file_metadata_id, filesystem_id, last_modified 363 | # FROM joined_file_metadata 364 | # WHERE source_id ='{}' AND path_hash = '{}' AND file_name !='/' 365 | # """.format(src_info.source_id, hash_val) 366 | 367 | cursor.execute(""" 368 | SELECT file_name, file_metadata_id, filesystem_id, last_modified 369 | FROM joined_file_metadata 370 | WHERE source_id = %s AND path_hash = %s AND file_name !='/' 371 | """, (src_info.source_id, hash_val,)) 372 | 373 | #bring all results into memory 374 | sql_results = cursor.fetchall() 375 | 376 | if(len(sql_results) == 0): 377 | return 378 | 379 | print "...Found {} files in specified directory".format(len(sql_results)) 380 | print "...Will form into {} clusters".format(num_clusters) 381 | if num_clusters > len(sql_results): 382 | print "Number of clusters ({}) exceeds number of files ({})".format(num_clusters, len(sql_results)) 383 | num_clusters = len(sql_results) 384 | print "Number of clusters is now: {}".format(num_clusters) 385 | 386 | 387 | #zero out the array that will contain the inodes 388 | filesystem_id_arr = np.zeros((len(sql_results), num_features)) 389 | 390 | i = 0 391 | for _, _,inode, mod_date in sql_results: 392 | seconds = calendar.timegm(mod_date.utctimetuple()) 393 | filesystem_id_arr[i] = (inode, seconds) 394 | i += 1 395 | whitened = whiten(filesystem_id_arr) 396 | #get the centroids 397 | codebook,_ = kmeans(whitened, num_clusters) 398 | code, dist = vq(whitened, codebook) 399 | d = defaultdict(int) 400 | 401 | #quick way to get count of cluster sizes 402 | for c in code: 403 | d[c] += 1 404 | 405 | #sorts the codes and sql_results together as pairs 406 | combined = zip(dist, code, sql_results) 407 | 408 | #sort results by distances from centroid 409 | sorted_results = sorted(combined, key=lambda tup: tup[0]) 410 | 411 | for dist_val, c, r in sorted_results: 412 | print "Dist: {} Cluster: {} Data: {}".format(dist_val,c,r) 413 | 414 | 415 | if codebook is None or len(codebook) == 0: 416 | print "Data is not suitable for visualization" 417 | return 418 | 419 | visual.visualize_scatter(d, code, whitened, codebook, num_clusters, "inode number", "modification datetime", dir_name) 420 | 421 | 422 | ################################################## 423 | # 424 | # SURVEY 425 | # 426 | ################################################## 427 | 428 | def run_survey(self, source_name): 429 | """ 430 | Runs survey for this filter capturing discovery functions and reputation score results 431 | 432 | :param source_name: name of the source to survey 433 | :return survey_dir: location where survey results were saved 434 | """ 435 | 436 | print "...running survey for {}".format(self.name) 437 | 438 | resources = "resources" 439 | survey_file = "survey.html" 440 | survey_dir = "survey_{}_{}".format(self.name, source_name) 441 | 442 | resource_dir = os.path.join(survey_dir, resources) 443 | html_file = os.path.join(survey_dir, survey_file) 444 | 445 | try: 446 | shutil.rmtree(survey_dir) 447 | except: 448 | pass 449 | 450 | os.mkdir(survey_dir) 451 | os.mkdir(resource_dir) 452 | 453 | results = self.show_results("bottom", 100, source_name, None) 454 | 455 | with open(html_file, 'w') as f: 456 | 457 | f.write(""" 458 | 459 | 460 | 461 | 462 | 463 |

Locality Uniqueness Snapshot

464 | """) 465 | f.write("

The lowest 100 reputations for this filter

") 466 | f.write("") 467 | f.write("") 468 | f.write("") 469 | f.write("") 470 | i = 0 471 | lr = len(results) 472 | for r in results: 473 | if i == lr - 1: 474 | f.write("") 475 | f.write("".format(r[0], r[1], r[2])) 476 | else: 477 | f.write("".format(r[0], r[1], r[2])) 478 | i += 1 479 | f.write("
ScoreParent PathFilename
{}{}{}
{}{}{}
") 480 | 481 | f.write("") 482 | return survey_dir 483 | -------------------------------------------------------------------------------- /Filters/filter_prevalence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | 20 | Created on 19 October 2013 21 | @author: Lab41 22 | """ 23 | 24 | import os 25 | import numpy as np 26 | import matplotlib.pyplot as plt 27 | import redwood.helpers.core as core 28 | import shutil 29 | 30 | from redwood.filters.redwood_filter import RedwoodFilter 31 | 32 | class FilterPrevalence(RedwoodFilter): 33 | """ 34 | This filter provides analysis and scoring based on the prevalence of files and directories across sources. The general idea is that a file with a higher prevalence would have a higher reputation than a file that occurs less often. 35 | """ 36 | 37 | def __init__(self, cnx=None): 38 | self.name = "Prevalence" 39 | self.score_table = "fp_scores" 40 | self.cnx = cnx 41 | 42 | def usage(self): 43 | """ 44 | Prints the usage statement 45 | """ 46 | 47 | print "[+] histogram_by_source " 48 | print "---view histogram of file distribution for a single source with name " 49 | print "\t- source_name: name of the source" 50 | print "[+] histogram_by_os " 51 | print "---view file distribution for an os" 52 | print "\t- os_name: name of the os" 53 | print "[+] detect_anomalies " 54 | print "---view the top anomalies for the given source" 55 | print "\t-out_file: file to write results to" 56 | 57 | 58 | def update(self, source): 59 | """ 60 | Updates the scores of the fp_scores table with the new data from the inputted source 61 | 62 | :param source: identifier for the source to be updated 63 | """ 64 | 65 | print "[+] Prevalence Filter running on {} ".format(source) 66 | 67 | #creates the basic tables if they do not exist 68 | self.build() 69 | 70 | cursor = self.cnx.cursor() 71 | 72 | src_info = core.get_source_info(self.cnx, source) 73 | 74 | if src_info is None: 75 | print "Error: Source {} not found".format(source) 76 | return 77 | 78 | #initial insert 79 | #query = """ 80 | # INSERT INTO fp_scores(id, score) 81 | # SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average) 82 | # FROM global_file_prevalence JOIN file_metadata 83 | # ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id 84 | # where file_metadata.source_id = {} 85 | # ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average) 86 | #""".format(src_info.source_id) 87 | 88 | cursor.execute(""" 89 | INSERT INTO fp_scores(id, score) 90 | SELECT global_file_prevalence.unique_file_id, 91 | IF(num_systems < 3, .5, average) 92 | FROM global_file_prevalence JOIN file_metadata 93 | ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id 94 | where file_metadata.source_id = %s 95 | ON DUPLICATE KEY UPDATE score = 96 | IF(num_systems < 3, .5, average) 97 | """, (src_info.source_id,)) 98 | self.cnx.commit() 99 | 100 | #adjustment for low outliers in high prevalent directories... This could probably better be done with taking the std dev of each 101 | #dir, but his will have to work for now. 102 | #query = """ 103 | # UPDATE global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 104 | # LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id 105 | # LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id 106 | # LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id 107 | # SET fp_scores.score = fp_scores.score * .5 108 | # where file_metadata.source_id = {} AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2 109 | # and global_dir_combined_prevalence.average - global_file_prevalence.average > .6 110 | #""".format(src_info.source_id) 111 | 112 | cursor.execute(""" 113 | UPDATE global_file_prevalence left join file_metadata 114 | ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 115 | LEFT JOIN global_dir_prevalence 116 | on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id 117 | LEFT JOIN global_dir_combined_prevalence 118 | on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id 119 | LEFT JOIN fp_scores 120 | ON fp_scores.id = global_file_prevalence.unique_file_id 121 | SET fp_scores.score = fp_scores.score * .5 122 | where file_metadata.source_id = %s 123 | AND global_file_prevalence.count = 1 124 | and global_file_prevalence.num_systems > 2 125 | and global_dir_combined_prevalence.average - global_file_prevalence.average > .6 126 | """, (src_info.source_id,)) 127 | self.cnx.commit() 128 | 129 | #adjustments for low prevalent scored directories which occur often... hopefully this will exclude the caches 130 | #query = """ 131 | # UPDATE file_metadata 132 | # LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id 133 | # LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id 134 | # LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id 135 | # SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score 136 | # where file_metadata.source_id = {} AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5 137 | #""".format(src_info.source_id) 138 | 139 | cursor.execute(""" 140 | UPDATE file_metadata 141 | LEFT JOIN global_dir_prevalence 142 | ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id 143 | LEFT JOIN global_dir_combined_prevalence 144 | ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id 145 | LEFT JOIN fp_scores 146 | ON file_metadata.unique_file_id = fp_scores.id 147 | SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score 148 | where file_metadata.source_id = %s 149 | AND global_dir_prevalence.average > .8 150 | AND global_dir_combined_prevalence.average < .5 151 | """, (src_info.source_id,)) 152 | self.cnx.commit() 153 | cursor.close() 154 | 155 | def clean(self): 156 | """ 157 | Cleans all tables associated with this filter 158 | """ 159 | cursor = self.cnx.cursor() 160 | cursor.execute("DROP TABLE IF EXISTS fp_scores") 161 | self.cnx.commit() 162 | 163 | def build(self): 164 | """ 165 | Builds all persistent tables associated with this filter 166 | """ 167 | 168 | cursor = self.cnx.cursor() 169 | 170 | query = """ 171 | CREATE TABLE IF NOT EXISTS `fp_scores` ( 172 | id BIGINT UNSIGNED NOT NULL, 173 | score double DEFAULT NULL, 174 | PRIMARY KEY(id), 175 | CONSTRAINT `fk_unique_file1_id` FOREIGN KEY (`id`) 176 | REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION 177 | ) ENGINE=InnoDB 178 | """ 179 | 180 | cursor.execute(query) 181 | self.cnx.commit() 182 | cursor.close() 183 | 184 | 185 | 186 | ################################################## 187 | # 188 | # DISCOVERY FUNCTIONS 189 | # 190 | ################################################## 191 | 192 | def discover_histogram_by_os(self, os_name, output=None): 193 | """ 194 | Displays a histogram of the file distributions across all systems 195 | of the specified OS 196 | 197 | :param os_name: name of the operating system 198 | :param output: (optional) output filename in PNG format 199 | """ 200 | 201 | print '[+] Running \"Histogram by OS\"..."' 202 | cursor = self.cnx.cursor() 203 | 204 | num_systems = core.get_num_systems(self.cnx, os_name) 205 | 206 | print "NUM: {}".format(num_systems) 207 | if num_systems is None or num_systems == 0: 208 | print "Error: OS {} does not exist".format(os_name) 209 | return 210 | 211 | bins = range(1, num_systems+2) 212 | 213 | #query = """ 214 | # SELECT COUNT(file_metadata.os_id), global_file_prevalence.count FROM global_file_prevalence 215 | # LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 216 | # WHERE file_metadata.os_id = (SELECT os.id FROM os WHERE os.name = "{}") 217 | # GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC; 218 | #""".format(os_name) 219 | 220 | cursor.execute(""" 221 | SELECT COUNT(file_metadata.os_id), global_file_prevalence.count 222 | FROM global_file_prevalence 223 | LEFT JOIN file_metadata 224 | ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 225 | WHERE file_metadata.os_id = 226 | (SELECT os.id FROM os WHERE os.name = %s) 227 | GROUP BY global_file_prevalence.count 228 | ORDER BY global_file_prevalence.count ASC; 229 | """, (os_name,)) 230 | data = cursor.fetchall() 231 | counts, ranges = zip(*data) 232 | 233 | fig = plt.figure() 234 | perc = int( float(sum(counts[1:])) / sum(counts) * 100) 235 | ax = fig.add_subplot(111, title="File Prevalence of {} with {}% > 1".format(os_name, perc)) 236 | ax.hist(ranges, weights=counts, bins = bins) 237 | ax.set_xlabel("Num of Systems") 238 | ax.set_ylabel("File Occurrences") 239 | 240 | if output is None: 241 | plt.show() 242 | else: 243 | print "Saving histogram to {}".format(output) 244 | plt.savefig(output) 245 | 246 | def discover_histogram_by_source(self, source_name, output=None): 247 | """ 248 | Displays a histogram of the file distribution of a single source as it relates 249 | to all occurrences of that file across all systems 250 | 251 | :param source_name: The name of the source 252 | :param output: (optional) output filename in PNG format 253 | """ 254 | 255 | print '[+] Running \"Histogram by Source\"...' 256 | cursor = self.cnx.cursor() 257 | 258 | src_info = core.get_source_info(self.cnx, source_name) 259 | 260 | if src_info is None: 261 | print "Source {} does not exist".format(source_name) 262 | return 263 | 264 | num_systems = core.get_num_systems(self.cnx, src_info.os_id) 265 | bins = range(1, num_systems+2) 266 | 267 | #query = """ 268 | # SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence 269 | # LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 270 | # WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}") 271 | # GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC; 272 | #""".format(source_name) 273 | 274 | cursor.execute(""" 275 | SELECT COUNT(file_metadata.id), global_file_prevalence.count 276 | FROM global_file_prevalence 277 | LEFT JOIN file_metadata 278 | ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 279 | WHERE file_metadata.source_id = 280 | (SELECT media_source.id 281 | FROM media_source 282 | WHERE media_source.name = %s) 283 | GROUP BY global_file_prevalence.count 284 | ORDER BY global_file_prevalence.count ASC; 285 | """, (source_name,)) 286 | 287 | data = cursor.fetchall() 288 | 289 | if data == None or len(data) is 0: 290 | return 291 | 292 | counts, ranges = zip(*data) 293 | 294 | fig = plt.figure() 295 | perc = int( float(sum(counts[1:])) / sum(counts) * 100) 296 | ax = fig.add_subplot(111, title="File Prevalence of {} with {}% > 1".format(src_info.source_name, perc)) 297 | ax.hist(ranges, weights=counts, bins = bins) 298 | ax.set_xlabel("Num of Systems") 299 | ax.set_ylabel("File Occurrences") 300 | 301 | if output is None: 302 | plt.show() 303 | else: 304 | print "Saving histogram to {}".format(output) 305 | plt.savefig(output) 306 | 307 | def discover_detect_anomalies(self, source, out=None): 308 | """ 309 | Conducts an anomaly search on a given source 310 | 311 | :param source: source 312 | :param out: output file (optional) 313 | """ 314 | 315 | cursor = self.cnx.cursor() 316 | 317 | src_info = core.get_source_info(self.cnx, source) 318 | 319 | if src_info is None: 320 | print "*** Error: Source not found" 321 | return 322 | 323 | #anomaly type: low prevalence files in normally high prevalence directories 324 | print "...Anomaly Detection: Unique files in common areas" 325 | 326 | #query = """ 327 | # SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, 328 | # unique_path.full_path, file_metadata.file_name 329 | # FROM global_file_prevalence 330 | # LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 331 | # LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id 332 | # LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id 333 | # where file_metadata.source_id = {} 334 | # HAVING difference > 0 335 | # ORDER BY difference desc limit 0, 100 336 | #""".format(src_info.source_id) 337 | 338 | cursor.execute(""" 339 | SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, 340 | unique_path.full_path, file_metadata.file_name 341 | FROM global_file_prevalence 342 | LEFT JOIN file_metadata 343 | ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id 344 | LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id 345 | LEFT JOIN unique_path 346 | ON file_metadata.unique_path_id = unique_path.id 347 | where file_metadata.source_id = %s 348 | HAVING difference > 0 349 | ORDER BY difference desc limit 0, 100 350 | """, (src_info.source_id,)) 351 | 352 | if out is None: 353 | results = cursor.fetchall() 354 | if results is None or len(results) == 0: 355 | print "No anomalies found" 356 | else: 357 | print "Showing top {} results".format(len(results)) 358 | for x in results: 359 | print x 360 | return results 361 | 362 | print "Writing results to {}".format(out) 363 | 364 | with open(out, "w") as f: 365 | v=0 366 | for x in cursor.fetchall(): 367 | f.write("{}: {} {}/{}\n".format(v, x[0], x[1], x[2])) 368 | v+=1 369 | 370 | cursor.close() 371 | 372 | def run_survey(self, source_name): 373 | 374 | print "...running survey for {}".format(self.name) 375 | 376 | resources = "resources" 377 | img_by_src = "hist_by_src.png" 378 | img_by_os = "hist_by_os.png" 379 | survey_file = "survey.html" 380 | survey_dir = "survey_{}_{}".format(self.name, source_name) 381 | 382 | 383 | resource_dir = os.path.join(survey_dir, resources) 384 | html_file = os.path.join(survey_dir, survey_file) 385 | 386 | try: 387 | shutil.rmtree(survey_dir) 388 | except: 389 | pass 390 | 391 | os.mkdir(survey_dir) 392 | os.mkdir(resource_dir) 393 | 394 | src_info = core.get_source_info(self.cnx, source_name) 395 | 396 | self.discover_histogram_by_source(source_name, os.path.join(resource_dir, img_by_src)) 397 | self.discover_histogram_by_os(src_info.os_name, os.path.join(resource_dir, img_by_os)) 398 | anomalies = self.discover_detect_anomalies(source_name, None) 399 | results = self.show_results("bottom", 100, source_name, None) 400 | 401 | 402 | with open(html_file, 'w') as f: 403 | f.write(""" 404 | 405 | 406 |

Filter Prevalence Snapshot

407 | 408 |

Histogram for {}

409 | 410 |

Histogram for Operating System - {}

411 | 412 | """.format( source_name, 413 | os.path.join(resources, img_by_src), 414 | src_info.os_name, 415 | os.path.join(resources, img_by_os) 416 | )) 417 | 418 | f.write("

The lowest 100 reputations for this filter

") 419 | f.write("") 420 | f.write("") 421 | i = 0 422 | lr = len(results) 423 | for r in results: 424 | if i == lr - 1: 425 | f.write("") 426 | f.write("".format(r[0], r[1], r[2])) 427 | else: 428 | f.write("".format(r[0], r[1], r[2])) 429 | i += 1 430 | f.write("
ScoreParent PathFilename
{}{}{}
{}{}{}
") 431 | 432 | f.write("

The top 100 anomalous files

") 433 | f.write("") 434 | f.write("") 435 | i = 0 436 | lr = len(anomalies) 437 | for r in anomalies: 438 | if i == lr - 1: 439 | f.write("") 440 | f.write("".format(r[0], r[1], r[2])) 441 | else: 442 | f.write("".format(r[0], r[1], r[2])) 443 | i += 1 444 | #for r in anomalies: 445 | # f.write("".format(r[0], r[1], r[2])) 446 | f.write("
Anomaly ValueParent PathFilename
{}{}{}
{}{}{}
{}{}{}
") 447 | return survey_dir 448 | --------------------------------------------------------------------------------