├── redwood
    ├── __init__.py
    ├── io
    │   ├── __init__.py
    │   └── csv_importer.py
    ├── helpers
    │   ├── __init__.py
    │   ├── visual.py
    │   └── core.py
    ├── shell
    │   ├── __init__.py
    │   ├── controller.py
    │   └── modes.py
    ├── connection
    │   ├── __init__.py
    │   └── connect.py
    ├── foundation
    │   ├── __init__.py
    │   ├── aggregator.py
    │   ├── report.py
    │   └── prevalence.py
    └── filters
    │   ├── __init__.py
    │   └── redwood_filter.py
├── .gitignore
├── CHANGELOG.md
├── MAINTAINERS.md
├── CONTRIBUTING.md
├── images
    ├── clustering.png
    ├── discovery.png
    ├── histogram0.png
    ├── redwood_0.png
    └── logo
    │   ├── favicon.ico
    │   ├── redwood_logo.png
    │   ├── redwood_logo.xcf
    │   └── license
    │       ├── README.txt
    │       ├── clker_tos.txt
    │       └── SIL_open_font_license.txt
├── reports
    └── resources
    │   ├── images
    │       ├── bot_left.png
    │       ├── top_left.png
    │       ├── bot_right.png
    │       ├── top_right.png
    │       ├── redwood_logo.png
    │       ├── bot_left_light.png
    │       ├── bot_right_light.png
    │       ├── top_left_light.png
    │       └── top_right_light.png
    │   └── css
    │       └── style.css
├── MANIFEST.in
├── AUTHORS
├── sql
    ├── filewalk.sh
    ├── create_redwood_db.sql
    ├── synthesize_data.sh
    ├── create_redwood_sp.sql
    └── filewalk.py
├── LICENSE.txt
├── bin
    └── redwood
├── setup.py
├── Filters
    ├── filenames.py
    ├── locality_uniqueness.py
    └── filter_prevalence.py
├── docs
    └── conf.py
└── README.md


/redwood/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/redwood/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/redwood/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/redwood/shell/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | 


--------------------------------------------------------------------------------
/redwood/connection/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/redwood/foundation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | Current release is 0.1.0
2 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
1 | Paul M <paulm@lab41.org>
2 | 


--------------------------------------------------------------------------------
/redwood/filters/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | filter_list = list()
3 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/CONTRIBUTING.md


--------------------------------------------------------------------------------
/images/clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/clustering.png


--------------------------------------------------------------------------------
/images/discovery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/discovery.png


--------------------------------------------------------------------------------
/images/histogram0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/histogram0.png


--------------------------------------------------------------------------------
/images/redwood_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/redwood_0.png


--------------------------------------------------------------------------------
/images/logo/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/logo/favicon.ico


--------------------------------------------------------------------------------
/images/logo/redwood_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/logo/redwood_logo.png


--------------------------------------------------------------------------------
/images/logo/redwood_logo.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/images/logo/redwood_logo.xcf


--------------------------------------------------------------------------------
/reports/resources/images/bot_left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_left.png


--------------------------------------------------------------------------------
/reports/resources/images/top_left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_left.png


--------------------------------------------------------------------------------
/reports/resources/images/bot_right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_right.png


--------------------------------------------------------------------------------
/reports/resources/images/top_right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_right.png


--------------------------------------------------------------------------------
/reports/resources/images/redwood_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/redwood_logo.png


--------------------------------------------------------------------------------
/reports/resources/images/bot_left_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_left_light.png


--------------------------------------------------------------------------------
/reports/resources/images/bot_right_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/bot_right_light.png


--------------------------------------------------------------------------------
/reports/resources/images/top_left_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_left_light.png


--------------------------------------------------------------------------------
/reports/resources/images/top_right_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Redwood/HEAD/reports/resources/images/top_right_light.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHORS
2 | include *.md
3 | recursive-include docs *.md *.png *.rst *.bat *.pickle *.doctree *.html *.inv *.js *.txt *.gif *.css *.1 *.py
4 | recursive-include sql *.sh *.sql
5 | include docs/Makefile
6 | 


--------------------------------------------------------------------------------
/images/logo/license/README.txt:
--------------------------------------------------------------------------------
1 | Images in logo are modified versions of public domain clipart from clker.com:
2 |   - "Tree Silhouettes" by stephen foley
3 |   - "Tree" by Zeta
4 | 
5 | Logo font is "Lobster Two," by Pablo Impallari, from Google Webfonts and licensed under the SIL Open Font License.
6 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | # This file lists all individuals having contributed content to the repository.
2 | # If you're submitting a patch, please add your name here in alphabetical order as part of the patch.
3 | #
4 | # For a list of active project maintainers, see the MAINTAINERS file.
5 | #
6 | Charlie Lewis <charliel@lab41.org>
7 | Paul M <paulm@lab41.org>
8 | 


--------------------------------------------------------------------------------
/sql/filewalk.sh:
--------------------------------------------------------------------------------
1 | echo "contents_hash,dirname,basename,inode,device,permissions,user_owner,group_owner,last_accessed,last_modified,last_changed,inode_birth,user_flags,links_to_file,size" > filewalk; sudo find / -type f -exec  sh -c 'A=$(shasum "$0" | cut -d" "  -f1-2 | tr -d " ") ; DIR="$(dirname "$0")/"; BASE=$(basename "$0"); B=$(stat -f  "%i,%d,%p,%Su,%Sg,%a,%m,%c,%B,%f,%l,%z" "$0"); echo $A,$DIR,$BASE,$B >> filewalk ; ' {} \;
2 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |    Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 2 | 
 3 |    Licensed under the Apache License, Version 2.0 (the "License");
 4 |    you may not use this file except in compliance with the License.
 5 |    You may obtain a copy of the License at
 6 | 
 7 |        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.
14 | 


--------------------------------------------------------------------------------
/bin/redwood:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import exceptions
 4 | import cmd
 5 | import getopt
 6 | import multiprocessing
 7 | import os
 8 | import string
 9 | import sys
10 | import ConfigParser
11 | import MySQLdb
12 | import redwood.filters
13 | import redwood.helpers.core as core
14 | import redwood.connection.connect as rconn
15 | 
16 | from redwood.shell.controller import SessionController
17 | 
18 | 
19 | def main(argv):
20 | 
21 |     if(len(argv) != 1 and len(argv) != 2):
22 |         print "Please provide database configuration file"
23 |         sys.exit(1)
24 | 
25 |     print '\033[1;31m\n\n#################################\nWelcome to Redwood\n#################################\n\033[1;m'
26 | 
27 |     print "Establishing connection to database...\n",
28 |     print "...running with {} cores".format(multiprocessing.cpu_count())
29 | 
30 |     cnx = rconn.connect_with_config(argv[0])
31 | 
32 | 
33 |     #import the filters
34 |     if(len(argv) == 2):
35 |         core.import_filters(argv[1], cnx)      
36 | 
37 |     sc = SessionController(cnx)
38 |     sc.preloop()
39 |     sc.cmdloop()
40 | 
41 |     cnx.close()
42 | 
43 | if __name__ == "__main__":
44 |     main(sys.argv[1:])
45 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | from setuptools.command.install import install
 5 | import os
 6 | 
 7 | class MyInstall(install):
 8 | 
 9 |     def run(self):
10 |         install.run(self)
11 |         print "Installing dependencies"
12 | 
13 | setup(
14 |     name='RedwoodUtility',
15 |     version='0.1.0',
16 |     author='Lab41',
17 |     author_email='paulm@lab41.org',
18 |     description='A project that implements statistical methods for identifying anomalous files.',
19 |     url='http://lab41.github.io/Redwood',
20 |     packages=['redwood', 'redwood.filters', 'redwood.shell','redwood.io','redwood.helpers', 'redwood.connection', 'redwood.foundation'],
21 |     scripts=['bin/redwood'],
22 |     license='LICENSE.txt',
23 |     long_description=open('README.md').read(),
24 |     keywords='redwood stats statistics anomalies'.split(),
25 |     cmdclass={'install': MyInstall},
26 |     classifiers=[
27 |         'Programming Language :: Python',
28 |         'Operating System :: POSIX :: Linux',
29 |         'Topic :: Software Development :: Libraries :: Application Frameworks',
30 |         'Environment :: Other Environment'
31 |     ],
32 |     data_files=[
33 |         ('', ['LICENSE.txt'])
34 |     ],
35 |     install_requires=[
36 |         'numpy',
37 |         'scipy',
38 |     ]
39 | )
40 | 


--------------------------------------------------------------------------------
/redwood/connection/connect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | """
19 | Created on 19 October 2013
20 | @author: Lab41
21 | 
22 | This package provides connection functionality to a redwood MySQL db
23 | """
24 | 
25 | 
26 | import sys
27 | import os
28 | import getopt
29 | import string
30 | import MySQLdb
31 | import exceptions
32 | import ConfigParser
33 | 
34 | 
35 | def connect_with_config(config_path):
36 |     """
37 |     Given a path, returns a connection object
38 | 
39 |     :param config_path: path to the configuration file
40 | 
41 |     :return MySQL connection object
42 |     """
43 | 
44 |     cnx = None
45 | 
46 |     if config_path is None:
47 |         print "Error: A config file must be provided"
48 |         return cnx
49 | 
50 |     try:
51 |         with open(config_path): pass
52 |     except IOError:
53 |         print ('Error: Configuration file \'{}\' not found'.format(config_path))
54 |         return cnx
55 | 
56 |     config = ConfigParser.RawConfigParser()
57 |     config.read(config_path)
58 |     user        =       config.get("mysqld", "username")
59 |     password    =       config.get("mysqld", "password")
60 |     host        =       config.get("mysqld", "host")
61 |     database    =       config.get("mysqld", "database")
62 |     try:
63 |         port = int(config.get("mysqld", "port"))
64 |     except:
65 |         port = 3306
66 | 
67 |     try:
68 | 
69 |         cnx = MySQLdb.connect(host=host, user=user, passwd=password, db=database, port=port, local_infile=1)
70 |     except MySQLdb.Error as e:
71 |         print(e)
72 |         return None
73 | 
74 |     if cnx is None:
75 |         print "Error: Unable to connect to database"
76 |         return None
77 | 
78 |     return cnx
79 | 


--------------------------------------------------------------------------------
/redwood/shell/controller.py:
--------------------------------------------------------------------------------
 1 | import cmd
 2 | import sys
 3 | import shlex
 4 | import redwood.helpers.core as core
 5 | import redwood.io.csv_importer as csv_load
 6 | from modes import SubInterpreterFilter
 7 | from redwood.filters import filter_list
 8 | 
 9 | class SessionController(cmd.Cmd):
10 |     prompt = '\033[1;32mredwood$ \033[1;m'
11 | 
12 | 
13 |     def __init__(self, cnx):
14 |         cmd.Cmd.__init__(self) 
15 |         self.cnx = cnx
16 | 
17 |     def default(self, line):
18 |         if line == 'EOF' or line == 'exit':
19 |             self.do_quit(line)
20 |             return True
21 |         else:
22 |             print "*** Command not recognized, try 'help'"
23 | 
24 |     def emptyline(self):
25 |         pass
26 | 
27 |     def preloop(self, cnx=None):
28 |         #self.cnx = cnx
29 |         pass
30 |     def cmdloop(self):
31 |         try:
32 |             if not filter_list:
33 |                 core.import_filters("./Filters", self.cnx)
34 |             return cmd.Cmd.cmdloop(self)
35 |         except KeyboardInterrupt:
36 |             sys.stdout.write('\n')
37 |             return self.cmdloop()
38 | 
39 |     def help_help(self):
40 |         self.do_help('')
41 | 
42 |     def do_filter(self, line):
43 |         '''[*] filter\n\t|--activates FILTER mode:'''
44 |         sub_cmd = SubInterpreterFilter(self.cnx)
45 |         sub_cmd.cmdloop()
46 | 
47 |     def do_import_filters(self, line):
48 |         '''[*] import_filters <path>\n\t|-[path]   - path to the directory containing the filters'''
49 |         new_filters = core.import_filters(line, self.cnx)
50 |         
51 |         if new_filters is not None:
52 |             print "Importing the following filters: "
53 |             for f in new_filters:
54 |                 print "{}".format(f.name)
55 |         else:
56 |             print "No filters found"
57 |             
58 |     def do_load_csv(self, line):
59 |         '''[*] load_csv <path> <include-survey>
60 |             |-[path]   - path where csv files exist or a path to a csv file
61 |          '''
62 |         try:
63 |             csv_load.run(self.cnx, line)
64 |         except Exception as e:
65 |             print "Error occurred {}".format(e)
66 |         return
67 | 
68 |     def do_quit(self, line):
69 |         '''quit: Exit the redwood console'''
70 |         if self.cnx != None:
71 |             self.cnx.close()
72 |         sys.stdout.write('\n')
73 |         print "quitting"
74 |         sys.exit(0)
75 |         return True
76 | 


--------------------------------------------------------------------------------
/redwood/helpers/visual.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | Created on 19 October 2013
19 | @author: Lab41
20 | 
21 | Helper functions for creating visualizations
22 | """
23 | import array
24 | import matplotlib.pyplot as plt
25 | import numpy as np
26 | import matplotlib
27 | 
28 | 
29 | def visualize_scatter(counts, codes, data, codebook, num_clusters, xlabel="", ylabel="", title=""):
30 |     """
31 |     Generates a 2-d scatter plot visualization of two feature data for 
32 | 
33 |     :param counts: dictionary of counts for the number of observations pairs for 
34 |                     each cluster
35 |     :param codes:  list of codes for each observation row in the order returned by the original query
36 |     :param data: list of observations returned from query in their original order
37 |     :param codebook: the coordinates of the centroids
38 |     :param num_clusters: number of specified clusters up to 8
39 |     :param xlabel: a label for the x axis (Default: None)
40 |     :param ylabel: a label for the y axis (Default: None)
41 |     """
42 |     if num_clusters > 8:
43 |         print "Visualize scatter only supports up to 8 clusters"
44 |         return
45 | 
46 |     num_features = 2
47 |     list_arrays = list()
48 |     list_arr_idx = array.array("I", [0, 0, 0])
49 | 
50 |     for idx in range(num_clusters):
51 |         list_arrays.append(np.zeros((counts[idx], num_features)))
52 | 
53 | 
54 |     for i, j in zip(codes, data):
55 | 
56 |         list_arrays[i][list_arr_idx[i]][0] = j[0]
57 |         list_arrays[i][list_arr_idx[i]][1] = j[1]
58 |         list_arr_idx[i] += 1
59 | 
60 |     #plot the clusters first as relatively larger circles
61 |     plt.scatter(codebook[:,0], codebook[:,1], color='orange', s=260)
62 |    
63 |     colors = ['red', 'blue', 'green', 'purple', 'cyan', 'black', 'brown', 'grey']
64 |    
65 |     for idx in range(num_clusters):
66 |         plt.scatter(list_arrays[idx][:,0], list_arrays[idx][:,1], c=colors[idx]) 
67 |     
68 |     plt.title(title)
69 |     plt.ylabel(ylabel)
70 |     plt.xlabel(xlabel)
71 |     plt.show()
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/redwood/foundation/aggregator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | """
19 | Created on 19 October 2013
20 | @author: Lab41
21 | """
22 | 
23 | 
24 | class Aggregator():
25 | 
26 |     def __init__(self, cnx):
27 |         self.cnx = cnx
28 | 
29 | 
30 |     def aggregate(self, filter_list, dist_str=None):
31 |         '''
32 |         should come in as a:x, b:y, c:z, etc, where x+y+z = 100, and a-c are filter ids
33 |         standard aggregate is equally weighted
34 |         '''
35 | 
36 |         weights = list()
37 |         #TODO: make the dup_list a dict
38 |         dup_list = list()
39 | 
40 |         if not dist_str is None:
41 |             if len(dist_str) != len(filter_list):
42 |                 print "The number of loaded filters ({}) does not equal the number of provided weights ({})".format(len(filter_list), len(dist_str))
43 |                 return
44 |             try:
45 |                 for s in dist_str:
46 |                     p = s.split(':')
47 |                     filter_id = int(p[0])
48 |                     percent = float(p[1])
49 | 
50 |                     if filter_id in dup_list:
51 |                         print "Error: Mutliple weights entered for filter with id {}".format(filter_id)
52 |                         return
53 |                     dup_list.append(filter_id)
54 |                     weights.append((filter_id, percent / float(100)))
55 | 
56 |                 if sum([w[1] for w in weights]) != 1:
57 |                     print "The filter weights must total 100"
58 |                     return
59 |             except:
60 |                 print "There was an error with your sytax, try again"
61 |                 return
62 |         else:
63 |             i = 0
64 |             even_split = 1 / float(len(filter_list))
65 |             for f in filter_list:
66 |                 weights.append((i, even_split))
67 |                 i += 1
68 | 
69 |         query = "UPDATE unique_file\n"
70 | 
71 |         #now create the query
72 | 
73 |         for w in weights:
74 |             fltr = filter_list[w[0]]
75 |             print "{} weight -> {}".format(fltr.name, w[1])
76 |             query += "LEFT JOIN " + fltr.score_table + " ON " + fltr.score_table + ".id = unique_file.id\n"
77 | 
78 |         query += "SET unique_file.reputation = ("
79 | 
80 |         for filter_id, weight in weights:
81 |             fltr = filter_list[filter_id]
82 |             query += "{} * {}.score + ".format(weight, fltr.score_table)
83 | 
84 |         #remove the last +
85 |         query = query[0:len(query)-3]
86 |         query += ")"
87 |         cursor = self.cnx.cursor()
88 |         cursor.execute(query)
89 |         self.cnx.commit()
90 |         cursor.close()
91 | 


--------------------------------------------------------------------------------
/sql/create_redwood_db.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS os (
 2 |     id INT UNSIGNED NOT NULL AUTO_INCREMENT,
 3 |     name VARCHAR(150) NOT NULL,
 4 |     PRIMARY KEY (id),
 5 |     UNIQUE INDEX name_UNIQUE (name ASC)
 6 | )  ENGINE=InnoDB;
 7 | 
 8 | CREATE TABLE IF NOT EXISTS media_source (
 9 |     id INT UNSIGNED NOT NULL AUTO_INCREMENT,
10 |     reputation INT NULL,
11 |     name VARCHAR(150) NULL,
12 |     date_acquired DATETIME NULL,
13 |     os_id INT UNSIGNED NOT NULL,
14 |     PRIMARY KEY (id),
15 |     UNIQUE INDEX name_UNIQUE (name ASC),
16 |     CONSTRAINT fk_os_id FOREIGN KEY (os_id)
17 |         REFERENCES os (id)
18 |         ON DELETE NO ACTION ON UPDATE NO ACTION
19 | )  ENGINE=InnoDB;
20 | 
21 | CREATE TABLE IF NOT EXISTS unique_file (
22 |     id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
23 |     hash CHAR(40) NOT NULL,
24 |     reputation DOUBLE NOT NULL DEFAULT .5,
25 |     status INT DEFAULT 0,
26 |     PRIMARY KEY (id),
27 |     INDEX file_reputation (reputation ASC),
28 |     UNIQUE INDEX hash_UNIQUE (hash ASC)
29 | )  ENGINE=InnoDB;
30 | 
31 | 
32 | CREATE TABLE IF NOT EXISTS unique_path (
33 |     id INT UNSIGNED NOT NULL AUTO_INCREMENT,
34 |     full_path VARCHAR(4096) NOT NULL,
35 |     path_hash CHAR(40) NULL,
36 |     PRIMARY KEY (id),
37 |     UNIQUE INDEX path_hash_UNIQUE (path_hash ASC)
38 | )  ENGINE=InnoDB;
39 | 
40 | 
41 | CREATE TABLE IF NOT EXISTS file_metadata (
42 |     id BIGINT UNSIGNED UNIQUE NOT NULL,
43 |     unique_file_id BIGINT UNSIGNED NULL,
44 |     source_id INT UNSIGNED NOT NULL,
45 |     unique_path_id INT UNSIGNED NOT NULL,
46 | 	parent_id BIGINT UNSIGNED NULL,
47 |     file_name VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
48 |     filesystem_id INT UNSIGNED NULL DEFAULT NULL,
49 |     device_id INT NULL DEFAULT NULL,
50 |     attributes INT NULL DEFAULT NULL,
51 |     user_owner VARCHAR(45) NULL DEFAULT NULL COMMENT '	',
52 |     group_owner VARCHAR(45) NULL DEFAULT NULL,
53 |     size INT UNSIGNED NULL DEFAULT NULL,
54 |     created DATETIME NULL DEFAULT NULL,
55 | 	last_accessed DATETIME NULL DEFAULT NULL,
56 |     last_modified DATETIME NULL DEFAULT NULL,
57 |     last_changed DATETIME NULL DEFAULT NULL,
58 | 	user_flags INT NULL DEFAULT NULL,
59 |     links_to_file INT NULL DEFAULT NULL,
60 | 	disk_offset BIGINT NULL,
61 |     entropy TINYINT  NULL,
62 | 	file_content_status TINYINT NULL,
63 | 	extension VARCHAR(32) NULL,
64 | 	file_type VARCHAR(64) NULL,
65 |     os_id INT UNSIGNED NOT NULL,
66 |     INDEX fk_source_id_idx USING BTREE (source_id ASC),
67 |  CONSTRAINT fk_source_id FOREIGN KEY (source_id)
68 |         REFERENCES media_source (id)
69 |         ON DELETE NO ACTION ON UPDATE NO ACTION,
70 |     INDEX fk_unique_file_id_idx (unique_file_id ASC),
71 | CONSTRAINT fk_unique_file_id FOREIGN KEY (unique_file_id)
72 |         REFERENCES unique_file (id)
73 |         ON DELETE NO ACTION ON UPDATE NO ACTION,
74 | INDEX fk_unique_path_idx (unique_path_id ASC),
75 |   CONSTRAINT fk_unique_path FOREIGN KEY (unique_path_id)
76 |         REFERENCES unique_path (id)
77 |         ON DELETE NO ACTION ON UPDATE NO ACTION,
78 |     UNIQUE INDEX source_id_unique_path_id_file_name_idx USING BTREE (unique_path_id ASC , file_name ASC , source_id ASC),
79 |     INDEX fk_os_id_idx (os_id ASC),
80 | CONSTRAINT fk_os_id2 FOREIGN KEY (os_id)
81 |         REFERENCES os (id)
82 |         ON DELETE NO ACTION ON UPDATE NO ACTION,
83 | INDEX parent_id_idx USING BTREE (parent_id ASC),
84 | INDEX file_name_idx USING BTREE (file_name ASC)   
85 | )  ENGINE=InnoDB;
86 | 
87 | 


--------------------------------------------------------------------------------
/sql/synthesize_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | SOURCES_ROOT=sources
  4 | DIR_BASE=Home
  5 | DIR_TEMPLATE=${DIR_BASE}_0
  6 | NUM_SUBDIRS=10
  7 | NUM_BASE_FILES=30
  8 | NUM_SOURCES=20
  9 | NUM_SUBDIR_FILES=10
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | function genAnomalyPrevalence {
 17 | 
 18 |     ########## PREVALENCE ANOMALIES ################
 19 | 
 20 |     #now add the anomalie s
 21 |     NUM_PREVALENCE_ANOMALIES=5
 22 | 
 23 |     i=0
 24 | 
 25 |     while  [ $i -lt $NUM_PREVALENCE_ANOMALIES ]; do
 26 |         source_name=$((RANDOM % NUM_SOURCES))
 27 |         sub_dir=$((RANDOM % NUM_SUBDIRS))
 28 |         dd bs=100 count=10 if=/dev/urandom of=${SOURCES_ROOT}/${DIR_BASE}_${source_name}/${sub_dir}A_DIR/anom_${i} &>/dev/null
 29 |         i=$[$i+1]
 30 |     done
 31 | 
 32 | }
 33 | 
 34 | 
 35 | function genAnomalyLocality {
 36 | 
 37 |     echo "Generating anomalies for Locality Uniqueness"
 38 | 
 39 |     ############ LOCALITY UNIQUENESS ANOMALIES ########################
 40 | 
 41 |     #since loc unq is currently using time
 42 |     sleep 3
 43 | 
 44 |     NUM_LOCUNQ_ANOMALIES=5
 45 | 
 46 |     i=0
 47 | 
 48 |     while  [ $i -lt $NUM_LOCUNQ_ANOMALIES ]; do
 49 |         source_name=$((RANDOM % NUM_SOURCES))
 50 |         sub_dir=$((RANDOM % NUM_SUBDIRS))
 51 |         dd bs=100 count=10 if=/dev/urandom of=${SOURCES_ROOT}/${DIR_BASE}_${source_name}/${sub_dir}A_DIR/newer_anom_${i} &>/dev/null
 52 |         i=$[$i+1]
 53 |     done
 54 | }
 55 | 
 56 | 
 57 | function genAnomalyFileName {
 58 | 
 59 |     echo "Generating anomalies for file name"
 60 | 
 61 |     ############# FILE NAME ANOMALY ###############################
 62 | 
 63 |     i=0
 64 |     NUM_NAME_ANOMALIES=3
 65 | 
 66 |     while [ $i -lt $NUM_NAME_ANOMALIES ]; do
 67 | 
 68 |         #anomaly 1
 69 |         source_id=$((RANDOM % NUM_SOURCES))
 70 |         base_file=$((RANDOM % NUM_BASE_FILES))
 71 |         
 72 |         
 73 |         mv ${SOURCES_ROOT}/${DIR_BASE}_${source_id}/file_${base_file} ${SOURCES_ROOT}/${DIR_BASE}_${source_id}/diff_name_${base_file}
 74 |         i=$[$i+1]
 75 | 
 76 |     done
 77 | 
 78 | }
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | rm -rf $SOURCES_ROOT
 85 | mkdir $SOURCES_ROOT
 86 | mkdir ${SOURCES_ROOT}/${DIR_TEMPLATE}
 87 | 
 88 | i=0
 89 | 
 90 | #make subdir level A
 91 | while [ $i -lt ${NUM_SUBDIRS} ]; do
 92 |     mkdir "${SOURCES_ROOT}/${DIR_TEMPLATE}/${i}A_DIR"
 93 |     i=$[$i+1]
 94 | done
 95 | 
 96 | 
 97 | i=0
 98 | 
 99 | while [ $i -lt $NUM_BASE_FILES ]; do
100 |     dd bs=32 count=10 if=/dev/urandom of="${SOURCES_ROOT}/${DIR_TEMPLATE}/file_${i}"  &>/dev/null
101 |     i=$[$i+1]
102 | done
103 | 
104 | i=0
105 | 
106 | while [ $i -lt $NUM_SUBDIR_FILES ]; do
107 |     num=$((RANDOM % NUM_SUBDIRS))
108 |     dd bs=50 count=10 if=/dev/urandom of="${SOURCES_ROOT}/${DIR_TEMPLATE}/${num}A_DIR/f_${i}"  &>/dev/null
109 |     i=$[$i+1]
110 | done
111 | 
112 | 
113 | i=1
114 | 
115 | while [ $i -lt $(( NUM_SOURCES )) ]; do
116 | 
117 |     cp -rf ${SOURCES_ROOT}/${DIR_TEMPLATE} ${SOURCES_ROOT}/${DIR_BASE}_${i}
118 |     i=$[$i+1]
119 | done
120 | 
121 | 
122 | #now create the anomalies
123 | genAnomalyFileName
124 | genAnomalyLocality
125 | genAnomalyPrevalence
126 | 
127 | i=0
128 | 
129 | 
130 | #go ahead and create the csvs
131 | if [ -e filewalk.py ]; then
132 | 
133 |     rm -rf csv
134 |     mkdir csv
135 | 
136 |     while [ $i -lt ${NUM_SOURCES} ]; do 
137 |         loc="${SOURCES_ROOT}/${DIR_BASE}_${i}" 
138 |         python filewalk.py $loc test_os source_${i} csv
139 |         i=$[$i+1]
140 |     done
141 | fi
142 | 
143 | echo "done"
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/reports/resources/css/style.css:
--------------------------------------------------------------------------------
  1 | .redwood-title
  2 | {
  3 | 	font-size: 40px;
  4 | 	text-indent: 5px;
  5 | }
  6 | .redwood-header
  7 | {
  8 | 	background-color: orange;
  9 | 	text-indent: 5px;
 10 | }
 11 | 
 12 | #redwood-table
 13 | {
 14 | 	font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif;
 15 | 	font-size: 12px;
 16 | 	margin: 15px;
 17 | 	width: 480px;
 18 | 	text-align: left;
 19 | 	border-collapse: collapse;
 20 | 	border-style: hidden;
 21 | }
 22 | #redwood-table caption.caption
 23 | {
 24 | 	text-align: left;
 25 | 	font-size: 25px;
 26 | 	font-style: bold;
 27 | }
 28 | #redwood-table thead th.rounded-head-left
 29 | {
 30 | 	background: orange url('../images/top_left.png') left -1px no-repeat;
 31 | }
 32 | #redwood-table thead th.rounded-head-left-light
 33 | {
 34 | 	background: #FFE0B2 url('../images/top_left_light.png') left -1px no-repeat;
 35 | }
 36 | #redwood-table thead th.rounded-head-right
 37 | {
 38 | 	background: orange url('../images/top_right.png') right -1px no-repeat;
 39 | }
 40 | #redwood-table thead th.rounded-head-right-light
 41 | {
 42 | 	background: #FFE0B2 url('../images/top_right_light.png') right -1px no-repeat;
 43 | }
 44 | #redwood-table th
 45 | {
 46 | 	padding: 8px;
 47 | 	font-weight: normal;
 48 | 	font-size: 13px;
 49 | 	color: black;
 50 | 	background: orange;
 51 | }
 52 | #redwood-table th.score-divider
 53 | {
 54 | 	background: orange
 55 | }
 56 | #redwood-table th.count-divider
 57 | {
 58 | 	/*border-right: 2px solid;*/
 59 | 	background: #FFE0B2;
 60 | }
 61 | #redwood-table td
 62 | {
 63 | 	padding: 8px;
 64 | 	background: #FFE0B2;
 65 | 	border-top: 1px solid #fff;
 66 | 	color: black;
 67 | }
 68 | #redwood-table td.score-divider
 69 | {
 70 | 	background: orange;
 71 | }
 72 | #redwood-table td.count-divider
 73 | {
 74 | 	/*border-right: 2px solid;*/
 75 | }
 76 | #redwood-table tfoot td.rounded-foot-left
 77 | {
 78 | 	background: orange url('../images/bot_left.png') left bottom no-repeat;
 79 | }
 80 | #redwood-table tfoot td.rounded-foot-left-light
 81 | {
 82 | 	background: #FFE0B2 url('../images/bot_left_light.png') left bottom no-repeat;
 83 | }
 84 | #redwood-table tfoot td.rounded-foot-right
 85 | {
 86 | 	background: orange url('../images/bot_right.png') right bottom no-repeat;
 87 | }
 88 | #redwood-table tfoot td.rounded-foot-right-light
 89 | {
 90 | 	background: #FFE0B2 url('../images/bot_right_light.png') right bottom no-repeat;
 91 | }
 92 | #redwood-table tbody tr:hover td
 93 | {
 94 | 	background: #FFC266;
 95 | }
 96 | div#top
 97 | {
 98 | 	margin-left: 225px
 99 | }
100 | div#navigation
101 | {
102 | 	display: inline-block;
103 | 	position: absolute;
104 | 	float: left/right;
105 | 	width: 200px;
106 | 	height: 97.9%;
107 | 	top: 15%
108 | 	right: auto;
109 | 	bottom: 100px
110 | 	left: 0;
111 | 	color: #ffffff;
112 | 	background-color: #FFE0B2;
113 | 	background-image: url('../images/top_left_light.png'), url('../images/top_right_light.png'), url('../images/bot_right_light.png'), url('../images/bot_left_light.png');
114 | 	background-position: left top, right top, right bottom, left bottom;
115 | 	background-repeat: no-repeat;
116 | 	padding: 5px;
117 | }
118 | div#content
119 | {
120 | 	margin-left: 210px;
121 | }
122 | div#navigation .center
123 | {
124 | 	width: 150px;
125 | 	height: 150px;
126 | 	display: block;
127 |     margin-left: auto; 
128 |     margin-right: auto;
129 | }
130 | div.a
131 | {
132 | 	display: block;
133 | 	width: 60px;
134 | }
135 | dl.list
136 | {
137 | 	text-align: center;
138 | }
139 | .button
140 | {
141 | 	width: 180px;
142 |   	height: 25px;
143 |   	background: orange;
144 |   	padding: 10px;
145 |   	border-radius: 5px;
146 |   	color: black;
147 |     font-size: 20;
148 |   	font-weight: bold;
149 |   	margin-bottom: 5px;
150 | }
151 | ul#navigation
152 | {
153 | 	margin: 0;
154 | 	margin-top: 10px;
155 | 	padding: 0;
156 | 	width: 300px;
157 | }
158 | ul#navigation li
159 | {
160 | 	list-style-type: none;
161 | }
162 | a
163 | {
164 | 	display: block;
165 | 	padding: 10px 20px;
166 | 	width: 60px;
167 |     text-decoration: none;
168 | }
169 | 


--------------------------------------------------------------------------------
/sql/create_redwood_sp.sql:
--------------------------------------------------------------------------------
  1 | DROP PROCEDURE IF EXISTS map_staging_table;
  2 | 
  3 | DELIMITER //
  4 | CREATE PROCEDURE map_staging_table(IN source_id INT, IN os_id INT)
  5 |     BEGIN
  6 |         INSERT INTO `unique_file` (hash)
  7 |                 SELECT DISTINCT contents_hash
  8 |                 FROM `staging_table` where basename != "/" and LENGTH(contents_hash) > 0
  9 |         ON DUPLICATE KEY UPDATE hash = hash;
 10 |         INSERT IGNORE INTO `unique_path` (full_path, path_hash)
 11 |                 SELECT dirname, dirname_hash
 12 |                 FROM `staging_table`;
 13 |         INSERT IGNORE INTO `file_metadata` 
 14 |                 (id,
 15 |                 unique_file_id,
 16 |                 source_id,
 17 |                 unique_path_id,
 18 |                 parent_id,
 19 |                 file_name,
 20 |                 filesystem_id,
 21 |                 device_id,
 22 |                 attributes,
 23 |                 user_owner,
 24 |                 group_owner,
 25 |                 size,
 26 |                 created,
 27 |                 last_accessed,
 28 |                 last_modified,
 29 |                 last_changed,
 30 |                 user_flags,
 31 |                 links_to_file,
 32 |                 disk_offset,
 33 |                 entropy,
 34 |                 file_content_status,
 35 |                 extension,
 36 |                 file_type,
 37 |                 os_id)
 38 |         SELECT
 39 |                 staging_table.global_file_id,
 40 |                 unique_file.id,
 41 |                 source_id,
 42 |                 unique_path.id,
 43 |                 staging_table.parent_id,
 44 |                 staging_table.basename,
 45 |                 staging_table.filesystem_id,
 46 |                 staging_table.device_id,
 47 |                 staging_table.attributes,
 48 |                 staging_table.user_owner,
 49 |                 staging_table.group_owner,
 50 |                 staging_table.size,
 51 |                 staging_table.created,
 52 |                 staging_table.last_accessed,
 53 |                 staging_table.last_modified,
 54 |                 staging_table.last_changed,
 55 |                 staging_table.user_flags,
 56 |                 staging_table.links_to_file,
 57 |                 staging_table.disk_offset,
 58 |                 staging_table.entropy,
 59 |                 staging_table.file_content_status,
 60 |                 staging_table.extension,
 61 |                 staging_table.file_type,
 62 |                 os_id
 63 |         FROM `staging_table`
 64 |                 LEFT JOIN  `unique_file`
 65 |                 ON (staging_table.contents_hash = unique_file.hash)
 66 |                 LEFT JOIN `unique_path`
 67 |                 ON (staging_table.dirname_hash = unique_path.path_hash);
 68 |     END //
 69 | DELIMITER ;
 70 | 
 71 | 
 72 | DROP VIEW IF EXISTS joined_file_metadata;
 73 | 
 74 | CREATE VIEW `joined_file_metadata` AS
 75 |     SELECT 
 76 |         `file_metadata`.id AS file_metadata_id,
 77 |                 unique_file_id,
 78 |                 source_id,
 79 |                 unique_path_id,
 80 |                 file_name,
 81 |                 parent_id,
 82 |                 filesystem_id,
 83 |                 device_id,
 84 |                 attributes,
 85 |                 user_owner,
 86 |                 group_owner,
 87 |                 size,
 88 |                 created,
 89 |                 last_accessed,
 90 |                 last_modified,
 91 |                 last_changed,
 92 |                 user_flags,
 93 |                 links_to_file,
 94 |                 disk_offset,
 95 |                 entropy,
 96 |                 file_content_status,
 97 |                 extension,
 98 |                 file_type,
 99 |                 hash,
100 |                 reputation,
101 |                 full_path,
102 |                 path_hash
103 |     FROM
104 |         file_metadata
105 |             LEFT JOIN
106 |         unique_file ON `file_metadata`.unique_file_id = `unique_file`.id
107 |             LEFT JOIN
108 |         unique_path ON `unique_path`.id = `file_metadata`.unique_path_id;
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/images/logo/license/clker_tos.txt:
--------------------------------------------------------------------------------
 1 | Terms of use
 2 | Clker.com is owned by Rolera LLC, an Illinois Limited Liability Corporation. Clker and Clker.com are trademarks of Rolera LLC.
 3 | 
 4 | Clker.com is an online sharing service where users share free public domain vector cliparts, or share public domain photos and derive vector cliparts from those photos using clker's online tracer.
 5 | 
 6 | Users who upload shared cliparts and photos on Clker.com shall certify they are in public domain, as it is shown on the upload page. Please flag any content suspected otherwise. We have no other information concerning the status of the uploaded pictures and cliparts.
 7 | 
 8 | Using clker.com
 9 | You are allowed to use clker.com and any content provided by clker.com if you are 18 years or older or if your parent or guardian who is 18 years or older reads the entire terms listed on this page including disclaimers and agrees to all of them. If you or your parent or guardian do not agree to the entire terms listed on this page then you shall not use clker.com or any content provided by clker.com.
10 | 
11 | You shall not use clker.com to draw any image, or download images from clker.com that will be used or characterized as:
12 | 
13 | derogatory, humliating or condescending towards any person, group of people, associations, organizations or corporations.
14 | expressing hate towards any one or group.
15 | porn or advertises sexual activities even if it was legal in your state or country.
16 | violating any US laws.
17 | The terms listed here are subject to change without notice. If you download content from, upload or draw on clker.com, you hereby agree that it is your responsibility to continuously check the terms of use for updates. In the event that the terms change and you do not agree with the new terms, you shall cease using any content downloaded or delete any content drawn or uploaded from or to clker.com upon publishing those newer terms.
18 | 
19 | DISCLAIMER & NO WARRANTY
20 | BECAUSE CLKER.COM AND ITS CONTENTS ARE FREE OF CHARGE, WE PROVIDE ABSOLUTELY NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING, CLKER.COM AND ITS OWNERS PROVIDE THE CONTENT AND IMAGES 'AS IS' WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE CONTENT IS WITH YOU. SHOULD ANY PART OF CLKER.COM OR ITS CONTENT PROVE DEFECTIVE, OR NOT PUBLIC DOMAIN YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, CORRECTION AND OTHER COSTS THAT MIGHT HAPPEN TO YOU OR YOUR PRODUCT OR CLIENTS OR CUSTOMERS FROM USING CONTENT OR IMAGES FROM CLKER.COM OR ANY DERIVATIVES OF YOUR WORK THAT INCLUDED OR WAS DERIVED FROM OUR CONTENT.
21 | 
22 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL CLKER.COM, IT'S OWNERS, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A FAILURE OF THE PICTURES/IMAGES/SOFTWARE TO OPERATE WITH ANY PROGRAMS) THE SITE OR ITS CONTENTS, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
23 | 
24 | Uploading content
25 | By uploading content you hereby declare your responsibility for what you upload and release clker.com from any liabilities and responibilities towards your upload. Simply, clker.com is just hosting your upload.
26 | 
27 | By uploading content, you certify that it is free from any copyrights and trademarks, and in case you are the copyright holder you hereby release it under the lastest version of the creative commons CC0 public domain dedication found here. All uploads must not a. Contain profanity words, or imply profanity by using gestures b. Must not contain porn, adult content or not safe for work images c. Photo pictures of tatto, body piercing and other forms of body art are not allowed in raster format. However, tattoo designs are allowed in vector SVG format d. Must not show hatred or imply hatered to any group or ethnicity e. Must not be discriminatory in any way f. Must not violate any US laws including copyrights.
28 | 
29 | You may not upload any photos that contain identifiable living or recently deceased people unless the photo and context cannot be protected by personal privacy or publicity rights. For example, picture of prominant members of governments while doing their job are allowed, but pictures of your family and yourself are not allowed if people can be identified in the photos.
30 | 
31 | Clker.com reserves the right to delete any content deemed unacceptable, and reservers the right to determine what content is acceptable even if you uploaded content that you believe complies with the upload policy. Any content that turns out to be copyright protected, will be deleted as soon as Clker.com learns of it.
32 | 
33 | Clker.com reserves the right to terminate any user account for repeated policy violations or for uploading copyright protected images.
34 | 
35 | DMCA notices
36 | Greiman, Rome & Griesmeyer is our DMCA agent. Please do not email or call them for support questions.. For support questions please email support at clker dot com.
37 | 
38 | Please send all DMCA notices to:
39 | Brian J. Pleviak, Attorney
40 | Ginsberg Jacobs LLC
41 | 300 South Wacker Drive, Suite 2450
42 | Chicago Illinois 60606
43 | Phone: (312) 660-9626
44 | Fax: (312) 660-9612
45 | 


--------------------------------------------------------------------------------
/sql/filewalk.py:
--------------------------------------------------------------------------------
  1 | import binascii
  2 | import datetime
  3 | import hashlib
  4 | import mimetypes
  5 | import os
  6 | import re
  7 | import struct
  8 | import subprocess
  9 | import sys
 10 | import time
 11 | import urllib
 12 | import csv
 13 | from Queue import Queue
 14 | 
 15 | # 8 byte unique ID generator give a path.
 16 | #   - first five bytes are first five from sha1 of path name
 17 | #   - last 3 are the first three from the current time
 18 | # Returns a long
 19 | def generateUniqueId(path):
 20 | 
 21 |     m = hashlib.md5()
 22 |     m.update(path)
 23 |     first_five = m.digest()[:5]
 24 |     last_three = struct.pack("I", int(time.time()))[:3]
 25 |     combined = first_five + last_three
 26 |     return  long(binascii.hexlify(combined), 16)
 27 | 
 28 | 
 29 | def write_stat_info(basename, dirname, file_id, parent_id, dirname_digest, csv_writer):
 30 |     
 31 |     #need to escape commas from base name and dirname since we are creating a csv
 32 | 
 33 |     
 34 |     path = os.path.join(dirname, basename)
 35 | 
 36 |     try:
 37 |         stat_obj = os.stat(path)
 38 |     except Exception:
 39 |         # print "Error trying to stat {}".format(path)
 40 |         return
 41 | 
 42 |     url = urllib.pathname2url(path)
 43 |     file_type = mimetypes.guess_type(url)[0]
 44 |     hash_val = hash_file(path, file_type)
 45 | 
 46 | #file_id, parent_id,dirname,basename,hash,fs_id,device,permissions,uid,gid,size,create_time,access_time,mod_time,metadata_change_time,user_flags,links,disk_offset,entropy,file_content_status,extensions,file_type
 47 | 
 48 |     csv_writer.writerow([file_id, parent_id, dirname, basename, hash_val, dirname_digest, stat_obj.st_ino, stat_obj.st_dev,
 49 |                         str(oct(stat_obj.st_mode)), stat_obj.st_uid, stat_obj.st_gid, stat_obj.st_size, long(os.path.getctime(path)),
 50 |                         long(stat_obj.st_atime), long(stat_obj.st_mtime), long(stat_obj.st_ctime), "", stat_obj.st_nlink, "", "", "",
 51 |                         os.path.splitext(basename)[1], file_type])
 52 | 
 53 | 
 54 | BUFFER = 4096
 55 | 
 56 | def hash_file(path, file_type):
 57 | 
 58 |     ret = ""
 59 |     # some files you can't hash
 60 |     if(file_type == 'inode/chardevice' \
 61 |             or file_type == 'inode/symlink' \
 62 |             or file_type == 'inode/socket' \
 63 |             or file_type == 'inode/blockdevice' \
 64 |             or file_type == 'inode/x-empty' \
 65 |             or file_type == 'application/x-coredump' \
 66 |             or file_type == 'inode/directory'):
 67 |         ret = "0"
 68 |         return ret
 69 | 
 70 |     fd = None
 71 |     try:
 72 |         h = hashlib.sha1()
 73 |         fd = os.open(path, os.O_RDONLY | getattr(os, 'O_NONBLOCK', 0) | os.O_NONBLOCK)
 74 |         data = os.read(fd, BUFFER)
 75 |         while(len(data)>0):
 76 |             h.update(data)
 77 |             data = os.read(fd, BUFFER)
 78 |         ret = h.hexdigest()
 79 |     except Exception, err:
 80 |         # print "Hash Error: {} on file {} with type {}".format(err, path,
 81 |         # file_type)
 82 |         pass
 83 |     finally:
 84 |         if(fd != None):
 85 |             os.close(fd)
 86 |     return ret
 87 | 
 88 | 
 89 | omitted_dirs = ['/dev', '/proc', '/sys', '/Volumes', '/mnt', '/net']
 90 | 
 91 | 
 92 | def main(argv):
 93 | 
 94 |     if(len(argv) != 5):
 95 |         print "filewalk.py <directory> <os> <source> <output_dir>"
 96 |         return
 97 | 
 98 | 
 99 |     #make sure output dir exists
100 |     if os.path.exists(argv[4]) is False:
101 |         print "Output dir {} does not exist".format(argv[4])
102 |         return
103 | 
104 |     today = datetime.date.today()
105 |     str_date = today.strftime('%Y-%m-%d')
106 |     out_file = os.path.join(argv[4], "{}--{}--{}".format(str_date, argv[2], argv[3]))
107 |     start_dir = argv[1]
108 | 
109 | 
110 |     stack = list()
111 | 
112 |     with open(out_file, "w") as file_handle:
113 |         
114 |         csv_writer = csv.writer(file_handle)
115 |         csv_writer.writerow(["file_id","parent_id","dirname","basename","contents_hash", "dirname_hash", "fs_id","device","permissions",
116 |                 "uid","gid","size","create_time","access_time","mod_time","metadata_change_time",
117 |                 "user_flags","links","disk_offset","entropy","file_content_status","extensions","file_type"])
118 | 
119 |         # start the queue with a 0 value
120 |         stack.append(0L)
121 | 
122 |         for root, dirs, files in os.walk(start_dir):
123 |             # We want to have a nice, dynamic output that doesn't flood the
124 |             # terminal with lines of text. So we'll write a line, then flush it
125 |             # with '\r'. In order to do this properly, we need to first measure
126 |             # the width of the terminal.
127 |             # We're also going to put it inside the loop in case the window
128 |             # gets resized while it's running
129 |             rows,columns = os.popen('stty size', 'r').read().split()
130 |             rows = int(rows)
131 |             columns = int(columns)
132 | 
133 |             parent_id = stack.pop()
134 | 
135 |             #some directories we will ignore as so
136 |             if root in omitted_dirs:
137 |                 del dirs[:]
138 |                 continue
139 | 
140 |             sys.stdout.write('\r')
141 |             sys.stdout.write(' ' * columns)
142 |             sys.stdout.write('\r')
143 |             sys.stdout.write('processing {}'.format(root[:columns-12]))
144 |             sys.stdout.flush()
145 | 
146 |             new_parent_id = generateUniqueId(root)
147 | 
148 |             # for each of the child dirs, add the parent id. This assumes a BFS
149 |             # search
150 |             for d in dirs:
151 |                 stack.append(new_parent_id)
152 | 
153 |             h = hashlib.sha1()
154 |             h.update(root)
155 |             root_digest = h.hexdigest()
156 | 
157 |             # write the parent directory
158 |             write_stat_info("/", root,  new_parent_id, parent_id, root_digest,csv_writer)
159 |             for f in files:
160 |                 _id = generateUniqueId(os.path.join(root, f))
161 |                 write_stat_info(f, root, _id, new_parent_id, root_digest, csv_writer)
162 |             file_handle.flush()
163 | 
164 | if __name__=="__main__":
165 |     main(sys.argv)
166 | 


--------------------------------------------------------------------------------
/redwood/filters/redwood_filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Created on 19 October 2013
 20 | @author: Lab41
 21 | """
 22 | from collections import namedtuple
 23 | import inspect
 24 | 
 25 | class RedwoodFilter(object):
 26 |     """
 27 |     Base class for Filter creation
 28 | 
 29 |     :ivar name: Name of the filter. This should be one word, lower case, with underscores if needed
 30 |     :ivar cnx: connection instance to the database
 31 |     :ivar score_table: name of the table containing reputation scores. The table must have exactly two columns (id, score)
 32 |     """
 33 |     def __init__(self):
 34 |         self.name = "generic"
 35 |         self.cnx = None
 36 |         self.score_table = None
 37 |     def clean(self):
 38 |         """
 39 |         Deletes all required tables for this filter (method must be overridden)
 40 |         """
 41 |         raise NotImplementedError
 42 | 
 43 |     def update(self, source):
 44 |         """
 45 |         Updates filter tables with new data from <source>  (method must be overridden)
 46 | 
 47 |         :param source: name of the media source
 48 |         """
 49 |         raise NotImplementedError
 50 | 
 51 |     def rebuild(self):
 52 |         """
 53 |         Deletes all tables for this filter, recreates them, then rebuilds data for them from the datastore
 54 |         """
 55 |         self.clean()
 56 |         self.build()
 57 | 
 58 |         #get a list of the sources
 59 |         query = """
 60 |             SELECT media_source.name FROM media_source
 61 |         """
 62 | 
 63 |         cursor = self.cnx.cursor()
 64 |         cursor.execute(query)
 65 | 
 66 |         print "...Rebuild process started"
 67 |         for source in cursor.fetchall():
 68 |             print "rebuilding for source: {}".format(source[0])
 69 |             self.update(source[0])
 70 | 
 71 |     def show_results(self, direction, count, source, out=None):
 72 |         """
 73 |         Displays avg file prevalence in orderr for a given source
 74 | 
 75 |         :param direction: either [top] or [bottom]
 76 |         :param count: number of rows to retrieve from the direction
 77 |         :param out: file to write results to
 78 |         """
 79 | 
 80 |         print "[+] Running list_by_source..."
 81 |         cursor = self.cnx.cursor()
 82 |         dir_val = ("desc" if direction == "top" else  "asc")
 83 | 
 84 |         if direction == "top":
 85 |             dir_val = "desc"
 86 |         elif direction == "bottom":
 87 |             dir_val = "asc"
 88 |         else:
 89 |             print "Error:  direction must be \"top\" or \"bottom\""
 90 |             return
 91 | 
 92 | 
 93 |         print "Fetching {} results from {} for filter {}".format(direction, source, self.name)
 94 | 
 95 |         query = """
 96 |             SELECT {}.score, unique_path.full_path, file_metadata.file_name
 97 |             FROM {} LEFT JOIN file_metadata ON {}.id = file_metadata.unique_file_id
 98 |             LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id
 99 |             WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}")
100 |             ORDER BY {}.score {} LIMIT 0, {}
101 |         """.format(self.score_table, self.score_table, self.score_table, source, self.score_table, dir_val, count)
102 | 
103 |         cursor.execute(query)
104 | 
105 |         if out is None:
106 |             results = cursor.fetchall()
107 |             i = 0
108 |             for r in results:
109 |                 print "{}:\t{}\t{}/{}".format(i, r[0], r[1], r[2])
110 |                 i+=1
111 |             return results
112 |         else:
113 | 
114 |             with open (out, "w") as f:
115 |                 v = 0
116 |                 for x in cursor.fetchall():
117 |                     f.write("{}:\t{}\t{}/{}\n".format(v, x[0], x[1], x[2]))
118 |                     v += 1
119 | 
120 |         cursor.close()
121 | 
122 | 
123 |     def build(self):
124 |         """
125 |         Builds necessary tables for the filter. This function must create the scores table. The standard practice
126 |         is to create a table called "filter_name"_scores that has two columns (id, double score). As an example for a
127 |         filter called "woohoo", you would want to add the following create table::
128 | 
129 |             CREATE TABLE IF NOT EXISTS `woohoo_scores` (
130 |                 id BIGINT unsigned NOT NULL,
131 |                 score double DEFAULT NULL,
132 |                 PRIMARY KEY(id),
133 |                 CONSTRAINT `fk_unique_file_woohoo_id` FOREIGN KEY (`id`)
134 |                 REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
135 |                 ) ENGINE=InnoDB
136 |         """
137 | 
138 |         raise NotImplementedError
139 | 
140 |     def run_survey(self, source_name):
141 |         """
142 |         Given a source name, this function will create an html file summarizing its analysis. The survey should be an
143 |         html file named "survey.html", and it should be located in a directory called "survey_[your file name]_[source name].
144 |         The survey directory should also contain a resources directory where html resources such as images will be saved::
145 | 
146 |             survey_filtername__sourcename
147 |             |- survey.html
148 |             |- resources
149 | 
150 |         :param source_name: name of the source
151 | 
152 |         :return path to the survey directory
153 |         """
154 | 
155 |         raise NotImplementedError
156 | 
157 |     def run_func(self, func_name, *args):
158 |         """
159 |         Helper function that will run the <func_name> with <args> for this filter
160 | 
161 |         :param func_name: name of the function to run
162 |         :param args: list of arguments to run with the function
163 |         """
164 |         func = getattr(self, 'discover_' + func_name, None)
165 |         if not func:
166 |             return False
167 | 
168 |         ret = inspect.getargspec(func)
169 |         #subtract one for the "self"
170 |         upper_num_args = len(ret.args) - 1
171 | 
172 |         if ret.defaults is not None:
173 |             lower_num_args = upper_num_args - len(ret.defaults)
174 |         else:
175 |             lower_num_args = upper_num_args
176 | 
177 |         actual_args = len(args)
178 | 
179 |         if actual_args > upper_num_args or actual_args < lower_num_args:
180 |             print "Error: Incorrect number of args"
181 |             return False
182 | 
183 |         func(*args)
184 |         return True
185 | 
186 |     def do_help(self, cmd):
187 |         "Get help on a command. Usage: help command"
188 |         if cmd: 
189 |             func = getattr(self, 'discover_' + cmd, None)
190 |             if func:
191 |                 print func.__doc__
192 |                 return True
193 |         return False
194 | 


--------------------------------------------------------------------------------
/redwood/shell/modes.py:
--------------------------------------------------------------------------------
  1 | import cmd
  2 | import exceptions
  3 | import sys
  4 | import time
  5 | import shlex
  6 | import redwood.filters
  7 | import redwood.helpers.core as core
  8 | from redwood.filters import filter_list
  9 | from redwood.foundation.aggregator import Aggregator
 10 | from redwood.foundation.report import Report
 11 | 
 12 | class SubInterpreterDiscover(cmd.Cmd):
 13 |     
 14 |     def __init__(self, cnx, line):
 15 |         cmd.Cmd.__init__(self)
 16 |         self.cnx = cnx
 17 | 
 18 |         if line:
 19 |             self.plugin = filter_list[int(line)]
 20 |             self.prompt = '\033[1;32mredwood-'+str(self.plugin.name)+'-discover$ \033[1;m'
 21 |             publicMethods = filter(lambda funcname: funcname.startswith('discover_'), dir(self.plugin)) 
 22 |             self.added_attrs = []
 23 |             for method in publicMethods:
 24 |                 self.added_attrs.append(method.replace("discover_", "do_", 1))
 25 |                 setattr(SubInterpreterDiscover, method.replace("discover_", "do_", 1), self.run)
 26 | 
 27 | 
 28 |     def default(self, line):
 29 |         if line == 'EOF' or line == 'exit' or line == 'quit':
 30 |             self.do_back(line)
 31 |             return True
 32 |         else:
 33 |             print "*** Command not recognized, try 'help'"
 34 | 
 35 |     def emptyline(self):
 36 |         pass
 37 | 
 38 |     def help_help(self):
 39 |         self.do_help('')
 40 | 
 41 |     def do_back(self, line):
 42 |         '''Go back a level in the shell'''
 43 |         for attr in self.added_attrs:
 44 |             delattr(SubInterpreterDiscover, attr)
 45 |         return True
 46 | 
 47 |     def run(self, line):
 48 |         '''Calls out the run_func in redwood_filter'''
 49 |         if line:
 50 |             #line_a = self.cmdline.split()
 51 |             line_a = shlex.split(self.cmdline)
 52 |             func_name = line_a[0]
 53 |             args = tuple(line_a[1:])
 54 |             self.plugin.run_func(func_name, *args)
 55 |         else:
 56 |             self.plugin.do_help(self.cmdline)
 57 |         
 58 |     def do_help(self, line):
 59 |         if line:
 60 |             self.plugin.do_help(line)
 61 |         else:
 62 |             cmd.Cmd.do_help(self, line)
 63 | 
 64 |     def precmd(self, line):
 65 |         self.cmdline = line
 66 |         return line
 67 | 
 68 |     def do_quit(self, line):
 69 |         '''quit: Exit the redwood console'''
 70 |         if self.cnx != None:
 71 |             self.cnx.close()
 72 |         sys.stdout.write('\n')
 73 |         sys.exit(0)
 74 | 
 75 | class SubInterpreterFilter(cmd.Cmd):
 76 |     prompt = '\033[1;32mredwood-filter$ \033[1;m'
 77 |         
 78 |     def __init__(self, cnx):
 79 |         cmd.Cmd.__init__(self)
 80 |         self.cnx = cnx
 81 |     
 82 |     def do_quit(self, line):
 83 |         '''quit: Exit the redwood console'''
 84 |         if self.cnx != None:
 85 |             self.cnx.close()
 86 |         sys.stdout.write('\n')
 87 |         sys.exit(0)
 88 | 
 89 |     def default(self, line):
 90 |         if line == 'EOF' or line == 'exit' or line == 'quit':
 91 |             self.do_back(line)
 92 |             return True
 93 |         else:
 94 |             print "*** Command not recognized, try 'help'"
 95 | 
 96 |     def emptyline(self):
 97 |         pass
 98 | 
 99 |     def help_help(self):
100 |         self.do_help('')
101 | 
102 |     def do_back(self, line):
103 |         '''Go back a level in the shell'''
104 |         return True
105 | 
106 |     def do_discover(self, line):
107 |         '''
108 |         discover <filter-id>
109 |         
110 |         activates discover mode for the given filter with id "filter-id"
111 |         '''
112 |         if line:
113 |             v = SubInterpreterFilter.validateFilterId(line)
114 |             if v >= 0:
115 |                 sub_cmd = SubInterpreterDiscover(self.cnx, line)
116 |                 sub_cmd.cmdloop()
117 |         else:
118 |             print "Error: Filter Id required"
119 | 
120 |     def do_show_results(self, line):
121 |         '''
122 |         show_results <filter-id> <direction> <count> <source> <out>
123 |         
124 |         shows the results for the given filter's score table
125 |         
126 |         filter-id   - id of filter
127 |         direction   - top or bottom
128 |         count       - items to display
129 |         source      - source name
130 |         out         - file to write output to (optional)
131 |         '''
132 |         args = line.split()
133 |         if len(args) != 5 and len(args) != 4 :
134 |             print "Error: incorrect number of arguments"
135 |             return
136 |         v = self.validateFilterId(args[0])
137 |         plugin = filter_list[v]
138 |         plugin.show_results(*args[1:])
139 | 
140 | 
141 |     def do_rerun(self, line):
142 |         '''
143 |         rerun <filter-id>
144 |         
145 |         Reruns a filter on all sources
146 |         '''
147 |         args = line.split()
148 |         if(len(args) != 1):
149 |             print "Error: Filter Id required"
150 |             return
151 |         
152 |         v = self.validateFilterId(args[0])
153 |         if v < 0:
154 |             return
155 |         plugin = filter_list[v]
156 |         plugin.clean()
157 |         
158 |         print "Deleting old data in filter storage"
159 | 
160 |         sources = core.get_all_sources(self.cnx)
161 |        
162 |         print "Creating new data"
163 |         for src_info in sources:
164 |             print "Running filter on source: {}".format(src_info.source_name)
165 |             plugin.update(src_info.source_name)
166 | 
167 |         print "Rerun complete"
168 | 
169 |     def do_list(self, line):
170 |         '''list: lists the avialble filters'''
171 |         print "Available Filters"
172 |         i = 0
173 |         for plugin in filter_list:
174 |             print "{}............{}".format(i, plugin.name)
175 |             i+=1
176 | 
177 |     def do_aggregate_scores(self, line):
178 |         '''
179 |         aggregate_scores  filter_id:weight filter_id:weight ...
180 |         
181 |         Aggregates the reputations of all files using the list of filters and weights provided. If no list is
182 |         provided, all filters are weighted equally. The "filter_id" is the numeric id of the filter.  The "weight"
183 |         is a percentage between 0-100, such that the total of all specified weights is 100. 
184 | 
185 |         For example, if you have 3 filters loaded, and you want to aggregate the scores such that the distribution of weights
186 |         is 50, 30, 20 respectively, then you would run the following command
187 | 
188 |         Example
189 | 
190 |         aggregate_scores 0:50 1:30 2:20
191 |         '''
192 |         
193 |         print "Aggregating Scores"
194 |         args = line.split()
195 |         ag = Aggregator(self.cnx)
196 |         if args and len(args) > 0:
197 |             ag.aggregate(filter_list, args)
198 |         else:
199 |             ag.aggregate(filter_list)
200 | 
201 |     def do_run_survey(self, line):
202 |         '''
203 |         run_survey <source_name>
204 |             
205 |         runs the survey function for the given source
206 |         
207 |         [source_name] - option name of source to process
208 |         '''
209 | 
210 | 	args = shlex.split(line)
211 | 	
212 | 	if len(args) < 1:
213 |             print "Error: Incorrect # of arguments"
214 | 	    return
215 | 
216 |         src_obj = core.get_source_info(self.cnx, args[0])
217 |         
218 |         if src_obj is None:
219 |             print "Error: Unable to find source {}".format(args[0])
220 |             return
221 |         else:
222 |             rpt = Report(self.cnx, src_obj)
223 | 	    if len(args) > 1:
224 |         	rpt.run(args[1:])
225 | 	    else:
226 | 		rpt.run(None)
227 | 
228 |     @staticmethod
229 |     def validateFilterId(str_val):
230 | 
231 |         try:
232 |             value = int(str_val)
233 |         except exceptions.ValueError:
234 |             print "Error: \'{}\' is not a number".format(str_val)
235 |             return -1
236 | 
237 |         if(value < 0 or value >= len(filter_list)):
238 |             print "Error: no plugin exists for that number"
239 |             return -1
240 | 
241 |         return value
242 | 


--------------------------------------------------------------------------------
/Filters/filenames.py:
--------------------------------------------------------------------------------
  1 | from redwood.filters.redwood_filter import RedwoodFilter
  2 | import redwood.helpers.core as core
  3 | import time
  4 | import os
  5 | import shutil
  6 | 
  7 | class FileNameFilter(RedwoodFilter):
  8 | 
  9 |     def __init__(self):
 10 |         self.name =  "FileNameFilter"
 11 |         self.score_table = "FileNameFilter_scores"
 12 | 
 13 |     def clean(self):
 14 |         """
 15 |         Cleans all tables associtaed witht his filter
 16 |         """
 17 |         cursor = self.cnx.cursor()
 18 |         cursor.execute("DROP TABLE IF EXISTS FileNameFilter_scores")
 19 |         cursor.execute("DROP TABLE IF EXISTS FileNameFilter_unique_name")
 20 |         self.cnx.commit()
 21 |         cursor.close()
 22 | 
 23 |     def build(self):
 24 |         """
 25 |         Builds all persistent tables associated with this filter
 26 |         """
 27 |         cursor = self.cnx.cursor()
 28 |         query = """
 29 |             CREATE TABLE IF NOT EXISTS `FileNameFilter_scores` (
 30 |             id BIGINT unsigned NOT NULL,
 31 |             score double DEFAULT NULL,
 32 |             PRIMARY KEY(id),
 33 |             CONSTRAINT `FNF_unique_file1_id` FOREIGN KEY (`id`)
 34 |             REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
 35 |              ) ENGINE=InnoDB
 36 |         """
 37 |         cursor.execute(query)
 38 |         self.cnx.commit()
 39 | 
 40 |         query = """
 41 |             CREATE TABLE IF NOT EXISTS FileNameFilter_unique_name (
 42 |             id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
 43 |             file_name VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
 44 |             unique_path_id INT(10) NOT NULL,
 45 |             count INT DEFAULT 1,
 46 |             PRIMARY KEY (id),
 47 |             UNIQUE INDEX file_path_idx USING BTREE (file_name ASC, unique_path_id),
 48 |             INDEX file_name_idx USING BTREE (file_name ASC)
 49 |         )  ENGINE=InnoDB;
 50 |         """
 51 |         cursor.execute(query)
 52 |         self.cnx.commit()
 53 |         cursor.close()
 54 | 
 55 |     def update(self, source):
 56 |         print "[+] FileName Filter running on {} ".format(source)
 57 | 
 58 |         #creates the basic tables if they do not exist
 59 |         self.build()
 60 | 
 61 |         cursor = self.cnx.cursor()
 62 | 
 63 |         src_info = core.get_source_info(self.cnx, source)
 64 | 
 65 |         if src_info is None:
 66 |             print "Error: Source {} not found".format(source)
 67 |             return
 68 | 
 69 |         now = time.time()
 70 | 
 71 |         # self.cnx.autocommit(False)
 72 |         #query = """
 73 |         #    INSERT INTO FileNameFilter_unique_name
 74 |         #        (file_name, unique_path_id)
 75 |         #        (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = {})
 76 |         #    ON DUPLICATE KEY UPDATE count = count + 1;
 77 |         #""".format(src_info.source_id)
 78 |         cursor.execute("""
 79 |                        INSERT INTO FileNameFilter_unique_name
 80 |                        (file_name, unique_path_id)
 81 |                        (SELECT file_name, unique_path_id
 82 |                         FROM file_metadata
 83 |                         WHERE file_name != "/" and source_id = %s)
 84 |                        ON DUPLICATE KEY UPDATE count = count + 1;
 85 |                        """, (src_info.source_id,))
 86 |         # self.cnx.autocommit(True)
 87 | 
 88 |         later = time.time()
 89 | 
 90 |         #print "Updated counts in {} secs\nUpdating Scores".format(later - now)
 91 | 
 92 |         cursor.execute("SELECT MAX(count) FROM FileNameFilter_unique_name")
 93 |         (max_count,) = cursor.fetchone()
 94 | 
 95 |         now = time.time()
 96 |         #query = """
 97 |         #    INSERT INTO FileNameFilter_scores
 98 |         #        (id, score)
 99 |         #        (
100 |         #            SELECT
101 |         #                fm.unique_file_id, MIN(fnfun.count / {})
102 |         #            FROM FileNameFilter_unique_name fnfun
103 |         #            LEFT JOIN file_metadata fm
104 |         #                ON fnfun.file_name = fm.file_name
105 |         #                AND fnfun.unique_path_id = fm.unique_path_id
106 |         #            WHERE not isnull(fm.unique_file_id)
107 |         #            GROUP BY fm.unique_file_id
108 |         #        )
109 |         #    ON DUPLICATE KEY UPDATE score = score
110 |         #    """.format(max_count)
111 |         cursor.execute("""
112 |                        INSERT INTO FileNameFilter_scores
113 |                        (id, score)
114 |                        (SELECT fm.unique_file_id, MIN(fnfun.count / %s)
115 |                         FROM FileNameFilter_unique_name fnfun
116 |                         LEFT JOIN file_metadata fm
117 |                         ON fnfun.file_name = fm.file_name
118 |                         AND fnfun.unique_path_id = fm.unique_path_id
119 |                         WHERE not isnull(fm.unique_file_id)
120 |                         GROUP BY fm.unique_file_id)
121 |                        ON DUPLICATE KEY UPDATE score = score
122 |                        """, (max_count,))
123 |         self.cnx.commit()
124 |         later = time.time()
125 |         #print "Scores updated in {} secs".format(later - now)
126 |         cursor.close()
127 | 
128 |     def discover_unique_names(self, source):
129 |         """usage: unique_names source_name"""
130 | 
131 |         data = self.get_unique_names(source)
132 | 
133 |         if data is not None:
134 |             for (file, dir) in data:
135 |                 print "Unique file %s %s" % (file, dir)
136 | 
137 | 
138 |     def get_unique_names(self, source):
139 |         """usage: unique_names source_name"""
140 | 
141 |         #creates the basic tables if they do not exist
142 |         self.build()
143 | 
144 |         cursor = self.cnx.cursor()
145 | 
146 |         src_info = core.get_source_info(self.cnx, source)
147 | 
148 |         if src_info is None:
149 |             print "Error: Source {} not found".format(source)
150 |             return
151 | 
152 |         #query = """
153 |         #    SELECT fm.file_name, up.full_path
154 |         #    FROM file_metadata fm
155 |         #    LEFT JOIN FileNameFilter_unique_name fnfun
156 |         #        ON fnfun.file_name = fm.file_name
157 |         #        AND fnfun.unique_path_id = fm.unique_path_id
158 |         #    LEFT JOIN unique_path up
159 |         #        ON up.id = fm.unique_path_id
160 |         #    WHERE not isnull(fm.unique_file_id)
161 |         #        AND fnfun.count = 1
162 |         #        AND fm.source_id = {}
163 |         #""".format(src_info.source_id)
164 |         cursor.execute("""
165 |                        SELECT fm.file_name, up.full_path
166 |                        FROM file_metadata fm
167 |                        LEFT JOIN FileNameFilter_unique_name fnfun
168 |                        ON fnfun.file_name = fm.file_name
169 |                        AND fnfun.unique_path_id = fm.unique_path_id
170 |                        LEFT JOIN unique_path up
171 |                        ON up.id = fm.unique_path_id
172 |                        WHERE not isnull(fm.unique_file_id)
173 |                        AND fnfun.count = 1
174 |                        AND fm.source_id = %s
175 |                        """, (src_info.source_id,))
176 |         data = cursor.fetchall()
177 | 
178 |         cursor.close()
179 |         return data
180 | 
181 |     def run_survey(self, source_name):
182 | 
183 |         resources = "resources"
184 |         survey_file = "survey.html"
185 |         survey_dir = "survey_{}_{}".format(self.name, source_name)
186 | 
187 |         resource_dir = os.path.join(survey_dir, resources)
188 |         html_file = os.path.join(survey_dir, survey_file)
189 | 
190 |         try:
191 |             shutil.rmtree(survey_dir)
192 |         except:
193 |             pass
194 | 
195 |         os.mkdir(survey_dir)
196 |         os.mkdir(resource_dir)
197 | 
198 |         results = self.get_unique_names(source_name)
199 | 
200 |         with open(html_file, 'w') as f:
201 | 
202 |             f.write("""
203 |             <html>
204 |             <head>
205 |             <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css">
206 |             </head>
207 |             <body>
208 |             <h2 class="redwood-title">FileNameFilter Snapshot</h2>
209 |             """)
210 |             f.write("<h3 class=\"redwood-header\">One Timers in Directories</h3>")
211 |             f.write("<table border=\"1\" id=\"redwood-table\">")
212 |             f.write("<thead>")
213 |             f.write("<tr><th class=\"rounded-head-left\">Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr>")
214 |             f.write("</thead><tbody>")
215 |             i = 0
216 |             lr = len(results)
217 |             for (b,a) in results:
218 |                 if i == lr - 1:
219 |                     f.write("</tbody><tfoot>")
220 |                     f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(a, b))
221 |                 else:
222 |                     f.write("<tr><td>{}</td><td>{}</td></tr>".format(a, b))
223 |                 i += 1
224 |             f.write("</table>") 
225 |             f.write("</body></html>")
226 | 
227 |         return survey_dir 
228 | 


--------------------------------------------------------------------------------
/redwood/helpers/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Created on 19 October 2013
 20 | @author: Lab41
 21 | 
 22 | This module contains core helper functions for Redwood
 23 | """
 24 | 
 25 | import sys
 26 | import os
 27 | import inspect
 28 | import time
 29 | from collections import namedtuple
 30 | from redwood.filters.redwood_filter import RedwoodFilter
 31 | from redwood.filters import filter_list
 32 | from redwood.foundation.prevalence import PrevalenceAnalyzer
 33 | 
 34 | SourceInfo = namedtuple('SourceInfo', 'source_id source_name os_id os_name date_acquired')
 35 | 
 36 | 
 37 | def get_filter_by_name(filter_name):
 38 |     """
 39 |     Fetches an instance of a loaded filter by its name
 40 | 
 41 |     :param filter_name: the name of the filter
 42 | 
 43 |     :return an instance of a loaded filter with name filter_name
 44 |     """
 45 |     for f in filter_list:
 46 |         if f.name == filter_name:
 47 |             return f
 48 | 
 49 |     return None
 50 | 
 51 | def import_filters(path, cnx):
 52 |     """
 53 |     Imports filters from an external directory at runtime. Imported filters will be added
 54 |     to the global filter_list
 55 | 
 56 |     :param path: path where the modules reside
 57 |     :param cnx: an instance of the connection
 58 | 
 59 |     :return list of newly add filter instances
 60 |     """
 61 | 
 62 |     new_filters = list()
 63 | 
 64 | 
 65 |     print "Importing specified filters from {}".format(path)
 66 | 
 67 |     #make sure path exists
 68 |     if os.path.isdir(path) is False:
 69 |         print "Error: path {} does not exist".format(path)
 70 |         return None
 71 | 
 72 |     #add the path to the PYTHONPATH
 73 |     sys.path.append(path)
 74 | 
 75 |     #acquire list of files in the path
 76 |     mod_list = os.listdir(path)
 77 | 
 78 |     for f in mod_list:
 79 | 
 80 |         #continue if it is not a python file
 81 |         if f[-3:] != '.py':
 82 |             continue
 83 | 
 84 |         #get module name by removing extension
 85 |         mod_name = os.path.basename(f)[:-3]
 86 | 
 87 |         #import the module
 88 |         module = __import__(mod_name, locals(), globals())
 89 |         for name,cls in inspect.getmembers(module):
 90 |             #check name comaprison too since RedwoodFilter is a subclass of itself
 91 |             if inspect.isclass(cls) and issubclass(cls, RedwoodFilter) and name != "RedwoodFilter":
 92 |                 instance = cls()
 93 |                 #append an instance of the class to the filter_list
 94 |                 instance.cnx = cnx
 95 |                 filter_list.append(instance)
 96 |                 new_filters.append(instance)
 97 |                 print name
 98 | 
 99 |     return new_filters
100 | 
101 | def get_source_info(cnx, source_name):
102 |     """
103 |     Retrieves a SourceInfo instance given a <source_name>
104 | 
105 |     :param cnx: a instance of the connection
106 |     :param source_name: name of the media source
107 | 
108 |     :return SourceInfo instance or None if not found
109 |     """
110 |     cursor = cnx.cursor()
111 | 
112 |     #query = """
113 |     #    SELECT media_source.id as source_id,
114 |     #           media_source.name as source_name,
115 |     #           os.id as os_id, os.name as os_name,
116 |     #           media_source.date_acquired as date_acquired
117 |     #    FROM media_source
118 |     #    LEFT JOIN os
119 |     #    ON media_source.os_id = os.id
120 |     #    WHERE media_source.name = "{}";""".format(source_name)
121 | 
122 |     cursor.execute("""
123 |                    SELECT media_source.id as source_id,
124 |                    media_source.name as source_name,
125 |                    os.id as os_id, os.name as os_name,
126 |                    media_source.date_acquired as date_acquired
127 |                    FROM media_source
128 |                    LEFT JOIN os
129 |                    ON media_source.os_id = os.id
130 |                    WHERE media_source.name = %s;""", (source_name,))
131 |     r =  cursor.fetchone()
132 | 
133 |     if r is None:
134 |         return r
135 | 
136 |     return SourceInfo(r[0], r[1], r[2],r[3],r[4])
137 | 
138 | def get_malware_reputation_threshold(cnx):
139 |     """
140 |     Retrieves the max reputation of all confirmed malware
141 | 
142 |     :param cnx: mysql connection instance
143 | 
144 |     :return max reputation score
145 |     """
146 | 
147 |     cursor = cnx.cursor()
148 | 
149 |     query = """
150 |         select AVG(unique_file.reputation)
151 |             from validator_0 left join unique_file on validator_0.id=unique_file.id
152 |             LEFT JOIN file_metadata ON file_metadata.unique_file_id=unique_file.id where validator_0.status=3;
153 |     """
154 | 
155 |     cursor.execute(query)
156 | 
157 |     r = cursor.fetchone()
158 | 
159 |     if r is None:
160 |         return r
161 | 
162 |     return r[0]
163 | 
164 | def get_num_systems(cnx, os_name_or_id):
165 |     """
166 |     Retrieves the number of unique media sources for a given os
167 | 
168 |     :param cnx: mysql connection instance
169 |     :param os_name_or_id: os name or os id
170 | 
171 |     :return the number of systems found or None if the os does not exist
172 |     """
173 |     
174 |     cursor = cnx.cursor()
175 |     
176 | 
177 |     try: 
178 |         val = int(os_name_or_id)
179 | 
180 |         cursor.execute("""
181 |             SELECT COUNT(media_source.id) FROM os
182 |             LEFT JOIN media_source ON os.id = media_source.os_id
183 |             WHERE os.id = %s
184 |             GROUP BY os.id
185 |             """, (val,))
186 | 
187 |     except Exception as e:
188 |         cursor.execute("""
189 |             SELECT COUNT(media_source.id) FROM os
190 |             LEFT JOIN media_source ON os.id = media_source.os_id
191 |             WHERE os.id = (SELECT DISTINCT os.id from os where os.name = %s) GROUP BY os.id""", (os_name_or_id,))
192 |     
193 |     r = cursor.fetchone()
194 | 
195 |     if r is None:
196 |         return None
197 | 
198 |     return r[0]
199 | 
200 | 
201 | def update_analyzers(cnx, sources):
202 |     """
203 |     Runs Analyzers and Filters against each source in the source_os_list, updating the
204 |     approriate tables
205 | 
206 |     :param sources: list of SourceInfo instances
207 |     """
208 |     print "...Beginning Analyzers and Filters for inputted sources"
209 | 
210 |     start_time = time.time()
211 | 
212 |     #now let's run the prevalence analyzer
213 |     pu = PrevalenceAnalyzer(cnx)
214 |     pu.update(sources)
215 | 
216 |     elapsed_time = time.time() - start_time
217 |     print "...completed analyzers on inputed sources in {}".format(elapsed_time)
218 | 
219 | 
220 | def update_filters(cnx, sources):
221 | 
222 |     start_time = time.time()
223 | 
224 |     #set the cnx for each plugin
225 |     for p in filter_list:
226 |         p.cnx = cnx
227 | 
228 |     for source in sources:
229 |         #for source in sources:
230 |         print "==== Beginning filter analysis of {} ====".format(source.source_name)
231 |         for p in filter_list:
232 |             p.update(source.source_name)
233 | 
234 |     elapsed_time = time.time() - start_time
235 |     print "...completed filter analysis on inputted sources in  {}".format(elapsed_time)
236 | 
237 | 
238 | 
239 | def table_exists(cnx, name):
240 |     """
241 |     Checks if the mysql table with <name> exists
242 | 
243 |     :param cnx: mysql connection instance
244 |     :param name: table name
245 | 
246 |     :return True if exists, else False
247 |     """
248 |     cursor = cnx.cursor()
249 |     result = None
250 |     try:
251 |         cursor.execute("""select COUNT(id) from %s""", (name,))
252 |         result = cursor.fetchone()
253 |         cursor.close()
254 |     except Exception as err:
255 |         print err
256 |         pass
257 | 
258 | 
259 |     if(result == None or result[0] == 0):
260 |         return False
261 |     else:
262 |         return True
263 | 
264 | def get_all_sources(cnx):
265 |     """
266 |     Returns a list of all sources currently loaded into Redwood
267 | 
268 |     :param cnx: mysql connection instance
269 |     """
270 | 
271 |     cursor = cnx.cursor()
272 |     result = list()
273 |     try:
274 |         cursor.execute("""SELECT media_source.id, media_source.name, os.id, os.name, date_acquired FROM media_source
275 |         INNER JOIN os
276 |         ON media_source.os_id = os.id
277 |         """)
278 |         result = cursor.fetchall()
279 |         cursor.close()
280 |     except Exception as err:
281 |         print err
282 |         return None
283 | 
284 |     sources = list()
285 |     for r in result:
286 |         sources.append(SourceInfo(r[0],r[1], r[2],r[3],r[4]))
287 | 
288 |     return sources
289 | 
290 | def get_reputation_by_source(cnx, source_name):
291 |     """
292 |     Returns a list of scores for every file on the source
293 | 
294 |     :param cnx: myqsl connection instance
295 |     """
296 | 
297 |     cursor = cnx.cursor()
298 |     result = list()
299 | 
300 |     try:
301 |         cursor.execute("""SELECT ROUND(unique_file.reputation, 2),
302 |                        COUNT(DISTINCT unique_file.id) FROM unique_file
303 |                        INNER JOIN file_metadata
304 |                        ON unique_file.id = file_metadata.unique_file_id
305 |                        INNER JOIN media_source
306 |                        ON file_metadata.source_id = media_source.id
307 |                        WHERE media_source.name = %s
308 |                        GROUP BY ROUND(unique_file.reputation, 2)
309 |                        """, (source_name,))
310 |         result = cursor.fetchall()
311 |         cursor.close()
312 |     except Exception as err:
313 |         print err
314 |         return None
315 | 
316 |     return result
317 | 


--------------------------------------------------------------------------------
/redwood/io/csv_importer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Created on 19 October 2013
 20 | @author: Lab41
 21 | """
 22 | 
 23 | 
 24 | import sys
 25 | import os
 26 | import shutil
 27 | import getopt
 28 | import string
 29 | import time
 30 | from datetime import datetime
 31 | import MySQLdb
 32 | from redwood.helpers.core import SourceInfo
 33 | from redwood.foundation.prevalence import PrevalenceAnalyzer
 34 | from redwood.filters import filter_list
 35 | import redwood.helpers.core as core
 36 | from redwood.foundation.report import Report
 37 | 
 38 | def db_load_file(connection, path):
 39 |     """
 40 |     Loads a file located at <path> into the database
 41 | 
 42 |     :param connection: connection object for the database
 43 |     :param path: path where the file is located
 44 | 
 45 |     :return SourceInfo representing the inputted source
 46 |     """
 47 | 
 48 |     try:
 49 |         with open(path): pass
 50 |     except IOError:
 51 |         print '*** Error: File \'{}\' does not exist'.format(path)
 52 |         return
 53 | 
 54 | 
 55 |     filename = os.path.basename(path)
 56 |     fields = string.split(filename, '--')
 57 | 
 58 |     if(len(fields) != 3):
 59 |         print "*** Error: Improper naming scheme with {} fields".format(len(fields))
 60 |         print path
 61 |         print fields
 62 |         return
 63 | 
 64 |     cursor = connection.cursor()
 65 |     os_id = None
 66 | 
 67 |     source_name = fields[2]
 68 |     os_name = fields[1]
 69 | 
 70 |     print "=== Loading \"{}\" into database ===".format(source_name)
 71 |     #transaction for adding to media and os tables. Both succeed or both fail
 72 |     try:
 73 | 
 74 |         data_os = {
 75 |             'name':os_name,
 76 |         }
 77 | 
 78 |         #add os
 79 |         add_os = ("INSERT INTO `os` (name) VALUES('%(name)s') ON DUPLICATE KEY UPDATE id=id") % data_os
 80 |         cursor.execute(add_os)
 81 |         connection.commit()
 82 | 
 83 |     except MySQLdb.Error, e:
 84 |         if connection:
 85 |             connection.rollback()
 86 |             print "*** Error %d: %s" % (e.args[0],e.args[1])
 87 |             return
 88 | 
 89 |     #now get the os_id for the os_name
 90 |     #query = "SELECT os.id FROM os WHERE os.name = \"{}\"".format(os_name)
 91 |     cursor.execute("""SELECT os.id FROM os WHERE os.name = %s""", (os_name,))
 92 |     r = cursor.fetchone()
 93 |     os_id = r[0]
 94 | 
 95 |     if(os_id is None):
 96 |         print "*** Error: Unable to find corresponding os"
 97 |         return
 98 | 
 99 |     try:
100 |         date_object = datetime.strptime(fields[0], '%Y-%m-%d')
101 | 
102 |         data_media_source = {
103 | 
104 |             'name':fields[2],
105 |             'date_acquired':date_object.isoformat(),
106 |             'os_id':os_id,
107 |         }
108 | 
109 |         #add the media source
110 |         add_media_source = ("INSERT INTO `media_source` (reputation, name, date_acquired, os_id) "
111 |                             "VALUES(0, '%(name)s', '%(date_acquired)s', '%(os_id)s') ") % data_media_source
112 | 
113 |         cursor.execute(add_media_source)
114 |         connection.commit()
115 |         source_id = cursor.lastrowid
116 | 
117 |     except MySQLdb.Error, e:
118 |         if connection:
119 |             connection.rollback()
120 |             print "*** Error %d: %s" % (e.args[0],e.args[1])
121 |             return
122 | 
123 |     media_source_id = cursor.lastrowid
124 | 
125 |     path = path.replace('\\','\\\\')
126 |     #load raw csv into the staging table from the client
127 |     #add_staging_table = ("""LOAD DATA LOCAL INFILE '{}' INTO TABLE `staging_table`
128 |     #                     FIELDS TERMINATED BY ','  ENCLOSED BY '\"' LINES TERMINATED BY '\\n'
129 |     #                     IGNORE 1 LINES
130 |     #                     (global_file_id, parent_id, dirname, basename,contents_hash,dirname_hash,filesystem_id,device_id,
131 |     #                     attributes,user_owner,group_owner,size,@created_param,@accessed_param,@modified_param,@changed_param,
132 |     #                     @user_flags,links_to_file, @disk_offset, @entropy, @file_content_status, @extension, file_type)
133 |     #                     SET created = FROM_UNIXTIME(@created_param),
134 |     #         last_accessed = FROM_UNIXTIME(@accessed_param),
135 |     #                     last_modified = FROM_UNIXTIME(@modified_param),
136 |     #         last_changed = FROM_UNIXTIME(@changed_param),
137 |     #                     user_flags = nullif(@user_flags,''), disk_offset = nullif(@disk_offset,''),
138 |     #                     entropy=nullif(@entropy,''), file_content_status=nullif(@file_content_status,''),
139 |     #                     extension = nullif(@extension,'');""").format(path)
140 | 
141 |     try:
142 | 
143 |         #create the staging table
144 |         query = """
145 |             CREATE TABLE IF NOT EXISTS staging_table (
146 |             global_file_id LONG NOT NULL,
147 |             parent_id LONG NULL,
148 |             dirname VARCHAR(4096) NULL,
149 |             basename VARCHAR(255) NULL,
150 |             contents_hash CHAR(40) NULL,
151 |             dirname_hash CHAR(40) NULL,
152 |             filesystem_id INT UNSIGNED NULL,
153 |             device_id INT NULL,
154 |             attributes INT NULL,
155 |             user_owner INT NULL,
156 |             group_owner INT NULL,
157 |             size INT UNSIGNED NULL,
158 |             created DATETIME NULL,
159 |             last_accessed DATETIME NULL,
160 |             last_modified DATETIME NULL,
161 |             last_changed DATETIME NULL,
162 |             user_flags INT NULL DEFAULT NULL,
163 |             links_to_file INT NULL,
164 |             disk_offset BIGINT  NULL,
165 |             entropy TINYINT  NULL,
166 |             file_content_status TINYINT NULL,
167 |             extension VARCHAR(32)  NULL,
168 |             file_type VARCHAR(64)  NULL,
169 |             INDEX contents_hash_idx (contents_hash ASC),
170 |             INDEX dirname_hash_idx (dirname_hash ASC)
171 |             )  ENGINE=InnoDB;
172 |         """
173 | 
174 |         cursor.execute(query)
175 |         connection.commit()
176 | 
177 |         start_time = time.time()
178 |         cursor.execute("""
179 |                        LOAD DATA LOCAL INFILE %s INTO TABLE `staging_table`
180 |                        FIELDS TERMINATED BY ',' 
181 |                        ENCLOSED BY '\"' LINES TERMINATED BY '\\n'
182 |                        IGNORE 1 LINES
183 |                        (global_file_id, parent_id, dirname, basename,contents_hash,dirname_hash,filesystem_id,device_id,
184 |                         attributes,user_owner,group_owner,size,@created_param,@accessed_param,@modified_param,@changed_param,
185 |                         @user_flags,links_to_file, @disk_offset, @entropy, @file_content_status, @extension, file_type)
186 |                        SET created = FROM_UNIXTIME(@created_param),
187 |                        last_accessed = FROM_UNIXTIME(@accessed_param),
188 |                        last_modified = FROM_UNIXTIME(@modified_param),
189 |                        last_changed = FROM_UNIXTIME(@changed_param),
190 |                        user_flags = nullif(@user_flags,''),
191 |                        disk_offset = nullif(@disk_offset,''),
192 |                        entropy=nullif(@entropy,''),
193 |                        file_content_status=nullif(@file_content_status,''),
194 |                        extension = nullif(@extension,'');""", (path,))
195 |         connection.commit()
196 |         print "...data transfer to staging table in {}".format(time.time() - start_time)
197 |         start_time = time.time()
198 | 
199 |         cursor.callproc('map_staging_table', (media_source_id, os_id))
200 |         cursor.execute("DROP TABLE `staging_table`;")
201 |         connection.commit()
202 |         print "...data written from staging table to main tables in {}".format(time.time() - start_time)
203 |     except Exception as err:
204 |         print "Exception occurred: {}".format(err)
205 |         cursor.close()
206 |         sys.exit(1)
207 | 
208 |     total_time =  time.time() - start_time
209 |     print "...completed in {}".format(total_time)
210 |     cursor.close()
211 |     #TODO: just call get source info here
212 |     return SourceInfo(source_id, source_name, os_id, os_name, None) 
213 | 
214 | def run(cnx, path):
215 |     """
216 |     Loads all csv files from the path into the database
217 | 
218 |     :param cnx: mysql connection object
219 |     :param path: directory containing csv files or the full path to a csv file
220 |     """
221 |     src_os_list = list()
222 | 
223 |     if(path == None):
224 |         print "*** Error: Path is required"
225 |         return
226 | 
227 |     if(os.path.isfile(path)):
228 |         info =  db_load_file(cnx, path)
229 |         if info is not None:
230 |             src_os_list.append(info)
231 |     elif(os.path.isdir(path)):
232 |         for r, d, f in os.walk(path):
233 |             while len(d) > 0:
234 |                 d.pop()
235 |             for file in f:
236 |                 if not file.startswith('.'):
237 |                     os.path.abspath(os.path.join(r, file))
238 |                     info = db_load_file(cnx, path + "/" + file)
239 |                     if info is not None:
240 |                         src_os_list.append(info)
241 |     else:
242 |         print 'Please input a valid file or a directory for import'
243 |         return
244 | 
245 |     #update the analyzers and filters
246 |     core.update_analyzers(cnx,src_os_list)
247 |     core.update_filters(cnx, src_os_list)
248 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Hemlock documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Sep  9 22:31:44 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | sys.path.insert(0, os.path.abspath('../'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest', 'sphinx.ext.coverage']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'Redwood'
 44 | copyright = u'2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1.0'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1.0'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | #html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | #html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | #html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | #html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'Redwooddoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 | # The paper size ('letterpaper' or 'a4paper').
174 | #'papersize': 'letterpaper',
175 | 
176 | # The font size ('10pt', '11pt' or '12pt').
177 | #'pointsize': '10pt',
178 | 
179 | # Additional stuff for the LaTeX preamble.
180 | #'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |   ('index', 'Redwood.tex', u'Redwood Documentation',
187 |    u'Paul M', 'manual'),
188 | ]
189 | 
190 | # The name of an image file (relative to this directory) to place at the top of
191 | # the title page.
192 | #latex_logo = None
193 | 
194 | # For "manual" documents, if this is true, then toplevel headings are parts,
195 | # not chapters.
196 | #latex_use_parts = False
197 | 
198 | # If true, show page references after internal links.
199 | #latex_show_pagerefs = False
200 | 
201 | # If true, show URL addresses after external links.
202 | #latex_show_urls = False
203 | 
204 | # Documents to append as an appendix to all manuals.
205 | #latex_appendices = []
206 | 
207 | # If false, no module index is generated.
208 | #latex_domain_indices = True
209 | 
210 | 
211 | # -- Options for manual page output --------------------------------------------
212 | 
213 | # One entry per manual page. List of tuples
214 | # (source start file, name, description, authors, manual section).
215 | man_pages = [
216 |     ('index', 'redwood', u'Redwood Documentation',
217 |      [u'Paul M'], 1)
218 | ]
219 | 
220 | # If true, show URL addresses after external links.
221 | #man_show_urls = False
222 | 
223 | 
224 | # -- Options for Texinfo output ------------------------------------------------
225 | 
226 | # Grouping the document tree into Texinfo files. List of tuples
227 | # (source start file, target name, title, author,
228 | #  dir menu entry, description, category)
229 | texinfo_documents = [
230 |   ('index', 'Redwood', u'Redwood  Documentation',
231 |    u'Paul M', 'Redwood', 'A project that implements statistical methods for identifying anomalous files.',
232 |    'Miscellaneous'),
233 | ]
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #texinfo_appendices = []
237 | 
238 | # If false, no module index is generated.
239 | #texinfo_domain_indices = True
240 | 
241 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
242 | #texinfo_show_urls = 'footnote'
243 | 
244 | # If true, do not generate a @detailmenu in the "Top" node's menu.
245 | #texinfo_no_detailmenu = False
246 | 
247 | 
248 | # -- Options for Epub output ---------------------------------------------------
249 | 
250 | # Bibliographic Dublin Core info.
251 | epub_title = u'Redwood'
252 | epub_author = u'Paul M'
253 | epub_publisher = u'Paul M'
254 | epub_copyright = u'2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.'
255 | 
256 | # The language of the text. It defaults to the language option
257 | # or en if the language is not set.
258 | #epub_language = ''
259 | 
260 | # The scheme of the identifier. Typical schemes are ISBN or URL.
261 | #epub_scheme = ''
262 | 
263 | # The unique identifier of the text. This can be a ISBN number
264 | # or the project homepage.
265 | #epub_identifier = ''
266 | 
267 | # A unique identification for the text.
268 | #epub_uid = ''
269 | 
270 | # A tuple containing the cover image and cover page html template filenames.
271 | #epub_cover = ()
272 | 
273 | # A sequence of (type, uri, title) tuples for the guide element of content.opf.
274 | #epub_guide = ()
275 | 
276 | # HTML files that should be inserted before the pages created by sphinx.
277 | # The format is a list of tuples containing the path and title.
278 | #epub_pre_files = []
279 | 
280 | # HTML files shat should be inserted after the pages created by sphinx.
281 | # The format is a list of tuples containing the path and title.
282 | #epub_post_files = []
283 | 
284 | # A list of files that should not be packed into the epub file.
285 | #epub_exclude_files = []
286 | 
287 | # The depth of the table of contents in toc.ncx.
288 | #epub_tocdepth = 3
289 | 
290 | # Allow duplicate toc entries.
291 | #epub_tocdup = True
292 | 
293 | # Fix unsupported image types using the PIL.
294 | #epub_fix_images = False
295 | 
296 | # Scale large images.
297 | #epub_max_image_width = 0
298 | 
299 | # If 'no', URL addresses will not be shown.
300 | #epub_show_urls = 'inline'
301 | 
302 | # If false, no index is generated.
303 | #epub_use_index = True
304 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #Association-Based Data Reduction (REDWOOD)
  2 | 
  3 | <i>Finding the Tree in the Forest</i>
  4 | 
  5 | ![Redwood](https://raw.github.com/Lab41/Redwood/master/images/logo/redwood_logo.png "Redwood")
  6 | 
  7 | 
  8 | <p><b>Redwood is a Python framework intended to identify anomalous files through analyzing the file metadata of a collection of media</b>. Each file analyzed is assigned a score that signals its reputation relative to other files in the system --the lower a reputation score, the more likely that a file is anomalous.  The final reputation score of a given file is based on an aggegation of scores assigned to it by modules that we call "Filters".</p>
  9 | <p>A Filter is a plugin whose functionality is only limited by the creativity of the developer.  Redwood can support any number of Filters, so long as a Filter extends the RedwoodFilter class and produces a table assigning a reputation score to each unique file in the system.  Much of the Redwood framework is aimed at making the process of adding new Filters to the system as frictionless as possible (see the Filter section below for more information).</p>   
 10 | <p>In addition to the Filters, Redwood also provides an effective data model for analyzing and storing file metadata, an API for interacting with that data, a simple shell for executing Redwood commands, and two example Filters (a "prevalence" Filter and a "locality uniqueness" Filter.  Though sample Filters are included in the project, ultimately the effectiveness of Redwood will be based on the Filters that you write for the particular anomaly that you are looking for. To that end, Redwood is nothing more than a simple framework for connecting Filters to a well-formed data model.</p> 
 11 | 
 12 | ##Quick Setup
 13 | The instructions that follow should get you up and running quickly.  Redwood has been tested on  OS X and Linux. Windows will likely work with a few changes.
 14 | 
 15 | #### Stuff to Download
 16 | 1. Python 2.7
 17 | 2. Python packages
 18 |   * SciPy (Matplotlib, MqSQLdb)
 19 | 3. MySQL Client for your client OS
 20 | 4. MySQL Server for the server hosting the DB
 21 | 
 22 | #### Prep the Database
 23 | Redwood uses a MySQL database to store metadata. In order to use Redwood, you will need to first set up your own MySQL DB, then run the following two SQL scripts to create the required tables and subroutines.
 24 | 
 25 | ```bash
 26 | mysql -uyour_db_user -pyour_password -hyour_host -Dyour_database < sql/create_redwood_db.sql
 27 | mysql -uyour_db_user -pyour_password -hyour_host -Dyour_database < sql/create_redwood_sp.sql
 28 | ```
 29 | 
 30 | #### Create a config
 31 | 
 32 | Create a file containing the following configuration information specific to your database
 33 | 
 34 | ```
 35 | [mysqld]
 36 | database:your_db_name
 37 | host:your_host
 38 | username:your_username
 39 | password:your_password
 40 | ```
 41 | 
 42 | ## Run Redwood
 43 | 
 44 | There are two ways that you can run Redwood.  If you just want to play with the tool, and maybe create a couple of filters, the "Redwood Shell" method is probably the best choice.  If you want to make modifications to the core package and or create your own UI, then you probably want to use the API.  Examples of how to do both are below:
 45 | 
 46 | #### Using the Redwood Shell
 47 | 
 48 | ```bash
 49 | #append to the python path the Redwood directory
 50 | export PYTHONPATH=/path/to/Redwood
 51 | #from the Redwood directory run
 52 | python bin/redwood /path/to/config
 53 | ```
 54 | 
 55 | #### Using the API to create your Application
 56 | <i>This is a brief example of how to use the API to load a media source into the database and then run specific filter functions on that source</i>
 57 | 
 58 | ```python
 59 | import redwood.connection.connect as connect
 60 | import redwood.io.csv_importer as loader
 61 | import redwood.helpers.core as core
 62 | 
 63 | #connect to the database
 64 | cnx = connect.connect_with_config("my_db.cfg")
 65 | 
 66 | #load a csv to the database
 67 | loader.run(cnx,"directory_containing_csv_data_pulls", false)
 68 | 
 69 | core.import_filters("./Filters", cnx)
 70 | 
 71 | #grab instances of two specific filters
 72 | fp = core.get_filter_by_name("prevalence")
 73 | lu = core.get_filter_by_name("locality_uniqueness")
 74 | 
 75 | #generate a histogram to see distribution of files for that source
 76 | fp.discover_histogram_by_source("some_source")
 77 | 
 78 | #run a survey for a particular source
 79 | fp.run_survey("some_source")
 80 | ```
 81 | 
 82 | 
 83 | ##Documentation
 84 | from the root project directory, run the following
 85 | ```bash
 86 | sphinx-apidoc -o docs redwood -F; pushd docs; make html; make man; popd
 87 | ```
 88 | 
 89 | ###Data
 90 | 
 91 | Redwood currently only loads data from a CSV file with the fields below. Information about these fields can typically be found in a stat
 92 | 
 93 | |Field Name | Field Description|
 94 | |-----------|------------------|
 95 | |file_id| Unique id of the file |
 96 | |parent_id| file_id of the parent |
 97 | |dirname| path excluding filename |
 98 | |basename| filename |
 99 | |hash| Sha1 of file contents |
100 | |fs_id| Inode of linux or non-linux equivalent |
101 | |device| Device Node identifier |
102 | |permissions| Permission of the file |
103 | |uid| User owner of the file |
104 | |gid| Group owner of the file |
105 | |size| Size in bytes |
106 | |create_time | file create in  seconds from epoch | 
107 | |access_time| file last accessed in seconds from epoch |
108 | |mod_time| file modification in seconds from epoch |
109 | |metadata_change_time| file change in seconds from epoch |
110 | |user_flags| user flags | 
111 | |links| links to the file |
112 | |disk_offset| disk offset |
113 | |entropy| entropy of the file |
114 | |file_content_status|file content status|
115 | |extensions| file extension if available |
116 | |file_type| file type if auto discovered |
117 | 
118 | 
119 | The **sql/filewalk.py** script will walk a hfs+ file system and (perhaps) other Unix/Linux file systems, collecting the relevant metadata using the stat command.  The output will be in the appropriate format for the load_csv command. Note, this script has been optimized for Linux/OS X.  It will not work on a Windows system... updates welcome)
120 | 
121 | 
122 | 
123 | ##Redwood Architecture
124 | 
125 | Redwood is composed of 5 core engines, all backed by a MySQL DB 
126 | 
127 | 1. Ingestion Engine
128 |   - The ingestion engine is responsible for importing data into the datastore from a metadata source file (currently only supporting csv). 
129 | 2. Global Analytics Engine
130 |   - The Global Analytics Engine is responsible for performing analytics on a global scale against all metadata and then providing those results to all filters for subsequent computation in the form of queriable tables.  This engine typically conducts time intensive queries that you only want to perform once per new source.  Currently, the only Global Analytics Engine is the "Prevalence" analyzer.  This is not to be confused with the prevalence filter which leverages the tables produced by the the Prevalence analyzer. 
131 | 3. Filter Engine
132 |   - The Filter Engine has two main responsiblities. The first is to create a table for the reputation scores that it has calculated for each unique file in the the database.  The second is to optionally provide a series of "Discovery" functions that are associated with the filter scoring yet can be used independently by the end user or developer to discover in more detail why a file has a paticular score. For more information, please refer to the "All About Filters" section.
133 | 4. Aggregation Engine
134 |   - The Aggregation Engine is responsible for two main duties (1)  aggregating the scores of each filter into a single reputation score based on some aggregation algorithm (2) freezing global reputation scores if the engine deems them as either definitely high or low reputations
135 | 5. Reporting Engine
136 |   - The Reporting Engine is responsible for generating a comprehensive report highlighting user specified information about the data. 
137 | 
138 | 
139 | ##All About Filters
140 | 
141 | ####Summary
142 | Filters are the foundation of file scoring in Redwood. A Filter's central purpose is to create a score for each unique file in the system.  After Redwood runs all the filters, each unique file should have a score from each filter.  It is then that Redwood is responsible for combining these scores using an aggregation function such that each unique file has only a single score in the unique file table.  Keep in mind that numerous filters can exist in a Redwood project.<br>
143 | In addition to generating a score for each file, a Filter can optionally create one or more "Discovery" functions.  A Discovery function is a function that allows the user of the Filter to explore the data beyond just deriving a score. It is common for a Discovery function to also be used in the calculations for file scoring -- the Redwood model just provides a structured way for the developer to make that function available to the end user. 
144 | 
145 | ####Writing your own Filter
146 | Your filter should inherit from the base class RedwoodFilter in redwood.filters.redwood_filter. You must override those functions that raise a "NotImplementedError".  To assist in writing your own filter, look at the sample filters (locality_uniqueness and file_prevalence) in the Filters directory. 
147 | 
148 | - If you are using the Redwood Shell, any Filter placed in the Filters directory will be automatically imported into the application. 
149 | - All discovery functions should be preceded by "discover_" in their name so that during introspection a developer knows which functions are intended for discovery
150 | - A Filter is free to create any tables in the database. This can become necessary for efficiently calculating the reputation scores
151 | - The update function must produce (or update if not exists) a table called self.score_table with two columns (id, score) where the id is the unique_file.id of the the given file and the score is the calculated score
152 | - The self.cnx instance variable must be set prior to running any of the functions of the filter. The self.cnx is a mysql connection object. Redwood will set the cnx instance if you use its import functions.
153 | 
154 | 
155 | ```python
156 | class YourFilterName(RedwoodFilter)
157 | 
158 |     def __init__(self):
159 |         self.name = "YourFilterName"
160 |         self.score_table = "YourScoreTableName"
161 |         self.cnx
162 |     def usage(self):
163 |         print "Your usage statement"
164 | 
165 |     def update(self, source_name):
166 |         #code to update all filter tables with source_name data
167 |     
168 |     #survey function
169 |     def run_survey(source):
170 |         your code
171 |    
172 |     #build
173 |     def build():
174 |         your code
175 | 
176 |     #clean
177 |     def clean(self)
178 |         your code
179 | 
180 |     #discovery functions
181 |     def discover_your_discover_func0(self, arg0, ..., argN):
182 |         your code
183 |     ...
184 |     def discover_your discover_funcM(self, arg0, ..., argN):
185 |         your code
186 | 
187 | ```
188 | 
189 | ##Screen Shots 
190 | <i>Sceenshot of the Sample Shell</i><br>
191 | ![Shell](https://raw.github.com/Lab41/Redwood/master/images/redwood_0.png "Redwood Shell")
192 | <br><i>Sceenshot of the Filter Options</i><br>
193 | ![Shell](https://raw.github.com/Lab41/Redwood/master/images/discovery.png "Filter Options")
194 | <br><i>Sceenshot of the File Distribution discovery function for Filter Prevalence</i><br>
195 | ![Shell](https://raw.github.com/Lab41/Redwood/master/images/histogram0.png "Prevalence Filter file distribution")
196 | <br><i>Sceenshot of the discovery function for Locality Uniqueness</i><br>
197 | ![Clustering](https://raw.github.com/Lab41/Redwood/master/images/clustering.png "Locality Uniquenss Clustering")
198 | 
199 | 
200 | 
201 | 
202 | 
203 | ##Optimizing MySQL Notes
204 | bulk_insert_buffer_size: 8G
205 | 
206 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/Lab41/redwood/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
207 | 


--------------------------------------------------------------------------------
/redwood/foundation/report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Created on 19 October 2013
 20 | @author: Lab41
 21 | """
 22 | 
 23 | import os
 24 | import shutil
 25 | import math
 26 | from redwood.filters import filter_list
 27 | from redwood.helpers import core
 28 | import matplotlib.pylab as plt
 29 | from redwood.foundation.aggregator import Aggregator
 30 | 
 31 | 
 32 | class Report():
 33 |     def __init__(self, cnx, source_info):
 34 |         self.report_dir = "reports"
 35 |         self.cnx = cnx
 36 |         self.source = source_info
 37 | 
 38 |     def run(self, agg_weights=None):
 39 | 
 40 |         print "Running report survey for: " + self.source.source_name
 41 |         print "... aggregating most recent filter scores"
 42 |         ag = Aggregator(self.cnx)
 43 |         ag.aggregate(filter_list, agg_weights)
 44 |         self.run_filter_survey()
 45 |         self.generate_report()
 46 | 
 47 |     #collects survey reports from each filter and aggregates the results into one central report
 48 |     def run_filter_survey(self):
 49 |         print "...Generating Report"
 50 |         for f in filter_list:
 51 |             f.cnx = self.cnx
 52 |             print f.name
 53 |             path = f.run_survey(self.source.source_name)
 54 |             try:
 55 |                 shutil.rmtree(self.report_dir + "/" + self.source.source_name + "/filters/" + f.name)
 56 |             except:
 57 |                 pass
 58 | 
 59 |             if path == None:
 60 |                 continue
 61 | 
 62 |             shutil.move(path, self.report_dir + "/" + self.source.source_name + "/filters/" + f.name)
 63 | 
 64 |     def generate_report(self):
 65 |         report_dir = "reports/" + self.source.source_name
 66 |         report_file = self.source.source_name + "_report.html"
 67 |         html_file = os.path.join(report_dir, report_file)
 68 | 
 69 |         score_counts = core.get_reputation_by_source(self.cnx, self.source.source_name)
 70 | 
 71 |         bins = [.05,.1,.15,.2,.25,.30,.35,.40,.45,.50,.55,.60,.65,.70,.75,.80,.85,.90,.95,1.00]
 72 |         scores, counts = zip(*score_counts)
 73 |         fig = plt.figure()
 74 |         ax = fig.add_subplot(111, title="Reputation Distribution")
 75 |         ax.hist(scores, weights=counts, bins = bins)
 76 |         ax.set_xlabel("Reputation Score")
 77 |         ax.set_ylabel("File Occurrences")
 78 | 
 79 |         threshold = None
 80 |         #TODO: if you have a truth source, use it here
 81 |         #if you have a validation engine, use the line below
 82 |         #threshold = core.get_malware_reputation_threshold(self.cnx)
 83 |         #print "thres: {}".format(threshold)
 84 |         #if threshold is not None:
 85 |         #    plt.axvline(x=threshold, color="r", ls='--')
 86 |         #plt.xticks(bins)
 87 | 
 88 |         #for tick in ax.xaxis.get_major_ticks():
 89 |         #    tick.label.set_fontsize(8)
 90 | 
 91 |         hist_reputation = os.path.join(report_dir, "rep.png")
 92 |         plt.savefig(hist_reputation)
 93 | 
 94 | 
 95 |         table_height = int(math.ceil(len(score_counts) / float(3)))
 96 |         file_count = 0
 97 |         for s in score_counts:
 98 |             file_count += s[1]
 99 | 
100 |         with open(html_file, 'w') as f:
101 |             f.write("""
102 |             <html>
103 |             <head>
104 |             <link href="../resources/css/style.css" rel="stylesheet" type="text/css">
105 |             </head>
106 |             <body>
107 |             <div id="navigation">
108 |                 <image class="center" src="../resources/images/redwood_logo.png" height="25%"/>
109 |                 <ul id="navigation" class="list">""")
110 |             for d in os.listdir(report_dir + "/filters"):
111 |                 if os.path.isdir(os.path.join(report_dir + "/filters", d)) and d[0] != '.':
112 |                     filter_survey = os.path.join("filters/" + d, "survey.html")
113 |                     f.write("""
114 |                     <li><a class="button" href=\"{}\">{}</a></dt>
115 |             """.format(filter_survey, d))
116 |             f.write("""
117 |                 </ul>
118 |             </div>
119 |             <div id="top">
120 |                 <h2 class="redwood-title">Report for {}</h2>\n""".format(self.source.source_name))
121 |             f.write("\t\t<h3 class=\"redwood-header\">Source Information</h3>\n")
122 |             f.write("\t\t<dl style=\"text-indent: 5px;\">\n")
123 |             f.write("\t\t\t<dt>Acquisition Date: {}</dt>\n".format(self.source.date_acquired))
124 |             f.write("\t\t\t<dt>Operating System: {}</dt>\n".format(self.source.os_name))
125 |             f.write("\t\t\t<dt>File Count: {}</dt>\n".format(file_count))
126 |             f.write("\t\t</dl>\n\t\t</div>\n")
127 |             f.write("\t\t<div id=\"content\">\n")
128 |             f.write("\t\t<table border=\"1\" id=\"redwood-table\">\n")
129 |             f.write("\t\t\t<caption class=\"caption\">File Score Distribution</caption>\n")
130 |             f.write("\t\t\t<thead></thead>\n")
131 |             f.write("""
132 |                 <caption id="redwood-table" class="caption">Reputation Distribution</caption>
133 |                 <img src="rep.png"/>
134 |             """)
135 |             f.write("""
136 |             <thead>
137 |                 <tr>
138 |                     <th scope="col" class="rounded-head-left">Score</th>
139 |                     <th scope="col" class="count-divider">Count</th>
140 |                     <th scope="col" class="score-divider">Score</th>
141 |                     <th scope="col" class="count-divider">Count</th>
142 |                     <th scope="col" class="score-divider">Score</th>
143 |                     <th scope="col" class="rounded-head-right-light">Count</th>
144 |                 </tr>
145 |             </thead>
146 |             <tbody>""")
147 |             for i in range(0, table_height):
148 |                 if len(score_counts) == 1:
149 |                     f.write("""
150 |             </tbody>
151 |             <tfoot>
152 |                 <tr>
153 |                     <td class="rounded-foot-left">{}</td>
154 |                     <td class="count-divider">{}</td>
155 |                     <td class="score-divider"></td>
156 |                     <td class="count-divider"></td>
157 |                     <td class="score-divider"></td>
158 |                     <td class="rounded-foot-right-light"></td>
159 |                 </tr>
160 |             </tfoot>""".format(score_counts[i][0], score_counts[i][1]))
161 |                 elif table_height * 2 + i >= len(score_counts):
162 |                     if i == table_height - 1:
163 |                         f.write("""
164 |             </tbody>
165 |             <tfoot>
166 |                 <tr>
167 |                     <td class="rounded-foot-left">{}</td>
168 |                     <td class="count-divider">{}</td>
169 |                     <td class="score-divider">{}</td>
170 |                     <td class="count-divider">{}</td>
171 |                     <td class="score-divider"></td>
172 |                     <td class="rounded-foot-right-light"></td>
173 |                 </tr>
174 |             </tfoot>""".format(score_counts[i][0], score_counts[i][1],
175 |                 score_counts[table_height + i][0], score_counts[table_height + i][1]))
176 |                     else:
177 |                         f.write("""
178 |                 <tr>
179 |                     <td class="score-divider">{}</td>
180 |                     <td class="count-divider">{}</td>
181 |                     <td class="score-divider">{}</td>
182 |                     <td class="count-divider">{}</td>
183 |                     <td class="score-divider"></td>
184 |                     <td></td>
185 |                 </tr>""".format(score_counts[i][0], score_counts[i][1],
186 |                         score_counts[table_height + i][0], score_counts[table_height + i][1]))
187 |                 else:
188 |                     f.write("""
189 |                 <tr>
190 |                     <td class="score-divider">{}</td>
191 |                     <td class=count-divider>{}</td>
192 |                     <td class="score-divider">{}</td>
193 |                     <td class=count-divider>{}</td>
194 |                     <td class="score-divider">{}</td>
195 |                     <td>{}</td>
196 |                 </tr>""".format(score_counts[i][0], score_counts[i][1],
197 |                     score_counts[table_height + i][0], score_counts[table_height + i][1],
198 |                     score_counts[table_height * 2 + i][0], score_counts[table_height * 2 + i][1]))
199 |             f.write("""
200 |             </table>
201 |             </div>
202 |             """)
203 |             #This is what the query should look like in production
204 |             cursor = self.cnx.cursor()
205 |             #cursor.execute("""
206 |             #    SELECT file_metadata.file_name AS Filename,
207 |             #    unique_file.reputation AS Reputation,
208 |             #    unique_path.full_path As Path,
209 |             #    unique_file.hash AS Hash
210 |             #    FROM file_metadata
211 |             #    INNER JOIN unique_file
212 |             #    ON file_metadata.unique_file_id = unique_file.id
213 |             #    INNER JOIN unique_path
214 |             #    ON file_metadata.unique_path_id = unique_path.id
215 |             #    WHERE source_id = {}
216 |             #    ORDER BY unique_file.reputation ASC
217 |             #    LIMIT 0, 100
218 |             #    """.format(source.os_id))
219 |             #Use this query if unique_file.reputation is no indexed
220 |             cursor.execute("""
221 |                 SELECT file_metadata.file_name AS Filename,
222 |                 unique_file.reputation AS Reputation,
223 |                 unique_path.full_path As Path,
224 |                 unique_file.hash AS Hash
225 |                 FROM file_metadata
226 |                 INNER JOIN unique_file
227 |                 ON file_metadata.unique_file_id = unique_file.id
228 |                 INNER JOIN unique_path
229 |                 ON file_metadata.unique_path_id = unique_path.id
230 |                 WHERE source_id = %s
231 |                 ORDER BY unique_file.reputation ASC
232 |                 LIMIT 0, 100
233 |                 """, (self.source.source_id,))
234 |             col_length = len(cursor.description)
235 |             field_names = cursor.description
236 |             results = cursor.fetchall()
237 |             f.write("\t\t<div id=\"content\">\n")
238 |             f.write("\t\t<table border=\"1\" id=\"redwood-table\">\n")
239 |             f.write("\t\t\t<caption class=\"caption\">Lowest Reputation Files (100)</caption>\n")
240 |             f.write("""
241 |             <thead>
242 |                 <tr>""")
243 |             for i in range(0, col_length):
244 |                 if i == 0:
245 |                     f.write("<th scope=\"col\" class=\"rounded-head-left\">{}</th>".format(field_names[i][0]))
246 |                 elif i == col_length - 1:
247 |                     f.write("<th class=\"rounded-head-right\">{}</td>".format(field_names[i][0]))
248 |                 else:
249 |                     f.write("<th>{}</th>".format(field_names[i][0]))
250 |             f.write("""
251 |                 </tr>
252 |             </thead>
253 |             <tbody>""")
254 |             for row in results:
255 |                 f.write("<tr>")
256 |                 for l in row:
257 |                     f.write("<td>{}</td>".format(l))
258 |                 f.write("</tr>\n")
259 |             f.write("</tbody></table></div>")
260 |             f.write("</body></html>")
261 |             f.close()
262 | 


--------------------------------------------------------------------------------
/redwood/foundation/prevalence.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Created on 19 October 2013
 20 | @author: Lab41
 21 | """
 22 | 
 23 | 
 24 | class PrevalenceAnalyzer():
 25 |     """
 26 |     The PrevalenceAnalyzer is a core component of Redwood for determining prevalence
 27 |     analytics that can then be made available to all filters.
 28 |     """
 29 | 
 30 |     def __init__(self, cnx):
 31 |         self.cnx = cnx
 32 | 
 33 |     def update(self, sources):
 34 |         """
 35 |         Analyzes all sources from the source_os_list, storing results in the global tables
 36 |         for prevalence
 37 | 
 38 |         :param sources: a list of SourceInfo instances containing information about the sources.
 39 |         """
 40 |         self.build()
 41 | 
 42 |         print "[+] Conducting global analysis for prevalence"
 43 | 
 44 |         cursor = self.cnx.cursor()
 45 |         #iterate through each of the new sources, updating the prevalence table accordingly
 46 |         for source in sources:
 47 |             print source.source_name
 48 |             #will need to fetch the number of systems first for the given os
 49 |             #query = """
 50 |             #    select COUNT(os.name) from os LEFT JOIN media_source ON(os.id = media_source.os_id)
 51 |             #    where os.id = {} GROUP BY os.name
 52 |             #""".format(source.os_id)
 53 | 
 54 |             cursor.execute("""
 55 |                            select COUNT(os.name) from os
 56 |                            LEFT JOIN media_source
 57 |                            ON(os.id = media_source.os_id)
 58 |                            where os.id = %s GROUP BY os.name
 59 |                            """, (source.os_id,))
 60 |             num_systems = cursor.fetchone()[0]
 61 | 
 62 |             #this query will either insert a new entry into the table or update an existing ones
 63 |             #This will only get prevalence of files, NOT directories since all directories have the same zero
 64 |             #contents hash. We exclude dirs by checking file size != 0, though some dirs slip through with larger file sizes
 65 |             #query = """
 66 |             #    INSERT INTO global_file_prevalence(unique_file_id, count, num_systems, os_id)
 67 |             #    SELECT  t.unique_file_id, COUNT(unique_file_id) as count, t.num_systems, t.os_idd from
 68 |             #    (SELECT DISTINCT unique_file_id, media_source.id as src, s.os_idd, num_systems
 69 |             #    from file_metadata JOIN media_source ON (file_metadata.source_id = media_source.id)
 70 |             #    LEFT JOIN( select os.id as os_idd, os.name as os, COUNT(os.name) as num_systems
 71 |             #    from os LEFT JOIN media_source ON(os.id = media_source.os_id)
 72 |             #    WHERE os.id = {}  GROUP BY os.name ) s
 73 |             #    ON (s.os_idd = file_metadata.os_id) where media_source.id = {} AND file_metadata.unique_file_id is not null) t
 74 |             #    GROUP BY t.os_idd, t.unique_file_id
 75 |             #    ON DUPLICATE KEY UPDATE  count=count+1
 76 |             #""".format(source.os_id, source.source_id)
 77 | 
 78 |             cursor.execute("""
 79 |                            INSERT INTO global_file_prevalence(unique_file_id, count, num_systems, os_id)
 80 |                            SELECT t.unique_file_id, COUNT(unique_file_id)
 81 |                            as count, t.num_systems, t.os_idd from
 82 |                            (SELECT DISTINCT unique_file_id, media_source.id
 83 |                             as src, s.os_idd, num_systems
 84 |                             from file_metadata JOIN media_source
 85 |                             ON (file_metadata.source_id = media_source.id)
 86 |                             LEFT JOIN(select os.id as os_idd, os.name
 87 |                                       as os, COUNT(os.name) as num_systems
 88 |                                       from os LEFT JOIN media_source
 89 |                                       ON(os.id = media_source.os_id)
 90 |                                       WHERE os.id = %s  GROUP BY os.name) s
 91 |                             ON (s.os_idd = file_metadata.os_id)
 92 |                             where media_source.id = %s
 93 |                             AND file_metadata.unique_file_id is not null) t
 94 |                            GROUP BY t.os_idd, t.unique_file_id
 95 |                            ON DUPLICATE KEY UPDATE  count=count+1
 96 |                            """, (source.os_id, source.source_id,))
 97 | 
 98 |             #TODO: use a local variable for num_systems
 99 |             #query = """
100 |             #    UPDATE global_file_prevalence SET num_systems = {}, average =  (SELECT count/num_systems) where os_id = {}
101 |             #""".format(num_systems, source.os_id)
102 | 
103 |             cursor.execute("""
104 |                            UPDATE global_file_prevalence
105 |                            SET num_systems = %s, average =
106 |                            (SELECT count/num_systems) where os_id = %s
107 |                            """, (num_systems, source.os_id,))
108 | 
109 |             #get the prevalence of directories
110 |             #query = """
111 |             #    INSERT INTO global_dir_prevalence (unique_path_id, count, num_systems, os_id)
112 |             #        SELECT unique_path.id as path_id, COUNT(file_metadata.id) as count, t.num_systems, file_metadata.os_id
113 |             #        from unique_path LEFT JOIN file_metadata
114 |             #        ON file_metadata.unique_path_id = unique_path.id LEFT JOIN
115 |             #        (SELECT os.id as os_i, COUNT(media_source.id) as num_systems from os
116 |             #        LEFT JOIN media_source ON os.id = media_source.os_id
117 |             #        GROUP BY os.id) as t ON (file_metadata.os_id = t.os_i)
118 |             #        where file_metadata.file_name = '/' AND file_metadata.source_id = {}
119 |             #        GROUP BY file_metadata.os_id, unique_path.id
120 |             #        ON DUPLICATE KEY UPDATE count=count+1
121 |             #""".format(source.source_id)
122 | 
123 |             cursor.execute("""
124 |                            INSERT INTO global_dir_prevalence (unique_path_id, count, num_systems, os_id)
125 |                            SELECT unique_path.id as path_id,
126 |                            COUNT(file_metadata.id)
127 |                            as count, t.num_systems, file_metadata.os_id
128 |                            from unique_path LEFT JOIN file_metadata
129 |                            ON file_metadata.unique_path_id = unique_path.id
130 |                            LEFT JOIN (SELECT os.id as os_i,
131 |                                       COUNT(media_source.id)
132 |                                       as num_systems from os
133 |                                       LEFT JOIN media_source
134 |                                       ON os.id = media_source.os_id
135 |                                       GROUP BY os.id)
136 |                            as t ON (file_metadata.os_id = t.os_i)
137 |                            where file_metadata.file_name = '/'
138 |                            AND file_metadata.source_id = %s
139 |                            GROUP BY file_metadata.os_id, unique_path.id
140 |                            ON DUPLICATE KEY UPDATE count=count+1
141 |                            """, (source.source_id,))
142 | 
143 |             #query = """
144 |             #    UPDATE global_dir_prevalence SET num_systems = {}, average = (SELECT count/num_systems) where os_id = {}
145 |             #""".format(num_systems, source.os_id)
146 | 
147 |             cursor.execute("""
148 |                            UPDATE global_dir_prevalence
149 |                            SET num_systems = %s,
150 |                            average = (SELECT count/num_systems)
151 |                            where os_id = %s
152 |                            """, (num_systems, source.os_id,))
153 | 
154 |             self.cnx.commit()
155 | 
156 |         #TODO: There should be a better way for below code
157 |         print "[+] Rebuilding the aggregated prevalence table for directories"
158 | 
159 |         if len(sources) == 0:
160 |             return
161 | 
162 |         cursor.execute("DROP TABLE IF EXISTS global_dir_combined_prevalence")
163 | 
164 |         self.cnx.commit()
165 | 
166 |         query = """
167 |             CREATE TABLE IF NOT EXISTS global_dir_combined_prevalence (
168 |             unique_path_id INT UNSIGNED NOT NULL,
169 |             average DOUBLE NOT NULL DEFAULT .5,
170 |             PRIMARY KEY(unique_path_id),
171 |             CONSTRAINT fk_unique_path_idx3 FOREIGN KEY(unique_path_id)
172 |             REFERENCES unique_path (id)
173 |             ON DELETE NO ACTION ON UPDATE NO ACTION
174 |             ) ENGINE = InnoDB;
175 |         """
176 | 
177 |         cursor.execute(query)
178 |         self.cnx.commit()
179 | 
180 |         query = """
181 |             INSERT INTO global_dir_combined_prevalence
182 |             SELECT unique_path_id, avg(average) FROM file_metadata
183 |             INNER JOIN global_file_prevalence ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id
184 |                 where file_metadata.file_name != '/' GROUP BY unique_path_id
185 |         """
186 | 
187 |         cursor.execute(query)
188 |         self.cnx.commit()
189 | 
190 |     def clean(self):
191 |         """
192 |         Removes all required tables
193 |         """
194 | 
195 |         cursor = self.cnx.cursor()
196 |         cursor.execute("DROP TABLE IF EXISTS global_file_prevalence")
197 |         cursor.execute("DROP TABLE IF EXISTS global_dir_prevalence")
198 |         cursor.execute("DROP TABLE IF EXISTS global_dir_combined_prevalence")
199 |         self.cnx.commit()
200 | 
201 | 
202 |     def build(self):
203 |         """
204 |         Builds all required tables
205 |         """
206 |         cursor = self.cnx.cursor()
207 | 
208 |         query = """
209 |             CREATE TABLE IF NOT EXISTS global_file_prevalence (
210 |             unique_file_id BIGINT UNSIGNED NOT NULL,
211 |             average DOUBLE NOT NULL DEFAULT .5,
212 |             count INT NOT NULL DEFAULT 0,
213 |             num_systems INT NOT NULL DEFAULT 0,
214 |             os_id INT UNSIGNED NOT NULL,
215 |             PRIMARY KEY(unique_file_id, os_id),
216 |             INDEX idx_fp_average (average) USING BTREE,
217 |             INDEX fk_unique_file_idx1 (unique_file_id),
218 |             INDEX fk_os_id_idx1 (os_id),
219 |             CONSTRAINT fk_unique_file_idx1 FOREIGN KEY(unique_file_id)
220 |             REFERENCES unique_file (id)
221 |             ON DELETE NO ACTION ON UPDATE NO ACTION,
222 |             CONSTRAINT fk_os_id_idx1 FOREIGN KEY(os_id)
223 |             REFERENCES os (id)
224 |             ON DELETE NO ACTION ON UPDATE NO ACTION
225 |             ) ENGINE = InnoDB;
226 |         """
227 | 
228 |         cursor.execute(query)
229 | 
230 |         query = """
231 |             CREATE TABLE IF NOT EXISTS global_dir_prevalence (
232 |             unique_path_id INT UNSIGNED NOT NULL,
233 |             average DOUBLE NOT NULL DEFAULT .5,
234 |             count INT NOT NULL DEFAULT 0,
235 |             num_systems INT NOT NULL DEFAULT 0,
236 |             os_id INT UNSIGNED NOT NULL,
237 |             PRIMARY KEY(unique_path_id, os_id),
238 |             INDEX fk_unique_path_idx1 (unique_path_id),
239 |             INDEX fk_os_id_idx2 (os_id),
240 |             CONSTRAINT fk_unique_path_idx2 FOREIGN KEY(unique_path_id)
241 |             REFERENCES unique_path (id)
242 |             ON DELETE NO ACTION ON UPDATE NO ACTION,
243 |             CONSTRAINT fk_os_id_idx2 FOREIGN KEY(os_id)
244 |             REFERENCES os (id)
245 |             ON DELETE NO ACTION ON UPDATE NO ACTION
246 |             ) ENGINE = InnoDB;
247 |         """
248 | 
249 |         cursor.execute(query)
250 | 
251 |         self.cnx.commit()
252 |         cursor.close()
253 | 


--------------------------------------------------------------------------------
/images/logo/license/SIL_open_font_license.txt:
--------------------------------------------------------------------------------
  1 | SIL Open Font License (OFL)
  2 | 
  3 | General Information
  4 | 
  5 | Text of the SIL Open Font License
  6 | 
  7 | The OFL FAQ
  8 | 
  9 | Fonts licensed via the OFL
 10 | 
 11 | OFL Graphics
 12 | 
 13 | 
 14 |     Overview
 15 |     Documents
 16 |         Current version - 1.1
 17 |         Translations
 18 |     Using the OFL
 19 |     History
 20 |         Community review
 21 |         OFL fonts
 22 |     Details and rationale
 23 |         FLOSS-friendliness
 24 |             The 4 FSF Freedoms
 25 |             DFSG compatibility
 26 |             OSD compatibility
 27 |         "Human readable" version and visual representation
 28 |             Terminology
 29 |             Visual representation
 30 |             Attribution
 31 |             Share Alike
 32 |             Embedding
 33 |             DerivativeRenaming
 34 |             BundlingWhenSelling
 35 | 
 36 | 
 37 | 
 38 | New version of the OFL-FAQ available: version 1.1-update3
 39 | 
 40 | There is a new version of the OFL-FAQ (version 1.1-update3) available based on feedback from the wider open font design community. Various sections have been clarified but the main changes are mostly related to web font use and modification. There is also a separate discussion paper on Web Font and Reserved Font Names.
 41 | 
 42 | Overview
 43 | 
 44 | 
 45 | 
 46 | The SIL Open Font License (OFL) is a free, libre and open source license specifically designed for fonts and related software based on our experience in font design and linguistic software engineering.
 47 | 
 48 | The OFL provides a legal framework and infrastructure for worldwide development, sharing and improvement of fonts and related software in a collaborative manner. It enables font authors to release their work under a common license that allows use, bundling, modification and redistribution. It encourages shared value, is not limited to any specific computing platform or environment, and can be used by other organizations or individuals.
 49 | 
 50 | The OFL meets the specific needs of typographic design and engineering as well as the gold standards of the FLOSS (Free/Libre and Open Source Software) community, namely the cultural values and guidelines from the FSF 1, the Debian Free Software Guidelines2, as well as the Open Source Definition3. It draws inspiration from concepts and elements found in other licenses, but our improvements in the specific area of fonts have made the licensing model work better than other approaches currently in use.
 51 | 
 52 |  SIL International serves language communities worldwide, building their capacity for sustainable language development, by means of research, translation, training and materials development. We have been thinking about more open and participative models for a while, for example through our partnerships with UNESCO (Initiative B@bel) and our work on the Gentium typeface. See  www.sil.org/resources/software_fonts for a detailed list of free/libre and open source software resources provided by SIL.
 53 | 
 54 | We want to:
 55 | 
 56 | enable others to participate in our projects
 57 | enable others to cater to needs for which we don't have the resources
 58 | share our wealth of knowledge and experience in the area of writing systems and pass on our tools
 59 | equip the community to meet its font needs
 60 | We serve the peoples of the world without regard to their material wealth, so we are grateful to those that do fund our work. Please visit  Donate to SIL International for information on supporting our efforts.
 61 | 
 62 | Documents
 63 | 
 64 | We have gone through a lot of effort to make our license readable and easily understood by users, designers and software developers as well as package maintainers and distributors. To make the OFL even more human-readable, we have provided a FAQ (Frequently Asked Questions) to help everyone understand the intent and the practical aspects of using the license itself. Although it already covers many items, the FAQ will grow as needed. Please let us know if you have more questions.
 65 | 
 66 | Current version - 1.1
 67 | We recommend all authors use version 1.1 of the OFL, but version 1.0 is given here for reference. A full list of changes from 1.0 to 1.1 can be found on the OFL Review page. The most important change for authors is that no font names are reserved by default. Reserved Font Names must be explicitly listed alongside the copyright statement in the OFL header.
 68 | 
 69 | Format	OFL	OFL-FAQ
 70 | web (html)	OFL 1.1	OFL-FAQ 1.1-update3
 71 | plain text
 72 | 	OFL Plaintext
 73 | Nicolas Spalinger & Victor Gaultney, 2007-02-26
 74 | Download "OFL.txt", Text document, 5KB [45809 downloads]
 75 | 	OFL-FAQ Plaintext (1.1-update3)
 76 | Nicolas Spalinger & Victor Gaultney, 2013-09-19
 77 | Download "OFL-FAQ.txt", Text document, 57KB [19412 downloads]
 78 | OFL 1.1 Documents
 79 | 
 80 | Format	OFL	OFL-FAQ
 81 | web (html)	OFL 1.0	OFL-FAQ 1.0
 82 | plain text
 83 | 	OFL 1.0 Plaintext
 84 | Nicolas Spalinger & Victor Gaultney, 2005-11-22
 85 | Download "OFL10.txt", Text document, 4KB [10968 downloads]
 86 | 	OFL-FAQ 1.0 Plaintext
 87 | Nicolas Spalinger & Victor Gaultney, 2005-11-22
 88 | Download "ofl-faq10.txt", Text document, 18KB [8591 downloads]
 89 | OFL 1.0 Documents (for reference only)
 90 | 
 91 | Translations
 92 | We also recognise the need for people who are not familiar with English to be able to understand the OFL and this FAQ better - in their own language. If you are an experienced translator, you are very welcome to help by translating the OFL and its FAQ so that designers and users in your language community can understand the license better. But only the original English version of the license has legal value and has been approved by the community. Translations do not count as legal substitutes and should only serve as a way to explain the original license. SIL - as the author and steward of the license for the community at large - does not approve any translation of the OFL as legally valid because even small translation ambiguities could be abused and create problems.
 93 | 
 94 | We give permission to publish unofficial translations into other languages provided that they comply with the following guidelines:
 95 | 
 96 | 1) Put the following disclaimer in both English and the target language stating clearly that the translation is unofficial:
 97 | 
 98 | "This is an unofficial translation of the SIL Open Font License into $language. It was not published by SIL International, and does not legally state the distribution terms for fonts that use the OFL. A release under the OFL is only valid when using the original English text.
 99 | 
100 | However, we recognize that this unofficial translation will help users and designers not familiar with English to understand the SIL OFL better and make it easier to use and release font families under this collaborative font design model. We encourage designers who consider releasing their creation under the OFL to read the FAQ in their own language if it is available.
101 | Please go to  http://scripts.sil.org/OFL for the official version of the license and the accompanying FAQ."
102 | 
103 | 2) Keep your unofficial translation current and update it at our request if needed, for example, if there is any ambiguity which could lead to confusion.
104 | 
105 | If you start such a unofficial translation effort of the OFL and its accompanying FAQ please let us know, thank you.
106 | 
107 | Using the OFL
108 | 
109 | It is relatively simple to use the OFL for your own font project. If you are the copyright owner you only need to do the following:
110 | 
111 | Put your copyright and Reserved Font Names information at the beginning of the main OFL.txt file in place of the dedicated placeholders (marked with the <> characters). Include this file in your release package.
112 | Put your copyright and the OFL text with your chosen Reserved Font Name(s) into your font files (the copyright and license fields). A link to the OFL text on the OFL web site is an acceptable (but not recommended) alternative. Also add this information to any other components (build scripts, glyph databases, documentation, test files, etc). Accurate metadata in your font files is beneficial to you as an increasing number of applications are exposing this information to the user. For example, clickable links can bring users back to your website and let them know about other work you have done or services you provide. Depending on the format of your fonts and sources, you can use template human-readable headers or machine-readable metadata. You should also double-check that there is no conflicting metadata in the font itself contradicting the license, such as the fstype bits in the os2 table or fields in the name table.
113 | Write an initial FONTLOG.txt for your font and include it in the release package (see Section 6 and Appendix A of the OFL-FAQ for details including a template).
114 | Include the relevant practical documentation on the license by adding the current OFL-FAQ.txt file in your package.
115 | If you wish, you can use the OFL Graphics on your web page.
116 | More information can be found in the OFL-FAQ.
117 | 
118 | History
119 | 
120 | Current version: 1.1
121 | 
122 | 2013-09-19 - OFL-FAQ 1.1-update3.
123 | 
124 | 2013-05-17 - OFL-FAQ 1.1-update3-draft and discussion paper on Web Font and Reserved Font Names available for review and comment.
125 | 
126 | 2010-08-23 - OFL-FAQ 1.1-update2.
127 | 
128 | 2009-04-06 - OFL recognized as compliant with the OSD (Open Source Definition) by the OSI board and placed on their  list of approved licenses.
129 | 
130 | 2007-02-26 - Version 1.1 released.
131 | 
132 | 2006-03-18 - A minor revision of the OFL entered the review phase. OFL-1.1-review1 was followed by OFL-1.1-review2 a few months later.
133 | 
134 | 2006-01-23 - OFL recognized as a free license by the FSF (Free Software Foundation) on their  License List.
135 | 
136 | 2005-11-22 - Version 1.0 released.
137 | 
138 | 2005-11-07 - Version 1.0-review2 submitted to ofl-discuss.
139 | 
140 | 2005-09-07 - Version 1.0-review1 submitted to the first round of public reviewers.
141 | 
142 | Community review
143 | Between November 2005 and January 2007 the OFL was in a public review stage, with efforts going towards version 1.1. We selected a number of reviewers we felt were the relevant experts and sought their input. We submitted our draft for review and received very insightful feedback.
144 | 
145 | The review period is over and even though we feel version 1.1 will likely meet the needs for open font licensing for quite some time, we remain open to community feedback. Please contact us with your queries and suggestions.
146 | 
147 | Various font-related BoFs (Birds of a Feather meetings) have taken place at FLOSS conference (like Libre Graphics Meeting, Ubuntu Summit, GUADEC, DebConf, TextLayoutSummit among others) to discuss what would be needed to improve the font landscape. One key aspect was appropriate licensing of the fonts, flexibility to maintain and branch fonts without breaking rendering, interoperability across distributions, and the definition of a core set of fonts with recognized glyph quality, sufficient Unicode coverage and a good community-recognized license. The OFL has been recognised by many contributors to these discussion as a good solution for these issues.
148 | 
149 | The goals of the OFL and its methodology have been presented and discussed at major conferences from the type industry like  AtypI.
150 | 
151 | Open font-related presentation have also been made at  TUG (TeX User Group conferences).
152 | 
153 | There is a campaign with support from various key organisations in the FLOSS community (Unifont.org, Freedesktop.org, the GNOME foundation, KDE e.V., the Linux Foundation and the Free Software Foundation) to encourage more designers and supporting institutions to consider choosing the OFL for their font projects. Visit  Unifont.org/go_for_ofl for more details and ways you can participate.
154 | 
155 | The OFL is now well-established as the most widely used licensing model for releasing and developing unrestricted font software. It is being used successfully by various organisations, both for-profit and not-for-profit, to release fonts of varying levels of scope and complexity. A number of institutions have now made the OFL their default recommended license for fonts.
156 | 
157 | OFL fonts
158 | We intend to use the OFL for all our future font releases, and will re-release our existing and older font packages under the OFL as we have personnel time. The priority of older packages will depend on demand.
159 | 
160 | If you release (or intend to release) your font(s) under the OFL, let us know and we may place a link to the fonts on our OFL fonts page.
161 | 
162 | Details and rationale
163 | 
164 | FLOSS-friendliness
165 | The OFL is designed to be in tune with the FLOSS (Free/Libre and Open Source Software) culture. It builds upon good ideas already in existence in some free/libre and open projects but by bringing our extensive font design experience and linguistic software engineering know-how into the mix, we have produced a font-specific license which is simpler, more human-readable, neutral and reusable and dedicated to the needs of font creators.
166 | 
167 | The OFL authors were inspired by the partnership between  GNOME and  Bitstream for the  Vera family of fonts and the licensing model which was chosen. They have also studied the community impact and some of the difficulties faced by this model.
168 | 
169 | The 4 FSF Freedoms
170 | The OFL is listed and recognized as a valid Free Software license on the FSF  License List. It complies with the  Free Software Definition and its four foundational freedoms as defined by the Free Software Foundation for the GNU project:
171 | 
172 | Use: the freedom to use font software for any purpose. (freedom 0)
173 | Study and adaptation: the freedom to study how font software works, and adapt it to your needs (freedom 1). Access and rights to the source code is a precondition for this.
174 | Redistribution: the freedom to redistribute copies of the font software so you can help your neighbor (freedom 2).
175 | Improvement and redistribution of modifications: the freedom to improve the font software and release your improvements (freedom 3), so that the community benefits. Access and rights to the source code is a precondition for this.
176 | DFSG compatibility
177 | Font Software released under the OFL complies with the  Debian Free Software Guidelines:
178 | 
179 | reselling: DFSG #1
180 | source code redistribution: DFSG #2
181 | derivatives: DFSG #3
182 | "compromise" clause permitting name change: DFSG #4 (this is very important for font derivatives for artistic integrity and anti-collision purposes)
183 | no discrimination against people/groups: DFSG #5
184 | no discrimination against fields of endeavour: DFSG #6
185 | license distribution: DFSG #7
186 | non-Debian specific: DFSG #8
187 | no contamination of other software: DFSG #9
188 | Various font families under OFL have been accepted in the main archive of Debian (as well as Ubuntu) by the ftp-masters. An increasing number of Debian and Ubuntu developers are maintaining font packages under the OFL in main (the component of the archive which only holds Free/Libre and Open Source software).
189 | 
190 | OSD compatibility
191 | The OFL complies with the  Open Source Definition:
192 | 
193 | free redistribution: #1
194 | source code: #2
195 | derived works: #3
196 | integrity of the author(s) source code: #4 (with the possibility of requiring a name change)
197 | no discrimination against persons or groups: #5
198 | no discrimination against fields of endeavour: #6
199 | distribution of license: #7
200 | license must not be specific to a product: #8
201 | license must not restrict other software: #9
202 | license must be technology-neutral: #10
203 | The OSI (Open Source Initiative) has recognized the OFL's compliance with the OSD and placed it on their   list of approved licenses.
204 | 
205 | "Human readable" version and visual representation
206 | The spirit and working model of the OFL can be expressed in human-readable Creative Commons-like 4 terminology using the following permits / requires elements and visual representations:
207 | 
208 | Please note that this terminology and visual representation is simply an expression of the working model of the license and has no legal value in itself. It is designed to help you understand and use the Open Font License in a similar way to the OFL FAQ. It is always intended to link back to the full license text of the OFL. Please note that although the terminology and visual representation of the OFL is based on work by Creative Commons, the OFL is not officially affiliated with Creative Commons.
209 | 
210 | Terminology
211 | permits
212 | Distribution, Reproduction, Embedding, DerivativeWorks
213 | 
214 | requires
215 | Attribution, Notice, ShareAlike, DerivativeRenaming, BundlingWhenSelling
216 | 
217 | Visual representation
218 | 
219 | Human-readable representation
220 | 
221 | 
222 | 
223 | (the Distribution, Reproduction, DerivativeWorks and Notice elements are implied and not represented as icons).
224 | 
225 | This is what each icon means:
226 | 
227 | Attribution
228 | 
229 | requirement
230 | 
231 | The icon shows a person and represents the author(s).
232 | The requirement is for proper attribution of the author(s): name(s) and notice(s) must be preserved and abuse of the name(s) and reputation of the author(s) is forbidden.
233 | See condition 2) and 4) of the OFL.
234 | 
235 | Share Alike
236 | 
237 | requirement
238 | 
239 | The icon shows a cycle and represents the way font software can be re-used by all under equivalent terms.
240 | The requirement is for derivative works to remain under the same license to encourage fair collaboration and prevent anyone from locking away contributions.
241 | See condition 5) of the OFL
242 | 
243 | Embedding
244 | 
245 | permission
246 | 
247 | The icon shows a letter on a piece of paper and represents a font placed inside a document.
248 | The permission is for fonts to be embedded in any kind of document. This does not affect the licensing status of the document but makes it easier for documents to be used in different environments.
249 | See the first paragraph of the Permission and Conditions section as well as section 5) of the OFL.
250 | 
251 | DerivativeRenaming
252 | 
253 | requirement
254 | 
255 | The icon shows letters A and B close to each other representing a font (A) from which another font (B) of a different shape is derived. It refers to a derivative branched from the original font and bearing a new name.
256 | The requirement is for derivative fonts to be renamed to allow branching while retaining artistic integrity.
257 | See condition 3) of the OFL
258 | 
259 | BundlingWhenSelling
260 | 
261 | requirement
262 | 
263 | The icon shows a dollar sign between parentheses. The dollar sign represents money (although there are many other currencies in the world) and the parentheses refers to the bundling.
264 | The requirement is for fonts to be bundled with software when they are sold. Fonts cannot be sold on their own. Redistribution without selling is not restricted.
265 | See condition 1) of the OFL.
266 | 
267 | 
268 | 
269 | 1	The Free Software Foundation Licensing Lab:  www.fsf.org/licensing
270 | 2	The Debian Free Software Guidelines:  www.debian.org/social_contract
271 | 3	The Open Source Definition:  opensource.org/docs/definition.php
272 | 4	Creative Commons:  http://creativecommons.org/about/licenses/
273 | © 2003-2013 SIL International, all rights reserved, unless otherwise noted elsewhere on this page.
274 | Provided by SIL's Non-Roman Script Initiative. Contact us at nrsi@sil.org.
275 | 


--------------------------------------------------------------------------------
/Filters/locality_uniqueness.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Created on 19 October 2013
 20 | @author: Lab41
 21 | """
 22 | 
 23 | 
 24 | import time
 25 | import operator
 26 | import os
 27 | import numpy as np
 28 | import matplotlib.pyplot as plt
 29 | from collections import namedtuple, defaultdict
 30 | from hashlib import sha1
 31 | from scipy.cluster.vq import vq, kmeans, whiten
 32 | from redwood.filters.redwood_filter import RedwoodFilter
 33 | import calendar
 34 | import random
 35 | import warnings
 36 | from multiprocessing import Pool, Queue, Manager
 37 | import Queue
 38 | import redwood.helpers.core as core
 39 | import redwood.helpers.visual as visual
 40 | import shutil
 41 | 
 42 | warnings.filterwarnings('ignore')
 43 | 
 44 | #NOTE: the find_anomalies and do_eval functions are outside the class so that we
 45 | #can run them in parallel using the apply_async function for thread pools
 46 | 
 47 | SMALL_CLUSTERS_SCORE = .3
 48 | DEFAULT_NUM_CLUSTERS = 3
 49 | 
 50 | def find_anomalies(rows, sorted_results, code_count_dict):
 51 |     """
 52 |     Helper function that given a list of results from kmeans will assign
 53 |     scores to each file given their distance form their centroid
 54 | 
 55 |     :param rows: output rows to append to
 56 |     :param sorted_rows: results from kmeans sorted by first column of their code id
 57 |     :sorted_code_counts: centroids sorted by number of observations
 58 |     """
 59 |     #definitely want to adjust these distance thresholds
 60 |     distance_threshold0 = 1.0
 61 |     distance_threshold1 = 2.0
 62 |     distance_threshold2 = 5.0
 63 |     distance_threshold3 = 10.0
 64 | 
 65 |     #assign scores based on distance 
 66 |     for c, d, r in sorted_results:
 67 | 
 68 |         #get code count
 69 |         code_count = code_count_dict[c]
 70 | 
 71 |         #if a file belongs to a cluster with fewer than three elements, we automatically assign in lower score
 72 |         if code_count < 3:
 73 |             score = SMALL_CLUSTERS_SCORE
 74 |         elif d > distance_threshold3:
 75 |             score = .1
 76 |         elif d > distance_threshold2:
 77 |             score = .2
 78 |         elif d> distance_threshold1:
 79 |             score = .3
 80 |         elif d> distance_threshold0:
 81 |             score = .4
 82 |         else:
 83 |             score = .8
 84 |         file_metadata_id = r[0]
 85 |         rows.put((file_metadata_id, score))
 86 | 
 87 | 
 88 | def do_eval(rows, full_path, files, num_clusters, num_features):
 89 |     """
 90 |     Helper function that analyzes a directory, looking for outliers in clusters based on the input features.
 91 |     Currently, only two static features are analyzed, however future versions could allow
 92 |     for selectable set of features
 93 | 
 94 |     :param rows: output variable to append results to
 95 |     :param full_path: the path that is being analyzed
 96 |     :param files: meta data for files in the directory
 97 |     :param num_clusters: number of clusters to specify for kmeans
 98 |     :param num_features: number of features included
 99 | 
100 |     :return: nothing... use the rows input as an output to append to
101 |     """
102 | 
103 |     num_obs = len(files)
104 | 
105 |     #if the number of observations is less than the num_clusters we do not cluster
106 |     #but rather give each file SMALL CULSTER SCORE
107 |     if(num_obs < num_clusters):
108 |         for f in files:
109 |             rows.put((f[0], SMALL_CLUSTERS_SCORE))
110 |         return
111 | 
112 |     #zero out the two dimensional array
113 |     observations = np.zeros((num_obs, num_features))
114 | 
115 |     i = 0
116 | 
117 |     #transfer the observations to the numpy array
118 |     for file_metadata_id,mod_date,full_path,file_name,inode,parent_id, in files:
119 |         seconds = calendar.timegm(mod_date.utctimetuple())
120 |         observations[i] = (inode, seconds)
121 |         i += 1
122 | 
123 |     #normalize the observations
124 |     whitened = whiten(observations)
125 | 
126 |     #get the centroids (aka codebook)
127 |     codebook,_ = kmeans(whitened, num_clusters)
128 | 
129 |     #sometimes if all observations for a given feature are the same
130 |     #the centroids will not be found. In that case we give a neutral score
131 |     if len(codebook) != num_clusters:
132 |         for f in files:
133 |             rows.put((f[0], .5))
134 |         return
135 | 
136 |     #calulate the distances
137 |     code, dist = vq(whitened, codebook)
138 | 
139 | 
140 |     d = defaultdict(int)
141 | 
142 |     #quick way to get count of cluster sizes
143 |     for c in code:
144 |         d[c] += 1
145 | 
146 |     #combine the results with the original data, then sort by the code
147 |     combined = zip(code, dist, files)
148 |     sorted_results =  sorted(combined, key=lambda tup: tup[0])
149 | 
150 |     find_anomalies(rows, sorted_results, d)
151 | 
152 | 
153 | 
154 | 
155 | class LocalityUniqueness(RedwoodFilter):
156 |     """
157 |     LocalityUniqueness seeks to identify anomalies through clustering of file features in a given directory. The
158 |     assumption is that files of interest are those that are different than most of their neighbors in a given
159 |     domain -- this case being the directory.  As a result, this filter is responsible for giving outliers of clusters
160 |     lower reputation scores than those files closer to the centroid
161 |     """
162 | 
163 |     def __init__(self, cnx=None):
164 |         self.score_table = "lu_scores"
165 |         self.name = "Locality_Uniqueness"
166 |         self.cnx = cnx
167 | 
168 |     def usage(self):
169 |         """
170 |         Prints the usage statements for the discovery functions
171 |         """
172 |         print "[*] evaluate_dir [full_path] [source] [clusters]"
173 |         print "\t|- runs kmeans and shows scatter plot"
174 |         print "\t| [full_path]  - path to analyze"
175 |         print "\t| [source]     - source where the path exists"
176 |         print "\t| [clusters]   - number of clusters to use"
177 | 
178 |     def update(self, source):
179 |         """
180 |         Applies the Locality Uniqueness filter to the given source, updating existing data
181 |         analyzed from previous sources. Currently the update function uses 3 clusters for clustering
182 |         analysis.  This will be dynamic in future versions.
183 | 
184 |         :param source: media source name
185 |         """
186 |         print "[+] Locality Uniqueness Filter running on {}".format(source)
187 |         self.build()
188 |         self.evaluate_source(source)
189 | 
190 |     def evaluate_source(self, source_name, num_clusters=DEFAULT_NUM_CLUSTERS):
191 |         """
192 |         Evaluates and scores a given source with a specified number of clusters for kmeans. Currently
193 |         this function uses two set features as inputs (modification time and inode number), however
194 |         futures versions will allow for dynamic feature inputs
195 | 
196 |         :param source_name: media source name
197 |         :param num_clusters: number of clusters to input into kmeans (Default: 3)
198 |         """
199 | 
200 |         cursor = self.cnx.cursor()
201 |         src_info = core.get_source_info(self.cnx, source_name)
202 | 
203 |         #returns all files sorted by directory for the given source
204 |         #query = """
205 |         #    SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash
206 |         #    FROM joined_file_metadata
207 |         #    where source_id = {} order by parent_id asc
208 |         #    """.format(src_info.source_id)
209 | 
210 |         cursor.execute("""
211 |                        SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash
212 |                        FROM joined_file_metadata
213 |                        where source_id = %s order by parent_id asc
214 |                        """, (src_info.source_id,))
215 | 
216 |         files = list()
217 | 
218 |         print "...Beginning clustering analysis"
219 |         pool = Pool(processes=4)              # start 4 worker processes
220 |         manager = Manager()
221 |         rows = manager.Queue()
222 |         is_first = True
223 | 
224 |         parent_id_prev = None
225 |         #should iterate by dir of a given source at this point
226 |         for(file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash_val) in cursor:
227 | 
228 |             if is_first is True:
229 |                 is_first = False
230 |                 parent_id_prev = parent_id
231 | 
232 |             #if parent_id is diff than previous, we are in new directory, so pack it up for analysis
233 |             if parent_id_prev != parent_id:
234 | 
235 |                 parent_id_prev = parent_id
236 | 
237 |                 if len(files) > 0:
238 |                     pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2])
239 |                     files = list()
240 | 
241 |             #make sure to omit directories from the clustering analy
242 |             if file_name != '/' and hash_val != "":
243 |                 files.append((file_metadata_id, last_modified, full_path,file_name, filesystem_id, parent_id))
244 | 
245 |         if len(files) > 0:
246 |             pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2])
247 | 
248 |         pool.close()
249 |         pool.join() 
250 | 
251 |         input_rows = []
252 |         count = 0
253 |         while rows.empty() is False:
254 |             curr = rows.get()
255 |             input_rows.append(curr)
256 |             count +=1
257 |             if count % 50000 is 0:
258 |                 print "...sending {} results to server".format(len(input_rows))
259 |                 cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows)
260 |                 input_rows = []
261 |                 count=0
262 |         print "...sending {} results to server".format(len(input_rows))
263 | 
264 |         cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows)
265 |         self.cnx.commit()
266 |         #need to drop the lu_scores and recalculate
267 |         cursor.execute("drop table if exists lu_scores")
268 | 
269 |         query = ("""CREATE TABLE IF NOT EXISTS `lu_scores` (
270 |                 `id` bigint(20) unsigned NOT NULL,
271 |                 `score` double DEFAULT NULL,
272 |                 KEY `fk_unique_file0_id` (`id`),
273 |                 CONSTRAINT `fk_unique_file0_id` FOREIGN KEY (`id`)
274 |                 REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
275 |                         ) ENGINE=InnoDB""")
276 | 
277 |         cursor.execute(query)
278 | 
279 |         print "...updating scores on the server"
280 |         query = """
281 |             INSERT INTO lu_scores
282 |             (SELECT file_metadata.unique_file_id, avg(locality_uniqueness.score) FROM
283 |             locality_uniqueness LEFT JOIN file_metadata on (locality_uniqueness.file_metadata_id = file_metadata.id)
284 |             WHERE file_metadata.unique_file_id is not null
285 |             GROUP BY file_metadata.unique_file_id)
286 |             """
287 | 
288 |         cursor.execute(query)
289 |         self.cnx.commit()
290 | 
291 | 
292 |     def clean(self):
293 |         """
294 |         Removes all tables associated with this filter
295 |         """
296 | 
297 |         cursor = self.cnx.cursor()
298 |         cursor.execute("DROP TABLE IF EXISTS lu_scores")
299 |         cursor.execute("DROP TABLE IF EXISTS locality_uniqueness")
300 |         self.cnx.commit()
301 | 
302 | 
303 |     def build(self):
304 |         """
305 |         Build all persistent tables associated with this filter
306 |         """
307 |         cursor = self.cnx.cursor()
308 | 
309 |         query = """
310 |             CREATE table IF NOT EXISTS locality_uniqueness (
311 |             file_metadata_id BIGINT unsigned unique,
312 |             score DOUBLE NOT NULL,
313 |             PRIMARY KEY(file_metadata_id),
314 |             INDEX lu_score (score ASC),
315 |             CONSTRAINT fk_file_metadata11 FOREIGN KEY (file_metadata_id)
316 |             REFERENCES file_metadata (id)
317 |             ON DELETE NO ACTION ON UPDATE NO ACTION
318 |             ) ENGINE = InnoDB;
319 |         """
320 | 
321 |         cursor.execute(query)
322 | 
323 |         self.cnx.commit()
324 | 
325 |     ##################################################
326 |     #
327 |     #       DISCOVERY FUNCTIONS
328 |     #
329 |     ##################################################
330 | 
331 |     def discover_evaluate_dir(self, dir_name, source, num_clusters=DEFAULT_NUM_CLUSTERS):
332 |         """
333 |         Discovery function that applies kmeans clustering to a specified directory, displays
334 |         the resulting scatter plot with the clusters, and then prints out an ordered list of
335 |         the file by the distance from their respective centroid. Currently,
336 |         this function uses two static features of "modification date" and "inode number" but
337 |         future versions will allow for dynamic features inputs.
338 | 
339 |         :param dir_name: directory name to be analyzed (Required)
340 |         :source: source name to be analzyed (Required)
341 |         :num_clusters: specified number of clusters to use for kmeans (Default: 3)
342 |         """
343 | 
344 |         num_features = 2
345 |         num_clusters = int(num_clusters)
346 |         cursor = self.cnx.cursor()
347 | 
348 |         if(dir_name.endswith('/')):
349 |             dir_name = dir_name[:-1]
350 | 
351 |         print "...Running discovery function on source {} at directory {}".format(source, dir_name)
352 | 
353 |         src_info = core.get_source_info(self.cnx, source)
354 |         if src_info is None:
355 |             print "Error: Source {} does not exist".format(source)
356 |             return
357 | 
358 |         #grab all files for a particular directory from a specific source
359 |         hash_val = sha1(dir_name).hexdigest()
360 | 
361 |         #query = """
362 |         #    SELECT file_name, file_metadata_id, filesystem_id, last_modified
363 |         #    FROM joined_file_metadata
364 |         #    WHERE source_id ='{}' AND path_hash = '{}' AND file_name !='/'
365 |         #    """.format(src_info.source_id, hash_val)
366 | 
367 |         cursor.execute("""
368 |                        SELECT file_name, file_metadata_id, filesystem_id, last_modified
369 |                        FROM joined_file_metadata
370 |                        WHERE source_id = %s AND path_hash = %s AND file_name !='/'
371 |                        """, (src_info.source_id, hash_val,))
372 | 
373 |         #bring all results into memory
374 |         sql_results = cursor.fetchall()
375 | 
376 |         if(len(sql_results) == 0):
377 |             return
378 | 
379 |         print "...Found {} files in specified directory".format(len(sql_results))
380 |         print "...Will form into {} clusters".format(num_clusters)
381 |         if num_clusters > len(sql_results):
382 |             print "Number of clusters ({}) exceeds number of files ({})".format(num_clusters, len(sql_results))
383 |             num_clusters = len(sql_results)
384 |             print "Number of clusters is now: {}".format(num_clusters)
385 | 
386 | 
387 |         #zero out the array that will contain the inodes
388 |         filesystem_id_arr = np.zeros((len(sql_results), num_features))
389 | 
390 |         i = 0
391 |         for _, _,inode, mod_date in sql_results:
392 |             seconds = calendar.timegm(mod_date.utctimetuple())
393 |             filesystem_id_arr[i] = (inode, seconds)
394 |             i += 1
395 |         whitened = whiten(filesystem_id_arr)
396 |         #get the centroids
397 |         codebook,_ = kmeans(whitened, num_clusters)
398 |         code, dist = vq(whitened, codebook)
399 |         d = defaultdict(int)
400 | 
401 |         #quick way to get count of cluster sizes
402 |         for c in code:
403 |             d[c] += 1
404 | 
405 |         #sorts the codes and sql_results together as pairs
406 |         combined = zip(dist, code, sql_results)
407 | 
408 |         #sort results by distances from centroid
409 |         sorted_results =  sorted(combined, key=lambda tup: tup[0])
410 | 
411 |         for dist_val, c, r in sorted_results:
412 |             print "Dist: {} Cluster: {}  Data: {}".format(dist_val,c,r)
413 | 
414 | 
415 |         if codebook is None or len(codebook) == 0:
416 |             print "Data is not suitable for visualization"
417 |             return
418 | 
419 |         visual.visualize_scatter(d, code, whitened, codebook, num_clusters, "inode number", "modification datetime", dir_name)
420 | 
421 | 
422 |     ##################################################
423 |     #
424 |     #       SURVEY
425 |     #
426 |     ##################################################
427 | 
428 |     def run_survey(self, source_name):
429 |         """
430 |         Runs survey for this filter capturing discovery functions and reputation score results
431 | 
432 |         :param source_name: name of the source to survey
433 |         :return survey_dir: location where survey results were saved
434 |         """
435 | 
436 |         print "...running survey for {}".format(self.name)
437 | 
438 |         resources = "resources"
439 |         survey_file = "survey.html"
440 |         survey_dir = "survey_{}_{}".format(self.name, source_name)
441 | 
442 |         resource_dir = os.path.join(survey_dir, resources) 
443 |         html_file = os.path.join(survey_dir, survey_file)
444 | 
445 |         try:
446 |             shutil.rmtree(survey_dir)
447 |         except:
448 |             pass
449 | 
450 |         os.mkdir(survey_dir)
451 |         os.mkdir(resource_dir)
452 | 
453 |         results = self.show_results("bottom", 100, source_name, None)
454 | 
455 |         with open(html_file, 'w') as f:
456 | 
457 |             f.write("""
458 |             <html>
459 |             <head>
460 |             <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css">
461 |             </head>
462 |             <body>
463 |             <h2 class="redwood-title">Locality Uniqueness Snapshot</h2>
464 |             """)
465 |             f.write("<h3 class=\"redwood-header\">The lowest 100 reputations for this filter</h3>")
466 |             f.write("<table border=\"1\" id=\"redwood-table\">")
467 |             f.write("<thead>")
468 |             f.write("<tr><th class=\"rounded-head-left\">Score</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr>")
469 |             f.write("</thead><tbody>")
470 |             i = 0
471 |             lr = len(results)
472 |             for r in results:
473 |                 if i == lr - 1:
474 |                     f.write("</tbody><tfoot>")
475 |                     f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2]))
476 |                 else:
477 |                     f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
478 |                 i += 1
479 |             f.write("</table>")
480 | 
481 |             f.write("</body></html>")
482 |             return survey_dir
483 | 


--------------------------------------------------------------------------------
/Filters/filter_prevalence.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | 
 20 | Created on 19 October 2013
 21 | @author: Lab41
 22 | """
 23 | 
 24 | import os
 25 | import numpy as np
 26 | import matplotlib.pyplot as plt
 27 | import redwood.helpers.core as core
 28 | import shutil
 29 | 
 30 | from redwood.filters.redwood_filter import RedwoodFilter
 31 | 
 32 | class FilterPrevalence(RedwoodFilter):
 33 |     """
 34 |     This filter provides analysis and scoring based on the prevalence of files and directories across sources. The general idea is that a file with a higher prevalence would have a higher reputation than a file that occurs less often.
 35 |     """
 36 | 
 37 |     def __init__(self, cnx=None):
 38 |         self.name = "Prevalence"
 39 |         self.score_table = "fp_scores"
 40 |         self.cnx = cnx
 41 | 
 42 |     def usage(self):
 43 |         """
 44 |         Prints the usage statement
 45 |         """
 46 | 
 47 |         print "[+] histogram_by_source <source_name>"
 48 |         print "---view histogram of file distribution for a single source with name <source_name>"
 49 |         print "\t- source_name: name of the source"
 50 |         print "[+] histogram_by_os <os_name>"
 51 |         print "---view file distribution for an os"
 52 |         print "\t- os_name: name of the os"
 53 |         print "[+] detect_anomalies <source_name> <out_file>"
 54 |         print "---view the top anomalies for the given source"
 55 |         print "\t-out_file:  file to write results to"
 56 | 
 57 | 
 58 |     def update(self, source):
 59 |         """
 60 |         Updates the scores of the fp_scores table with the new data from the inputted source
 61 | 
 62 |         :param source: identifier for the source to be updated
 63 |         """
 64 | 
 65 |         print "[+] Prevalence Filter running on {} ".format(source)
 66 | 
 67 |         #creates the basic tables if they do not exist
 68 |         self.build()
 69 | 
 70 |         cursor = self.cnx.cursor()
 71 | 
 72 |         src_info = core.get_source_info(self.cnx, source)
 73 | 
 74 |         if src_info is None:
 75 |             print "Error: Source {} not found".format(source)
 76 |             return
 77 | 
 78 |         #initial insert
 79 |         #query = """
 80 |         #    INSERT INTO  fp_scores(id, score)
 81 |         #    SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average)
 82 |         #    FROM global_file_prevalence JOIN file_metadata
 83 |         #    ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id
 84 |         #    where file_metadata.source_id = {}
 85 |         #    ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average)
 86 |         #""".format(src_info.source_id)
 87 | 
 88 |         cursor.execute("""
 89 |                        INSERT INTO  fp_scores(id, score)
 90 |                        SELECT global_file_prevalence.unique_file_id,
 91 |                        IF(num_systems < 3, .5, average)
 92 |                        FROM global_file_prevalence JOIN file_metadata
 93 |                        ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id
 94 |                        where file_metadata.source_id = %s
 95 |                        ON DUPLICATE KEY UPDATE score =
 96 |                        IF(num_systems < 3, .5, average)
 97 |                        """, (src_info.source_id,))
 98 |         self.cnx.commit()
 99 | 
100 |         #adjustment for low outliers in high prevalent directories... This could probably better be done with taking the std dev of each
101 |         #dir, but his will have to work for now.  
102 |         #query = """
103 |         #    UPDATE  global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
104 |         #    LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
105 |         #    LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
106 |         #    LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id
107 |         #    SET fp_scores.score = fp_scores.score * .5
108 |         #    where file_metadata.source_id = {} AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2
109 |         #    and global_dir_combined_prevalence.average - global_file_prevalence.average > .6
110 |         #""".format(src_info.source_id)
111 | 
112 |         cursor.execute("""
113 |                        UPDATE  global_file_prevalence left join file_metadata
114 |                        ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
115 |                        LEFT JOIN global_dir_prevalence
116 |                        on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
117 |                        LEFT JOIN global_dir_combined_prevalence
118 |                        on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
119 |                        LEFT JOIN fp_scores
120 |                        ON fp_scores.id = global_file_prevalence.unique_file_id
121 |                        SET fp_scores.score = fp_scores.score * .5
122 |                        where file_metadata.source_id = %s
123 |                        AND global_file_prevalence.count = 1
124 |                        and global_file_prevalence.num_systems > 2
125 |                        and global_dir_combined_prevalence.average - global_file_prevalence.average > .6
126 |                        """, (src_info.source_id,))
127 |         self.cnx.commit()
128 | 
129 |         #adjustments for low prevalent scored directories which occur often... hopefully this will exclude the caches
130 |         #query = """
131 |         #    UPDATE file_metadata
132 |         #    LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
133 |         #    LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id
134 |         #    LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id
135 |         #    SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score
136 |         #    where file_metadata.source_id = {} AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5
137 |         #""".format(src_info.source_id)
138 | 
139 |         cursor.execute("""
140 |                        UPDATE file_metadata
141 |                        LEFT JOIN global_dir_prevalence
142 |                        ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
143 |                        LEFT JOIN global_dir_combined_prevalence
144 |                        ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id
145 |                        LEFT JOIN fp_scores
146 |                        ON file_metadata.unique_file_id = fp_scores.id
147 |                        SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score
148 |                        where file_metadata.source_id = %s
149 |                        AND global_dir_prevalence.average > .8
150 |                        AND global_dir_combined_prevalence.average < .5
151 |                        """, (src_info.source_id,))
152 |         self.cnx.commit()
153 |         cursor.close()
154 | 
155 |     def clean(self):
156 |         """
157 |         Cleans all tables associated with this filter
158 |         """
159 |         cursor = self.cnx.cursor() 
160 |         cursor.execute("DROP TABLE IF EXISTS fp_scores")
161 |         self.cnx.commit()
162 | 
163 |     def build(self):
164 |         """
165 |         Builds all persistent tables associated with this filter
166 |         """
167 | 
168 |         cursor = self.cnx.cursor()
169 | 
170 |         query = """
171 |             CREATE TABLE IF NOT EXISTS `fp_scores` (
172 |             id BIGINT UNSIGNED NOT NULL,
173 |             score double DEFAULT NULL,
174 |             PRIMARY KEY(id),
175 |             CONSTRAINT `fk_unique_file1_id` FOREIGN KEY (`id`)
176 |             REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
177 |                      ) ENGINE=InnoDB
178 |         """
179 | 
180 |         cursor.execute(query)
181 |         self.cnx.commit()
182 |         cursor.close()
183 | 
184 | 
185 | 
186 |     ##################################################
187 |     #
188 |     #       DISCOVERY FUNCTIONS
189 |     #
190 |     ##################################################
191 | 
192 |     def discover_histogram_by_os(self, os_name, output=None):
193 |         """
194 |         Displays a histogram of the file distributions across all systems
195 |         of the specified OS
196 | 
197 |         :param os_name: name of the operating system
198 |         :param output: (optional) output filename in PNG format
199 |         """
200 | 
201 |         print '[+] Running \"Histogram by OS\"..."'
202 |         cursor = self.cnx.cursor()
203 |         
204 |         num_systems = core.get_num_systems(self.cnx, os_name)
205 | 
206 |         print "NUM: {}".format(num_systems)
207 |         if num_systems is None or num_systems == 0:
208 |             print "Error: OS {} does not exist".format(os_name)
209 |             return
210 | 
211 |         bins = range(1, num_systems+2)
212 | 
213 |         #query = """
214 |         #    SELECT COUNT(file_metadata.os_id), global_file_prevalence.count FROM global_file_prevalence
215 |         #    LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
216 |         #    WHERE file_metadata.os_id = (SELECT os.id FROM os WHERE os.name = "{}")
217 |         #    GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC;
218 |         #""".format(os_name)
219 | 
220 |         cursor.execute("""
221 |                        SELECT COUNT(file_metadata.os_id), global_file_prevalence.count
222 |                        FROM global_file_prevalence
223 |                        LEFT JOIN file_metadata
224 |                        ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
225 |                        WHERE file_metadata.os_id =
226 |                        (SELECT os.id FROM os WHERE os.name = %s)
227 |                        GROUP BY global_file_prevalence.count
228 |                        ORDER BY global_file_prevalence.count ASC;
229 |                        """, (os_name,))
230 |         data = cursor.fetchall()
231 |         counts, ranges = zip(*data)
232 | 
233 |         fig = plt.figure()
234 |         perc = int( float(sum(counts[1:])) / sum(counts) * 100)
235 |         ax = fig.add_subplot(111, title="File Prevalence of {} with {}% > 1".format(os_name, perc))
236 |         ax.hist(ranges, weights=counts, bins = bins)
237 |         ax.set_xlabel("Num of Systems")
238 |         ax.set_ylabel("File Occurrences")
239 | 
240 |         if output is None:
241 |             plt.show()
242 |         else:
243 |             print "Saving histogram to {}".format(output)
244 |             plt.savefig(output)
245 | 
246 |     def discover_histogram_by_source(self, source_name, output=None):
247 |         """
248 |         Displays a histogram of the file distribution of a single source as it relates
249 |         to all occurrences of that file across all systems
250 | 
251 |         :param source_name: The name of the source
252 |         :param output: (optional) output filename in PNG format
253 |         """
254 | 
255 |         print '[+] Running \"Histogram by Source\"...'
256 |         cursor = self.cnx.cursor()
257 | 
258 |         src_info = core.get_source_info(self.cnx, source_name)
259 | 
260 |         if src_info is None:
261 |             print "Source {} does not exist".format(source_name)
262 |             return
263 | 
264 |         num_systems = core.get_num_systems(self.cnx, src_info.os_id)
265 |         bins = range(1, num_systems+2)
266 | 
267 |         #query = """
268 |         #    SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence
269 |         #    LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
270 |         #    WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}")
271 |         #    GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC;
272 |         #""".format(source_name)
273 | 
274 |         cursor.execute("""
275 |                        SELECT COUNT(file_metadata.id), global_file_prevalence.count
276 |                        FROM global_file_prevalence
277 |                        LEFT JOIN file_metadata
278 |                        ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
279 |                        WHERE file_metadata.source_id =
280 |                        (SELECT media_source.id
281 |                         FROM media_source
282 |                         WHERE media_source.name = %s)
283 |                        GROUP BY global_file_prevalence.count
284 |                        ORDER BY global_file_prevalence.count ASC;
285 |                        """, (source_name,))
286 | 
287 |         data = cursor.fetchall()
288 | 
289 |         if data == None or len(data) is 0:
290 |             return
291 | 
292 |         counts, ranges = zip(*data)
293 | 
294 |         fig = plt.figure()
295 |         perc = int( float(sum(counts[1:])) / sum(counts) * 100)
296 |         ax = fig.add_subplot(111, title="File Prevalence of {} with {}% > 1".format(src_info.source_name, perc))
297 |         ax.hist(ranges, weights=counts, bins = bins)
298 |         ax.set_xlabel("Num of Systems")
299 |         ax.set_ylabel("File Occurrences")
300 | 
301 |         if output is None:
302 |             plt.show()
303 |         else:
304 |             print "Saving histogram to {}".format(output)
305 |             plt.savefig(output)
306 | 
307 |     def discover_detect_anomalies(self, source, out=None):
308 |         """
309 |         Conducts an anomaly search on a given source
310 | 
311 |         :param source: source
312 |         :param out: output file (optional)
313 |         """
314 | 
315 |         cursor = self.cnx.cursor()
316 | 
317 |         src_info = core.get_source_info(self.cnx, source)
318 | 
319 |         if src_info is None:
320 |             print "*** Error: Source not found"
321 |             return
322 | 
323 |         #anomaly type:  low prevalence files in normally high prevalence directories
324 |         print "...Anomaly Detection: Unique files in common areas"
325 | 
326 |         #query = """
327 |         #    SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference,
328 |         #    unique_path.full_path, file_metadata.file_name
329 |         #    FROM global_file_prevalence
330 |         #    LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
331 |         #    LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
332 |         #    LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id
333 |         #    where file_metadata.source_id = {}
334 |         #    HAVING difference > 0
335 |         #    ORDER BY difference desc limit 0, 100
336 |         #""".format(src_info.source_id)
337 | 
338 |         cursor.execute("""
339 |                        SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference,
340 |                        unique_path.full_path, file_metadata.file_name
341 |                        FROM global_file_prevalence
342 |                        LEFT JOIN file_metadata
343 |                        ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
344 |                        LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
345 |                        LEFT JOIN unique_path
346 |                        ON file_metadata.unique_path_id = unique_path.id
347 |                        where file_metadata.source_id = %s
348 |                        HAVING difference > 0
349 |                        ORDER BY difference desc limit 0, 100
350 |                        """, (src_info.source_id,))
351 | 
352 |         if out is None:
353 |             results = cursor.fetchall()
354 |             if results is None or len(results) == 0:
355 |                 print "No anomalies found"
356 |             else:
357 |                 print "Showing top {} results".format(len(results))
358 |                 for x in results:
359 |                     print x
360 |             return results
361 | 
362 |         print "Writing results to {}".format(out)
363 | 
364 |         with open(out, "w") as f:
365 |             v=0
366 |             for x in cursor.fetchall():
367 |                 f.write("{}: {}    {}/{}\n".format(v, x[0], x[1], x[2]))
368 |                 v+=1
369 | 
370 |         cursor.close()
371 | 
372 |     def run_survey(self, source_name):
373 | 
374 |         print "...running survey for {}".format(self.name)
375 | 
376 |         resources = "resources"
377 |         img_by_src = "hist_by_src.png"
378 |         img_by_os = "hist_by_os.png"
379 |         survey_file = "survey.html"
380 |         survey_dir = "survey_{}_{}".format(self.name, source_name)
381 | 
382 | 
383 |         resource_dir = os.path.join(survey_dir, resources) 
384 |         html_file = os.path.join(survey_dir, survey_file)
385 | 
386 |         try:
387 |             shutil.rmtree(survey_dir)
388 |         except:
389 |             pass
390 | 
391 |         os.mkdir(survey_dir)
392 |         os.mkdir(resource_dir)
393 | 
394 |         src_info = core.get_source_info(self.cnx, source_name)
395 | 
396 |         self.discover_histogram_by_source(source_name, os.path.join(resource_dir, img_by_src))
397 |         self.discover_histogram_by_os(src_info.os_name, os.path.join(resource_dir, img_by_os))
398 |         anomalies = self.discover_detect_anomalies(source_name, None)
399 |         results = self.show_results("bottom", 100, source_name, None)
400 | 
401 | 
402 |         with open(html_file, 'w') as f:
403 |             f.write("""
404 |             <html>
405 |             <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css">
406 |             <h2 class="redwood-title">Filter Prevalence Snapshot</h2>
407 |             <body>
408 |                 <h3 class="redwood-header">Histogram for {}</h3>
409 |                 <img src="{}">
410 |                 <h3 class="redwood-header">Histogram for Operating System - {}</h3>
411 |                 <img src="{}">
412 |             """.format( source_name,
413 |                         os.path.join(resources, img_by_src),
414 |                         src_info.os_name,
415 |                         os.path.join(resources, img_by_os)
416 |                         ))
417 | 
418 |             f.write("<h3 class=\"redwood-header\">The lowest 100 reputations for this filter</h3>")
419 |             f.write("<table border=\"1\" id=\"redwood-table\">")
420 |             f.write("<thead><tr><th class=\"rounded-head-left\">Score</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>")
421 |             i = 0
422 |             lr = len(results)
423 |             for r in results:
424 |                 if i == lr - 1:
425 |                     f.write("</tbody><tfoot>")
426 |                     f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2]))
427 |                 else:
428 |                     f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
429 |                 i += 1
430 |             f.write("</table>") 
431 | 
432 |             f.write("<h3 class=\"redwood-header\">The top 100 anomalous files</h3>")
433 |             f.write("<table border=\"1\" id=\"redwood-table\">")
434 |             f.write("<thead><tr><th class=\"rounded-head-left\">Anomaly Value</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>")
435 |             i = 0
436 |             lr = len(anomalies)
437 |             for r in anomalies:
438 |                 if i == lr - 1:
439 |                     f.write("</tbody><tfoot>")
440 |                     f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2]))
441 |                 else:
442 |                     f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
443 |                 i += 1
444 |             #for r in anomalies:
445 |             #    f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
446 |             f.write("</table>") 
447 |         return survey_dir
448 | 


--------------------------------------------------------------------------------