├── img ├── filters.png ├── inspector.png ├── partition.png ├── node_colors.png └── give_colors_to_nodes.png ├── doc ├── Files Graph.png ├── Flow Graph.png ├── HTTP Graph.png ├── Flow Graph.graffle ├── HTTP Graph.graffle ├── Files Graph.graffle ├── DNS Transaction Graph.png └── DNS Transaction Graph.graffle ├── gh ├── __init__.py ├── host.py ├── account.py ├── file.py ├── flow.py ├── dns.py ├── http.py ├── connect.py └── util.py ├── db_stats.py ├── .gitignore ├── db_clear.py ├── db_path.py ├── groovy └── gremlin.groovy ├── db_graph.py ├── README.md └── db_load.py /img/filters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/filters.png -------------------------------------------------------------------------------- /img/inspector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/inspector.png -------------------------------------------------------------------------------- /img/partition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/partition.png -------------------------------------------------------------------------------- /doc/Files Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Files Graph.png -------------------------------------------------------------------------------- /doc/Flow Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Flow Graph.png -------------------------------------------------------------------------------- /doc/HTTP Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/HTTP Graph.png -------------------------------------------------------------------------------- /img/node_colors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/node_colors.png -------------------------------------------------------------------------------- /doc/Flow Graph.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Flow Graph.graffle -------------------------------------------------------------------------------- /doc/HTTP Graph.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/HTTP Graph.graffle -------------------------------------------------------------------------------- /doc/Files Graph.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Files Graph.graffle -------------------------------------------------------------------------------- /doc/DNS Transaction Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/DNS Transaction Graph.png -------------------------------------------------------------------------------- /img/give_colors_to_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/give_colors_to_nodes.png -------------------------------------------------------------------------------- /doc/DNS Transaction Graph.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/DNS Transaction Graph.graffle -------------------------------------------------------------------------------- /gh/__init__.py: -------------------------------------------------------------------------------- 1 | import util 2 | import file 3 | import account 4 | import flow 5 | import dns 6 | import connect 7 | import http 8 | import host 9 | -------------------------------------------------------------------------------- /gh/host.py: -------------------------------------------------------------------------------- 1 | from bulbs.model import Node, Relationship 2 | from bulbs.property import String, Integer, DateTime 3 | from bulbs.utils import current_datetime 4 | 5 | class Host(Node): 6 | 7 | element_type = "host" 8 | 9 | name = String(nullable=False) 10 | color = String(default="#DA456B") 11 | 12 | -------------------------------------------------------------------------------- /gh/account.py: -------------------------------------------------------------------------------- 1 | from bulbs.model import Node, Relationship 2 | from bulbs.property import String, Integer, DateTime 3 | from bulbs.utils import current_datetime 4 | 5 | class Account(Node): 6 | 7 | element_type = "account" 8 | 9 | name = String(nullable=False) 10 | color = String(default="#00FFFF") 11 | 12 | class Requested(Relationship): 13 | label = "requested" 14 | element_type = label 15 | 16 | class Uses(Relationship): 17 | label = "uses" 18 | element_type = label 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /gh/file.py: -------------------------------------------------------------------------------- 1 | from bulbs.model import Node, Relationship 2 | from bulbs.property import String, Integer, DateTime 3 | from bulbs.utils import current_datetime 4 | 5 | class File(Node): 6 | 7 | element_type = "file" 8 | name = String(nullable=False) 9 | color = String(default="#E6E658") 10 | 11 | class Transferred(Relationship): 12 | label = "transferred" 13 | element_type = label 14 | 15 | class SentTo(Relationship): 16 | label = "sentTo" 17 | element_type = label 18 | 19 | class SentBy(Relationship): 20 | label = "sentBy" 21 | element_type = label 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /gh/flow.py: -------------------------------------------------------------------------------- 1 | from bulbs.model import Node, Relationship 2 | from bulbs.property import String, Integer, DateTime 3 | from bulbs.utils import current_datetime 4 | 5 | class Flow(Node): 6 | 7 | element_type = "flow" 8 | name = String(nullable=False) 9 | color = String(default="#5B87F2") 10 | 11 | class Source(Relationship): 12 | label = "source" 13 | element_type = label 14 | 15 | class Dest(Relationship): 16 | label = "dest" 17 | element_type = label 18 | 19 | class ConnectedTo(Relationship): 20 | label = "connectedTo" 21 | element_type = label 22 | 23 | class Contains(Relationship): 24 | label = "contains" 25 | element_type = label 26 | 27 | 28 | -------------------------------------------------------------------------------- /db_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Our interface to the GraphDB 4 | from bulbs.rexster import Graph, Config, DEBUG 5 | 6 | # Our own modules 7 | from gh.connect import Connect 8 | from gh.util import graph_info 9 | 10 | def graph_stats (g): 11 | info = graph_info(g) 12 | 13 | print 14 | print "**** Graph Stats" 15 | print 16 | print " **** Totals" 17 | print " %15s\t%d" % ("Vertices", info["numv"]) 18 | print " %15s\t%d" % ("Edges", info["nume"]) 19 | print 20 | print " **** Vertices by type:" 21 | for v in info["vinfo"]: 22 | print " %15s\t%d" % (v, info["vinfo"][v]) 23 | print 24 | print " **** Edges by type:" 25 | for e in info["einfo"]: 26 | print " %15s\t%d" % (e, info["einfo"][e]) 27 | 28 | if __name__ == "__main__": 29 | g = Connect() 30 | 31 | graph_stats(g) 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Emacs 6 | *~ 7 | \#* 8 | .#* 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | -------------------------------------------------------------------------------- /db_clear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from gh.connect import Connect 4 | 5 | from optparse import OptionParser 6 | import sys 7 | import re 8 | 9 | def parse_options() : 10 | parser = OptionParser() 11 | parser.add_option("-f", "--force", dest="force",default=False, 12 | action="store_true", 13 | help="Force clear the DB, without prompting for confirmation.") 14 | (options, args) = parser.parse_args() 15 | return(options, args) 16 | 17 | ### MAIN ### 18 | (options, args) = parse_options() 19 | 20 | g = Connect() 21 | 22 | # don't prompt if we used --force 23 | if options.force: 24 | g.clear() 25 | else: 26 | print "WARNING: This will DELETE ALL DATA in the graph. Are you sure? (y/N) ", 27 | 28 | answer = sys.stdin.readline() 29 | 30 | # Do a case-insensitive match to see if the first char of the response is "y" 31 | if re.match('y', answer, re.I): 32 | g.clear() 33 | else: 34 | print "Graph data NOT DELETED." 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /gh/dns.py: -------------------------------------------------------------------------------- 1 | from bulbs.model import Node, Relationship 2 | from bulbs.property import String, Integer, DateTime 3 | from bulbs.utils import current_datetime 4 | 5 | class FQDN(Node): 6 | 7 | element_type = "fqdn" 8 | 9 | name = String(nullable=False) 10 | color = String(default="#8CBC1C") 11 | 12 | class DNSTransaction(Node): 13 | element_type = "dnsTransaction" 14 | 15 | name = String(nullable=False) 16 | color = String(default="#FFBF56") 17 | 18 | class Resolved(Relationship): 19 | label = "resolved" 20 | element_type = label 21 | 22 | class Answer(Relationship): 23 | label = "answer" 24 | element_type = label 25 | 26 | class Queried(Relationship): 27 | label = "queried" 28 | element_type = label 29 | 30 | class QueriedServer(Relationship): 31 | label = "queriedServer" 32 | element_type = label 33 | 34 | class LookedUp(Relationship): 35 | label = "lookedUp" 36 | element_type = label 37 | 38 | class ResolvedTo(Relationship): 39 | label = "resolvedTo" 40 | element_type = label 41 | 42 | -------------------------------------------------------------------------------- /gh/http.py: -------------------------------------------------------------------------------- 1 | from bulbs.model import Node, Relationship 2 | from bulbs.property import String, Integer, DateTime 3 | from bulbs.utils import current_datetime 4 | 5 | class HTTPTransaction(Node): 6 | 7 | element_type = "http_transaction" 8 | name = String(nullable=False) 9 | color = String(default="#5FBD71") 10 | 11 | class UserAgent(Node): 12 | 13 | element_type = "userAgent" 14 | name = String(nullable=False) 15 | color = String(default="#BE844A") 16 | 17 | class URI(Node): 18 | 19 | element_type = "uri" 20 | name = String(nullable=False) 21 | color = String(default="#71985E") 22 | 23 | class Referrer(Relationship): 24 | label = "referrer" 25 | element_type = label 26 | 27 | class HostedBy(Relationship): 28 | label = "hostedBy" 29 | element_type = label 30 | 31 | class RequestedBy(Relationship): 32 | label = "requestedBy" 33 | element_type = label 34 | 35 | class RequestedOf(Relationship): 36 | label = "requestedOf" 37 | element_type = label 38 | 39 | class IdentifiedBy(Relationship): 40 | label = "identifiedBy" 41 | element_type = label 42 | 43 | class Agent(Relationship): 44 | label = "agent" 45 | element_type = label 46 | 47 | class Sent(Relationship): 48 | label = "sent" 49 | element_type = label 50 | 51 | class Received(Relationship): 52 | label = "received" 53 | element_type = label 54 | 55 | 56 | -------------------------------------------------------------------------------- /db_path.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from optparse import OptionParser 4 | import sys 5 | import re 6 | 7 | import gh 8 | 9 | def parse_options() : 10 | parser = OptionParser() 11 | parser.add_option("-v", "--verbose", dest="verbose",default=False, 12 | action="store_true", 13 | help="Print more details about nodes and edges.") 14 | parser.add_option("-d", "--directed", dest="directed", default=False, 15 | action="store_true", 16 | help="Respect relationship direction when finding paths.") 17 | parser.add_option("-m", "--max_hops", dest="max_hops", default=5, 18 | help="Max number of hops in path.") 19 | (options, args) = parser.parse_args() 20 | return(options, args) 21 | 22 | #### MAIN #### 23 | 24 | (options, args) = parse_options() 25 | 26 | if len(args) != 2: 27 | print "ERROR: You must specify both the beginning and ending nodes." 28 | sys.exit(-1) 29 | 30 | g = gh.connect.Connect() 31 | 32 | src = args[0] 33 | dst = args[1] 34 | 35 | # Look up the source ID, if we gave a name 36 | if re.match("\d+$", dst): 37 | print "Looking up by destination vertex ID" 38 | lookup = gh.util.shortest_path 39 | else: 40 | print "Looking up by destination vertex type" 41 | lookup = gh.util.shortest_path_to_type 42 | 43 | res = lookup(g, src, dst, 44 | directed=options.directed, 45 | max_hops=options.max_hops) 46 | 47 | if res: 48 | for r in res: 49 | print gh.util._v2s(r,verbose=options.verbose), 50 | else: 51 | print "No path." 52 | 53 | -------------------------------------------------------------------------------- /groovy/gremlin.groovy: -------------------------------------------------------------------------------- 1 | // Return info about the count of different types of vertices and edges 2 | def graph_info() { 3 | 4 | numv = g.V.count() 5 | nume = g.E.count() 6 | vinfo = g.V.groupBy{it.element_type}{1}{it.size}.cap.next() 7 | einfo = g.E.groupBy{it.element_type}{1}{it.size}.cap.next() 8 | 9 | return [numv: numv, nume: nume, vinfo: vinfo, einfo: einfo] 10 | } 11 | 12 | // Shortest path between two vertices 13 | def shortest_path(node1_id, node2_id, hops, directed) { 14 | 15 | if (directed) { 16 | p = g.v(node1_id).as("x").outE.inV.dedup.loop("x"){it.loops < hops}{it.object.id == node2_id.toString()}.path.sort{a,b -> a.size() <=> b.size()}.take(1) 17 | } else { 18 | p = g.v(node1_id).as("x").bothE.bothV.dedup.loop("x"){it.loops < hops}{it.object.id == node2_id.toString()}.path.sort{a,b -> a.size() <=> b.size()}.take(1) 19 | } 20 | } 21 | 22 | // Shortest path between two vertices, where the destination is a node 23 | // type, not a specific node ID 24 | def shortest_path_to_type(node1_id, node2_type, hops, directed) { 25 | 26 | if (directed) { 27 | p = g.v(node1_id).as("x").outE.inV.dedup.loop("x"){it.loops < hops}{it.object.element_type == node2_type}.path.sort{a,b -> a.size() <=> b.size()}.take(1) 28 | } else { 29 | p = g.v(node1_id).as("x").bothE.bothV.dedup.loop("x"){it.loops < hops}{it.object.element_type == node2_type}.path.sort{a,b -> a.size() <=> b.size()}.take(1) 30 | } 31 | } 32 | 33 | // Test script to get simple node info 34 | def node_info(node_id) { 35 | n = g.v(node_id).map 36 | n 37 | } 38 | 39 | // Return the list of edges between a given two nodes 40 | def edge_list(node1_id, node2_id, edge_type) { 41 | e = g.v(node1_id).outE(edge_type).as("x").inV.filter{it.id == node2_id.toString()}.back("x") 42 | 43 | e 44 | } 45 | -------------------------------------------------------------------------------- /db_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from gh.connect import Connect 4 | import gh 5 | 6 | from GephiStreamer import Node, Edge, GephiStreamerManager 7 | 8 | from random import choice 9 | 10 | def display_graph(t, nodes, edges): 11 | 12 | if nodes != None: 13 | print "*** Graphing Nodes" 14 | for n in nodes: 15 | node_temp = Node(n._id) 16 | 17 | properties = n.map() 18 | for key in properties: 19 | node_temp.property[key] = properties[key] 20 | 21 | node_temp.property["colour"] = node_temp.property["color"] 22 | node_temp.property["label"] = node_temp.property["name"] 23 | t.add_node(node_temp) 24 | 25 | if edges != None: 26 | print "*** Graphing Edges" 27 | for e in edges: 28 | src = e._outV 29 | dst = e._inV 30 | edge_temp = Edge(src, dst, directed=True) 31 | 32 | properties = e.map() 33 | for key in properties: 34 | edge_temp.property[key] = properties[key] 35 | 36 | t.add_edge(edge_temp) 37 | 38 | t.commit() 39 | 40 | def display_graph_from_list(t, l=None): 41 | if l == None: 42 | return 43 | 44 | for element in l: 45 | if element["_type"] == "vertex": 46 | node_temp = Node(element["_id"]) 47 | 48 | for key in element: 49 | node_temp.property[key] = element[key] 50 | 51 | node_temp.property["label"] = node_temp.property["name"] 52 | node_temp.property["colour"] = node_temp.property["color"] 53 | 54 | t.add_node(node_temp) 55 | elif element["_type"] == "edge": 56 | src = element["_outV"] 57 | dst = element["_inV"] 58 | edge_temp = Edge(src, dst, directed=True) 59 | 60 | for key in element: 61 | edge_temp.property[key] = element[key] 62 | 63 | t.add_edge(edge_temp) 64 | 65 | t.commit() 66 | 67 | if __name__ == "__main__": 68 | g = Connect() 69 | t = GephiStreamerManager() 70 | 71 | print "Getting nodes..." 72 | nodes = g.V 73 | print "Getting edges..." 74 | edges = g.E 75 | 76 | display_graph(t, g.V, g.E) 77 | 78 | 79 | -------------------------------------------------------------------------------- /gh/connect.py: -------------------------------------------------------------------------------- 1 | from bulbs.rexster import Graph, Config 2 | from host import Host 3 | from flow import Flow, Source, Dest, Contains, ConnectedTo 4 | from dns import FQDN, DNSTransaction, LookedUp, Queried, Answer, QueriedServer, Resolved, ResolvedTo 5 | from file import File, Transferred, SentTo, SentBy 6 | from http import HTTPTransaction, URI, UserAgent, Referrer, HostedBy, RequestedBy, RequestedOf, IdentifiedBy, Agent, Sent, Received 7 | from account import Account, Requested, Uses 8 | 9 | DEFAULT_URI = "http://localhost:8182/graphs/hunting" 10 | 11 | def Connect(uri=DEFAULT_URI): 12 | """ 13 | Establishes a connection to the graph database backend. It also does 14 | a few standard tasks to set up the models and server side scripts we 15 | depend on, so every utility that calls Connect() has a consistent 16 | environment. 17 | 18 | Returns a Graph() object. 19 | 20 | Example: 21 | 22 | g = Connect() # Connect using the standard default database info 23 | g = Connect("http://localhost:8182/graphs/myDB") # Use a custom DB 24 | """ 25 | config = Config(uri) 26 | g = Graph(config) 27 | 28 | # Set up the node and relationship proxies 29 | g.add_proxy("host", Host) 30 | g.add_proxy("flow", Flow) 31 | g.add_proxy("source", Source) 32 | g.add_proxy("contains", Contains) 33 | g.add_proxy("dest", Dest) 34 | g.add_proxy("connectedTo", ConnectedTo) 35 | g.add_proxy("fqdn", FQDN) 36 | g.add_proxy("dnsTransaction", DNSTransaction) 37 | g.add_proxy("resolved", Resolved) 38 | g.add_proxy("answer", Answer) 39 | g.add_proxy("queried", Queried) 40 | g.add_proxy("queriedServer", QueriedServer) 41 | g.add_proxy("lookedUp", LookedUp) 42 | g.add_proxy("resolvedTo", ResolvedTo) 43 | g.add_proxy("file", File) 44 | g.add_proxy("transferred", Transferred) 45 | g.add_proxy("sentTo", SentTo) 46 | g.add_proxy("sentBy", SentBy) 47 | g.add_proxy("httpTransaction", HTTPTransaction) 48 | g.add_proxy("uri", URI) 49 | g.add_proxy("userAgent", UserAgent) 50 | g.add_proxy("requestedBy", RequestedBy) 51 | g.add_proxy("requestedOf", RequestedOf) 52 | g.add_proxy("hostedBy", HostedBy) 53 | g.add_proxy("identifiedBy", IdentifiedBy) 54 | g.add_proxy("agent", Agent) 55 | g.add_proxy("sent", Sent) 56 | g.add_proxy("received", Received) 57 | g.add_proxy("account", Account) 58 | g.add_proxy("requested", Requested) 59 | g.add_proxy("uses", Uses) 60 | # Load in our groovy scripts 61 | g.scripts.update("groovy/gremlin.groovy") 62 | return g 63 | -------------------------------------------------------------------------------- /gh/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | def _v2s(v, verbose=False): 4 | if not "_type" in v: 5 | return "-" 6 | elif v["_type"] == "vertex": 7 | if verbose: 8 | return "v[%s][%s %s]" % (v["_id"], v["element_type"], v["name"]) 9 | else: 10 | return "v[%s]" % v["_id"] 11 | elif v["_type"] == "edge": 12 | if verbose: 13 | return "e[%s][%s-%s->%s]" % (v["_id"], v["_outV"], v["element_type"], v["_inV"]) 14 | else: 15 | return "e[%s][%s-%s->%s]" % (v["_id"], v["_outV"], v["element_type"], v["_inV"]) 16 | else: 17 | return "[??]" 18 | 19 | def write_graphml(g, filename="/tmp/graph.graphml"): 20 | ''' 21 | Given a Graph object (g), write the GraphML representation to the 22 | given filename. If no filename is given, use the default 23 | "/tmp/graph.graphml". This can be loaded into Gephi or some other 24 | visualization tool. 25 | ''' 26 | gml = g.get_graphml() 27 | f = open(filename,"w") 28 | f.write(gml) 29 | f.close() 30 | 31 | def shortest_path(g, node1_id, node2_id, max_hops=4, directed=False): 32 | ''' 33 | Calls a Groovy script to compute the shortest path between two 34 | nodes that is less than or equal to "max_hops" long. In the event 35 | that there are multiple paths of the same length, it only returns 36 | one of them. Which one it returns is undefined. If the "directed" 37 | attribute is True, the function will follow relationships only in 38 | the direction in which they occur on the graph. If set to False, 39 | it will find paths regardelss of the direction of the 40 | relationships. 41 | 42 | Return value is either a list of nodes and edges, or None if no path 43 | was found. 44 | 45 | ''' 46 | script = g.scripts.get("shortest_path") 47 | res = g.gremlin.execute(script, dict(node1_id=node1_id, node2_id=node2_id, hops=max_hops, directed=directed)) 48 | if res: 49 | lst = list(res.results) 50 | # Results will be a list-of-lists. If there are any results, return 51 | # the first list. 52 | if len(lst) > 0: 53 | return lst[0].data 54 | # If we got here, there were no results, so we couldn't find a path. 55 | return None 56 | 57 | def shortest_path_to_type(g, node1_id, node2_type, max_hops=4, directed=False): 58 | ''' 59 | Calls a Groovy script to compute the shortest path between two 60 | nodes that is less than or equal to "max_hops" long, where the destination 61 | node is any node of the type specified in "node2_type". In the event 62 | that there are multiple paths of the same length, it only returns 63 | one of them. Which one it returns is undefined. If the "directed" 64 | attribute is True, the function will follow relationships only in 65 | the direction in which they occur on the graph. If set to False, 66 | it will find paths regardelss of the direction of the 67 | relationships. 68 | 69 | Return value is either a list of nodes and edges, or None if no path 70 | was found. 71 | 72 | ''' 73 | script = g.scripts.get("shortest_path_to_type") 74 | res = g.gremlin.execute(script, dict(node1_id=node1_id, node2_type=node2_type, hops=max_hops, directed=directed)) 75 | if res: 76 | lst = list(res.results) 77 | # Results will be a list-of-lists. If there are any results, return 78 | # the first list. 79 | if len(lst) > 0: 80 | return lst[0].data 81 | # If we got here, there were no results, so we couldn't find a path. 82 | return None 83 | 84 | def graph_info(g): 85 | script = g.scripts.get("graph_info") 86 | res = g.gremlin.execute(script) 87 | 88 | return res.results.next().data 89 | 90 | def node_info(g, node_id): 91 | script = g.scripts.get("node_info") 92 | res = g.gremlin.execute(script, dict(node_id=node_id)) 93 | return res.results.next().data 94 | 95 | def edge_list(g, node1_id, node2_id, edge_type): 96 | script = g.scripts.get("edge_list") 97 | res = g.gremlin.query(script, dict(node1_id=node1_id, node2_id=node2_id, edge_type=edge_type)) 98 | return res 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bro2Graph 2 | 3 | ## Prerequisites 4 | Bro2Graph relies on a few third-party packages, namely the Rexster server (https://github.com/tinkerpop/rexster/wiki/Downloads) and the 'Bulbs' Python interface (http://bulbflow.com). 5 | 6 | To render and interact with the graph, you'll need to install Gephi (http://gephi.github.io). You'll also need the "Give Colors to Nodes" and "Graph Streaming" plugins (see the *Tools -> Plugins -> Available Plugins* menu to download and install these from within Gephi). 7 | 8 | ### Installing & Configuring Rexster 9 | Installation is the very simple. Just download the latest version of the Rexster Server package from the above URL. At the time of this writing, that would be v2.6.0. It's all one big Zip file, so just extract it somewhere convenient. 10 | 11 | Once unzipped, you will need to edit the _config/rexster.xml_ file to create the database used by our scripts. Find the beginning of the `` stanza (where, obviously, all the graphs are defined) and insert the following: 12 | 13 | 14 | hunting 15 | tinkergraph 16 | true 17 | 18 | 19 | tp:gremlin 20 | 21 | 22 | 23 | 24 | ### Installing Bulbs 25 | Bulbs is available through PyPi, so you can install it quite easily: 26 | 27 | # pip install bulbs 28 | 29 | ## Getting Data Into the Graph 30 | To load Bro data into the graph, you must first start the graph database backend. After that, you simply run the script to load your Bro log files into that database. This section details the process. 31 | 32 | ### Starting the Graph Database Backend 33 | When you begin your hunting session, the first thing you'll need to do is to start the graph database backend, like so: 34 | 35 | [...]/rexster-server-2.6.0> ./bin/rexster.sh --start 36 | 37 | You'll get a lot of output, but after a few seconds, the database will be initialized and ready for action. 38 | 39 | ### Loading Bro Data Into the Graph 40 | 41 | [...]/Bro2Graph> ./db-load.py -l ~/BroLogDir 42 | 43 | This should go pretty quickly for smaller datasets, but if you have a lot of Bro logs, it could take quite a long time. Hours, even, for larger datasets. 44 | 45 | When it's finished, you'll see something like the following: 46 | 47 | [...]/Bro2Graph> ./db_load.py -l ~/BroLogDir 48 | Reading log files from /Users/bro/BroLogDir 49 | Graphing Flows... 50 | Reading /Users/bro/BroLogDir/conn.log... 51 | Number of events: 18 52 | Graphing Files... 53 | Reading /Users/bro/BroLogDir/files.log... 54 | Number of events: 22 55 | Graphing DNS Transactions... 56 | Reading /Users/bro/BroLogDir/dns.log... 57 | Number of events: 11 58 | Graphing HTTP Transactions... 59 | Reading /Users/bro/BroLogDir/http.log... 60 | Number of events: 22 61 | 62 | **** Graph Stats 63 | 64 | **** Totals 65 | Vertices 144 66 | Edges 414 67 | 68 | **** Vertices by type: 69 | account 2 70 | flow 20 71 | fqdn 19 72 | uri 19 73 | host 27 74 | dnsTransaction 11 75 | file 22 76 | userAgent 2 77 | http_transaction 22 78 | 79 | **** Edges by type: 80 | queried 11 81 | received 21 82 | hostedBy 22 83 | contains 55 84 | requested 2 85 | dest 18 86 | resolved 11 87 | resolvedTo 32 88 | queriedServer 11 89 | connectedTo 14 90 | agent 44 91 | identifiedBy 22 92 | source 18 93 | uses 2 94 | answer 32 95 | sentTo 22 96 | sentBy 22 97 | requestedBy 22 98 | lookedUp 11 99 | requestedOf 22 100 | 101 | Notice that the last part is a summary of the numbers and types of nodes and edges in the graph. You can generate this report at any time by running the *db_stats.py* script. 102 | 103 | ### Deleting the Graph 104 | If for any reason you want to delete all the loaded data and start fresh, you have two options. The default Rexster configuration (above) only stores the data in RAM, so simply restarting Rexster will effectively erase all the data. 105 | 106 | On the other hand, if you have configured Rexster to save the data to disk, or if you just don't feel like restarting the database process, you can run *db_clear.py*. After confirming that you do indeed really want to delete everything, the script will do just that. At the end, you'll have a fresh new database, just as though you had never loaded anything into it. 107 | 108 | ## Visualizing the Graph 109 | After you have loaded your Bro data into the graph, you will naturally want to see what this looks like. In this section, you'll learn how to start Gephi, load the data in, and do some simple things to render the graph in a more readable fashion. 110 | 111 | Note that Gephis is a **very** full featured system for interacting and computing with graphs. This document will barely scratch the surface of what you can do with Gephi, and I encourage you to come up with your own cool techniques (and to share them!). 112 | 113 | ### Loading the Data Into Gephi 114 | Start Gephi, and select "New Project" when prompted. This will give you a blank workspace (Gephi calls them "canvases"). 115 | 116 | If you have already installed the necessary plugins, you should see a tab on the lefthand column called *Streaming*. Click that, and then right click on the *Master Server* entry and set it to *Start*. This makes Gephi listen on the local network for graph streaming connections. 117 | 118 | Now that Gephi is listening for graph data, run *db_graph.py* to send the data from Rexster into Gephi. There are no arguments necessary, as it will just stream the entire graph. This shouldn't take too long, and the output is minimal. If you look at the Gephi window, you'll see a bunch of black lines and dots. Don't worry, we'll make this look a lot better! 119 | 120 | Once you've loaded the data, click back over to Gephi's *Layout* tab, since we'll need that later. 121 | 122 | ### Making the Graph Readable 123 | 124 | To make this graph something approaching readable, we'll start with three simple operations: 125 | 126 | * Assign colors to the different types of nodes 127 | * Size the nodes according to some criteria 128 | * Apply a layout algorithm 129 | 130 | #### Node Colors 131 | The *db_load.py* script automatically assigned color values to different types of nodes when it loaded them into the database. Each type of node is color coded, according to the chart below. 132 | 133 | ![](img/node_colors.png) 134 | 135 | By default, though, Gephi will not display these colors. The *Give Colors to Nodes* plugin you installed earlier makes this quite simple, though. Simply click on the plugin's icon to the left of the canvas. It looks like this: 136 | 137 | ![](img/give_colors_to_nodes.png) 138 | 139 | This will automatically color the nodes according to their type, though you may not immediately notice this since most of the nodes are still quite small. 140 | 141 | #### Resize the Nodes 142 | When working with graphs, it's very common to want to display the nodes as different sizes, depending on some criteria you compute. This gives you some immediate visual feedback about the nodes, and is quite useful. 143 | 144 | You can size your nodes by any numeric feature that Bro computed (for example, by the number of bytes transferred, if you are looking at network Flow nodes). However, the most common way is probably to size them by the number of edges they have with other nodes. The edge count of a node is referred to as it's *degree*. We'll start with this. 145 | 146 | To resize your nodes, simply click the *Ranking* tab on the left side of the Gephi screen. The drop down menu at the top of the ranking panel lists all of the criteria you can use to size your nodes. Some are computations that Gephi performs for you (like degree), while others are fields drawn from your own data. For now, though, select *Degree* and click *Apply*. 147 | 148 | Now you should start seeing nodes of various sizes, and you can probably also start to see their colors as well. Still, it's a bit of a mess, so let's fix that. 149 | 150 | #### Applying a Layout Algorithm 151 | By default, Gephi displays your graph in a pretty jumbled, hard to understand fashion. You can easily fix this by applying a layout algorithm from the *Layout* panel on the left. 152 | 153 | Gephi comes with a number of predefined layout algorithms, and I'm not going to try to explain them in detail. Most of these are well-known algorithms (at least, if you are a computer scientist who deals with graphs a lot, I guess) and you can Google them if you want to know how they work. 154 | 155 | For now, though, select the *Yifan Hu* algorithm from the drop down and click *Run*. You should see the nodes on your graph start to move around as the algorithm does its work. Yifan Hu will automatically stop when it thinks it's got everything right, but sometimes running it more than once may help make the graph clearer, with more separation between the clusters of nodes. 156 | 157 | ## Interacting With the Graph 158 | Now that your graph is formatted nicely, you can start to explore it. Gephi has **a lot** of nice functions for this, and I am not going to try to cover them all here. I recommend searching for "Gephi" on YouTube to find some really nice tutorials. 159 | 160 | For now, though, I want to show just two things: How to inspect the values for a specific node, and how to control which types of nodes and/or relationships you show on the graph. 161 | 162 | ### Inspecting the Values of a Node 163 | This is actually pretty simple. Just click the *Edit* icon, which can be found on the toolbar to the left of the canvas, and which looks like this: 164 | 165 | ![](img/inspector.png) 166 | 167 | When the edit control is selected, you can click on any node and Gephi will show you all the features and their associated values. As the name implies, you can also edit these values, but of course these edited values will be valid only inside this Gephi session, and will not be propagated back to the Rexster graph database. 168 | 169 | ### Working With Specific Types of Nodes or Relationships 170 | Although there are a lot of cases where you really do want to see *all* the nodes and relationships in your graph, in most cases you will probably want to view only specific types. Not only will this make Gephi faster (since it has to do less work to show fewer items) but also it will make your graphs easier to understand. 171 | 172 | Gephi makes it easy to show the types of nodes and relationships you want by using a custom filter. Start by locating the *Filters* pane on the right, and navigating to the *Attributes -> Partition* menu, which will look like this: 173 | 174 | ![](img/filters.png) 175 | 176 | You'll see a very long list of attributes by which you can partition the graph (BTW, *partitioning* just means that you can divide the graph up into pieces according to some criteria, and this is the list of the criteria you can use). Scroll down the list until you see *Element Type (Node)*, then drag it down below to where you see a red bulls-eye labeled *Drag filter here*. When you're done, it should look something like this: 177 | 178 | ![](img/partition.png) 179 | 180 | Notice that each node type in your graph is listed here. To control what you want to display on your graph, simply check the boxes next to the node types you want to work with and click *Filter*. After a short time (longer if you have a large graph), you'll see the results reflected in the main canvas. 181 | 182 | Note that when you add or subtract elements, you may want to re-run the layout algorithm again. 183 | 184 | With a little work, you can also drag in the *Element Types (Edge)* filter to get more control over what relationships you show for the nodes in your graph, but I'll leave that to you to play with. 185 | -------------------------------------------------------------------------------- /db_load.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import subprocess 4 | import sys 5 | import os 6 | from optparse import OptionParser 7 | import re 8 | import StringIO 9 | import numpy 10 | import pandas 11 | import random 12 | import string 13 | 14 | # Our interface to the GraphDB 15 | from bulbs.rexster import Graph, Config, DEBUG 16 | 17 | # Our own modules 18 | from gh.connect import Connect 19 | from gh.util import graph_info, shortest_path, edge_list 20 | from db_stats import graph_stats 21 | 22 | # A per-log dict that contains the list of fields we want to extract, in order 23 | SUPPORTED_BRO_FIELDS = { 24 | "conn.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","proto","service","duration","orig_bytes","resp_bytes","conn_state","local_orig","missed_bytes","history","orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes","tunnel_parents"], 25 | "dns.log":["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","proto","trans_id","query","qclass","qclass_name","qtype","qtype_name","rcode","rcode_name","AA","TC","RD","RA","Z","answers","TTLs","rejected"], 26 | "dpd.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","proto","analyzer","failure_reason"], 27 | "files.log": ["ts","fuid","tx_hosts","rx_hosts","conn_uids","source","depth","analyzers","mime_type","filename","duration","local_orig","is_orig","seen_bytes","total_bytes","missing_bytes","overflow_bytes","timedout","parent_fuid","md5","sha1","sha256","extracted"], 28 | "ftp.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","user","password","command","arg","mime_type","file_size","reply_code","reply_msg","data_channel.passive","data_channel.orig_h","data_channel.resp_h","data_channel.resp_p","fuid"], 29 | "http.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","trans_depth","method","host","uri","referrer","user_agent","request_body_len","response_body_len","status_code","status_msg","info_code","info_msg","filename","tags","username","password","proxied","orig_fuids","orig_mime_types","resp_fuids","resp_mime_types"], 30 | "irc.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","nick","user","command","value","addl","dcc_file_name","dcc_file_size","dcc_mime_type","fuid"], 31 | "notice.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","fuid","file_mime_type","file_desc","proto","note","msg","sub","src","dst","p","n","peer_descr","actions","suppress_for","dropped","remote_location.country_code","remote_location.region","remote_location.city","remote_location.latitude","remote_location.longitude"], 32 | "smtp.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","trans_depth","helo","mailfrom","rcptto","date","from","to","reply_to","msg_id","in_reply_to","subject","x_originating_ip","first_received","second_received","last_reply","path","user_agent","tls","fuids","is_webmail"], 33 | "snmp.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","duration","version","community","get_requests","get_bulk_requests","get_responses","set_requests","display_string","up_since"], 34 | "ssh.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","status","direction","client","server","remote_location.country_code","remote_location.region","remote_location.city","remote_location.latitude","remote_location.longitude"] 35 | } 36 | 37 | FIELDS_STRING = ["TTLs"] 38 | FIELDS_INTEGER = ["id.orig_p","id.resp_p","orig_bytes","resp_bytes","missed_bytes","orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes","qclass","qtype","trans_id","rcode","Z","depth","seen_bytes","total_bytes","missing_bytes","file_size","reply_code","data_channel.resp_p","trans_depth","request_body_len","response_body_len","status_code","info_code","dcc_file_size"] 39 | FIELDS_FLOAT = ["duration","lease_type"] 40 | 41 | # Output date format for timestamps 42 | DATE_FMT="%FT%H:%M:%SZ" 43 | 44 | BRO_CUT_CMD=["bro-cut","-U",DATE_FMT] 45 | 46 | def unique_id(size=17): 47 | return ''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(size)) 48 | 49 | def is_IP(s): 50 | # this is pretty dumb. If it looks like an IPv4 address, fine. But a 51 | # good IPv6 regex is ridiculously complex. I took a shortcut, since I 52 | # this routine is only ever called to disambiguate IPs from hostnames or 53 | # FQDNs. If there's even a single ":", we'll just assume this must be 54 | # IPv6, since neither hostnames nor FQDNs can contain that char. 55 | # 56 | # Sorry. 57 | return( re.match("\d+.\d+.\d+.\d+$", s) != None or re.search(":",s) != None) 58 | 59 | 60 | def extend_list(lst, val, length): 61 | ''' 62 | Given a list "lst", extend it to length "length". Each new item will 63 | be composed of the value "val". Of course, if "lst" is already "length" 64 | size or longer, just return and do nothing. 65 | ''' 66 | if len(lst) >= length: 67 | return lst 68 | else: 69 | lst.extend([val] * (length - len(lst))) 70 | return lst 71 | 72 | def parse_options() : 73 | parser = OptionParser() 74 | parser.add_option("-l", "--log-dir", dest="logdir", 75 | help="Bro log file directory to parse.") 76 | parser.add_option("-q", "--quiet", dest="quiet", 77 | help="Suppress unecessary output (run quietly)") 78 | parser.add_option("-o", "--output", dest="outputdir",default=".", 79 | help="Output directory (will be created if necessary)") 80 | parser.add_option("-s", "--sample", dest="sample",default=False,type="int", 81 | help="Randomly select SAMPLE # of connections and associated log entries.") 82 | 83 | (options, args) = parser.parse_args() 84 | return(options, args) 85 | 86 | def readlog(file, connection_ids=False): 87 | 88 | output = "" 89 | 90 | logtype = file 91 | 92 | logfile = "%s/%s" % (options.logdir,file) 93 | 94 | print "Reading %s..." % logfile 95 | 96 | tmp_bro_cut_cmd = BRO_CUT_CMD 97 | tmp_bro_cut_cmd = tmp_bro_cut_cmd + SUPPORTED_BRO_FIELDS[logtype] 98 | 99 | # Create a job that just cats the log file 100 | p1 = subprocess.Popen(['cat',logfile], stdout=subprocess.PIPE) 101 | 102 | # This is the bro-cut job, reading the "cat" command output 103 | p2 = subprocess.Popen(tmp_bro_cut_cmd, stdin=p1.stdout, stdout=subprocess.PIPE) 104 | 105 | p1.stdout.close() 106 | 107 | # Now we're going to use the "pandas" package to create a dataframe 108 | # out of the log data. Dataframes greatly simplify the tasks of cleaning 109 | # the data. 110 | # 111 | # StringIO treats the string as a fake file, so we can use pandas to 112 | # create a dataframe out of the string directly, without having to write 113 | # it to disk first. 114 | brodata = StringIO.StringIO(p2.communicate()[0]) 115 | 116 | df = pandas.DataFrame.from_csv(brodata, sep="\t", parse_dates=False, header=None, index_col=None) 117 | 118 | df.columns = SUPPORTED_BRO_FIELDS[logtype] 119 | 120 | # If this is the connection log, and if we've requested a random sample, 121 | # cut the dataframe down to ONLY contain that random sample 122 | if logtype == "conn.log" and options.sample: 123 | print "Size before sampling: %d" % len(df.index) 124 | df = df.sample(n=options.sample) 125 | df.reset_index(drop=True, inplace=True) 126 | print "Size after sampling: %d" % len(df.index) 127 | elif logtype == "files.log" and connection_ids: 128 | df = df[df.conn_uids.isin(connection_ids)] 129 | df.reset_index(drop=True, inplace=True) 130 | elif logtype != "conn.log" and connection_ids and "uid" in df.columns: 131 | # If this is any other type of log AND we have an explicit list of 132 | # connection IDs we sampled AND this is a file that has the "uid" 133 | # data to pair it with the conn.log, pare down the dataframe to 134 | # only include those rows with the right uids 135 | df = df[df.uid.isin(connection_ids)] 136 | df.reset_index(drop=True, inplace=True) 137 | # It is entirely possible that this sampling may mean that some 138 | # log files no longer have any output (for example, you only sampled 139 | # a list of connections, none of which were DHCP). 140 | 141 | 142 | df.replace(to_replace=["(empty)","-"], value=["",""], inplace=True) 143 | 144 | # Some columns need to be forced into type String, primarily because they 145 | # may contain lists and we always call split() on them, but they look like 146 | # integers, so numpy tries to store them that way. 147 | for field in FIELDS_STRING: 148 | if field in df.columns: 149 | df[field] = df[field].astype(str) 150 | 151 | # Likewise, many rows need to be stored as Integers, but numpy thinks 152 | # they may be strings (probably because a legal value is "-"). This is 153 | # the list of the fields we know need to be converted 154 | for field in FIELDS_INTEGER: 155 | if field in df.columns: 156 | df[field] = df[field].replace("",-1) 157 | df[field] = df[field].astype(int) 158 | 159 | # Finally, convert the Float fields 160 | for field in FIELDS_FLOAT: 161 | if field in df.columns: 162 | df[field] = df[field].replace("",numpy.nan) 163 | df[field] = df[field].astype(float) 164 | 165 | if logtype == "conn.log": 166 | # if we're processing the conn.log AND we've requested random samples, 167 | # create a list of the sampled connection IDs and update the 168 | # connection_ids parameter. Otherwise, leave it the same. 169 | if options.sample: 170 | for id in df["uid"].tolist(): 171 | connection_ids.append(id) 172 | 173 | return df 174 | 175 | def graph_flows(g, df_conn): 176 | # Iterate through all the flows 177 | for con in df_conn.index: 178 | # For each flow, create new Host objects if necessary. Then create a 179 | # new Flow, and add the relationships between the Hosts and the Flow 180 | 181 | # Create the source & dest nodes 182 | src_host = g.host.get_or_create("name", 183 | df_conn.loc[con]["id.orig_h"], 184 | {"name": df_conn.loc[con]["id.orig_h"], 185 | "address":df_conn.loc[con]["id.orig_h"] 186 | }) 187 | dst_host = g.host.get_or_create("name", 188 | df_conn.loc[con]["id.resp_h"], 189 | {"name": df_conn.loc[con]["id.resp_h"], 190 | "address":df_conn.loc[con]["id.resp_h"] 191 | }) 192 | 193 | # If the flow is marked "local_orig", we need to update this feature 194 | # on the source host. We can't do this at creation time because we 195 | # might have seen this host before in another context, and created a 196 | # node for it without knowing it was a local host. 197 | if df_conn.loc[con]["local_orig"] == "T": 198 | src_host.local = "T" 199 | src_host.save() 200 | 201 | # Create the Flow object. Since we can run the same log file through 202 | # multiple times, or observe the same flow from different log files, 203 | # assume flows with the same name are actually the same flow. 204 | 205 | flowname = df_conn.loc[con]["uid"] 206 | # Create the flow node, with all the rich data 207 | properties = dict(df_conn.loc[con]) 208 | # Manually assign the "name" property 209 | properties["name"] = flowname 210 | # Take out the info about the source & dest IPs, since we should be 211 | # getting them from the connected host nodes 212 | del properties["id.orig_h"] 213 | del properties["id.resp_h"] 214 | 215 | flow = g.flow.get_or_create("name", flowname, properties) 216 | 217 | # Create the edges for this flow, if they don't already exist 218 | nodes = flow.inV("source") 219 | if nodes == None or not (src_host in nodes): 220 | g.source.create(src_host, flow) 221 | 222 | nodes = flow.outV("dest") 223 | if nodes == None or not (dst_host in nodes): 224 | g.dest.create(flow, dst_host) 225 | 226 | # Make a direct link between the src and dest hosts, as this 227 | # is a common analysis task. It doesn't *always* make sense 228 | # to go through the flows. 229 | neighbors = src_host.outV("connectedTo") 230 | if neighbors == None or not (dst_host in neighbors): 231 | e = g.connectedTo.create(src_host, dst_host) 232 | e.weight=1 233 | e.save() 234 | else: 235 | edges = edge_list(g, src_host._id, dst_host._id, "connectedTo") 236 | # There should only be one of these edges, and we already know 237 | # it exists, so it's safe to just take the first one 238 | edge = edges.next() 239 | g.connectedTo.update(edge._id, weight=(edge.weight + 1)) 240 | 241 | def graph_dns(g, df_dns): 242 | # Iterate through all the flows 243 | for i in df_dns.index: 244 | # Create the DNSTransaction node 245 | # name = str(df_dns.loc[i]["trans_id"]) 246 | name = "%d - %s - %s" % (df_dns.loc[i]["trans_id"], 247 | df_dns.loc[i]["qtype_name"], 248 | df_dns.loc[i]["query"]) 249 | timestamp = df_dns.loc[i]["ts"] 250 | flowname = df_dns.loc[i]["uid"] 251 | 252 | # Pick out the properties that belong on the transaction and add 253 | # them 254 | transaction = g.dnsTransaction.create(name=name, 255 | ts=df_dns.loc[i]["ts"], 256 | proto=df_dns.loc[i]["proto"], 257 | orig_p=df_dns.loc[i]["id.orig_p"], 258 | resp_p=df_dns.loc[i]["id.resp_p"], 259 | qclass=df_dns.loc[i]["qclass"], 260 | qclass_name=df_dns.loc[i]["qclass_name"], 261 | qtype=df_dns.loc[i]["qtype"], 262 | qtype_name=df_dns.loc[i]["qtype_name"], 263 | rcode=df_dns.loc[i]["rcode"], 264 | rcode_name=df_dns.loc[i]["rcode_name"], 265 | AA=df_dns.loc[i]["AA"], 266 | TC=df_dns.loc[i]["TC"], 267 | RD=df_dns.loc[i]["RD"], 268 | RA=df_dns.loc[i]["RA"], 269 | Z=df_dns.loc[i]["Z"], 270 | rejected=df_dns.loc[i]["rejected"]) 271 | 272 | # Create a node + edge for the query, if there is one in the log 273 | if df_dns.loc[i]["query"]: 274 | fqdn = g.fqdn.get_or_create("name", df_dns.loc[i]["query"], 275 | {"name":df_dns.loc[i]["query"], 276 | "domain":df_dns.loc[i]["query"]}) 277 | g.lookedUp.create(transaction,fqdn) 278 | 279 | # Now create the nodes and edges for the domains or addresses in 280 | # the answer (if there is an answer). There can be multiple 281 | # answers, so split this into a list and create one node + edge 282 | # for each. 283 | # 284 | # There should also be one TTL per answer, so we'll split those and 285 | # use array indices to tie them together. The arrays are supposed 286 | # to always be the same length, but maybe sometimes they are 287 | # not. We'll force the issue by extending the TTL list to be 288 | # the same size as the address list. 289 | if df_dns.loc[i]["answers"]: 290 | addrs = df_dns.loc[i]["answers"].split(",") 291 | ttls = df_dns.loc[i]["TTLs"].split(",") 292 | ttls = extend_list(ttls, ttls[len(ttls)-1],len(addrs)) 293 | 294 | for i in range(len(addrs)): 295 | ans = addrs[i] 296 | ttl = float(ttls[i]) 297 | # DNS answers can be either IPs or other names. Figure 298 | # out which type of node to create for each answer. 299 | if is_IP(ans): 300 | node = g.host.get_or_create("name",ans,{"name":ans, 301 | "address":ans}) 302 | else: 303 | node = g.fqdn.get_or_create("name",ans,{"name":ans, 304 | "address":ans}) 305 | 306 | g.resolvedTo.create(fqdn, node, {"ts":timestamp}) 307 | g.answer.create(transaction, node, {"TTL": ttl}) 308 | 309 | # Create a node + edge for the source of the DNS transaction 310 | # (the client host) 311 | if df_dns.loc[i]["id.orig_h"]: 312 | src = g.host.get_or_create("name", df_dns.loc[i]["id.orig_h"], 313 | {"name": df_dns.loc[i]["id.orig_h"], 314 | "address":df_dns.loc[i]["id.orig_h"]}) 315 | g.queried.create(src, transaction) 316 | 317 | # Create a node + edge for the destination of the DNS transaction 318 | # (the DNS server) 319 | if df_dns.loc[i]["id.resp_h"]: 320 | dst = g.host.get_or_create("name", df_dns.loc[i]["id.resp_h"], 321 | {"name": df_dns.loc[i]["id.resp_h"], 322 | "address":df_dns.loc[i]["id.resp_h"]}) 323 | g.queriedServer.create(transaction,dst) 324 | 325 | 326 | # Now connect this transaction to the correct flow 327 | flows = g.flow.index.lookup(name=flowname) 328 | if flows == None: 329 | # print "ERROR: Flow '%s' does not exist" % flowname 330 | pass 331 | else: 332 | # lookup returns a generator, but since there should only be one 333 | # flow with this name, just take the first one 334 | flow = flows.next() 335 | nodes = flow.outV("contains") 336 | if nodes == None or not (transaction in nodes): 337 | edge = g.contains.create(flow, transaction) 338 | 339 | 340 | # Associate the src host with the FQDN it resolved. Since a host 341 | # can resolve a domain multiple times, we'll also keep track of a 342 | # "weight" feature to count how many times this happened. 343 | if df_dns.loc[i]["query"]: 344 | neighbors = src.outV("resolved") 345 | if neighbors == None or not (fqdn in neighbors): 346 | e = g.resolved.create(src, fqdn) 347 | e.weight=1 348 | e.save() 349 | else: 350 | edges = edge_list(g, src._id, fqdn._id, "resolved") 351 | # There should only be one of these edges, and we already know 352 | # it exists, so it's safe to just take the first one 353 | edge = edges.next() 354 | g.resolved.update(edge._id, weight=(edge.weight + 1)) 355 | 356 | def graph_files(g, df_files): 357 | # Iterate through all the flows 358 | for i in df_files.index: 359 | # Create the file node 360 | name = str(df_files.loc[i]["fuid"]) 361 | timestamp = df_files.loc[i]["ts"] 362 | flows = df_files.loc[i]["conn_uids"] 363 | 364 | # Create the file object. Note that this is more like a file transfer 365 | # transaction than a static object just for that file. There can be 366 | # more than one node with the same MD5 hash, for example. Cleary, 367 | # those are the same file in the real world, but not in our graph. 368 | # 369 | # However, it is possible to actually have the same file transaction 370 | # show up in the Bro logs multiple times. AFAICT, this is mostly 371 | # due to things like timeouts, where Bro records the file transfer 372 | # start and then sends another log later that says that the xfer 373 | # failed. We need to make sure we always check to make sure there 374 | # is only one File node for each actual transaction, but we'll use 375 | # the fields from the most recent log, assuming things that Bro 376 | # logs last will be more accurate. 377 | fileobj = g.file.get_or_create("name", name, {"name":name}) 378 | 379 | fileobj.fuid=df_files.loc[i]["fuid"] 380 | fileobj.source=df_files.loc[i]["source"] 381 | fileobj.depth=df_files.loc[i]["depth"] 382 | fileobj.analyzers=df_files.loc[i]["analyzers"] 383 | fileobj.mime_type=df_files.loc[i]["mime_type"] 384 | fileobj.filename=df_files.loc[i]["filename"] 385 | fileobj.duration=df_files.loc[i]["duration"] 386 | fileobj.seen_bytes=df_files.loc[i]["seen_bytes"] 387 | fileobj.total_bytes=df_files.loc[i]["total_bytes"] 388 | fileobj.missing_bytes=df_files.loc[i]["missing_bytes"] 389 | fileobj.overflow_bytes=df_files.loc[i]["overflow_bytes"] 390 | fileobj.timedout=df_files.loc[i]["timedout"] 391 | fileobj.md5=df_files.loc[i]["md5"] 392 | fileobj.sha1=df_files.loc[i]["sha1"] 393 | fileobj.sha256=df_files.loc[i]["sha256"] 394 | fileobj.extracted=df_files.loc[i]["extracted"] 395 | fileobj.save() 396 | 397 | # Now connect this to the flow(s) it is associated with. 398 | for f in flows.split(","): 399 | flow = g.flow.get_or_create("name", f, {"name":f}) 400 | g.contains.create(flow, fileobj) 401 | 402 | # Connect it to the src and dest hosts in the file xfer. Note that 403 | # there can be more than one host listed for each side of the 404 | # xfer (don't ask me how). 405 | for h in df_files.loc[i]["tx_hosts"].split(","): 406 | src = g.host.get_or_create("name", h, 407 | {"name":h, 408 | "address":h}) 409 | g.sentBy.create(fileobj,src,{"ts":timestamp, 410 | "is_orig":df_files.loc[i]["is_orig"]}) 411 | # Also have this extra bit of info about whether the originating 412 | # host is part of a local subnet. We should make sure that is 413 | # recorded on the host object. 414 | src.local = df_files.loc[i]["local_orig"] 415 | src.save() 416 | 417 | for h in df_files.loc[i]["rx_hosts"].split(","): 418 | dst = g.host.get_or_create("name", h, 419 | {"name":h, 420 | "address":h}) 421 | g.sentTo.create(dst, fileobj,{"ts":timestamp}) 422 | 423 | def graph_http(g, df_http): 424 | # Iterate through all the flows 425 | for i in df_http.index: 426 | # Create the HTTPTransaction node 427 | http = g.httpTransaction.create(name="H" + unique_id(), 428 | ts=df_http.loc[i]["ts"], 429 | resp_p=df_http.loc[i]["id.resp_p"], 430 | trans_depth=df_http.loc[i]["trans_depth"], 431 | method=df_http.loc[i]["method"].upper(), 432 | request_body_len=df_http.loc[i]["request_body_len"], 433 | response_body_len=df_http.loc[i]["response_body_len"], 434 | status_code=df_http.loc[i]["status_code"], 435 | status_msg=df_http.loc[i]["status_msg"], 436 | info_code=df_http.loc[i]["info_code"], 437 | info_msg=df_http.loc[i]["info_msg"], 438 | filename=df_http.loc[i]["filename"], 439 | tags=df_http.loc[i]["tags"], 440 | proxied=df_http.loc[i]["proxied"]) 441 | 442 | # Now connect this to the flow it's associated with 443 | flowname = df_http.loc[i]["uid"] 444 | flow = g.flow.get_or_create("name", flowname, {"name":flowname}) 445 | g.contains.create(flow, http) 446 | 447 | # Now connect it to the hosts on each side of the transaction 448 | src_addr = df_http.loc[i]["id.orig_h"] 449 | dst_addr = df_http.loc[i]["id.resp_h"] 450 | 451 | src_host = g.host.get_or_create("name", src_addr, {"name":src_addr}) 452 | dst_host = g.host.get_or_create("name", dst_addr, {"name":dst_addr}) 453 | 454 | g.requestedBy.create(src_host, http) 455 | g.requestedOf.create(http, dst_host) 456 | 457 | # Connect to the server host. This can be either a domain name or 458 | # an IP address. If it's a domain, we need to attach to an FQDN node. 459 | # If it's an IP, we need a Host node. 460 | h = df_http.loc[i]["host"] 461 | if is_IP(h): 462 | host = g.host.get_or_create("name", h, {"name":h}) 463 | else: 464 | host = g.fqdn.get_or_create("name", h, {"name":h}) 465 | 466 | g.hostedBy.create(http, host) 467 | 468 | # Now create and link to a URI node for the requested resource 469 | u = df_http.loc[i]["uri"] 470 | uri = g.uri.get_or_create("name", u, {"name":u}) 471 | g.identifiedBy.create(http, uri) 472 | 473 | # Link to the UserAgent node 474 | ua = df_http.loc[i]["user_agent"] 475 | user_agent = g.userAgent.get_or_create("name", ua, {"name":ua}) 476 | 477 | # Link to the HTTP transaction 478 | g.agent.create(http, user_agent) 479 | # Link to the host that sent the request 480 | g.agent.create(src_host, user_agent) 481 | 482 | # Now link to the File objects transferred by this transaction. 483 | # Each file object also has an associated MIME type. These are 484 | # encoded as two sets of paired lists: orig_fuids/orig_mime_types 485 | # and resp_fuids/resp_mime_types. In the event that the fuid list 486 | # is longer than the MIME type list (indicating that the last values 487 | # in the fuid list all have the same MIME type), we will extend the 488 | # mime type list to explicitly name all the mime types. It makes it 489 | # simpler to process the paired lists if we know they are the same 490 | # size. 491 | orig_fuids = df_http.loc[i]["orig_fuids"].split(",") 492 | orig_mime_types = df_http.loc[i]["orig_mime_types"].split(",") 493 | orig_mime_types = extend_list(orig_mime_types, 494 | orig_mime_types[len(orig_mime_types)-1], 495 | len(orig_fuids)) 496 | 497 | resp_fuids = df_http.loc[i]["resp_fuids"].split(",") 498 | resp_mime_types = df_http.loc[i]["resp_mime_types"].split(",") 499 | resp_mime_types = extend_list(resp_mime_types, 500 | resp_mime_types[len(resp_mime_types)-1], 501 | len(resp_fuids)) 502 | 503 | if orig_fuids != ['']: 504 | for x in range(len(orig_fuids)): 505 | fuid = orig_fuids[x] 506 | mime_type = orig_mime_types[x] 507 | 508 | f = g.file.get_or_create("name", fuid, {"name":fuid}) 509 | g.sent.create(http, f, {"mime_type": mime_type}) 510 | 511 | if resp_fuids != ['']: 512 | for x in range(len(resp_fuids)): 513 | try: 514 | fuid = resp_fuids[x] 515 | mime_type = resp_mime_types[x] 516 | f = g.file.get_or_create("name", fuid, {"name":fuid}) 517 | g.received.create(http, f, {"mime_type": mime_type}) 518 | except Exception, e: 519 | print "****" 520 | print "Exception: %s" % e 521 | print 522 | print resp_fuids 523 | print 524 | print "x: %s fuid: %s" % (x, fuid) 525 | sys.exit(-1) 526 | 527 | # Create the user account object and relationship 528 | username = df_http.loc[i]["username"] 529 | password = df_http.loc[i]["password"] 530 | if username: 531 | account = g.account.get_or_create("name", username, {"name":username}) 532 | g.requested.create(account, http, {"password":password}) 533 | g.uses.create(account, src_host) 534 | 535 | ##### Main ##### 536 | 537 | (options, args) = parse_options() 538 | 539 | if not options.logdir: 540 | print "Error: Must specify the log directory with -l or --log-dir" 541 | sys.exit(-1) 542 | 543 | if not os.path.exists(options.logdir): 544 | print "Error: Directory %s does not exist" % options.logdir 545 | sys.exit(-1) 546 | 547 | if not os.path.exists(options.outputdir): 548 | os.mkdir(options.outputdir) 549 | 550 | if not options.quiet: 551 | print "Reading log files from %s" % options.logdir 552 | 553 | # Now we can start to read data and populate the graph. 554 | 555 | g = Connect() 556 | 557 | # Now read the types of logs we know how to process, extract the relevant 558 | # data and add it to the graph 559 | 560 | connection_ids = list() 561 | 562 | print "Graphing Flows..." 563 | df_conn = readlog("conn.log", connection_ids) 564 | print "Number of events: %d" % len(df_conn.index) 565 | graph_flows(g, df_conn) 566 | 567 | print "Graphing Files..." 568 | df_files = readlog("files.log", connection_ids) 569 | print "Number of events: %d" % len(df_files.index) 570 | graph_files(g, df_files) 571 | 572 | print "Graphing DNS Transactions..." 573 | df_dns = readlog("dns.log", connection_ids) 574 | print "Number of events: %d" % len(df_dns.index) 575 | graph_dns(g, df_dns) 576 | 577 | print "Graphing HTTP Transactions..." 578 | df_http = readlog("http.log", connection_ids) 579 | print "Number of events: %d" % len(df_http.index) 580 | graph_http(g, df_http) 581 | 582 | # Print some basic info about the graph so we know we did some real work 583 | graph_stats(g) 584 | 585 | 586 | 587 | 588 | 589 | 590 | --------------------------------------------------------------------------------