├── img
    ├── filters.png
    ├── inspector.png
    ├── partition.png
    ├── node_colors.png
    └── give_colors_to_nodes.png
├── doc
    ├── Files Graph.png
    ├── Flow Graph.png
    ├── HTTP Graph.png
    ├── Flow Graph.graffle
    ├── HTTP Graph.graffle
    ├── Files Graph.graffle
    ├── DNS Transaction Graph.png
    └── DNS Transaction Graph.graffle
├── gh
    ├── __init__.py
    ├── host.py
    ├── account.py
    ├── file.py
    ├── flow.py
    ├── dns.py
    ├── http.py
    ├── connect.py
    └── util.py
├── db_stats.py
├── .gitignore
├── db_clear.py
├── db_path.py
├── groovy
    └── gremlin.groovy
├── db_graph.py
├── README.md
└── db_load.py


/img/filters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/filters.png


--------------------------------------------------------------------------------
/img/inspector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/inspector.png


--------------------------------------------------------------------------------
/img/partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/partition.png


--------------------------------------------------------------------------------
/doc/Files Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Files Graph.png


--------------------------------------------------------------------------------
/doc/Flow Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Flow Graph.png


--------------------------------------------------------------------------------
/doc/HTTP Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/HTTP Graph.png


--------------------------------------------------------------------------------
/img/node_colors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/node_colors.png


--------------------------------------------------------------------------------
/doc/Flow Graph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Flow Graph.graffle


--------------------------------------------------------------------------------
/doc/HTTP Graph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/HTTP Graph.graffle


--------------------------------------------------------------------------------
/doc/Files Graph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/Files Graph.graffle


--------------------------------------------------------------------------------
/doc/DNS Transaction Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/DNS Transaction Graph.png


--------------------------------------------------------------------------------
/img/give_colors_to_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/img/give_colors_to_nodes.png


--------------------------------------------------------------------------------
/doc/DNS Transaction Graph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavidJBianco/Bro2Graph/HEAD/doc/DNS Transaction Graph.graffle


--------------------------------------------------------------------------------
/gh/__init__.py:
--------------------------------------------------------------------------------
1 | import util
2 | import file
3 | import account
4 | import flow
5 | import dns
6 | import connect
7 | import http
8 | import host
9 | 


--------------------------------------------------------------------------------
/gh/host.py:
--------------------------------------------------------------------------------
 1 | from bulbs.model import Node, Relationship
 2 | from bulbs.property import String, Integer, DateTime
 3 | from bulbs.utils import current_datetime
 4 | 
 5 | class Host(Node):
 6 | 
 7 |     element_type = "host"
 8 | 
 9 |     name = String(nullable=False)
10 |     color = String(default="#DA456B")
11 |     
12 | 


--------------------------------------------------------------------------------
/gh/account.py:
--------------------------------------------------------------------------------
 1 | from bulbs.model import Node, Relationship
 2 | from bulbs.property import String, Integer, DateTime
 3 | from bulbs.utils import current_datetime
 4 | 
 5 | class Account(Node):
 6 | 
 7 |     element_type = "account"
 8 | 
 9 |     name = String(nullable=False)
10 |     color = String(default="#00FFFF")
11 |     
12 | class Requested(Relationship):
13 |     label = "requested"
14 |     element_type = label
15 | 
16 | class Uses(Relationship):
17 |     label = "uses"
18 |     element_type = label
19 | 
20 |     
21 |     
22 | 
23 |     
24 | 


--------------------------------------------------------------------------------
/gh/file.py:
--------------------------------------------------------------------------------
 1 | from bulbs.model import Node, Relationship
 2 | from bulbs.property import String, Integer, DateTime
 3 | from bulbs.utils import current_datetime
 4 | 
 5 | class File(Node):
 6 | 
 7 |     element_type = "file"
 8 |     name = String(nullable=False)
 9 |     color = String(default="#E6E658")
10 |     
11 | class Transferred(Relationship):
12 |     label = "transferred"
13 |     element_type = label
14 |     
15 | class SentTo(Relationship):
16 |     label = "sentTo"
17 |     element_type = label
18 |     
19 | class SentBy(Relationship):
20 |     label = "sentBy"
21 |     element_type = label
22 |     
23 | 
24 |     
25 | 


--------------------------------------------------------------------------------
/gh/flow.py:
--------------------------------------------------------------------------------
 1 | from bulbs.model import Node, Relationship
 2 | from bulbs.property import String, Integer, DateTime
 3 | from bulbs.utils import current_datetime
 4 | 
 5 | class Flow(Node):
 6 | 
 7 |     element_type = "flow"
 8 |     name = String(nullable=False)
 9 |     color = String(default="#5B87F2")
10 |     
11 | class Source(Relationship):
12 |     label = "source"
13 |     element_type = label
14 |     
15 | class Dest(Relationship):
16 |     label = "dest"
17 |     element_type = label
18 | 
19 | class ConnectedTo(Relationship):
20 |     label = "connectedTo"
21 |     element_type = label
22 |     
23 | class Contains(Relationship):
24 |     label = "contains"
25 |     element_type = label
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/db_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Our interface to the GraphDB
 4 | from bulbs.rexster import Graph, Config, DEBUG
 5 | 
 6 | # Our own modules
 7 | from gh.connect import Connect
 8 | from gh.util import graph_info
 9 | 
10 | def graph_stats (g):
11 |     info = graph_info(g)
12 | 
13 |     print
14 |     print "**** Graph Stats"
15 |     print
16 |     print "  **** Totals"
17 |     print "  %15s\t%d" % ("Vertices", info["numv"])
18 |     print "  %15s\t%d" % ("Edges", info["nume"])
19 |     print
20 |     print "  **** Vertices by type:"
21 |     for v in info["vinfo"]:
22 |         print "  %15s\t%d" % (v, info["vinfo"][v])
23 |     print
24 |     print "  **** Edges by type:"
25 |     for e in info["einfo"]:
26 |         print "  %15s\t%d" % (e, info["einfo"][e])
27 | 
28 | if __name__ == "__main__":
29 |     g = Connect()
30 | 
31 |     graph_stats(g)
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # Emacs
 6 | *~
 7 | \#*
 8 | .#* 
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | 
58 | # Sphinx documentation
59 | docs/_build/
60 | 
61 | # PyBuilder
62 | target/
63 | 


--------------------------------------------------------------------------------
/db_clear.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from gh.connect import Connect
 4 | 
 5 | from optparse import OptionParser
 6 | import sys
 7 | import re
 8 | 
 9 | def parse_options() :
10 |     parser = OptionParser()
11 |     parser.add_option("-f", "--force", dest="force",default=False,
12 |                       action="store_true",
13 |                       help="Force clear the DB, without prompting for confirmation.")
14 |     (options, args) = parser.parse_args()
15 |     return(options, args)
16 | 
17 | ### MAIN ###
18 | (options, args) = parse_options()
19 | 
20 | g = Connect()
21 | 
22 | # don't prompt if we used --force
23 | if options.force:
24 |     g.clear()
25 | else:
26 |     print "WARNING: This will DELETE ALL DATA in the graph.  Are you sure? (y/N) ",
27 | 
28 |     answer = sys.stdin.readline()
29 | 
30 |     # Do a case-insensitive match to see if the first char of the response is "y"
31 |     if re.match('y', answer, re.I):
32 |         g.clear()
33 |     else:
34 |         print "Graph data NOT DELETED."
35 | 
36 |     
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/gh/dns.py:
--------------------------------------------------------------------------------
 1 | from bulbs.model import Node, Relationship
 2 | from bulbs.property import String, Integer, DateTime
 3 | from bulbs.utils import current_datetime
 4 | 
 5 | class FQDN(Node):
 6 | 
 7 |     element_type = "fqdn"
 8 | 
 9 |     name = String(nullable=False)
10 |     color = String(default="#8CBC1C")
11 |     
12 | class DNSTransaction(Node):
13 |     element_type = "dnsTransaction"
14 | 
15 |     name = String(nullable=False)
16 |     color = String(default="#FFBF56")
17 |     
18 | class Resolved(Relationship):
19 |     label = "resolved"
20 |     element_type = label
21 | 
22 | class Answer(Relationship):
23 |     label = "answer"
24 |     element_type = label
25 |     
26 | class Queried(Relationship):
27 |     label = "queried"
28 |     element_type = label
29 | 
30 | class QueriedServer(Relationship):
31 |     label = "queriedServer"
32 |     element_type = label
33 | 
34 | class LookedUp(Relationship):
35 |     label = "lookedUp"
36 |     element_type = label
37 | 
38 | class ResolvedTo(Relationship):
39 |     label = "resolvedTo"
40 |     element_type = label
41 |     
42 | 


--------------------------------------------------------------------------------
/gh/http.py:
--------------------------------------------------------------------------------
 1 | from bulbs.model import Node, Relationship
 2 | from bulbs.property import String, Integer, DateTime
 3 | from bulbs.utils import current_datetime
 4 | 
 5 | class HTTPTransaction(Node):
 6 | 
 7 |     element_type = "http_transaction"
 8 |     name = String(nullable=False)
 9 |     color = String(default="#5FBD71")
10 |     
11 | class UserAgent(Node):
12 | 
13 |     element_type = "userAgent"
14 |     name = String(nullable=False)
15 |     color = String(default="#BE844A")
16 |     
17 | class URI(Node):
18 | 
19 |     element_type = "uri"
20 |     name = String(nullable=False)
21 |     color = String(default="#71985E")
22 |     
23 | class Referrer(Relationship):
24 |     label = "referrer"
25 |     element_type = label
26 |     
27 | class HostedBy(Relationship):
28 |     label = "hostedBy"
29 |     element_type = label
30 |     
31 | class RequestedBy(Relationship):
32 |     label = "requestedBy"
33 |     element_type = label
34 | 
35 | class RequestedOf(Relationship):
36 |     label = "requestedOf"
37 |     element_type = label
38 | 
39 | class IdentifiedBy(Relationship):
40 |     label = "identifiedBy"
41 |     element_type = label
42 | 
43 | class Agent(Relationship):
44 |     label = "agent"
45 |     element_type = label
46 | 
47 | class Sent(Relationship):
48 |     label = "sent"
49 |     element_type = label
50 | 
51 | class Received(Relationship):
52 |     label = "received"
53 |     element_type = label
54 | 
55 |     
56 | 


--------------------------------------------------------------------------------
/db_path.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from optparse import OptionParser
 4 | import sys
 5 | import re
 6 | 
 7 | import gh
 8 | 
 9 | def parse_options() :
10 |     parser = OptionParser()
11 |     parser.add_option("-v", "--verbose", dest="verbose",default=False,
12 |                       action="store_true",
13 |                       help="Print more details about nodes and edges.")
14 |     parser.add_option("-d", "--directed", dest="directed", default=False,
15 |                       action="store_true",
16 |                       help="Respect relationship direction when finding paths.")
17 |     parser.add_option("-m", "--max_hops", dest="max_hops", default=5,
18 |                       help="Max number of hops in path.")
19 |     (options, args) = parser.parse_args()
20 |     return(options, args)
21 | 
22 | #### MAIN ####
23 | 
24 | (options, args) = parse_options()
25 | 
26 | if len(args) != 2:
27 |     print "ERROR: You must specify both the beginning and ending nodes."
28 |     sys.exit(-1)
29 | 
30 | g = gh.connect.Connect()
31 | 
32 | src = args[0]
33 | dst = args[1]
34 | 
35 | # Look up the source ID, if we gave a name
36 | if re.match("\d+$", dst):
37 |     print "Looking up by destination vertex ID"
38 |     lookup = gh.util.shortest_path
39 | else:
40 |     print "Looking up by destination vertex type"
41 |     lookup = gh.util.shortest_path_to_type
42 | 
43 | res = lookup(g, src, dst,
44 |              directed=options.directed,
45 |              max_hops=options.max_hops)
46 | 
47 | if res:
48 |     for r in res:
49 |         print gh.util._v2s(r,verbose=options.verbose),
50 | else:
51 |     print "No path."
52 |     
53 | 


--------------------------------------------------------------------------------
/groovy/gremlin.groovy:
--------------------------------------------------------------------------------
 1 | // Return info about the count of different types of vertices and edges
 2 | def graph_info() {
 3 | 
 4 |   numv = g.V.count()
 5 |   nume = g.E.count()
 6 |   vinfo = g.V.groupBy{it.element_type}{1}{it.size}.cap.next()
 7 |   einfo = g.E.groupBy{it.element_type}{1}{it.size}.cap.next()
 8 | 
 9 |   return [numv: numv, nume: nume, vinfo: vinfo, einfo: einfo]
10 | }
11 | 
12 | // Shortest path between two vertices
13 | def shortest_path(node1_id, node2_id, hops, directed) {
14 | 
15 |   if (directed) {
16 |     p = g.v(node1_id).as("x").outE.inV.dedup.loop("x"){it.loops < hops}{it.object.id == node2_id.toString()}.path.sort{a,b -> a.size() <=> b.size()}.take(1)
17 |   } else {
18 |     p = g.v(node1_id).as("x").bothE.bothV.dedup.loop("x"){it.loops < hops}{it.object.id == node2_id.toString()}.path.sort{a,b -> a.size() <=> b.size()}.take(1)
19 |   }
20 | }
21 | 
22 | // Shortest path between two vertices, where the destination is a node
23 | // type, not a specific node ID
24 | def shortest_path_to_type(node1_id, node2_type, hops, directed) {
25 | 
26 |   if (directed) {
27 |     p = g.v(node1_id).as("x").outE.inV.dedup.loop("x"){it.loops < hops}{it.object.element_type == node2_type}.path.sort{a,b -> a.size() <=> b.size()}.take(1)
28 |   } else {
29 |     p = g.v(node1_id).as("x").bothE.bothV.dedup.loop("x"){it.loops < hops}{it.object.element_type == node2_type}.path.sort{a,b -> a.size() <=> b.size()}.take(1)
30 |   }
31 | }
32 | 
33 | // Test script to get simple node info
34 | def node_info(node_id) {
35 |   n = g.v(node_id).map
36 |   n
37 | }
38 | 
39 | // Return the list of edges between a given two nodes
40 | def edge_list(node1_id, node2_id, edge_type) {
41 |   e = g.v(node1_id).outE(edge_type).as("x").inV.filter{it.id == node2_id.toString()}.back("x")
42 | 
43 |   e
44 | }
45 | 


--------------------------------------------------------------------------------
/db_graph.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from gh.connect import Connect
 4 | import gh
 5 | 
 6 | from GephiStreamer import Node, Edge, GephiStreamerManager
 7 | 
 8 | from random import choice
 9 | 
10 | def display_graph(t, nodes, edges):
11 | 
12 |     if nodes != None:
13 |         print "*** Graphing Nodes"
14 |         for n in nodes:
15 |             node_temp = Node(n._id)
16 | 
17 |             properties = n.map()
18 |             for key in properties:
19 |                 node_temp.property[key] = properties[key]
20 | 
21 |             node_temp.property["colour"] = node_temp.property["color"]
22 |             node_temp.property["label"] = node_temp.property["name"]
23 |             t.add_node(node_temp)
24 | 
25 |     if edges != None:
26 |         print "*** Graphing Edges"
27 |         for e in edges:
28 |             src = e._outV
29 |             dst = e._inV
30 |             edge_temp = Edge(src, dst, directed=True)
31 | 
32 |             properties = e.map()
33 |             for key in properties:
34 |                 edge_temp.property[key] = properties[key]
35 |             
36 |             t.add_edge(edge_temp)
37 | 
38 |     t.commit()
39 | 
40 | def display_graph_from_list(t, l=None):
41 |     if l == None:
42 |         return
43 | 
44 |     for element in l:
45 |         if element["_type"] == "vertex":
46 |             node_temp = Node(element["_id"])
47 | 
48 |             for key in element:
49 |                 node_temp.property[key] = element[key]
50 | 
51 |             node_temp.property["label"] = node_temp.property["name"]
52 |             node_temp.property["colour"] = node_temp.property["color"]
53 | 
54 |             t.add_node(node_temp)
55 |         elif element["_type"] == "edge":
56 |             src = element["_outV"]
57 |             dst = element["_inV"]
58 |             edge_temp = Edge(src, dst, directed=True)
59 | 
60 |             for key in element:
61 |                 edge_temp.property[key] = element[key]
62 | 
63 |             t.add_edge(edge_temp)
64 | 
65 |     t.commit()
66 | 
67 | if __name__ == "__main__":
68 |     g = Connect()
69 |     t = GephiStreamerManager()
70 | 
71 |     print "Getting nodes..."
72 |     nodes = g.V
73 |     print "Getting edges..."
74 |     edges = g.E
75 | 
76 |     display_graph(t, g.V, g.E)
77 | 
78 |     
79 | 


--------------------------------------------------------------------------------
/gh/connect.py:
--------------------------------------------------------------------------------
 1 | from bulbs.rexster import Graph, Config
 2 | from host import Host
 3 | from flow import Flow, Source, Dest, Contains, ConnectedTo
 4 | from dns import FQDN, DNSTransaction, LookedUp, Queried, Answer, QueriedServer, Resolved, ResolvedTo
 5 | from file import File, Transferred, SentTo, SentBy
 6 | from http import HTTPTransaction, URI, UserAgent, Referrer, HostedBy, RequestedBy, RequestedOf, IdentifiedBy, Agent, Sent, Received
 7 | from account import Account, Requested, Uses
 8 | 
 9 | DEFAULT_URI = "http://localhost:8182/graphs/hunting"
10 | 
11 | def Connect(uri=DEFAULT_URI):
12 |     """
13 |     Establishes a connection to the graph database backend.  It also does 
14 |     a few standard tasks to set up the models and server side scripts we
15 |     depend on, so every utility that calls Connect() has a consistent 
16 |     environment.
17 | 
18 |     Returns a Graph() object.
19 | 
20 |     Example:
21 | 
22 |         g = Connect()  # Connect using the standard default database info
23 |         g = Connect("http://localhost:8182/graphs/myDB") # Use a custom DB
24 |     """
25 |     config = Config(uri)
26 |     g = Graph(config)
27 |     
28 |     # Set up the node and relationship proxies
29 |     g.add_proxy("host", Host)
30 |     g.add_proxy("flow", Flow)
31 |     g.add_proxy("source", Source)
32 |     g.add_proxy("contains", Contains)
33 |     g.add_proxy("dest", Dest)
34 |     g.add_proxy("connectedTo", ConnectedTo)
35 |     g.add_proxy("fqdn", FQDN)
36 |     g.add_proxy("dnsTransaction", DNSTransaction)
37 |     g.add_proxy("resolved", Resolved)
38 |     g.add_proxy("answer", Answer)
39 |     g.add_proxy("queried", Queried)
40 |     g.add_proxy("queriedServer", QueriedServer)
41 |     g.add_proxy("lookedUp", LookedUp)
42 |     g.add_proxy("resolvedTo", ResolvedTo)
43 |     g.add_proxy("file", File)
44 |     g.add_proxy("transferred", Transferred)
45 |     g.add_proxy("sentTo", SentTo)
46 |     g.add_proxy("sentBy", SentBy)
47 |     g.add_proxy("httpTransaction", HTTPTransaction)
48 |     g.add_proxy("uri", URI)
49 |     g.add_proxy("userAgent", UserAgent)
50 |     g.add_proxy("requestedBy", RequestedBy)
51 |     g.add_proxy("requestedOf", RequestedOf)
52 |     g.add_proxy("hostedBy", HostedBy)
53 |     g.add_proxy("identifiedBy", IdentifiedBy)
54 |     g.add_proxy("agent", Agent)
55 |     g.add_proxy("sent", Sent)
56 |     g.add_proxy("received", Received)
57 |     g.add_proxy("account", Account)
58 |     g.add_proxy("requested", Requested)
59 |     g.add_proxy("uses", Uses)
60 |     # Load in our groovy scripts
61 |     g.scripts.update("groovy/gremlin.groovy")
62 |     return g
63 | 


--------------------------------------------------------------------------------
/gh/util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | def _v2s(v, verbose=False):
 4 |     if not "_type" in v:
 5 |         return "-"
 6 |     elif v["_type"] == "vertex":
 7 |         if verbose:
 8 |             return "v[%s][%s %s]" % (v["_id"], v["element_type"], v["name"])
 9 |         else:
10 |             return "v[%s]" % v["_id"]
11 |     elif v["_type"] == "edge":
12 |         if verbose:
13 |             return "e[%s][%s-%s->%s]" % (v["_id"], v["_outV"], v["element_type"], v["_inV"])
14 |         else:
15 |             return "e[%s][%s-%s->%s]" % (v["_id"], v["_outV"], v["element_type"], v["_inV"])
16 |     else:
17 |         return "[??]"
18 | 
19 | def write_graphml(g, filename="/tmp/graph.graphml"):
20 |     '''
21 |     Given a Graph object (g), write the GraphML representation to the 
22 |     given filename.  If no filename is given, use the default 
23 |     "/tmp/graph.graphml".  This can be loaded into Gephi or some other
24 |     visualization tool.
25 |     '''
26 |     gml = g.get_graphml()
27 |     f = open(filename,"w")
28 |     f.write(gml)
29 |     f.close()
30 | 
31 | def shortest_path(g, node1_id, node2_id, max_hops=4, directed=False):
32 |     '''
33 |     Calls a Groovy script to compute the shortest path between two
34 |     nodes that is less than or equal to "max_hops" long. In the event
35 |     that there are multiple paths of the same length, it only returns
36 |     one of them.  Which one it returns is undefined. If the "directed"
37 |     attribute is True, the function will follow relationships only in
38 |     the direction in which they occur on the graph. If set to False,
39 |     it will find paths regardelss of the direction of the
40 |     relationships.
41 | 
42 |     Return value is either a list of nodes and edges, or None if no path
43 |     was found.
44 | 
45 |     '''
46 |     script = g.scripts.get("shortest_path")
47 |     res = g.gremlin.execute(script, dict(node1_id=node1_id, node2_id=node2_id, hops=max_hops, directed=directed))
48 |     if res:
49 |         lst = list(res.results)
50 |         # Results will be a list-of-lists. If there are any results, return
51 |         # the first list.
52 |         if len(lst) > 0:
53 |             return lst[0].data
54 |     # If we got here, there were no results, so we couldn't find a path.
55 |     return None
56 | 
57 | def shortest_path_to_type(g, node1_id, node2_type, max_hops=4, directed=False):
58 |     '''
59 |     Calls a Groovy script to compute the shortest path between two
60 |     nodes that is less than or equal to "max_hops" long, where the destination
61 |     node is any node of the type specified in "node2_type". In the event
62 |     that there are multiple paths of the same length, it only returns
63 |     one of them.  Which one it returns is undefined. If the "directed"
64 |     attribute is True, the function will follow relationships only in
65 |     the direction in which they occur on the graph. If set to False,
66 |     it will find paths regardelss of the direction of the
67 |     relationships.
68 | 
69 |     Return value is either a list of nodes and edges, or None if no path
70 |     was found.
71 | 
72 |     '''
73 |     script = g.scripts.get("shortest_path_to_type")
74 |     res = g.gremlin.execute(script, dict(node1_id=node1_id, node2_type=node2_type, hops=max_hops, directed=directed))
75 |     if res:
76 |         lst = list(res.results)
77 |         # Results will be a list-of-lists. If there are any results, return
78 |         # the first list.
79 |         if len(lst) > 0:
80 |             return lst[0].data
81 |     # If we got here, there were no results, so we couldn't find a path.
82 |     return None
83 | 
84 | def graph_info(g):
85 |     script = g.scripts.get("graph_info")
86 |     res = g.gremlin.execute(script)
87 | 
88 |     return res.results.next().data
89 | 
90 | def node_info(g, node_id):
91 |     script = g.scripts.get("node_info")
92 |     res = g.gremlin.execute(script, dict(node_id=node_id))
93 |     return res.results.next().data
94 | 
95 | def edge_list(g, node1_id, node2_id, edge_type):
96 |     script = g.scripts.get("edge_list")
97 |     res = g.gremlin.query(script, dict(node1_id=node1_id, node2_id=node2_id, edge_type=edge_type))
98 |     return res
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bro2Graph
  2 | 
  3 | ## Prerequisites
  4 | Bro2Graph relies on a few third-party packages, namely the Rexster server (https://github.com/tinkerpop/rexster/wiki/Downloads) and the 'Bulbs' Python interface (http://bulbflow.com).  
  5 | 
  6 | To render and interact with the graph, you'll need to install Gephi (http://gephi.github.io).  You'll also need the "Give Colors to Nodes" and "Graph Streaming" plugins (see the *Tools -> Plugins -> Available Plugins* menu to download and install these from within Gephi).
  7 | 
  8 | ### Installing & Configuring Rexster
  9 | Installation is the very simple.  Just download the latest version of the Rexster Server package from the above URL.  At the time of this writing, that would be v2.6.0.  It's all one big Zip file, so just extract it somewhere convenient.  
 10 | 
 11 | Once unzipped, you will need to edit the _config/rexster.xml_ file to create the database used by our scripts.  Find the beginning of the `<graphs>` stanza (where, obviously, all the graphs are defined) and insert the following:
 12 | 
 13 | 	<graph>                                                                 
 14 | 	 <graph-name>hunting</graph-name>
 15 | 	 <graph-type>tinkergraph</graph-type>                                
 16 | 	 <graph-mock-tx>true</graph-mock-tx>                                 
 17 |          <extensions>                                                        
 18 |              <allows>                                                        
 19 |                  <allow>tp:gremlin</allow>                                   
 20 |              </allows>                                                       
 21 |          </extensions>                                                       
 22 |     </graph>                                                                
 23 | 
 24 | ### Installing Bulbs
 25 | Bulbs is available through PyPi, so you can install it quite easily:
 26 | 
 27 | 	# pip install bulbs
 28 | 
 29 | ## Getting Data Into the Graph
 30 | To load Bro data into the graph, you must first start the graph database backend.  After that, you simply run the script to load your Bro log files into that database.  This section details the process.
 31 | 
 32 | ### Starting the Graph Database Backend
 33 | When you begin your hunting session, the first thing you'll need to do is to start the graph database backend, like so:
 34 | 
 35 | 	[...]/rexster-server-2.6.0> ./bin/rexster.sh --start
 36 | 	
 37 | You'll get a lot of output, but after a few seconds, the database will be initialized and ready for action.
 38 | 
 39 | ### Loading Bro Data Into the Graph
 40 | 
 41 | 	[...]/Bro2Graph> ./db-load.py -l ~/BroLogDir
 42 | 	
 43 | This should go pretty quickly for smaller datasets, but if you have a lot of Bro logs, it could take quite a long time.  Hours, even, for larger datasets.
 44 | 
 45 | When it's finished, you'll see something like the following:
 46 | 
 47 | 	[...]/Bro2Graph> ./db_load.py -l ~/BroLogDir
 48 | 	Reading log files from /Users/bro/BroLogDir
 49 | 	Graphing Flows...
 50 | 	Reading /Users/bro/BroLogDir/conn.log...
 51 | 	Number of events: 18
 52 | 	Graphing Files...
 53 | 	Reading /Users/bro/BroLogDir/files.log...
 54 | 	Number of events: 22
 55 | 	Graphing DNS Transactions...
 56 | 	Reading /Users/bro/BroLogDir/dns.log...
 57 | 	Number of events: 11
 58 | 	Graphing HTTP Transactions...
 59 | 	Reading /Users/bro/BroLogDir/http.log...
 60 | 	Number of events: 22
 61 | 	
 62 | 	**** Graph Stats
 63 | 	
 64 | 	  **** Totals
 65 | 	         Vertices	144
 66 | 	            Edges	414
 67 | 	
 68 | 	  **** Vertices by type:
 69 | 	          account	2
 70 | 	             flow	20
 71 | 	             fqdn	19
 72 | 	              uri	19
 73 | 	             host	27
 74 | 	   dnsTransaction	11
 75 | 	             file	22
 76 | 	        userAgent	2
 77 | 	  http_transaction	22
 78 | 	
 79 | 	  **** Edges by type:
 80 | 	          queried	11
 81 | 	         received	21
 82 | 	         hostedBy	22
 83 | 	         contains	55
 84 | 	        requested	2
 85 | 	             dest	18
 86 | 	         resolved	11
 87 | 	       resolvedTo	32
 88 | 	    queriedServer	11
 89 | 	      connectedTo	14
 90 | 	            agent	44
 91 | 	     identifiedBy	22
 92 | 	           source	18
 93 | 	             uses	2
 94 | 	           answer	32
 95 | 	           sentTo	22
 96 | 	           sentBy	22
 97 | 	      requestedBy	22
 98 | 	         lookedUp	11
 99 | 	      requestedOf	22
100 | 
101 | Notice that the last part is a summary of the numbers and types of nodes and edges in the graph.  You can generate this report at any time by running the *db_stats.py* script.
102 | 
103 | ### Deleting the Graph
104 | If for any reason you want to delete all the loaded data and start fresh, you have two options.  The default Rexster configuration (above) only stores the data in RAM, so simply restarting Rexster will effectively erase all the data.  
105 | 
106 | On the other hand, if you have configured Rexster to save the data to disk, or if you just don't feel like restarting the database process, you can run *db_clear.py*.  After confirming that you do indeed really want to delete everything, the script will do just that.  At the end, you'll have a fresh new database, just as though you had never loaded anything into it.
107 | 
108 | ## Visualizing the Graph
109 | After you have loaded your Bro data into the graph, you will naturally want to see what this looks like.  In this section, you'll learn how to start Gephi, load the data in, and do some simple things to render the graph in a more readable fashion.  
110 | 
111 | Note that Gephis is a **very** full featured system for interacting and computing with graphs.  This document will barely scratch the surface of what you can do with Gephi, and I encourage you to come up with your own cool techniques (and to share them!).
112 | 
113 | ### Loading the Data Into Gephi
114 | Start Gephi, and select "New Project" when prompted. This will give you a blank workspace (Gephi calls them "canvases").  
115 | 
116 | If you have already installed the necessary plugins, you should see a tab on the lefthand column called *Streaming*.  Click that, and then right click on the *Master Server* entry and set it to *Start*.  This makes Gephi listen on the local network for graph streaming connections.  
117 | 
118 | Now that Gephi is listening for graph data, run *db_graph.py* to send the data from Rexster into Gephi.  There are no arguments necessary, as it will just stream the entire graph.  This shouldn't take too long, and the output is minimal.  If you look at the Gephi window, you'll see a bunch of black lines and dots.  Don't worry, we'll make this look a lot better!
119 | 
120 | Once you've loaded the data, click back over to Gephi's *Layout* tab, since we'll need that later. 
121 | 
122 | ### Making the Graph Readable
123 | 
124 | To make this graph something approaching readable, we'll start with three simple operations:
125 | 
126 | * Assign colors to the different types of nodes
127 | * Size the nodes according to some criteria
128 | * Apply a layout algorithm
129 | 
130 | #### Node Colors
131 | The *db_load.py* script automatically assigned color values to different types of nodes when it loaded them into the database.  Each type of node is color coded, according to the chart below.
132 | 
133 | ![](img/node_colors.png)
134 | 
135 | By default, though, Gephi will not display these colors.  The *Give Colors to Nodes* plugin you installed earlier makes this quite simple, though.  Simply click on the plugin's icon to the left of the canvas.  It looks like this:
136 | 
137 | ![](img/give_colors_to_nodes.png)
138 | 
139 | This will automatically color the nodes according to their type, though you may not immediately notice this since most of the nodes are still quite small.
140 | 
141 | #### Resize the Nodes
142 | When working with graphs, it's very common to want to display the nodes as different sizes, depending on some criteria you compute.  This gives you some immediate visual feedback about the nodes, and is quite useful.
143 | 
144 | You can size your nodes by any numeric feature that Bro computed (for example, by the number of bytes transferred, if you are looking at network Flow nodes).  However, the most common way is probably to size them by the number of edges they have with other nodes.  The edge count of a node is referred to as it's *degree*.  We'll start with this.
145 | 
146 | To resize your nodes, simply click the *Ranking* tab on the left side of the Gephi screen.  The drop down menu at the top of the ranking panel lists all of the criteria you can use to size your nodes.  Some are computations that Gephi performs for you (like degree), while others are fields drawn from your own data.  For now, though, select *Degree* and click *Apply*.  
147 | 
148 | Now you should start seeing nodes of various sizes, and you can probably also start to see their colors as well.  Still, it's a bit of a mess, so let's fix that.
149 | 
150 | #### Applying a Layout Algorithm
151 | By default, Gephi displays your graph in a pretty jumbled, hard to understand fashion.  You can easily fix this by applying a layout algorithm from the *Layout* panel on the left.  
152 | 
153 | Gephi comes with a number of predefined layout algorithms, and I'm not going to try to explain them in detail.  Most of these are well-known algorithms (at least, if you are a computer scientist who deals with graphs a lot, I guess) and you can Google them if you want to know how they work. 
154 | 
155 | For now, though, select the *Yifan Hu* algorithm from the drop down and click *Run*.  You should see the nodes on your graph start to move around as the algorithm does its work.  Yifan Hu will automatically stop when it thinks it's got everything right, but sometimes running it more than once may help make the graph clearer, with more separation between the clusters of nodes.
156 | 
157 | ## Interacting With the Graph
158 | Now that your graph is formatted nicely, you can start to explore it.  Gephi has **a lot** of nice functions for this, and I am not going to try to cover them all here.  I recommend searching for "Gephi" on YouTube to find some really nice tutorials.  
159 | 
160 | For now, though, I want to show just two things: How to inspect the values for a specific node, and how to control which types of nodes and/or relationships you show on the graph. 
161 | 
162 | ### Inspecting the Values of a Node
163 | This is actually pretty simple.  Just click the *Edit* icon, which can be found on the toolbar to the left of the canvas, and which looks like this:
164 | 
165 | ![](img/inspector.png)
166 | 
167 | When the edit control is selected, you can click on any node and Gephi will show you all the features and their associated values.  As the name implies, you can also edit these values, but of course these edited values will be valid only inside this Gephi session, and will not be propagated back to the Rexster graph database.
168 | 
169 | ### Working With Specific Types of Nodes or Relationships
170 | Although there are a lot of cases where you really do want to see *all* the nodes and relationships in your graph, in most cases you will probably want to view only specific types. Not only will this make Gephi faster (since it has to do less work to show fewer items) but also it will make your graphs easier to understand.
171 | 
172 | Gephi makes it easy to show the types of nodes and relationships you want by using a custom filter.  Start by locating the *Filters* pane on the right, and navigating to the *Attributes -> Partition* menu, which will look like this:
173 | 
174 | ![](img/filters.png)
175 | 
176 | You'll see a very long list of attributes by which you can partition the graph (BTW, *partitioning* just means that you can divide the graph up into pieces according to some criteria, and this is the list of the criteria you can use).  Scroll down the list until you see *Element Type (Node)*, then drag it down below to where you see a red bulls-eye labeled *Drag filter here*.  When you're done, it should look something like this:
177 | 
178 | ![](img/partition.png)
179 | 
180 | Notice that each node type in your graph is listed here.  To control what you want to display on your graph, simply check the boxes next to the node types you want to work with and click *Filter*.  After a short time (longer if you have a large graph), you'll see the results reflected in the main canvas.
181 | 
182 | Note that when you add or subtract elements, you may want to re-run the layout algorithm again.  
183 | 
184 | With a little work, you can also drag in the *Element Types (Edge)* filter to get more control over what relationships you show for the nodes in your graph, but I'll leave that to you to play with.
185 | 


--------------------------------------------------------------------------------
/db_load.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import subprocess
  4 | import sys
  5 | import os
  6 | from optparse import OptionParser
  7 | import re
  8 | import StringIO
  9 | import numpy
 10 | import pandas
 11 | import random
 12 | import string
 13 | 
 14 | # Our interface to the GraphDB
 15 | from bulbs.rexster import Graph, Config, DEBUG
 16 | 
 17 | # Our own modules
 18 | from gh.connect import Connect
 19 | from gh.util import graph_info, shortest_path, edge_list
 20 | from db_stats import graph_stats
 21 | 
 22 | # A per-log dict that contains the list of fields we want to extract, in order
 23 | SUPPORTED_BRO_FIELDS = {
 24 |     "conn.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","proto","service","duration","orig_bytes","resp_bytes","conn_state","local_orig","missed_bytes","history","orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes","tunnel_parents"],
 25 |     "dns.log":["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","proto","trans_id","query","qclass","qclass_name","qtype","qtype_name","rcode","rcode_name","AA","TC","RD","RA","Z","answers","TTLs","rejected"],
 26 |     "dpd.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","proto","analyzer","failure_reason"],
 27 |     "files.log": ["ts","fuid","tx_hosts","rx_hosts","conn_uids","source","depth","analyzers","mime_type","filename","duration","local_orig","is_orig","seen_bytes","total_bytes","missing_bytes","overflow_bytes","timedout","parent_fuid","md5","sha1","sha256","extracted"],
 28 |     "ftp.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","user","password","command","arg","mime_type","file_size","reply_code","reply_msg","data_channel.passive","data_channel.orig_h","data_channel.resp_h","data_channel.resp_p","fuid"],
 29 |     "http.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","trans_depth","method","host","uri","referrer","user_agent","request_body_len","response_body_len","status_code","status_msg","info_code","info_msg","filename","tags","username","password","proxied","orig_fuids","orig_mime_types","resp_fuids","resp_mime_types"],
 30 |     "irc.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","nick","user","command","value","addl","dcc_file_name","dcc_file_size","dcc_mime_type","fuid"],
 31 |     "notice.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","fuid","file_mime_type","file_desc","proto","note","msg","sub","src","dst","p","n","peer_descr","actions","suppress_for","dropped","remote_location.country_code","remote_location.region","remote_location.city","remote_location.latitude","remote_location.longitude"],
 32 |     "smtp.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","trans_depth","helo","mailfrom","rcptto","date","from","to","reply_to","msg_id","in_reply_to","subject","x_originating_ip","first_received","second_received","last_reply","path","user_agent","tls","fuids","is_webmail"],
 33 |     "snmp.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","duration","version","community","get_requests","get_bulk_requests","get_responses","set_requests","display_string","up_since"],
 34 |     "ssh.log": ["ts","uid","id.orig_h","id.orig_p","id.resp_h","id.resp_p","status","direction","client","server","remote_location.country_code","remote_location.region","remote_location.city","remote_location.latitude","remote_location.longitude"]
 35 | }
 36 | 
 37 | FIELDS_STRING = ["TTLs"]
 38 | FIELDS_INTEGER = ["id.orig_p","id.resp_p","orig_bytes","resp_bytes","missed_bytes","orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes","qclass","qtype","trans_id","rcode","Z","depth","seen_bytes","total_bytes","missing_bytes","file_size","reply_code","data_channel.resp_p","trans_depth","request_body_len","response_body_len","status_code","info_code","dcc_file_size"]
 39 | FIELDS_FLOAT = ["duration","lease_type"]
 40 | 
 41 | # Output date format for timestamps
 42 | DATE_FMT="%FT%H:%M:%SZ"
 43 | 
 44 | BRO_CUT_CMD=["bro-cut","-U",DATE_FMT]
 45 | 
 46 | def unique_id(size=17):
 47 |     return ''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(size))
 48 | 
 49 | def is_IP(s):
 50 |     # this is pretty dumb.  If it looks like an IPv4 address, fine.  But a
 51 |     # good IPv6 regex is ridiculously complex.  I took a shortcut, since I
 52 |     # this routine is only ever called to disambiguate IPs from hostnames or
 53 |     # FQDNs.  If there's even a single ":", we'll just assume this must be
 54 |     # IPv6, since neither hostnames nor FQDNs can contain that char.
 55 |     #
 56 |     # Sorry.
 57 |     return( re.match("\d+.\d+.\d+.\d+$", s) != None or re.search(":",s) != None)
 58 | 
 59 | 
 60 | def extend_list(lst, val, length):
 61 |     '''
 62 |     Given a list "lst", extend it to length "length".  Each new item will
 63 |     be composed of the value "val".  Of course, if "lst" is already "length"
 64 |     size or longer, just return and do nothing.
 65 |     '''
 66 |     if len(lst) >= length:
 67 |         return lst
 68 |     else:
 69 |         lst.extend([val] * (length - len(lst)))
 70 |         return lst
 71 | 
 72 | def parse_options() :
 73 |     parser = OptionParser()
 74 |     parser.add_option("-l", "--log-dir", dest="logdir",
 75 |                       help="Bro log file directory to parse.")
 76 |     parser.add_option("-q", "--quiet", dest="quiet",
 77 |                       help="Suppress unecessary output (run quietly)")
 78 |     parser.add_option("-o", "--output", dest="outputdir",default=".",
 79 |                       help="Output directory (will be created if necessary)")
 80 |     parser.add_option("-s", "--sample", dest="sample",default=False,type="int",
 81 |                       help="Randomly select SAMPLE # of connections and associated log entries.")
 82 | 
 83 |     (options, args) = parser.parse_args()
 84 |     return(options, args)
 85 | 
 86 | def readlog(file, connection_ids=False):
 87 | 
 88 |     output = ""
 89 | 
 90 |     logtype = file
 91 | 
 92 |     logfile = "%s/%s" % (options.logdir,file)
 93 | 
 94 |     print "Reading %s..." % logfile
 95 |     
 96 |     tmp_bro_cut_cmd = BRO_CUT_CMD
 97 |     tmp_bro_cut_cmd = tmp_bro_cut_cmd + SUPPORTED_BRO_FIELDS[logtype]
 98 | 
 99 |     # Create a job that just cats the log file
100 |     p1 = subprocess.Popen(['cat',logfile], stdout=subprocess.PIPE)
101 | 
102 |     # This is the bro-cut job, reading the "cat" command output
103 |     p2 = subprocess.Popen(tmp_bro_cut_cmd, stdin=p1.stdout, stdout=subprocess.PIPE)
104 | 
105 |     p1.stdout.close()
106 | 
107 |     # Now we're going to use the "pandas" package to create a dataframe
108 |     # out of the log data.  Dataframes greatly simplify the tasks of cleaning
109 |     # the data.
110 |     #
111 |     # StringIO treats the string as a fake file, so we can use pandas to
112 |     # create a dataframe out of the string directly, without having to write
113 |     # it to disk first.
114 |     brodata = StringIO.StringIO(p2.communicate()[0])
115 | 
116 |     df = pandas.DataFrame.from_csv(brodata, sep="\t", parse_dates=False, header=None, index_col=None)
117 | 
118 |     df.columns = SUPPORTED_BRO_FIELDS[logtype]
119 | 
120 |     # If this is the connection log, and if we've requested a random sample,
121 |     # cut the dataframe down to ONLY contain that random sample
122 |     if logtype == "conn.log" and options.sample:
123 |         print "Size before sampling: %d" % len(df.index)
124 |         df = df.sample(n=options.sample)
125 |         df.reset_index(drop=True, inplace=True)
126 |         print "Size after sampling: %d" % len(df.index)
127 |     elif logtype == "files.log" and connection_ids:
128 |         df = df[df.conn_uids.isin(connection_ids)]
129 |         df.reset_index(drop=True, inplace=True)
130 |     elif logtype != "conn.log" and connection_ids and "uid" in df.columns:
131 |         # If this is any other type of log AND we have an explicit list of
132 |         # connection IDs we sampled AND this is a file that has the "uid"
133 |         # data to pair it with the conn.log, pare down the dataframe to
134 |         # only include those rows with the right uids
135 |         df = df[df.uid.isin(connection_ids)]
136 |         df.reset_index(drop=True, inplace=True)
137 |         # It is entirely possible that this sampling may mean that some
138 |         # log files no longer have any output (for example, you only sampled
139 |         # a list of connections, none of which were DHCP).  
140 | 
141 |     
142 |     df.replace(to_replace=["(empty)","-"], value=["",""], inplace=True)
143 | 
144 |     # Some columns need to be forced into type String, primarily because they
145 |     # may contain lists and we always call split() on them, but they look like
146 |     # integers, so numpy tries to store them that way.
147 |     for field in FIELDS_STRING:
148 |         if field in df.columns:
149 |             df[field] = df[field].astype(str)
150 | 
151 |     # Likewise, many rows need to be stored as Integers, but numpy thinks
152 |     # they may be strings (probably because a legal value is "-").  This is
153 |     # the list of the fields we know need to be converted
154 |     for field in FIELDS_INTEGER:
155 |         if field in df.columns:
156 |             df[field] = df[field].replace("",-1)
157 |             df[field] = df[field].astype(int)
158 | 
159 |     # Finally, convert the Float fields
160 |     for field in FIELDS_FLOAT:
161 |         if field in df.columns:
162 |             df[field] = df[field].replace("",numpy.nan)
163 |             df[field] = df[field].astype(float)
164 | 
165 |     if logtype == "conn.log":
166 |         # if we're processing the conn.log AND we've requested random samples,
167 |         # create a list of the sampled connection IDs and update the 
168 |         # connection_ids parameter.  Otherwise, leave it the same.
169 |         if options.sample:
170 |             for id in df["uid"].tolist():
171 |                 connection_ids.append(id)
172 |     
173 |     return df
174 | 
175 | def graph_flows(g, df_conn):
176 |     # Iterate through all the flows
177 |     for con in df_conn.index:
178 |         # For each flow, create new Host objects if necessary.  Then create a
179 |         # new Flow, and add the relationships between the Hosts and the Flow
180 |   
181 |         # Create the source & dest nodes
182 |         src_host = g.host.get_or_create("name",
183 |                                         df_conn.loc[con]["id.orig_h"],
184 |                                         {"name": df_conn.loc[con]["id.orig_h"],
185 |                                          "address":df_conn.loc[con]["id.orig_h"]
186 |                                      })
187 |         dst_host = g.host.get_or_create("name",
188 |                                         df_conn.loc[con]["id.resp_h"],
189 |                                         {"name": df_conn.loc[con]["id.resp_h"],
190 |                                          "address":df_conn.loc[con]["id.resp_h"]
191 |                                      })
192 | 
193 |         # If the flow is marked "local_orig", we need to update this feature
194 |         # on the source host.  We can't do this at creation time because we
195 |         # might have seen this host before in another context, and created a
196 |         # node for it without knowing it was a local host.
197 |         if df_conn.loc[con]["local_orig"] == "T":
198 |             src_host.local = "T"
199 |             src_host.save()
200 | 
201 |         # Create the Flow object.  Since we can run the same log file through
202 |         # multiple times, or observe the same flow from different log files,
203 |         # assume flows with the same name are actually the same flow.
204 | 
205 |         flowname = df_conn.loc[con]["uid"]
206 |         # Create the flow node, with all the rich data
207 |         properties = dict(df_conn.loc[con])
208 |         # Manually assign the "name" property
209 |         properties["name"] = flowname
210 |         # Take out the info about the source & dest IPs, since we should be
211 |         # getting them from the connected host nodes
212 |         del properties["id.orig_h"]
213 |         del properties["id.resp_h"]
214 |         
215 |         flow = g.flow.get_or_create("name", flowname, properties)
216 | 
217 |         # Create the edges for this flow, if they don't already exist
218 |         nodes = flow.inV("source")
219 |         if nodes == None or not (src_host in nodes):
220 |             g.source.create(src_host, flow)
221 |         
222 |         nodes = flow.outV("dest")
223 |         if nodes == None or not (dst_host in nodes):
224 |             g.dest.create(flow, dst_host)
225 | 
226 |         # Make a direct link between the src and dest hosts, as this
227 |         # is a common analysis task.  It doesn't *always* make sense
228 |         # to go through the flows.
229 |         neighbors = src_host.outV("connectedTo")
230 |         if neighbors == None or not (dst_host in neighbors):
231 |             e = g.connectedTo.create(src_host, dst_host)
232 |             e.weight=1
233 |             e.save()
234 |         else:
235 |             edges = edge_list(g, src_host._id, dst_host._id, "connectedTo")
236 |             # There should only be one of these edges, and we already know
237 |             # it exists, so it's safe to just take the first one
238 |             edge = edges.next()
239 |             g.connectedTo.update(edge._id, weight=(edge.weight + 1))
240 | 
241 | def graph_dns(g, df_dns):
242 |     # Iterate through all the flows
243 |     for i in df_dns.index:
244 |         # Create the DNSTransaction node
245 |         # name = str(df_dns.loc[i]["trans_id"])
246 |         name = "%d - %s - %s" % (df_dns.loc[i]["trans_id"],
247 |                                  df_dns.loc[i]["qtype_name"],
248 |                                  df_dns.loc[i]["query"])
249 |         timestamp = df_dns.loc[i]["ts"]
250 |         flowname = df_dns.loc[i]["uid"]
251 |         
252 |         # Pick out the properties that belong on the transaction and add
253 |         # them
254 |         transaction = g.dnsTransaction.create(name=name,
255 |                                               ts=df_dns.loc[i]["ts"],
256 |                                               proto=df_dns.loc[i]["proto"],
257 |                                               orig_p=df_dns.loc[i]["id.orig_p"],
258 |                                               resp_p=df_dns.loc[i]["id.resp_p"],
259 |                                               qclass=df_dns.loc[i]["qclass"],
260 |                                               qclass_name=df_dns.loc[i]["qclass_name"],
261 |                                               qtype=df_dns.loc[i]["qtype"],
262 |                                               qtype_name=df_dns.loc[i]["qtype_name"],
263 |                                               rcode=df_dns.loc[i]["rcode"],
264 |                                               rcode_name=df_dns.loc[i]["rcode_name"],
265 |                                               AA=df_dns.loc[i]["AA"],
266 |                                               TC=df_dns.loc[i]["TC"],
267 |                                               RD=df_dns.loc[i]["RD"],
268 |                                               RA=df_dns.loc[i]["RA"],
269 |                                               Z=df_dns.loc[i]["Z"],
270 |                                               rejected=df_dns.loc[i]["rejected"])
271 | 
272 |         # Create a node + edge for the query, if there is one in the log
273 |         if df_dns.loc[i]["query"]:
274 |             fqdn = g.fqdn.get_or_create("name", df_dns.loc[i]["query"],
275 |                                         {"name":df_dns.loc[i]["query"],
276 |                                          "domain":df_dns.loc[i]["query"]})
277 |             g.lookedUp.create(transaction,fqdn)
278 | 
279 |             # Now create the nodes and edges for the domains or addresses in
280 |             # the answer (if there is an answer).  There can be multiple
281 |             # answers, so split this into a list and create one node + edge
282 |             # for each.
283 |             #
284 |             # There should also be one TTL per answer, so we'll split those and
285 |             # use array indices to tie them together. The arrays are supposed
286 |             # to always be the same length, but maybe sometimes they are
287 |             # not.  We'll force the issue by extending the TTL list to be
288 |             # the same size as the address list.
289 |             if df_dns.loc[i]["answers"]:
290 |                 addrs = df_dns.loc[i]["answers"].split(",")
291 |                 ttls = df_dns.loc[i]["TTLs"].split(",")
292 |                 ttls = extend_list(ttls, ttls[len(ttls)-1],len(addrs))
293 | 
294 |                 for i in range(len(addrs)):
295 |                     ans = addrs[i]
296 |                     ttl = float(ttls[i])
297 |                     # DNS answers can be either IPs or other names. Figure
298 |                     # out which type of node to create for each answer.
299 |                     if is_IP(ans):
300 |                         node = g.host.get_or_create("name",ans,{"name":ans,
301 |                                                                 "address":ans})
302 |                     else:
303 |                         node = g.fqdn.get_or_create("name",ans,{"name":ans,
304 |                                                                 "address":ans})
305 |                                                 
306 |                     g.resolvedTo.create(fqdn, node, {"ts":timestamp})
307 |                     g.answer.create(transaction, node, {"TTL": ttl})
308 | 
309 |         # Create a node + edge for the source of the DNS transaction
310 |         # (the client host)
311 |         if df_dns.loc[i]["id.orig_h"]:
312 |             src = g.host.get_or_create("name", df_dns.loc[i]["id.orig_h"],
313 |                                        {"name": df_dns.loc[i]["id.orig_h"],
314 |                                         "address":df_dns.loc[i]["id.orig_h"]})
315 |             g.queried.create(src, transaction)
316 |         
317 |         # Create a node + edge for the destination of the DNS transaction
318 |         # (the DNS server)
319 |         if df_dns.loc[i]["id.resp_h"]:
320 |             dst = g.host.get_or_create("name", df_dns.loc[i]["id.resp_h"],
321 |                                        {"name": df_dns.loc[i]["id.resp_h"],
322 |                                         "address":df_dns.loc[i]["id.resp_h"]})
323 |             g.queriedServer.create(transaction,dst)
324 | 
325 |         
326 |         # Now connect this transaction to the correct flow
327 |         flows = g.flow.index.lookup(name=flowname)
328 |         if flows == None:
329 |             # print "ERROR: Flow '%s' does not exist" % flowname
330 |             pass
331 |         else:
332 |             # lookup returns a generator, but since there should only be one
333 |             # flow with this name, just take the first one
334 |             flow = flows.next()
335 |             nodes = flow.outV("contains")
336 |             if nodes == None or not (transaction in nodes):
337 |                 edge = g.contains.create(flow, transaction)
338 | 
339 | 
340 |         # Associate the src host with the FQDN it resolved.  Since a host
341 |         # can resolve a domain multiple times, we'll also keep track of a
342 |         # "weight" feature to count how many times this happened.
343 |         if df_dns.loc[i]["query"]:
344 |             neighbors = src.outV("resolved")
345 |             if neighbors == None or not (fqdn in neighbors):
346 |                 e = g.resolved.create(src, fqdn)
347 |                 e.weight=1
348 |                 e.save()
349 |             else:
350 |                 edges = edge_list(g, src._id, fqdn._id, "resolved")
351 |                 # There should only be one of these edges, and we already know
352 |                 # it exists, so it's safe to just take the first one
353 |                 edge = edges.next()
354 |                 g.resolved.update(edge._id, weight=(edge.weight + 1))
355 |         
356 | def graph_files(g, df_files):
357 |     # Iterate through all the flows
358 |     for i in df_files.index:
359 |         # Create the file node
360 |         name = str(df_files.loc[i]["fuid"])
361 |         timestamp = df_files.loc[i]["ts"]
362 |         flows = df_files.loc[i]["conn_uids"]
363 |         
364 |         # Create the file object. Note that this is more like a file transfer
365 |         # transaction than a static object just for that file.  There can be
366 |         # more than one node with the same MD5 hash, for example.  Cleary,
367 |         # those are the same file in the real world, but not in our graph.
368 |         #
369 |         # However, it is possible to actually have the same file transaction
370 |         # show up in the Bro logs multiple times.  AFAICT, this is mostly
371 |         # due to things like timeouts, where Bro records the file transfer
372 |         # start and then sends another log later that says that the xfer
373 |         # failed.  We need to make sure we always check to make sure there
374 |         # is only one File node for each actual transaction, but we'll use
375 |         # the fields from the most recent log, assuming things that Bro
376 |         # logs last will be more accurate.
377 |         fileobj = g.file.get_or_create("name", name, {"name":name})
378 | 
379 |         fileobj.fuid=df_files.loc[i]["fuid"]
380 |         fileobj.source=df_files.loc[i]["source"]
381 |         fileobj.depth=df_files.loc[i]["depth"]
382 |         fileobj.analyzers=df_files.loc[i]["analyzers"]
383 |         fileobj.mime_type=df_files.loc[i]["mime_type"]
384 |         fileobj.filename=df_files.loc[i]["filename"]
385 |         fileobj.duration=df_files.loc[i]["duration"]
386 |         fileobj.seen_bytes=df_files.loc[i]["seen_bytes"]
387 |         fileobj.total_bytes=df_files.loc[i]["total_bytes"]
388 |         fileobj.missing_bytes=df_files.loc[i]["missing_bytes"]
389 |         fileobj.overflow_bytes=df_files.loc[i]["overflow_bytes"]
390 |         fileobj.timedout=df_files.loc[i]["timedout"]
391 |         fileobj.md5=df_files.loc[i]["md5"]
392 |         fileobj.sha1=df_files.loc[i]["sha1"]
393 |         fileobj.sha256=df_files.loc[i]["sha256"]
394 |         fileobj.extracted=df_files.loc[i]["extracted"]
395 |         fileobj.save()
396 |         
397 |         # Now connect this to the flow(s) it is associated with.
398 |         for f in flows.split(","):
399 |             flow = g.flow.get_or_create("name", f, {"name":f})
400 |             g.contains.create(flow, fileobj)
401 | 
402 |         # Connect it to the src and dest hosts in the file xfer.  Note that
403 |         # there can be more than one host listed for each side of the
404 |         # xfer (don't ask me how).  
405 |         for h in df_files.loc[i]["tx_hosts"].split(","):
406 |             src = g.host.get_or_create("name", h,
407 |                                        {"name":h,
408 |                                         "address":h})
409 |             g.sentBy.create(fileobj,src,{"ts":timestamp,
410 |                                            "is_orig":df_files.loc[i]["is_orig"]})
411 |             # Also have this extra bit of info about whether the originating
412 |             # host is part of a local subnet.  We should make sure that is
413 |             # recorded on the host object.
414 |             src.local = df_files.loc[i]["local_orig"]
415 |             src.save()
416 |             
417 |         for h in df_files.loc[i]["rx_hosts"].split(","):
418 |             dst = g.host.get_or_create("name", h,
419 |                                        {"name":h,
420 |                                         "address":h})
421 |             g.sentTo.create(dst, fileobj,{"ts":timestamp})
422 |             
423 | def graph_http(g, df_http):
424 |     # Iterate through all the flows
425 |     for i in df_http.index:
426 |         # Create the HTTPTransaction node
427 |         http = g.httpTransaction.create(name="H" + unique_id(),
428 |                                         ts=df_http.loc[i]["ts"],
429 |                                         resp_p=df_http.loc[i]["id.resp_p"],
430 |                                         trans_depth=df_http.loc[i]["trans_depth"],
431 |                                         method=df_http.loc[i]["method"].upper(),
432 |                                         request_body_len=df_http.loc[i]["request_body_len"],
433 |                                         response_body_len=df_http.loc[i]["response_body_len"],
434 |                                         status_code=df_http.loc[i]["status_code"],
435 |                                         status_msg=df_http.loc[i]["status_msg"],
436 |                                         info_code=df_http.loc[i]["info_code"],
437 |                                         info_msg=df_http.loc[i]["info_msg"],
438 |                                         filename=df_http.loc[i]["filename"],
439 |                                         tags=df_http.loc[i]["tags"],
440 |                                         proxied=df_http.loc[i]["proxied"])
441 |         
442 |         # Now connect this to the flow it's associated with
443 |         flowname = df_http.loc[i]["uid"]
444 |         flow = g.flow.get_or_create("name", flowname, {"name":flowname})
445 |         g.contains.create(flow, http)
446 | 
447 |         # Now connect it to the hosts on each side of the transaction
448 |         src_addr = df_http.loc[i]["id.orig_h"]
449 |         dst_addr = df_http.loc[i]["id.resp_h"]
450 | 
451 |         src_host = g.host.get_or_create("name", src_addr, {"name":src_addr})
452 |         dst_host = g.host.get_or_create("name", dst_addr, {"name":dst_addr})
453 | 
454 |         g.requestedBy.create(src_host, http)
455 |         g.requestedOf.create(http, dst_host)
456 | 
457 |         # Connect to the server host.  This can be either a domain name or
458 |         # an IP address.  If it's a domain, we need to attach to an FQDN node.
459 |         # If it's an IP, we need a Host node.
460 |         h = df_http.loc[i]["host"]
461 |         if is_IP(h):
462 |             host = g.host.get_or_create("name", h, {"name":h})
463 |         else:
464 |             host = g.fqdn.get_or_create("name", h, {"name":h})
465 | 
466 |         g.hostedBy.create(http, host)
467 | 
468 |         # Now create and link to a URI node for the requested resource
469 |         u = df_http.loc[i]["uri"]
470 |         uri = g.uri.get_or_create("name", u, {"name":u})
471 |         g.identifiedBy.create(http, uri)
472 | 
473 |         # Link to the UserAgent node
474 |         ua = df_http.loc[i]["user_agent"]
475 |         user_agent = g.userAgent.get_or_create("name", ua, {"name":ua})
476 | 
477 |         # Link to the HTTP transaction
478 |         g.agent.create(http, user_agent)
479 |         # Link to the host that sent the request 
480 |         g.agent.create(src_host, user_agent)
481 | 
482 |         # Now link to the File objects transferred by this transaction.
483 |         # Each file object also has an associated MIME type.  These are
484 |         # encoded as two sets of paired lists:  orig_fuids/orig_mime_types
485 |         # and resp_fuids/resp_mime_types.  In the event that the fuid list
486 |         # is longer than the MIME type list (indicating that the last values
487 |         # in the fuid list all have the same MIME type), we will extend the
488 |         # mime type list to explicitly name all the mime types. It makes it
489 |         # simpler to process the paired lists if we know they are the same
490 |         # size.
491 |         orig_fuids = df_http.loc[i]["orig_fuids"].split(",")
492 |         orig_mime_types = df_http.loc[i]["orig_mime_types"].split(",")
493 |         orig_mime_types = extend_list(orig_mime_types,
494 |                                       orig_mime_types[len(orig_mime_types)-1],
495 |                                       len(orig_fuids))
496 | 
497 |         resp_fuids = df_http.loc[i]["resp_fuids"].split(",")
498 |         resp_mime_types = df_http.loc[i]["resp_mime_types"].split(",")
499 |         resp_mime_types = extend_list(resp_mime_types,
500 |                                       resp_mime_types[len(resp_mime_types)-1],
501 |                                       len(resp_fuids))
502 | 
503 |         if orig_fuids != ['']:
504 |             for x in range(len(orig_fuids)):
505 |                 fuid = orig_fuids[x]
506 |                 mime_type = orig_mime_types[x]
507 | 
508 |                 f = g.file.get_or_create("name", fuid, {"name":fuid})
509 |                 g.sent.create(http, f, {"mime_type": mime_type})
510 | 
511 |         if resp_fuids != ['']:
512 |             for x in range(len(resp_fuids)):
513 |                 try:
514 |                     fuid = resp_fuids[x]
515 |                     mime_type = resp_mime_types[x]
516 |                     f = g.file.get_or_create("name", fuid, {"name":fuid})
517 |                     g.received.create(http, f, {"mime_type": mime_type})
518 |                 except Exception, e:
519 |                     print "****"
520 |                     print "Exception: %s" % e
521 |                     print
522 |                     print resp_fuids
523 |                     print
524 |                     print "x: %s fuid: %s" % (x, fuid)
525 |                     sys.exit(-1)
526 |                     
527 |         # Create the user account object and relationship
528 |         username = df_http.loc[i]["username"]
529 |         password = df_http.loc[i]["password"]
530 |         if username:
531 |             account = g.account.get_or_create("name", username, {"name":username})
532 |             g.requested.create(account, http, {"password":password})
533 |             g.uses.create(account, src_host)
534 |         
535 | ##### Main #####
536 | 
537 | (options, args) = parse_options()
538 | 
539 | if not options.logdir:
540 |     print "Error: Must specify the log directory with -l or --log-dir"
541 |     sys.exit(-1)
542 |     
543 | if not os.path.exists(options.logdir):
544 |     print "Error: Directory %s does not exist" % options.logdir
545 |     sys.exit(-1)
546 | 
547 | if not os.path.exists(options.outputdir):
548 |     os.mkdir(options.outputdir)    
549 |     
550 | if not options.quiet:
551 |     print "Reading log files from %s" % options.logdir
552 | 
553 | # Now we can start to read data and populate the graph.
554 | 
555 | g = Connect()
556 | 
557 | # Now read the types of logs we know how to process, extract the relevant
558 | # data and add it to the graph
559 | 
560 | connection_ids = list()
561 | 
562 | print "Graphing Flows..."
563 | df_conn = readlog("conn.log", connection_ids)
564 | print "Number of events: %d" % len(df_conn.index)
565 | graph_flows(g, df_conn)
566 | 
567 | print "Graphing Files..."
568 | df_files = readlog("files.log", connection_ids)
569 | print "Number of events: %d" % len(df_files.index)
570 | graph_files(g, df_files)
571 | 
572 | print "Graphing DNS Transactions..."
573 | df_dns = readlog("dns.log", connection_ids)
574 | print "Number of events: %d" % len(df_dns.index)
575 | graph_dns(g, df_dns)
576 | 
577 | print "Graphing HTTP Transactions..."
578 | df_http = readlog("http.log", connection_ids)
579 | print "Number of events: %d" % len(df_http.index)
580 | graph_http(g, df_http)
581 | 
582 | # Print some basic info about the graph so we know we did some real work
583 | graph_stats(g)
584 | 
585 | 
586 |     
587 | 
588 | 
589 | 
590 | 


--------------------------------------------------------------------------------