├── LICENSE.txt
├── README.md
├── examples
    ├── conn.log
    ├── load_zeek_log_csv.py
    ├── load_zeek_log_into_elk.py
    ├── load_zeek_log_json.py
    ├── zeek_to_csv.py
    └── zeek_to_json.py
├── parsezeeklogs
    └── __init__.py
├── setup.cfg
└── setup.py


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Dan Gunter
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ParseZeekLogs
 2 | A lightweight utility for programmatically reading and manipulating Zeek (Bro) NSM log files and outputting into JSON or CSV format. This library works on both Python 2 and Python 3.
 3 | 
 4 | ## Examples
 5 | The following example first loads records from the Zeek connection log named conn.log. The data is the written out to a file named out.json. The name of the log file to read must be provided when creating the ParseBroLog class. You can use the safe_headers=True option in the to_json method to replace all instances of a dot with an underscore.
 6 | ```python
 7 | from parsezeeklogs import ParseZeekLogs
 8 | 
 9 | with open('out.json',"w") as outfile:
10 |     for log_record in ParseZeekLogs("conn.log", output_format="json", safe_headers=False):
11 |         if log_record is not None:
12 |             outfile.write(log_record + "\n")
13 | ```
14 | 
15 | This is another example that instead uses the csv output method to write the data out to a file named out.csv. This example shows filtering on specific fields within the log file. Field names should be provided as list elements.
16 | 
17 | ```python
18 | from parsezeeklogs import ParseZeekLogs
19 | 
20 | with open('out.csv',"w") as outfile:
21 |     for log_record in ParseZeekLogs("conn.log", output_format="csv", safe_headers=False, fields=["ts","id.orig_h","id.orig_p","id.resp_h","id.resp_p"]):
22 |         if log_record is not None:
23 |             outfile.write(log_record + "\n")
24 | ```
25 | 
26 | ## Special Thanks
27 | * [@geekscrapy](https://github.com/geekscrapy): For bug fixes and the safe header feature addition
28 | 


--------------------------------------------------------------------------------
/examples/load_zeek_log_csv.py:
--------------------------------------------------------------------------------
1 | from parsezeeklogs import ParseZeekLogs
2 | 
3 | log_iterator = ParseZeekLogs("conn.log", output_format="csv", safe_headers=False)
4 | # Print the field line out
5 | print(log_iterator.get_fields())
6 | for log_record in log_iterator:
7 |     if log_record is not None:
8 |         print(str(log_record))


--------------------------------------------------------------------------------
/examples/load_zeek_log_into_elk.py:
--------------------------------------------------------------------------------
1 | from parsezeeklogs import ParseZeekLogs
2 | 
3 | ParseZeekLogs.batch_to_elk("http.log", meta={"source": "http"})
4 | 


--------------------------------------------------------------------------------
/examples/load_zeek_log_json.py:
--------------------------------------------------------------------------------
1 | from parsezeeklogs import ParseZeekLogs
2 | 
3 | # Print the field line out
4 | for log_record in ParseZeekLogs("conn.log", output_format="json", safe_headers=False):
5 |     if log_record is not None:
6 |         print(str(log_record))
7 | 


--------------------------------------------------------------------------------
/examples/zeek_to_csv.py:
--------------------------------------------------------------------------------
1 | from parsezeeklogs import ParseZeekLogs
2 | 
3 | with open('out.csv',"w") as outfile:
4 |     for log_record in ParseZeekLogs("conn.log", output_format="csv", safe_headers=False, fields=["ts","id.orig_h","id.orig_p","id.resp_h","id.resp_p"]):
5 |         if log_record is not None:
6 |             outfile.write(log_record + "\n")
7 | 


--------------------------------------------------------------------------------
/examples/zeek_to_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # File: zeek_to_json.py
 4 | # Description: Convert multiple bro files to json
 5 | #
 6 | # Usage: 
 7 | #   ./zeek_to_json.py *.log
 8 | #  ^^ will convert all zeek logs in this directory to same filename + .json (example: conn.log -> conn.json)
 9 | 
10 | from parsezeeklogs import ParseZeekLogs
11 | import argparse
12 | import os
13 | 
14 | if __name__ == "__main__":
15 |     # Create argument parser
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('zeekfile',  nargs='+', help="Zeek file file to parse")
18 | 
19 |     # Parse arguments 
20 |     args = parser.parse_args()
21 | 
22 |     # Loop through each Bro file and convert to json
23 |     for zeek_file in args.zeekfile:
24 |         outfilename = os.path.splitext(zeek_file)[0] + '.json'
25 | 
26 |         #log_data = ParseBroLogs(bro_file)
27 |         with open(outfilename, "w") as outfile:
28 |             for log_data in ParseZeekLogs(zeek_file, output_format="json"):
29 |                 outfile.write(log_data)
30 | 


--------------------------------------------------------------------------------
/parsezeeklogs/__init__.py:
--------------------------------------------------------------------------------
  1 | from json import loads, dumps
  2 | from collections import OrderedDict
  3 | from elasticsearch import Elasticsearch, helpers
  4 | from datetime import datetime
  5 | from traceback import print_exc
  6 | 
  7 | 
  8 | class ParseZeekLogs(object):
  9 |     """Class that parses Zeek logs and allows log data to be output in CSV or json format.
 10 | 
 11 |     Attributes:
 12 |         filepath: Path of Zeek log file to read
 13 | 
 14 |     """
 15 | 
 16 |     def __init__(self, filepath, batchsize=500, fields=None, output_format=None, ignore_keys=[], meta={}, safe_headers=False):
 17 |         self.fd = open(filepath,"r")
 18 |         self.options = OrderedDict()
 19 |         self.firstRun = True
 20 |         self.filtered_fields = fields
 21 |         self.batchsize = batchsize
 22 |         self.output_format = output_format
 23 |         self.ignore_keys = ignore_keys
 24 |         self.meta = meta
 25 |         self.safe_headers = safe_headers
 26 | 
 27 |         # Convert ' to " in meta string
 28 |         meta = loads(dumps(meta).replace("'", '"'))
 29 | 
 30 |         # Read the header option lines
 31 |         l = self.fd.readline().strip("\n")
 32 |         while l.strip().startswith("#"):
 33 |             # Parse the options out
 34 |             if l.startswith("#separator"):
 35 |                 key = str(l[1:].split(" ")[0])
 36 |                 value = str.encode(l[1:].split(" ")[1].strip()).decode('unicode_escape')
 37 |                 self.options[key] = value
 38 |             elif l.startswith("#"):
 39 |                 key = str(l[1:].split(self.options.get('separator'))[0])
 40 |                 value = l[1:].split(self.options.get('separator'))[1:]
 41 |                 self.options[key] = value
 42 | 
 43 |             # Read the next line
 44 |             l = self.fd.readline().strip("\n")
 45 | 
 46 |         self.firstLine = l
 47 | 
 48 |         # Save mapping of fields to values:
 49 |         self.fields = self.options.get('fields')
 50 |         self.types = self.options.get('types')
 51 | 
 52 |         # Convert field names if safe_headers is enabled
 53 |         #if self.safe_headers is True:
 54 |         #    for i, val in enumerate(self.fields):
 55 |         #        self.fields[i] = self.fields[i].replace(".", "_")
 56 | 
 57 |         self.data_types = {}
 58 |         for i, val in enumerate(self.fields):
 59 |             # Convert field names if safe_headers is enabled
 60 |             if self.safe_headers is True:
 61 |                 self.fields[i] = self.fields[i].replace(".", "_")
 62 | 
 63 |             # Match types with each other
 64 |             self.data_types[self.fields[i]] = self.types[i]
 65 | 
 66 |     def __del__(self):
 67 |         self.fd.close()
 68 | 
 69 |     def __iter__(self):
 70 |         return self
 71 | 
 72 |     def __next__(self):
 73 |         retVal = ""
 74 |         if self.firstRun is True:
 75 |             retVal = self.firstLine
 76 |             self.firstRun = False
 77 |         else:
 78 |             retVal = self.fd.readline().strip("\n")
 79 | 
 80 |         # If an empty string is returned, readline is done reading
 81 |         if retVal == "" or retVal is None:
 82 |             raise StopIteration
 83 | 
 84 |         # Split out the data we are going to return
 85 |         retVal = retVal.split(self.options.get('separator'))
 86 | 
 87 |         record = None
 88 |         # Make sure we aren't dealing with a comment line
 89 |         if len(retVal) > 0 and not str(retVal[0]).strip().startswith("#") \
 90 |                 and len(retVal) is len(self.options.get("fields")):
 91 |             record = OrderedDict()
 92 |             # Prepare fields for conversion
 93 |             for x in range(0, len(retVal)):
 94 |                 if self.safe_headers is True:
 95 |                     converted_field_name = self.options.get("fields")[x].replace(".", "_")
 96 |                 else:
 97 |                     converted_field_name = self.options.get("fields")[x]
 98 |                 if self.filtered_fields is None or converted_field_name in self.filtered_fields:
 99 |                     # Translate - to "" to fix a conversation error
100 |                     if retVal[x] == "-":
101 |                         retVal[x] = ""
102 |                     # Save the record field if the field isn't filtered out
103 |                     record[converted_field_name] = retVal[x]
104 | 
105 |             # Convert values to the appropriate record type
106 |             record = self.convert_values(record, self.ignore_keys, self.data_types)
107 | 
108 |             if record is not None and self.output_format == "json":
109 |                 # Output will be json
110 | 
111 |                 # Add metadata to json
112 |                 for k, v in self.meta.items():
113 |                     record[k] = v
114 | 
115 |                 retVal = dumps(record)
116 |             elif record is not None and self.output_format == "csv":
117 |                 retVal = ""
118 |                 # Add escaping to csv format
119 |                 for k, v in record.items():
120 |                     # Add escaping to string values
121 |                     if isinstance(v, str):
122 |                         retVal += str("\"" + str(v).strip() + "\"" + ",")
123 |                     else:
124 |                         retVal += str(str(v).strip() + ",")
125 |                 # Remove the trailing comma
126 |                 retVal = retVal[:-1]
127 |         else:
128 |             retVal = None
129 | 
130 |         return retVal
131 | 
132 |     def convert_values(self, data, ignore_keys=[], data_types={}):
133 |         keys_to_delete = []
134 |         for k, v in data.items():
135 |             # print("evaluating k: " + str(k) + " v: " + str(v))
136 | 
137 |             if isinstance(v, dict):
138 |                 data[k] = self.convert_values(v)
139 |             else:
140 |                 if data_types.get(k) is not None:
141 |                     if (data_types.get(k) == "port" or data_types.get(k) == "count"):
142 |                         if v != "":
143 |                             data[k] = int(v)
144 |                         else:
145 |                             keys_to_delete.append(k)
146 |                     elif (data_types.get(k) == "double" or data_types.get(k) == "interval"):
147 |                         if v != "":
148 |                             data[k] = float(v)
149 |                         else:
150 |                             keys_to_delete.append(k)
151 |                     elif data_types.get(k) == "bool":
152 |                         data[k] = bool(v)
153 |                     else:
154 |                         data[k] = v
155 | 
156 |         for k in keys_to_delete:
157 |             del data[k]
158 | 
159 |         return data
160 | 
161 |     def get_fields(self):
162 |         """Returns all fields present in the log file
163 | 
164 |         Returns:
165 |             A python list containing all field names in the log file
166 |         """
167 |         field_names = ""
168 |         if self.output_format == "csv":
169 |             for i, v in enumerate(self.fields):
170 |                 if self.filtered_fields is None or v in self.filtered_fields:
171 |                     field_names += str(v) + ","
172 |             # Remove the trailing comma
173 |             field_names = field_names[:-1].strip()
174 |         else:
175 |             field_names = []
176 |             for i, v in enumerate(self.fields):
177 |                 if self.filtered_fields is None or v in self.filtered_fields:
178 |                     field_names.append(v)
179 |         return field_names
180 | 
181 |     @staticmethod
182 |     def bulk_to_elasticsearch(es, bulk_queue):
183 |         try:
184 |             helpers.bulk(es, bulk_queue)
185 |             return True
186 |         except:
187 |             print(print_exc())
188 |             return False
189 | 
190 |     @staticmethod
191 |     def batch_to_elk(filepath=None, batch_size=500, fields=None, elk_ip="127.0.0.1", index="zeeklogs", meta={},
192 |                      ignore_keys=[]):
193 |         # Create handle to ELK
194 |         es = Elasticsearch([elk_ip])
195 | 
196 |         # Create a handle to the log data
197 |         dataHandle = ParseZeekLogs(filepath, fields=fields, output_format="json", meta=meta)
198 | 
199 |         # Begin to process and output data
200 |         dataBatch = []
201 |         for record in dataHandle:
202 |             try:
203 |                 record = loads(record)
204 | 
205 |                 if isinstance(record, dict):
206 |                     record["_index"] = index
207 |                     record["_type"] = index
208 |                     try:
209 |                         record['timestamp'] = datetime.utcfromtimestamp(float(record['ts'])).isoformat()
210 |                     except:
211 |                         pass
212 | 
213 |                     dataBatch.append(record)
214 | 
215 |                     if len(dataBatch) >= batch_size:
216 |                         # Batch the queue to ELK
217 |                         # print("Batching to elk: " + str(len(dataBatch)))
218 |                         dataHandle.bulk_to_elasticsearch(es, dataBatch)
219 |                         # Clear the data queue
220 |                         dataBatch = []
221 |             except:
222 |                 pass
223 | 
224 |         # Batch the final data to ELK
225 |         # print("Batching final data to elk: " + str(len(dataBatch)))
226 |         dataHandle.bulk_to_elasticsearch(es, dataBatch)
227 |         # Clear the data queue
228 |         dataBatch = []
229 | 
230 |     def __str__(self):
231 |         return dumps(self.data)
232 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | setup(
 3 |     name='parsezeeklogs',
 4 |     version='2.0.1',
 5 |     description='A lightweight utility for programmatically reading and manipulating Zeek IDS (Bro IDS) log files and outputting into JSON or CSV format.',
 6 |     author='Dan Gunter',
 7 |     author_email='dangunter@gmail.com',
 8 |     url='https://github.com/dgunter/parsezeeklogs',
 9 |     packages=find_packages(include=['parsezeeklogs', 'parsezeeklogs.*']),
10 |     install_requires=[
11 |         'elasticsearch==7.16.1'
12 |     ],
13 | )
14 | 
15 | 


--------------------------------------------------------------------------------