├── .gitignore
├── LICENSE
├── README.md
├── README.txt
├── run_tests.py
├── setup.cfg
├── setup.py
├── src
    ├── __init__.py
    └── log_scraper
    │   ├── __init__.py
    │   ├── base.py
    │   └── consts.py
└── tests
    └── test_log_scraper.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore compiled python
 2 | *.pyc
 3 | 
 4 | # ignore eggs
 5 | *.egg-info
 6 | 
 7 | # ignore vim tmp files
 8 | .*.swp
 9 | *~
10 | .*~
11 | 
12 | # ignore output of build / tests
13 | .coverage
14 | build/*
15 | dist/*
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, RohitK89
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LogScraper
 2 | A generic library for gathering stats from log files by running regexes on them.
 3 | Things you can do:
 4 |   * Create and run any number of regexes on any number of files in parallel.
 5 |   * Aggregate stats by creating named regex groups in your regexes
 6 |   * Grab archived logs (so long as you tell it where your archives live)
 7 |   * Grab files from remote boxes
 8 |   * Print stats to console
 9 |   * Print regex matches to console
10 |   * Search on gzipped files
11 | 
12 | ## Installation
13 | The easiest manner of installation is to grab the package from the PyPI repository.
14 | 
15 | ```
16 | pip install log_scraper
17 | ```
18 | 
19 | ## Usage
20 | #### Base Usage
21 | For off the cuff usage, you can just create a LogScraper object and tell it what regexes to run
22 | and where to look for files. Eg.
23 | 
24 | ```python
25 | from log_scraper.base import LogScraper
26 | import log_scraper.consts as LSC
27 | 
28 | filepath = '/path/to/file'
29 | filename = 'filename.ext'
30 | scraper = LogScraper(default_filepath={LSC.DEFAULT_PATH : filepath, LSC.DEFAULT_FILENAME : filename})
31 | scraper.add_regex(name='regex1', pattern=r'your_regex_here')
32 | 
33 | # To get aggregated stats
34 | data = scraper.get_log_data()
35 | 
36 | # To print all the stats
37 | scraper.print_total_stats(data)
38 | 
39 | # To print each file's individual stats
40 | scraper.print_stats_per_file(data)
41 | 
42 | # To view log lines matching the regex
43 | scraper.view_regex_matches(scraper.get_regex_matches())
44 | ```
45 | 
46 | The real power, though, is in creating your own class deriving from LogScraper that presets
47 | the paths and the regexes to run so that anyone can then use that anywhere to mine data from
48 | a process' logs.
49 | 
50 | 
51 | ## Development
52 | ### Dependencies
53 |   * Python 2.7
54 |   * [paramiko](http://paramiko-www.readthedocs.org/en/latest/index.html)
55 | 
56 | ### Testing
57 | To test successfully, you must set up a virtual environment
58 | On Unix, in the root folder for the package, do the following:
59 | ```
60 | python -m virtualenv .
61 | source ./bin/activate
62 | ./bin/python setup.py develop
63 | ```
64 | 
65 | Now you can make any changes you want and then run the unit-tests by doing:
66 | 
67 | ```
68 | ./bin/python setup.py test
69 | ```
70 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | LogScraper
 2 | ==========
 3 | 
 4 | A generic library for gathering stats from log files by running regexes
 5 | on them. Things you can do: \* Create and run any number of regexes on
 6 | any number of files in parallel. \* Aggregate stats by creating named
 7 | regex groups in your regexes \* Grab archived logs (so long as you tell
 8 | it where your archives live) \* Grab files from remote boxes \* Print
 9 | stats to console \* Print regex matches to console \* Search on gzipped
10 | files
11 | 
12 | Installation
13 | ------------
14 | 
15 | The easiest manner of installation is to grab the package from the PyPI
16 | repository.
17 | 
18 | ::
19 | 
20 |     pip install log_scraper
21 | 
22 | Usage
23 | -----
24 | 
25 | Base Usage
26 | ^^^^^^^^^^
27 | 
28 | For off the cuff usage, you can just create a LogScraper object and tell
29 | it what regexes to run and where to look for files. Eg.
30 | 
31 | ::
32 | 
33 |     from log_scraper.base import LogScraper
34 |     import log_scraper.consts as LSC
35 | 
36 |     filepath = '/path/to/file'
37 |     filename = 'filename.ext'
38 |     scraper = LogScraper(default_filepath={LSC.DEFAULT_PATH : filepath, LSC.DEFAULT_FILENAME : filename})
39 |     scraper.add_regex(name='regex1', pattern=r'your_regex_here')
40 | 
41 |     # To get aggregated stats
42 |     data = scraper.get_log_data()
43 | 
44 |     # To print all the stats
45 |     scraper.print_total_stats(data)
46 | 
47 |     # To print each file's individual stats
48 |     scraper.print_stats_per_file(data)
49 | 
50 |     # To view log lines matching the regex
51 |     scraper.view_regex_matches(scraper.get_regex_matches())
52 | 
53 | The real power, though, is in creating your own class deriving from
54 | LogScraper that presets the paths and the regexes to run so that anyone
55 | can then use that anywhere to mine data from a process' logs.
56 | 
57 | Development
58 | -----------
59 | 
60 | Dependencies
61 | ~~~~~~~~~~~~
62 | 
63 | -  Python 2.7
64 | -  `paramiko <http://paramiko-www.readthedocs.org/en/latest/index.html>`_
65 | 
66 | Testing
67 | ~~~~~~~
68 | 
69 | To test successfully, you must set up a virtual environment On Unix, in
70 | the root folder for the package, do the following:
71 | ``python -m virtualenv . source ./bin/activate ./bin/python setup.py develop``
72 | 
73 | Now you can make any changes you want and then run the unit-tests by
74 | doing:
75 | 
76 | ::
77 | 
78 |     ./bin/python setup.py test
79 | 
80 | 


--------------------------------------------------------------------------------
/run_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | import nose
 7 | 
 8 | def main(argv=None):
 9 |     if argv is None:
10 |         argv = ['nosetests', '--cover-erase', '--with-coverage',
11 |                 '--cover-package=fds.log_scraper']
12 | 
13 |     nose.run_exit(argv=argv,
14 |                   defaultTest=os.path.join(os.path.dirname(__file__), 'tests'))
15 | 
16 | if __name__ == '__main__':
17 |     main(sys.argv)
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.txt
3 | 
4 | [bdist_rpm]
5 | doc_files = README.txt
6 |             LICENSE
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('README.txt') as file:
 4 |     long_description = file.read()
 5 | 
 6 | setup(
 7 |     name                ='log_scraper',
 8 |     version             ='0.9.9',
 9 |     install_requires    =['paramiko'],
10 |     package_dir         ={'': 'src'},
11 |     packages            =find_packages('src'),
12 |     namespace_packages  =['log_scraper'],
13 | 
14 |     # metadata for upload to PyPI
15 |     author              ='Rohit Kapur',
16 |     author_email        ='rohitkapur@rohitkapur.com',
17 |     maintainer          ='Rohit Kapur',
18 |     maintainer_email    ='rohitkapur@rohitkapur.com',
19 |     description         =(
20 |                 'A base library for writing your own log scraper, '
21 |                 'i.e. something that can run regexes over files '
22 |                 'and give you meaningful information like stats. '
23 |                 'Add your own regexes and plug and play. '
24 |                 'See the readme for more information.'
25 |     ),
26 |     long_description    = long_description,
27 |     license             ='Simplified BSD License',
28 |     platforms           =['UNIX', 'OS X', 'Windows'],
29 |     url                 ='https://github.com/RohitK89/LogScraper/',
30 |     download_url        ='https://github.com/RohitK89/LogScraper/tarball/0.9.6',
31 |     keywords            =['log scraper','logs','regex','stats','grep'],
32 |     test_suite          ='run_tests.main',
33 |     tests_require       =['coverage', 'nose']
34 | )
35 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/src/log_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/src/log_scraper/base.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Log Scraper
  3 | 
  4 | Dependencies:
  5 |     * Python 2.7
  6 |     * paramiko for SSHing to remote hosts
  7 | 
  8 | The LogScraper class provides a plug and play experience to mine data from logs.
  9 | So long as you give it a regex and a file to run on, you'll be good to go.
 10 | You can also build your own scraper on top of the LogScraper class.
 11 | For example, if you want to create a scraper that always runs the same regexes
 12 | on the same files (unless the user requests otherwise), you can override
 13 | the _init_regexes method in your own derived class.
 14 | 
 15 | There are a host of extra parameters you can set when creating your scraper.
 16 | For a full list with explanations, please see the log_scraper_consts module.
 17 | 
 18 | For stats aggregation, just create named groups in your regexes.
 19 | The scraper will then aggregate all hits for each value of each named group.
 20 | 
 21 | If run on several files, the returned dataset will be the aggregated data
 22 | from all the files for each regex, along with each individual file's data.
 23 | 
 24 | For usage examples, please see the unit-tests.
 25 | '''
 26 | 
 27 | from Crypto.pct_warnings import CryptoRuntimeWarning
 28 | from datetime import date
 29 | from glob import glob
 30 | from multiprocessing import Pool
 31 | from operator import itemgetter
 32 | import collections
 33 | import contextlib
 34 | import copy_reg
 35 | import gzip
 36 | import logging
 37 | import os
 38 | import re
 39 | import socket
 40 | import sys
 41 | import threading
 42 | import time
 43 | import types
 44 | import warnings
 45 | import log_scraper.consts as LSC
 46 | 
 47 | # C'est la vie...
 48 | warnings.filterwarnings('ignore', category=CryptoRuntimeWarning)
 49 | import paramiko
 50 | 
 51 | LOGGER = logging.getLogger('log_scraper')
 52 | _LOGGING_SETUP_LOCK = threading.Lock()
 53 | 
 54 | TIMEOUT = 99999999
 55 | 
 56 | class LogScraperException(Exception):
 57 |     '''Base LogScraper Exception class'''
 58 |     pass
 59 | 
 60 | class BadRegexException(Exception):
 61 |     '''Use for anything to do with bad regexes'''
 62 |     pass
 63 | 
 64 | class MissingArgumentException(LogScraperException):
 65 |     '''Use to let caller know they didn't provide a required argument'''
 66 |     pass
 67 | 
 68 | class InvalidArgumentException(LogScraperException):
 69 |     '''Use to let the caller know of any bad arguments'''
 70 |     pass
 71 | 
 72 | class RegexObject(object):
 73 |     '''
 74 |     A wrapper around a regex pattern.
 75 |     Also provides an easy way to get all the named groups from the regex,
 76 |     which you can then use for aggregation or what have you
 77 |     '''
 78 |     def __init__(self, name=None, pattern=None):
 79 |         '''
 80 |         Initialize the object.
 81 |         Throws BadRegexException if the user gives a bad pattern.
 82 |         '''
 83 |         self.name = name
 84 |         self._pattern = pattern
 85 |         self._matcher = None
 86 |         self._create_matcher()
 87 | 
 88 |     def __repr__(self):
 89 |         return 'RegexObject(name={}, pattern={})'.format(self.name, self._pattern)
 90 | 
 91 |     def __str__(self):
 92 |         '''Pretty print info about self'''
 93 |         return 'Pattern: {}, Groups: {}'.format(self._pattern, self._matcher.groupindex.keys())
 94 | 
 95 |     def _create_matcher(self):
 96 |         '''
 97 |         Compile the regex pattern and update the matcher member variable.
 98 |         Throws BadRegexException if the user gives a bad pattern.
 99 |         '''
100 |         try:
101 |             self._matcher = re.compile(self._pattern)
102 |         except Exception:
103 |             raise BadRegexException('Invalid pattern: {}. '
104 |                                     'Could not create matcher'.format(self._pattern))
105 | 
106 |     def get_matcher(self):
107 |         '''Returns the matcher object'''
108 |         return self._matcher
109 | 
110 |     def get_pattern(self):
111 |         '''Returns the pattern'''
112 |         return self._pattern
113 | 
114 |     def update_pattern(self, pattern):
115 |         '''
116 |         Reset the regex pattern, the compiled matcher and the group dicts.
117 |         Throws BadRegexException if the user gives a bad pattern.
118 |         '''
119 |         self._pattern = pattern
120 |         self._create_matcher()
121 | 
122 |     def get_groups(self):
123 |         '''Returns a list of all named groups found in the regex'''
124 |         return self._matcher.groupindex.keys()
125 | 
126 | 
127 | def _pickle_method(method):
128 |     '''
129 |     Define pickling for a custom method and register it.
130 |     This is needed because multiprocessing needs to be able to
131 |     pickle/unpickle the various LogScraper methods while
132 |     processing files in parallel.
133 |     '''
134 |     if method.im_self is None:
135 |         return getattr, (method.im_class, method.im_func.func_name)
136 |     else:
137 |         return getattr, (method.im_self, method.im_func.func_name)
138 | 
139 | copy_reg.pickle(types.MethodType, _pickle_method)
140 | 
141 | class LogScraper(object):
142 |     '''
143 |     Base class for a log scraper.
144 |     Takes care of everything for you, so long as you provide regexes to run.
145 |     If your regexes have named groups in them, it will aggregate stats for
146 |     each value found for each named group.
147 |     You can set defaults for where the logfile(s) live,
148 |     as well as where the archived files live.
149 |     You can specify how many days worth of data is kept before archiving,
150 |     so that it knows where to look for the files.
151 |     Wildcards are acceptable in filepaths.
152 |     If files are to be grabbed from various boxes,
153 |     it will copy them over to your specified temporary space;
154 |     this is faster than just keeping the file open over SSH.
155 |     By default, it will refresh the temporary files if they are older than an hour.
156 |     Data is returned as a python dict mapping the regexes run to the stats found.
157 |     You can print to console by setting the PRINT_STATS key in the options dict,
158 |     or just request the data as a dict to work with.
159 |     '''
160 | 
161 |     COLORS = {
162 |         'HEADER' : '\033[95m',
163 |         'BLUE' : '\033[94m',
164 |         'GREEN' : '\033[92m',
165 |         'WARNING' : '\033[93m',
166 |         'RED' : '\033[91m',
167 |         'ENDC' : '\033[0m'
168 |         }
169 | 
170 |     def __init__(self, default_filepath=None, optional_params=None, user_params=None):
171 |         '''
172 |         default_filepath - Should be a dict containing key-value pairs for:
173 |           {LSC.DEFAULT_PATH : <directory_where_logs_live>,
174 |            LSC.DEFAULT_FILENAME : <filename_of_log_file>}
175 |           Wildcards are allowed in the filename.
176 | 
177 |         optional_params - Dict that specifies some optional stuff for your scraper.
178 |           These should be values that are constant for your scraper
179 |           and are not invocation dependent.
180 |           For the common optional params, see LSC.OPTIONAL_PARAMS.
181 | 
182 |         user_params - Dict of all values that are invocation-dependent
183 |         '''
184 |         if default_filepath is None:
185 |             default_filepath = {}
186 |         self._default_path = default_filepath.get(LSC.DEFAULT_PATH, '')
187 |         default_filename = default_filepath.get(LSC.DEFAULT_FILENAME, '')
188 | 
189 |         self._default_filename, self._default_ext = os.path.splitext(default_filename)
190 |         self._user_params = user_params if user_params else {}
191 |         self._optional_params = {}
192 | 
193 |         self._init_logger()
194 | 
195 |         self._init_optional_params(optional_params)
196 |         self._validate_user_params()
197 | 
198 |         self._regexes = []
199 |         self._init_regexes()
200 | 
201 |         self._file_list = []
202 | 
203 |     def __repr__(self):
204 |         return ('LogScraper(default_filename={}, default_filepath={}, '
205 |                 'optional_params={}, '
206 |                 'user_params={}'.format(self._default_filename, self._default_path,
207 |                                         self._optional_params, self._user_params))
208 | 
209 | 
210 |     def __str__(self):
211 |         return ('Regexes: {}\n'
212 |                 'Default filename: {}\n'
213 |                 'Default filepath: {}\n'
214 |                 'Optional params: {}\n'
215 |                 'User params: {}'.format(self._regexes, self._default_filename,
216 |                                          self._default_path, self._optional_params,
217 |                                          self._user_params))
218 | 
219 | # public:
220 | 
221 |     def add_regex(self, name, pattern):
222 |         '''
223 |         Add a regex to the list of regexes to run.
224 |         Throws BadRegexException if the user gives a bad pattern.
225 |         '''
226 |         self._regexes.append(RegexObject(name=name, pattern=pattern))
227 | 
228 |     def clear_regexes(self):
229 |         '''Resets the list of regexes to run'''
230 |         self._regexes = []
231 | 
232 |     def get_log_data(self):
233 |         '''
234 |         Main driver function for scraping logs.
235 |         Returns the data as a dict
236 |         '''
237 | 
238 |         #Make sure there's some files to run on
239 |         self._file_list = self._get_file_list()
240 |         try:
241 |             self._validate_file_list()
242 |         except InvalidArgumentException as err:
243 |             LOGGER.error('InvalidArgumentException: %s', err)
244 |             return None
245 | 
246 |         regex_hits = {}
247 |         regex_hits[LSC.REGEXES] = {}
248 | 
249 |         for regex in self._regexes:
250 |             regex_hits[LSC.REGEXES][regex.name] = {}
251 |             regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS] = {}
252 |             for group in regex.get_groups():
253 |                 regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS][group] = \
254 |                     collections.OrderedDict()
255 | 
256 |         if self._user_params.get(LSC.DEBUG):
257 |             self._print_regex_patterns()
258 | 
259 |         results = self._multiprocess_files(self._process_file_for_aggregates)
260 | 
261 |         if results is None:
262 |             return None
263 | 
264 |         for result in results:
265 |             for regex_name, hits in result[LSC.REGEXES].items():
266 |                 self._combine_hits(hits, regex_hits[LSC.REGEXES][regex_name])
267 | 
268 |         #Sort the group data
269 |         for hits in regex_hits[LSC.REGEXES].values():
270 |             if LSC.GROUP_HITS in hits:
271 |                 for group, group_hits in hits[LSC.GROUP_HITS].items():
272 |                     hits[LSC.GROUP_HITS][group] = \
273 |                         collections.OrderedDict(sorted(group_hits.iteritems()))
274 | 
275 |         if len(results) > 1:
276 |             regex_hits[LSC.FILE_HITS] = results
277 | 
278 |         return regex_hits
279 | 
280 |     def get_regexes(self):
281 |         '''Returns the list of regexes stored'''
282 |         return self._regexes
283 | 
284 |     def get_regex_matches(self):
285 |         '''
286 |         Returns a dict with all the regex matches found for each file
287 |         '''
288 | 
289 |         #Make sure there's some files to run on
290 |         self._file_list = self._get_file_list()
291 |         try:
292 |             self._validate_file_list()
293 |         except InvalidArgumentException as err:
294 |             LOGGER.error('InvalidArgumentException: %s', err)
295 |             return None
296 | 
297 |         if self._user_params.get(LSC.DEBUG, None):
298 |             self._print_regex_patterns()
299 |         matches = self._multiprocess_files(self._process_file_for_matches)
300 |         return matches
301 | 
302 |     def get_user_params(self):
303 |         '''Getter for user_params'''
304 |         return self._user_params
305 | 
306 |     def print_stats_per_file(self, regex_hits, out=sys.stdout):
307 |         '''Prints stats for each file separately'''
308 |         if regex_hits is None:
309 |             return
310 |         for result in regex_hits[LSC.FILE_HITS]:
311 |             out.write('File: {}\n'.format(result[LSC.FILENAME]))
312 |             self._pretty_print(result[LSC.REGEXES], self._user_params, out)
313 | 
314 |     def print_total_stats(self, regex_hits, out=sys.stdout):
315 |         '''Prints the total stats'''
316 |         if regex_hits is None:
317 |             return
318 |         self._pretty_print(regex_hits[LSC.REGEXES], self._user_params, out)
319 |         for regex_name, hits in regex_hits[LSC.REGEXES].items():
320 |             out.write(self.COLORS['GREEN'])
321 |             out.write('Total hits for regex {}: {:,}\n'.format(regex_name.capitalize(),
322 |                                                                hits[LSC.TOTAL_HITS]))
323 |         out.write(self.COLORS['ENDC'])
324 | 
325 | 
326 |     def set_user_params(self, user_params):
327 |         '''
328 |         Setter for the user params
329 |         Throws: InvalidArgumentException
330 |         '''
331 |         self._user_params = user_params
332 |         self._validate_user_params()
333 | 
334 |     def view_regex_matches(self, out=sys.stdout):
335 |         '''
336 |         Prints out all the lines that match the regexes in the file list properly
337 |         '''
338 |         matches = self.get_regex_matches()
339 |         for file_matches in matches:
340 |             for regex_name, regex_data in file_matches[LSC.REGEXES].items():
341 |                 out.write('Regex: {}\n'.format(regex_name))
342 |                 out.write('Matches:\n')
343 |                 if regex_data[LSC.MATCHES] == []:
344 |                     out.write('{}-No matches\n'.format(file_matches[LSC.FILENAME]))
345 |                 else:
346 |                     for match in regex_data[LSC.MATCHES]:
347 |                         out.write('{}-{}'.format(file_matches[LSC.FILENAME], match))
348 | 
349 | # private:
350 | 
351 |     def _init_base_logger(self):
352 |         '''Creates the base logger'''
353 |         log_level = logging.INFO
354 |         if self._user_params.get(LSC.DEBUG, None):
355 |             log_level = logging.DEBUG
356 | 
357 |         LOGGER.setLevel(log_level)
358 |         # create console handler
359 |         handler = logging.StreamHandler(sys.stdout)
360 |         handler.setLevel(log_level)
361 |         # create formatter
362 |         formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s',
363 |                                       datefmt='%Y%m%d %H:%M:%S')
364 |         # add formatter to ch
365 |         handler.setFormatter(formatter)
366 |         # add ch to logger
367 |         LOGGER.addHandler(handler)
368 | 
369 |     def _init_logger(self):
370 |         '''Sets the format of the logging'''
371 |         with _LOGGING_SETUP_LOCK:
372 |             if not LOGGER.handlers:
373 |                 self._init_base_logger()
374 |             LOGGER.propagate = False
375 |             # Set paramiko logging to only show warnings and higher
376 |             paramiko_logger = logging.getLogger("paramiko")
377 |             if paramiko_logger.level == logging.NOTSET:
378 |                 paramiko_logger.setLevel(logging.WARNING)
379 | 
380 |     def _init_optional_params(self, opt):
381 |         '''
382 |         Initializes all optional params with given values or defaults.
383 |         '''
384 |         if opt is None:
385 |             opt = {}
386 |         for param, default in LSC.OPTIONAL_PARAMS.items():
387 |             self._optional_params[param] = opt.get(param, default)
388 | 
389 | # Methods you should implement for your own scraper
390 |     def _init_regexes(self):
391 |         '''This is where you write the logic for what regexes to run'''
392 |         pass
393 | 
394 | 
395 |     def _get_archived_file_path(self):
396 |         '''Should return where your archived files live'''
397 |         pass
398 | 
399 |     def _validate_user_params(self):
400 |         '''
401 |         Make sure that all user-given values make sense.
402 |         Should throw InvalidArgumentException with a descriptive message otherwise.
403 |         Call this in your derived class constructor.'''
404 |         pass
405 | 
406 |     def _are_logs_archived(self, log_date):
407 |         '''
408 |         Returns whether logs are on netapp or on local box.
409 |         Always returns false if days_before_archiving is zero.
410 |         '''
411 | 
412 |         if self._optional_params[LSC.DAYS_BEFORE_ARCHIVING] == 0 or log_date is None:
413 |             return False
414 | 
415 |         today = date.today()
416 |         date_obj = date(int(log_date[:-4]), int(log_date[-4:-2]), int(log_date[-2:]))
417 | 
418 |         delta = today - date_obj
419 |         if delta.days >= self._optional_params[LSC.DAYS_BEFORE_ARCHIVING]:
420 |             return True
421 | 
422 |         return False
423 | 
424 |     @classmethod
425 |     def _calc_stats(cls, items):
426 |         '''Calculates the min, max and average items processed per key'''
427 | 
428 |         ret_dict = {LSC.MAX_KEY : 0, LSC.MIN_KEY : 0, LSC.MAX_COUNT : 0,
429 |                     LSC.MIN_COUNT : 0, LSC.AVG_COUNT : 0}
430 | 
431 |         if items is None or len(items) == 0:
432 |             return ret_dict
433 | 
434 |         max_key, max_count = max(items.iteritems(), key=itemgetter(1))
435 |         min_key, min_count = min(items.iteritems(), key=itemgetter(1))
436 |         total = sum(items.itervalues())
437 |         count = len(items)
438 | 
439 |         avg_count = float(total)/count
440 | 
441 |         ret_dict[LSC.MAX_KEY] = max_key
442 |         ret_dict[LSC.MIN_KEY] = min_key
443 |         ret_dict[LSC.MAX_COUNT] = max_count
444 |         ret_dict[LSC.MIN_COUNT] = min_count
445 |         ret_dict[LSC.AVG_COUNT] = avg_count
446 | 
447 |         return ret_dict
448 | 
449 |     @classmethod
450 |     def _combine_hits(cls, match_groups, combining_dict):
451 |         '''
452 |         Aggregates all matches for each key found in match_groups
453 |         into combining_dict
454 |         '''
455 | 
456 |         for group, hits in match_groups.items():
457 |             if isinstance(hits, collections.Mapping):
458 |                 cls._combine_hits(match_groups[group], combining_dict[group])
459 |             else:
460 |                 if combining_dict.get(group, None):
461 |                     combining_dict[group] += hits
462 |                 else:
463 |                     combining_dict[group] = hits
464 | 
465 |     @classmethod
466 |     def _copy_remote_file(cls, filepath, local_file, box):
467 |         '''Creates an SSH connection and copies filepath to local_file'''
468 |         ssh = cls._open_ssh_connection(box)
469 |         if ssh is None:
470 |             return ''
471 |         with contextlib.closing(ssh):
472 |             with contextlib.closing(ssh.open_sftp()) as sftp:
473 |                 #Temporarily copy file to current box.
474 |                 #This is being done because reading the file over SSH
475 |                 #slows everything down insanely.
476 |                 sftp.get(filepath, local_file)
477 | 
478 |     def _gen_lines(self, filename):
479 |         '''Generator that yields one line at a time from a file'''
480 |         with self._get_file_handle(filename) as handle:
481 |             for line in handle:
482 |                 yield line
483 | 
484 |     def _get_box_from_level(self, level):
485 |         '''Returns the mapped box name for the given production level'''
486 |         return self._optional_params[LSC.LEVELS_TO_BOXES].get(level, None)
487 | 
488 |     @classmethod
489 |     def _get_file_handle(cls, log_file):
490 |         '''
491 |         Returns a handle connected to the given file.
492 |         Needed because it grabs over ssh if needed,
493 |         and also checks to see if the given file is a gzip file,
494 |         in which case, some fancy stuff is needed to open it properly.
495 |         The first two characters of the header are inspected to see
496 |         whether the file is a gzipped file or plaintext.
497 |         '''
498 |         LOGGER.info('Opening file %s', log_file)
499 | 
500 |         handle = open(log_file, 'rb')
501 |         if handle.read(2) == '\x1f\x8b':
502 |             handle.seek(0)
503 |             handle = gzip.GzipFile(fileobj=handle)
504 |         else:
505 |             handle.seek(0)
506 |         return handle
507 | 
508 |     def _get_file_list(self):
509 |         '''Checks the default filename or wildcard search and the prod level set,
510 |            and returns a list of all files found on the relevant box at the
511 |            given path. If no level value is given, looks on current box'''
512 | 
513 |         file_list = list()
514 |         level = self._user_params.get(LSC.LEVEL, None)
515 |         log_date = self._user_params.get(LSC.DATE, None)
516 |         filename = self._user_params.get(LSC.FILENAME, None)
517 | 
518 |         if (level is not None
519 |                 and not self._are_logs_archived(log_date)):
520 |             if (self._optional_params.get(LSC.FORCE_COPY, False)
521 |                     or socket.gethostname() != self._get_box_from_level(level)):
522 |                 ssh = self._open_ssh_connection(self._get_box_from_level(level))
523 |                 if ssh is None:
524 |                     return file_list
525 |                 with contextlib.closing(ssh):
526 |                     with contextlib.closing(ssh.open_sftp()) as sftp:
527 |                         filename_regex = \
528 |                             self._make_file_name(self._optional_params[LSC.FILENAME_REGEX],
529 |                                                  log_date, level)
530 | 
531 |                         files = sftp.listdir(self._default_path)
532 |                         for name in files:
533 |                             match = re.match(filename_regex, str(name))
534 |                             if match is not None:
535 |                                 file_list.append(os.path.join(self._default_path, match.group()))
536 |                         sftp.close()
537 |                         ssh.close()
538 |                         return file_list
539 | 
540 |         #By default, let's look at the default_filepath
541 |         if filename is None:
542 |             filename = self._make_file_path()
543 |             file_list = glob(filename)
544 | 
545 |         else:
546 |             files = filename.split(',')
547 |             for file_iter in files:
548 |                 file_list += glob(file_iter)
549 |         file_list = sorted([f for f in file_list if os.path.isfile(f)])
550 | 
551 |         return file_list
552 | 
553 |     def _get_log_file(self, log_file):
554 |         '''
555 |         Copies the log file from the appropriate box to local temp space.
556 |         Returns path to local file.
557 |         Doesn't copy if it finds a local file already that is less than
558 |         local_copy_lifetime_in_hours,
559 |         which is an int value specifying how many hours before we recopy.
560 |         '''
561 |         level = self._user_params.get(LSC.LEVEL, None)
562 |         debug = self._user_params.get(LSC.DEBUG, None)
563 | 
564 |         remote_file = os.path.split(log_file)[1]
565 |         local_filepath = os.path.join(self._optional_params[LSC.TMP_PATH],
566 |                                       '_'.join([level, remote_file]))
567 | 
568 |         mtime = 0
569 |         if os.path.exists(local_filepath):
570 |             mtime = os.path.getmtime(local_filepath)
571 | 
572 |         try:
573 |             now = time.time()
574 |             max_time_before_recopy = now - self._optional_params[LSC.LOCAL_COPY_LIFETIME]*60*60
575 | 
576 |             if mtime < max_time_before_recopy:
577 |                 if os.path.exists(local_filepath):
578 |                     os.remove(local_filepath)
579 |                 if debug:
580 |                     LOGGER.debug('Copying file from %s:%s to %s temporarily',
581 |                                  self._get_box_from_level(level),
582 |                                  log_file,
583 |                                  local_filepath)
584 |                 self._copy_remote_file(log_file, local_filepath,
585 |                                        self._get_box_from_level(level))
586 |                 if debug:
587 |                     LOGGER.debug('Done copying file')
588 |         except IOError as err:
589 |             LOGGER.error('Couldn\'t copy %s from %s. Error: %s', log_file,
590 |                          self._get_box_from_level(level), str(err))
591 |             return ''
592 | 
593 |         return local_filepath
594 | 
595 |     def _make_file_path(self):
596 |         '''Creates and returns the path where files should be globbed for
597 |            for a given date and production level'''
598 |         log_date = self._user_params.get(LSC.DATE, None)
599 |         level = self._user_params.get(LSC.LEVEL, None)
600 |         if log_date is None and level is None:
601 |             return os.path.join(self._default_path,
602 |                                 self._make_file_name(self._default_filename))
603 | 
604 |         if not self._are_logs_archived(log_date):
605 |             return os.path.join(self._default_path,
606 |                                 self._make_file_name(self._default_filename,
607 |                                                      log_date,
608 |                                                      self._get_box_from_level(level)))
609 | 
610 |         return os.path.join(self._get_archived_file_path(),
611 |                             self._make_file_name(self._default_filename,
612 |                                                  log_date, self._get_box_from_level(level))
613 |                             + '*')
614 | 
615 |     def _make_file_name(self, base_name, log_date=None, box=None):
616 |         '''
617 |         Basic implementation: <base_name>-<box_name>-<date>.<default_ext>
618 |         Override if necessary
619 |         Returns the filename that would be appropriate for your logs,
620 |         based on the given base_name, date and level.
621 |         '''
622 |         parts = [base_name]
623 |         if box is not None:
624 |             parts.append(box)
625 |         if log_date is not None:
626 |             parts.append(log_date)
627 |         return '-'.join(parts) + self._default_ext
628 | 
629 |     def _multiprocess_files(self, func):
630 |         '''
631 |         Creates a pool to run the given function func
632 |         through several files at once.
633 |         '''
634 | 
635 |         # First copy any remote files as needed and create final file list
636 |         if (self._user_params.get(LSC.LEVEL, None)
637 |                 and not self._are_logs_archived(self._user_params.get(LSC.DATE, None))):
638 |             if (self._optional_params.get(LSC.FORCE_COPY, False)
639 |                     or socket.gethostname() != \
640 |                       self._get_box_from_level(self._user_params.get(LSC.LEVEL, None))):
641 |                 pool = Pool(processes=self._optional_params[LSC.PROCESSOR_COUNT])
642 |                 # Why is there a crazy timeout value at the end of this call?
643 |                 # Because python has a bug in it that's been open for years and has not been fixed
644 |                 # outside of v3.3 and above, wherein a KeyboardInterruption is never delivered
645 |                 # when a thread is waiting for a condition, which leads to a hang
646 |                 # if a user hits ^C.
647 |                 # However, if you set a timeout on the call, Condition.wait() will receive
648 |                 # the interrupt immediately.
649 |                 # See: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
650 |                 file_list = pool.map_async(self._get_log_file, self._file_list).get(TIMEOUT)
651 |                 self._file_list = sorted(filter(lambda x: x != '', file_list))
652 |                 pool = None
653 | 
654 |         LOGGER.debug('Final file list: %s', self._file_list)
655 | 
656 |         if self._file_list == []:
657 |             LOGGER.error('No files found to process.')
658 |             return None
659 | 
660 |         pool = Pool(processes=self._optional_params[LSC.PROCESSOR_COUNT])
661 |         results = pool.map_async(func, self._file_list).get(TIMEOUT)
662 |         return results
663 | 
664 |     @classmethod
665 |     def _open_ssh_connection(cls, server):
666 |         '''Creates and returns an SSH connection to the appropriate box'''
667 |         ssh = paramiko.SSHClient()
668 |         ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
669 |         try:
670 |             ssh.connect(hostname=server, timeout=300)
671 |             return ssh
672 |         except socket.gaierror as err:
673 |             LOGGER.error('Could not create SSH connection on server %s. Error: %s',
674 |                          server, str(err))
675 |         except paramiko.BadHostKeyException as err:
676 |             LOGGER.error('Could not verify hostkey for server: %s. Error: %s', server, str(err))
677 |         except paramiko.AuthenticationException as err:
678 |             LOGGER.error('Could not authenticate on server: %s. Error: %s', server, str(err))
679 |         except paramiko.SSHException as err:
680 |             LOGGER.error('Could not create SSH connection on server %s. Error: %s',
681 |                          server, str(err))
682 | 
683 |         return None
684 | 
685 |     @classmethod
686 |     def _pretty_print(cls, result, options, out=sys.stdout):
687 |         '''
688 |         Pretty prints the stats
689 |         '''
690 | 
691 |         out.write(cls.COLORS["BLUE"])
692 |         if options.get(LSC.DEBUG):
693 |             for regex_name, hits in result.items():
694 |                 regex_name = regex_name.capitalize()
695 |                 for group, group_hits in hits[LSC.GROUP_HITS].items():
696 |                     if group == LSC.TOTAL_HITS:
697 |                         continue
698 |                     out.write('\n{} hits per {}:\n'.format(regex_name, group.capitalize()))
699 |                     cls._pretty_print_dict(group_hits)
700 |                     out.write('\n{} max, min and average:\n'.format(regex_name))
701 |                     cls._print_max_min_avg(group, cls._calc_stats(group_hits))
702 | 
703 |                 out.write('\nTotal {} hits: {:,}\n'.format(regex_name, hits[LSC.TOTAL_HITS]))
704 | 
705 |         out.write(cls.COLORS['ENDC'])
706 | 
707 |     @classmethod
708 |     def _pretty_print_dict(cls, results):
709 |         '''
710 |         Pretty self-explanatory.
711 |         '''
712 |         if results is None:
713 |             return
714 |         for key, val in results.iteritems():
715 |             print "{} : {}\n".format(key, val)
716 | 
717 |     @classmethod
718 |     def _print_max_min_avg(cls, group, stats):
719 |         '''Prints the min, max and average stats'''
720 |         print '\nAggregator: {}\n'.format(group)
721 |         print '\nMax requests processed : {:,}, stat value: {}\n'.format(stats[LSC.MAX_COUNT],
722 |                                                                          stats[LSC.MAX_KEY])
723 |         print 'Min requests processed : {:,}, stat value: {}\n'.format(stats[LSC.MIN_COUNT],
724 |                                                                        stats[LSC.MIN_KEY])
725 |         print 'Average requests processed : {:,}\n'.format(stats[LSC.AVG_COUNT])
726 | 
727 |     def _print_regex_patterns(self):
728 |         '''Prints all the regex patterns'''
729 |         for regex in self._regexes:
730 |             LOGGER.debug('Running regex: %s', regex.get_pattern())
731 | 
732 |     def _process_file_for_matches(self, log_file):
733 |         '''Extracts all regex matches in the given log file and returns.
734 |            Override if you need to run several regexes or do any special
735 |            processing on the files.'''
736 | 
737 |         regex_hits = {LSC.FILENAME : log_file, LSC.REGEXES : {}}
738 |         with self._get_file_handle(log_file) as file_handle:
739 |             for regex in self._regexes:
740 |                 regex_hits[LSC.REGEXES][regex.name] = {}
741 |                 regex_hits[LSC.REGEXES][regex.name][LSC.MATCHES] = []
742 | 
743 |             for line in file_handle:
744 |                 for regex in self._regexes:
745 |                     matcher = regex.get_matcher()
746 |                     if matcher.match(line) != None:
747 |                         regex_hits[LSC.REGEXES][regex.name][LSC.MATCHES].append(line)
748 | 
749 |         return regex_hits
750 | 
751 | 
752 |     def _process_file_for_aggregates(self, log_file):
753 |         '''Extracts the data from the given log_file and returns.
754 |            Override if you need to run several regexes or do any special
755 |            processing on the files.'''
756 | 
757 |         regex_hits = {LSC.FILENAME : log_file, LSC.REGEXES : {}}
758 |         for regex in self._regexes:
759 |             regex_hits[LSC.REGEXES][regex.name] = {}
760 |             regex_hits[LSC.REGEXES][regex.name][LSC.TOTAL_HITS] = 0
761 |             regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS] = {}
762 |             for group in regex.get_groups():
763 |                 regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS][group] = {}
764 | 
765 |         for line in self._gen_lines(log_file):
766 |             for regex in self._regexes:
767 |                 group_hits = regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS]
768 |                 regex_hits[LSC.REGEXES][regex.name][LSC.TOTAL_HITS] += \
769 |                     self._run_regex_and_do_aggregation(line,
770 |                                                        regex.get_matcher(),
771 |                                                        group_hits)
772 |         #Sort the group data
773 |         for hits in regex_hits[LSC.REGEXES].values():
774 |             for group, group_hits in hits[LSC.GROUP_HITS].items():
775 |                 hits[LSC.GROUP_HITS][group] = \
776 |                     collections.OrderedDict(sorted(group_hits.iteritems()))
777 | 
778 |         return regex_hits
779 | 
780 |     @classmethod
781 |     def _run_regex_and_do_aggregation(cls, line, matcher, aggregators):
782 |         '''
783 |         Given the text and a regular expression,
784 |         adds found values for each regex group to the aggregators dict,
785 |         and returns 1.
786 |         If no match is found, returns 0
787 |         '''
788 |         try:
789 |             match = matcher.match(line)
790 |             if match is not None:
791 |                 for agg_key, agg_dict in aggregators.items():
792 |                     cls._sum_group_matches(agg_dict, match, agg_key)
793 |                 return 1
794 |         except AttributeError as err:
795 |             LOGGER.error('Regex Exception %s: %s, Line: %s', type(err), err, line)
796 |             return None
797 |         return 0
798 | 
799 |     @classmethod
800 |     def _sum_group_matches(cls, group_sums, match, regex_group):
801 |         '''
802 |         Takes a regex match and a group value and populates the given dict
803 |         with counts for each unique value for the regex group in the match.
804 |         If the regex match fails, returns silently.
805 |         '''
806 | 
807 |         try:
808 |             key = match.group(regex_group)
809 |             if not key in group_sums:
810 |                 group_sums[key] = 1
811 |             else:
812 |                 group_sums[key] += 1
813 |         except IndexError:
814 |             return
815 | 
816 |     def _validate_file_list(self):
817 |         '''Makes sure that there are files to process'''
818 | 
819 |         if self._file_list == []:
820 |             if self._user_params.get(LSC.FILENAME, None):
821 |                 raise InvalidArgumentException('File does not exist at {} '
822 |                                                'Please provide a valid path to a '
823 |                                                'log file.'.format(self._user_params[LSC.FILENAME]))
824 |             else:
825 |                 raise InvalidArgumentException(('No files found at {} on {}. '
826 |                                                 'Please provide a valid path to a log file.'
827 |                                                ).format(self._make_file_path(),
828 |                                                         'the current box'
829 |                                                         if not
830 |                                                         self._user_params.get(LSC.LEVEL, None)
831 |                                                         else
832 |                                                         self._get_box_from_level(self._user_params[LSC.LEVEL])))
833 | 
834 | 


--------------------------------------------------------------------------------
/src/log_scraper/consts.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | All the consts used by the LogScraper library.
 3 | '''
 4 | 
 5 | # Where your scraper should look in for files if the user doesn't provide a path themselves
 6 | DEFAULT_PATH = 'default_path'
 7 | DEFAULT_FILENAME = 'default_filename'
 8 | 
 9 | # How many days worth of data is kept in the default_filepath before being moved for archival
10 | DAYS_BEFORE_ARCHIVING = 'days_before_archiving'
11 | 
12 | # This is needed because files are grabbed from remote boxes over paramiko,
13 | # and the way that's done is by getting a list of all files in the default_path using listdir(),
14 | # and then running a regex over that list to get the files we care about.
15 | # All this because paramiko has no way to get a list of files with a wildcard in the filename
16 | FILENAME_REGEX = 'filename_regex'
17 | 
18 | # If this key is set to true, the scraper will copy files if a value for 'level' is specified,
19 | # even if the box mapping for 'level' is the same as the host we're currently on.
20 | # Mostly, I'm adding this so that I can write unit-tests for copying
21 | FORCE_COPY = 'force_copy'
22 | 
23 | # Mapping of what level corresponds to what boxname, so that users can just say things like
24 | # --sandbox or --production.
25 | 
26 | LEVELS_TO_BOXES = 'levels_to_boxes'
27 | 
28 | # Files copied over remotely are automatically refreshed if the timestamp on the local copy
29 | # is older than the value for LOCAL_COPY_LIFETIME, which is specified in hours
30 | # Defaults to 0, so that remote files are always refreshed
31 | LOCAL_COPY_LIFETIME = 'local_copy_lifetime'
32 | 
33 | # Where to copy over any files grabbed over SSH
34 | TMP_PATH = 'tmp_path'
35 | 
36 | # How many processors to use while doing multiprocessing on the files
37 | PROCESSOR_COUNT = 'processor_count'
38 | 
39 | # Defaults
40 | OPTIONAL_PARAMS = {DAYS_BEFORE_ARCHIVING : 0, FILENAME_REGEX : '',
41 |                    LEVELS_TO_BOXES : {}, LOCAL_COPY_LIFETIME : 0,
42 |                    TMP_PATH : '', PROCESSOR_COUNT : 4,
43 |                    FORCE_COPY : False}
44 | 
45 | # Misc useful params you could query the user for
46 | DATE = 'date'
47 | 
48 | # Runs logger in debug mode
49 | DEBUG = 'debug'
50 | 
51 | # Override any default filelist in favor of whatever the user gives
52 | FILENAME = 'filename'
53 | 
54 | FILE_HITS = 'file_hits'
55 | 
56 | # What production level box to look on
57 | LEVEL = 'level'
58 | 
59 | # The scraper only prints stats to console if this key is set to True
60 | PRINT_STATS = 'print_stats'
61 | 
62 | # The keys used in the dicts that store the extracted data
63 | REGEXES = 'regexes'
64 | MATCHES = 'matches'
65 | GROUP_HITS = 'group_hits'
66 | TOTAL_HITS = 'total_hits'
67 | 
68 | # Stats dict
69 | MAX_KEY = 'max_key'
70 | MIN_KEY = 'min_key'
71 | MAX_COUNT = 'max_count'
72 | MIN_COUNT = 'min_count'
73 | AVG_COUNT = 'avg_count'
74 | 


--------------------------------------------------------------------------------
/tests/test_log_scraper.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Unit-tests for the Log Scraper library
  3 | '''
  4 | 
  5 | from collections import OrderedDict
  6 | from datetime import datetime, timedelta
  7 | from StringIO import StringIO
  8 | import gzip
  9 | import os
 10 | import shutil
 11 | import socket
 12 | import sys
 13 | import unittest
 14 | 
 15 | BASE_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), os.pardir))
 16 | sys.path.append(BASE_PATH)
 17 | 
 18 | from src.log_scraper.base import LogScraper, RegexObject
 19 | from src.log_scraper.base import BadRegexException, MissingArgumentException, InvalidArgumentException
 20 | import src.log_scraper.consts as LSC
 21 | 
 22 | #DIRS
 23 | ARCHIVE_DIR = 'archived'
 24 | LOG_DIR = './logs'
 25 | LOG_FILE = 'log*.log'
 26 | LOG_FILE_REGEX = r'log\d+'
 27 | 
 28 | #Log files
 29 | LOG_FILE_1 = ("log1.log",
 30 | '''My name is Judge.
 31 | My name is Franklin.
 32 | Judge my name?
 33 | My name is Judge.
 34 | '''
 35 |              )
 36 | LOG_FILE_2 = ("log2.log",
 37 | '''The weather is sunny.
 38 | The time is noon.
 39 | My name is Judge.
 40 | What's my name?
 41 | My name is Franklin.
 42 | The weather is rainy.
 43 | The weather is icy.
 44 | '''
 45 |              )
 46 | # Remote coyping needs full path
 47 | REMOTE_LOG_FILE_PATH = os.path.join(BASE_PATH, 'logs', LOG_FILE_1[0])
 48 | REMOTE_FILE_1 = 'log1-this_box.log'
 49 | REMOTE_FILE_2 = 'log2-this_box.log'
 50 | TMP_REMOTE_DIR = './tmp_remote'
 51 | 
 52 | def _clean_dir(xdir=LOG_DIR):
 53 |     '''Set up directory structure. Will delete any existing directories and files.'''
 54 |     if os.path.exists(xdir):
 55 |         shutil.rmtree(xdir)
 56 |     os.makedirs(xdir)
 57 |     os.mkdir(os.path.join(xdir, ARCHIVE_DIR))
 58 | 
 59 | def _write_file(filename, contents, inc_dir=LOG_DIR):
 60 |     '''
 61 |     Takes a filename, contents and path and writes out file
 62 |     '''
 63 |     with open(os.path.join(inc_dir, filename), 'w') as mfile:
 64 |         mfile.write(contents)
 65 | 
 66 | def _write_file_from_pair(filename, inc_dir=LOG_DIR):
 67 |     '''
 68 |     Takes a tuple that has as its first element a filename, and the second as the contents,
 69 |     and writes out the file to the path provided in the inc_dir param.
 70 |     '''
 71 |     with open(os.path.join(inc_dir, filename[0]), 'w') as mfile:
 72 |         mfile.write(filename[1])
 73 | 
 74 | class LogScraperWithOptions(LogScraper):
 75 |     '''A sample implementation of the log scraper library that sets some of the optional params'''
 76 | 
 77 |     def __init__(self, user_params):
 78 |         default_filepath = {}
 79 |         optional_params = {}
 80 | 
 81 |         default_filepath[LSC.DEFAULT_PATH] = LOG_DIR
 82 |         default_filepath[LSC.DEFAULT_FILENAME] = LOG_FILE
 83 |         optional_params[LSC.DAYS_BEFORE_ARCHIVING] = 1
 84 |         optional_params[LSC.LEVELS_TO_BOXES] = {'this_box' : socket.gethostname()}
 85 | 
 86 |         super(LogScraperWithOptions, self).__init__(default_filepath=default_filepath,
 87 |                                                     optional_params=optional_params,
 88 |                                                     user_params=user_params)
 89 | 
 90 |     def _init_regexes(self):
 91 |         '''Sample regexes'''
 92 |         no_group_regex = r'My name is Judge\.$'
 93 |         group_regex = r'My name is (?P<name>\w+)\.$'
 94 |         self._regexes.append(RegexObject(name='no_group', pattern=no_group_regex))
 95 |         self._regexes.append(RegexObject(name='group', pattern=group_regex))
 96 | 
 97 |     def _get_archived_file_path(self):
 98 |         '''Where logs are archived'''
 99 |         return os.path.join(LOG_DIR, ARCHIVE_DIR)
100 | 
101 | class TestLogScraper(unittest.TestCase):
102 |     '''Creates a simple log scraper and tests out all the functionality'''
103 | 
104 |     def setUp(self):
105 |         '''Create the log scraper to use, write out the test log files'''
106 |         #Write out the sample log files
107 |         _clean_dir()
108 |         _write_file_from_pair(LOG_FILE_1)
109 |         _write_file_from_pair(LOG_FILE_2)
110 |         _write_file_from_pair((LOG_FILE_1[0].split('.')[0] + '-20150301.log', LOG_FILE_1[1]),
111 |                               os.path.join(LOG_DIR, ARCHIVE_DIR))
112 |         _write_file_from_pair((LOG_FILE_2[0].split('.')[0] + '-20150301.log', LOG_FILE_2[1]),
113 |                               os.path.join(LOG_DIR, ARCHIVE_DIR))
114 | 
115 |     def test_setting_user_params(self):
116 |         '''Tests to make sure that the user params dict is set correctly'''
117 |         _log_scraper = LogScraper()
118 |         self.assertEquals(_log_scraper.get_user_params(), {})
119 | 
120 |         user_params = {'TEST' : 'TEST1'}
121 | 
122 |         _log_scraper.set_user_params(user_params)
123 |         self.assertEquals(_log_scraper.get_user_params(), user_params)
124 | 
125 |     def test_base_scraper(self):
126 |         '''Test the no-nonsense simple scraper'''
127 | 
128 |         _log_scraper = LogScraper()
129 |         expected = ("LogScraper(default_filename=, default_filepath=, "
130 |                     "optional_params={'levels_to_boxes': {}, 'filename_regex': '', "
131 |                     "'processor_count': 4, 'local_copy_lifetime': 0, 'tmp_path': '', "
132 |                     "'force_copy': False, 'days_before_archiving': 0}, user_params={}")
133 |         self.assertEquals(repr(_log_scraper), expected)
134 | 
135 |         expected = ("Regexes: []\n"
136 |                     "Default filename: \n"
137 |                     "Default filepath: \n"
138 |                     "Optional params: {'levels_to_boxes': {}, 'filename_regex': '', "
139 |                     "'processor_count': 4, 'local_copy_lifetime': 0, 'tmp_path': '', "
140 |                     "'force_copy': False, 'days_before_archiving': 0}\n"
141 |                     "User params: {}")
142 |         self.assertEquals(str(_log_scraper), expected)
143 | 
144 |         #Set file list
145 |         user_params = {}
146 |         user_params[LSC.DEBUG] = True
147 |         user_params[LSC.FILENAME] = os.path.join(LOG_DIR, LOG_FILE)
148 |         _log_scraper.set_user_params(user_params)
149 | 
150 |         #Add some regexes
151 |         no_group_regex = r'My name is Judge\.$'
152 |         group_regex = r'The (?P<key>\w+) is (?P<value>\w+)\.$'
153 |         _log_scraper.add_regex(name='name_is_judge', pattern=no_group_regex)
154 |         _log_scraper.add_regex(name='key_value_regex', pattern=group_regex)
155 | 
156 |         # Validate user params (should do nothing)
157 |         _log_scraper._validate_user_params()
158 |         
159 |         # Should give back nothing
160 |         self.assertEquals(_log_scraper._get_archived_file_path(), None)
161 | 
162 |         #Finally, get some data
163 |         results = _log_scraper.get_log_data()
164 | 
165 |         expected = {'regexes' : {'key_value_regex': {'group_hits': {'key': OrderedDict([('time', 1),
166 |                                                                                        ('weather', 3)]),
167 |                                                                     'value': OrderedDict([('icy', 1),
168 |                                                                                           ('noon', 1),
169 |                                                                                           ('rainy', 1),
170 |                                                                                           ('sunny', 1)])},
171 |                                                      'total_hits': 4},
172 |                                  'name_is_judge': {'group_hits': {}, 'total_hits': 3}},
173 |                     'file_hits': [{'regexes': {'key_value_regex': {'group_hits': {'value': OrderedDict(),
174 |                                                                                   'key': OrderedDict()},
175 |                                                                    'total_hits': 0},
176 |                                                'name_is_judge': {'group_hits': {}, 'total_hits': 2}},
177 |                                    'filename': './logs/log1.log'},
178 |                                   {'regexes': {'key_value_regex': {'group_hits': {'value': OrderedDict([('icy', 1),
179 |                                                                                                         ('noon', 1),
180 |                                                                                                         ('rainy', 1),
181 |                                                                                                         ('sunny', 1)]),
182 |                                                                                   'key': OrderedDict([('time', 1),
183 |                                                                                                       ('weather', 3)])},
184 |                                                                    'total_hits': 4},
185 |                                                'name_is_judge': {'group_hits': {},
186 |                                                                  'total_hits': 1}},
187 |                                    'filename': './logs/log2.log'}]}
188 | 
189 |         self.maxDiff = None
190 |         self.assertDictEqual(results, expected)
191 | 
192 |         # Test the min/max/avg
193 |         # Test with no data
194 |         stats = _log_scraper._calc_stats([])
195 |         expected = {'max_key': 0,
196 |                     'max_count': 0,
197 |                     'avg_count': 0,
198 |                     'min_count': 0,
199 |                     'min_key': 0}
200 |         self.assertDictEqual(stats, expected)
201 | 
202 |         stats = _log_scraper._calc_stats(results['regexes']['key_value_regex'][LSC.GROUP_HITS]['key'])
203 |         expected = {'max_key': 'weather',
204 |                     'max_count': 3,
205 |                     'avg_count': 2.0,
206 |                     'min_count': 1,
207 |                     'min_key': 'time'}
208 |         self.assertDictEqual(stats, expected)
209 | 
210 |         # Test viewing regex hits
211 |         matches = _log_scraper.get_regex_matches()
212 |         self.assertEquals(len(matches), 2)
213 |         self.assertEquals(matches[0][LSC.FILENAME], os.path.join(LOG_DIR, LOG_FILE_1[0]))
214 |         self.assertEquals(len(matches[0][LSC.REGEXES]['key_value_regex'][LSC.MATCHES]), 0)
215 |         self.assertEquals(len(matches[0][LSC.REGEXES]['name_is_judge'][LSC.MATCHES]), 2)
216 |         self.assertEquals(matches[1][LSC.FILENAME], os.path.join(LOG_DIR, LOG_FILE_2[0]))
217 |         self.assertEquals(len(matches[1][LSC.REGEXES]['key_value_regex'][LSC.MATCHES]), 4)
218 |         self.assertEquals(len(matches[1][LSC.REGEXES]['name_is_judge'][LSC.MATCHES]), 1)
219 | 
220 |     def test_no_match_regex(self):
221 |         '''How regexes that don't match anything are handled'''
222 | 
223 |         _log_scraper = LogScraper()
224 | 
225 |         #Set file list
226 |         user_params = {}
227 |         user_params[LSC.FILENAME] = os.path.join(LOG_DIR, LOG_FILE)
228 |         _log_scraper.set_user_params(user_params)
229 | 
230 |         #Add some regexes
231 |         no_group_regex = r'This should not match\.$'
232 |         _log_scraper.add_regex(name='no_match', pattern=no_group_regex)
233 | 
234 |         results = _log_scraper.get_log_data()
235 |         self.assertEquals(results[LSC.REGEXES]['no_match'][LSC.TOTAL_HITS], 0)
236 | 
237 |     def test_archived_scraping(self):
238 |         '''Test the scraper that fetches archived files'''
239 | 
240 |         user_params = {}
241 |         user_params[LSC.DATE] = '20150301'
242 |         user_params[LSC.DEBUG] = True
243 |         _option_scraper = LogScraperWithOptions(user_params=user_params)
244 | 
245 |         results = _option_scraper.get_log_data()
246 |         expected = {'regexes' : {'no_group': {'group_hits': {}, 'total_hits': 3},
247 |                                  'group': {'group_hits': {'name': OrderedDict([('Franklin', 2), ('Judge', 3)])},
248 |                                            'total_hits': 5}},
249 |                     'file_hits': [{'regexes': {'no_group': {'group_hits': {},
250 |                                                             'total_hits': 2},
251 |                                                'group': {'group_hits': {'name': OrderedDict([('Franklin', 1),
252 |                                                                                              ('Judge', 2)])},
253 |                                                          'total_hits': 3}},
254 |                                    'filename': './logs/archived/log1-20150301.log'},
255 |                                   {'regexes': {'no_group': {'group_hits': {},
256 |                                                             'total_hits': 1},
257 |                                                'group': {'group_hits': {'name': OrderedDict([('Franklin', 1),
258 |                                                                                              ('Judge', 1)])},
259 |                                                          'total_hits': 2}},
260 |                                    'filename': './logs/archived/log2-20150301.log'}]}
261 |         self.assertDictEqual(results, expected)
262 | 
263 |         test_date = datetime.today().strftime('%Y%m%d')
264 |         # Yes, yes, it's testing a private method directly. Let's move on
265 |         self.assertFalse(_option_scraper._are_logs_archived(test_date))
266 | 
267 |     def test_regexes(self):
268 |         '''Test the logic for adding and removing regexes from the scraper'''
269 |         _log_scraper = LogScraper()
270 | 
271 |         pattern = 'Very Specific Regex'
272 |         regex_obj = RegexObject(name='test_regex', pattern=pattern)
273 |         self.assertEqual(regex_obj.get_pattern(), pattern)
274 |         matcher = regex_obj.get_matcher()
275 |         self.assertEqual(matcher.match(pattern).group(), pattern)
276 | 
277 |         new_pattern = 'New (?P<group>(Pattern))'
278 |         regex_obj.update_pattern(new_pattern)
279 |         self.assertEqual(regex_obj.get_pattern(), new_pattern)
280 |         matcher = regex_obj.get_matcher()
281 |         self.assertEqual(regex_obj.__repr__(),
282 |                          'RegexObject(name=test_regex, pattern=New (?P<group>(Pattern)))')
283 | 
284 |         self.assertEqual(regex_obj.__str__(),
285 |                          "Pattern: New (?P<group>(Pattern)), Groups: ['group']")
286 | 
287 |         self.assertEqual(regex_obj.get_groups(), ['group'])
288 | 
289 | 
290 |         _log_scraper.add_regex(name='test_regex', pattern='.*')
291 | 
292 |         self.assertEqual(1, len(_log_scraper.get_regexes()))
293 | 
294 |         _log_scraper.add_regex(name='test_regex_2', pattern='^.*$')
295 | 
296 |         self.assertEqual(2, len(_log_scraper.get_regexes()))
297 | 
298 |         # Clear regexes and test size
299 |         _log_scraper.clear_regexes()
300 | 
301 |         self.assertEqual(0, len(_log_scraper.get_regexes()))
302 | 
303 |         # Give a bad pattern
304 |         with self.assertRaises(BadRegexException):
305 |             _log_scraper.add_regex(name='bad_regex', pattern='?P<whoops')
306 | 
307 |     def test_remote_file_copying(self):
308 |         '''Tests to see if it copies file over SSH properly'''
309 | 
310 |         if os.path.exists(TMP_REMOTE_DIR):
311 |             shutil.rmtree(TMP_REMOTE_DIR)
312 |         os.makedirs(TMP_REMOTE_DIR)
313 |         _write_file(REMOTE_FILE_1, LOG_FILE_1[1])
314 |         _write_file(REMOTE_FILE_2, LOG_FILE_2[1])
315 | 
316 |         default_filepath = {}
317 |         default_filepath[LSC.DEFAULT_PATH] = os.path.join(BASE_PATH, 'logs')
318 |         default_filepath[LSC.DEFAULT_FILENAME] = LOG_FILE
319 | 
320 |         user_params = {LSC.LEVEL : 'this_box',
321 |                        LSC.FILENAME : REMOTE_LOG_FILE_PATH}
322 |         user_params[LSC.DEBUG] = True
323 | 
324 |         # Test SSHing with correct values
325 |         optional_params = {LSC.TMP_PATH : TMP_REMOTE_DIR,
326 |                            LSC.LOCAL_COPY_LIFETIME : 1,
327 |                            LSC.LEVELS_TO_BOXES : {'this_box' : socket.gethostname()},
328 |                            LSC.FILENAME_REGEX : LOG_FILE_REGEX,
329 |                            LSC.FORCE_COPY : True}
330 | 
331 |         _log_scraper = LogScraper(default_filepath=default_filepath,
332 |                                   user_params=user_params,
333 |                                   optional_params=optional_params)
334 | 
335 |         file_list = _log_scraper._get_file_list()
336 |         for scraper_file in file_list:
337 |             _log_scraper._get_log_file(scraper_file)
338 | 
339 |         expected_files = []
340 |         for filename in [REMOTE_FILE_1, REMOTE_FILE_2]:
341 |             tmp_file = os.path.join(TMP_REMOTE_DIR, '_'.join(['this_box', filename]))
342 |             self.assertTrue(os.path.exists(tmp_file))
343 |             expected_files.append(tmp_file)
344 | 
345 |         # Store the modification time of the locally copied file.
346 |         # We're going to repeat the copy operation, and since the file is new,
347 |         # there should be no recopying
348 |         mtimes = []
349 |         for filepath in expected_files:
350 |             mtimes.append(os.path.getmtime(filepath))
351 | 
352 |         copied_files = []
353 |         for scraper_file in file_list:
354 |             copied_files.append(_log_scraper._get_log_file(scraper_file))
355 | 
356 |         self.assertEquals(expected_files, copied_files)
357 | 
358 |         # Compare mtimes
359 |         copied_mtimes = []
360 |         for filepath in copied_files:
361 |             copied_mtimes.append(os.path.getmtime(filepath))
362 | 
363 |         self.assertEquals(mtimes, copied_mtimes)
364 | 
365 |         # Test copying remote files when local copy is out of date
366 |         optional_params[LSC.LOCAL_COPY_LIFETIME] = 0
367 |         _log_scraper = LogScraper(default_filepath=default_filepath,
368 |                                   user_params=user_params,
369 |                                   optional_params=optional_params)
370 | 
371 |         file_list = _log_scraper._get_file_list()
372 |         copied_files = []
373 |         for scraper_file in file_list:
374 |             copied_files.append(_log_scraper._get_log_file(scraper_file))
375 | 
376 |         self.assertEquals(expected_files, copied_files)
377 | 
378 |         # Compare mtimes
379 |         copied_mtimes = []
380 |         for filepath in copied_files:
381 |             copied_mtimes.append(os.path.getmtime(filepath))
382 | 
383 |         self.assertNotEqual(mtimes, copied_mtimes)
384 | 
385 |         # Test SSHing with bad box name
386 |         optional_params = {LSC.TMP_PATH : TMP_REMOTE_DIR,
387 |                            LSC.LOCAL_COPY_LIFETIME : 1,
388 |                            LSC.LEVELS_TO_BOXES : {'this_box' : 'bad_box'},
389 |                            LSC.FILENAME_REGEX : LOG_FILE_REGEX,
390 |                            LSC.FORCE_COPY : True}
391 | 
392 |         _log_scraper = LogScraper(default_filepath=default_filepath,
393 |                                   user_params=user_params,
394 |                                   optional_params=optional_params)
395 | 
396 |         self.assertEquals(_log_scraper.get_log_data(), None)
397 |         for filename in [REMOTE_FILE_1, REMOTE_FILE_2]:
398 |             tmp_file = os.path.join(TMP_REMOTE_DIR, '_'.join(['bad_box', filename]))
399 |             self.assertFalse(os.path.exists(tmp_file))
400 | 
401 |         shutil.rmtree(TMP_REMOTE_DIR)
402 | 
403 |     def test_file_path_creation(self):
404 |         '''Run the file path creation logic through its paces'''
405 | 
406 |         # No default paths, so no path should be made
407 |         _log_scraper = LogScraper()
408 |         self.assertEquals(_log_scraper._make_file_path(), '')
409 | 
410 |         # Test with default filepath
411 |         default_filepath = {}
412 |         default_filepath[LSC.DEFAULT_PATH] = os.path.join(BASE_PATH, 'logs')
413 |         default_filepath[LSC.DEFAULT_FILENAME] = LOG_FILE
414 |         _log_scraper = LogScraper(default_filepath=default_filepath)
415 |         log_file_parts = LOG_FILE.split('.')
416 |         expected = os.path.join(BASE_PATH, 'logs', LOG_FILE)
417 |         self.assertEquals(_log_scraper._make_file_path(), expected)
418 | 
419 |         # Test with date and non-archival
420 |         user_params = {}
421 |         test_date = datetime.today().strftime('%Y%m%d')
422 |         user_params[LSC.DATE] = test_date
423 |         _option_scraper = LogScraperWithOptions(user_params=user_params)
424 |         expected = os.path.join(LOG_DIR,
425 |                                 '.'.join(['-'.join([log_file_parts[0], test_date]),
426 |                                           log_file_parts[1]]))
427 |         self.assertEquals(_option_scraper._make_file_path(), expected)
428 | 
429 |         # Non-archival with box name and no date
430 |         user_params = {LSC.LEVEL : 'this_box'}
431 | 
432 |         optional_params = {LSC.LEVELS_TO_BOXES : {'this_box' : socket.gethostname()},
433 |                            LSC.FILENAME_REGEX : LOG_FILE_REGEX}
434 |         _log_scraper = LogScraper(default_filepath=default_filepath,
435 |                                   optional_params=optional_params,
436 |                                   user_params=user_params)
437 |         expected = os.path.join(BASE_PATH, 'logs',
438 |                                 '.'.join(['-'.join([log_file_parts[0], socket.gethostname()]),
439 |                                           log_file_parts[1]]))
440 |         self.assertEquals(_log_scraper._make_file_path(), expected)
441 | 
442 |         # Non-archival with box name and date
443 |         user_params = {LSC.LEVEL : 'this_box',
444 |                        LSC.DATE : test_date}
445 | 
446 |         optional_params = {LSC.LEVELS_TO_BOXES : {'this_box' : socket.gethostname()},
447 |                            LSC.FILENAME_REGEX : LOG_FILE_REGEX}
448 |         _log_scraper = LogScraper(default_filepath=default_filepath,
449 |                                   optional_params=optional_params,
450 |                                   user_params=user_params)
451 |         expected = os.path.join(BASE_PATH, 'logs',
452 |                                 '.'.join(['-'.join([log_file_parts[0], socket.gethostname(),
453 |                                                     test_date]),
454 |                                           log_file_parts[1]]))
455 |         self.assertEquals(_log_scraper._make_file_path(), expected)
456 | 
457 |         # Archival with box name and date
458 |         test_date = datetime.today() - timedelta(2)
459 |         test_date = test_date.strftime('%Y%m%d')
460 |         user_params = {LSC.LEVEL : 'this_box',
461 |                        LSC.DATE : test_date}
462 | 
463 |         _option_scraper = LogScraperWithOptions(user_params=user_params)
464 |         expected = os.path.join(LOG_DIR, ARCHIVE_DIR,
465 |                                 '.'.join(['-'.join([log_file_parts[0], socket.gethostname(),
466 |                                                     test_date]),
467 |                                           log_file_parts[1] + '*']))
468 |         self.assertEquals(_option_scraper._make_file_path(), expected)
469 | 
470 | 
471 |     def test_bad_filepath(self):
472 |         '''Create a log scraper with a bad filepath'''
473 | 
474 |         _log_scraper = LogScraper()
475 | 
476 |         #Should return None as no files should be found
477 |         self.assertEqual(_log_scraper.get_log_data(), None)
478 | 
479 |         user_params = {}
480 |         user_params[LSC.FILENAME] = '/this/path/does/not/exist/'
481 |         user_params[LSC.DEBUG] = True
482 |         _log_scraper.set_user_params(user_params)
483 |         #Should return None as no files should be found
484 |         self.assertEqual(_log_scraper.get_log_data(), None)
485 |         self.assertEqual(_log_scraper.get_regex_matches(), None)
486 | 
487 |     def test_good_filepath(self):
488 |         '''Create a log scraper with a good filepath'''
489 |         user_params = {}
490 |         user_params[LSC.FILENAME] = os.path.join(LOG_DIR, LOG_FILE)
491 |         user_params[LSC.DEBUG] = True
492 |         _option_scraper = LogScraperWithOptions(user_params=user_params)
493 | 
494 |         results = _option_scraper.get_log_data()
495 |         expected = {'regexes' : {'no_group': {'group_hits': {}, 'total_hits': 3},
496 |                                  'group': {'group_hits': {'name': OrderedDict([('Franklin', 2),
497 |                                                                                ('Judge', 3)])},
498 |                                            'total_hits': 5}},
499 |                     'file_hits': [{'regexes': {'no_group': {'group_hits': {},
500 |                                                             'total_hits': 2},
501 |                                                'group': {'group_hits': {'name': OrderedDict([('Franklin', 1),
502 |                                                                                              ('Judge', 2)])},
503 |                                                          'total_hits': 3}},
504 |                                    'filename': './logs/log1.log'},
505 |                                   {'regexes': {'no_group': {'group_hits': {},
506 |                                                             'total_hits': 1},
507 |                                                'group': {'group_hits': {'name': OrderedDict([('Franklin', 1),
508 |                                                                                              ('Judge', 1)])},
509 |                                                'total_hits': 2}},
510 |                                    'filename': './logs/log2.log'}]}
511 |         self.assertDictEqual(results, expected)
512 | 
513 | 
514 |     def test_printing(self):
515 |         '''Test the functions that print stuff out'''
516 |         _log_scraper = LogScraper()
517 |         out = StringIO()
518 |         _log_scraper.print_total_stats(None, out=out)
519 |         self.assertEquals(out.getvalue(), '')
520 | 
521 |         _log_scraper.print_stats_per_file(None, out=out)
522 |         self.assertEquals(out.getvalue(), '')
523 | 
524 |         #Set file list
525 |         user_params = {}
526 |         user_params[LSC.FILENAME] = os.path.join(LOG_DIR, LOG_FILE)
527 |         _log_scraper.set_user_params(user_params)
528 | 
529 |         #Add some regexes
530 |         no_group_regex = r'My name is Judge\.$'
531 |         _log_scraper.add_regex(name='name_is_judge', pattern=no_group_regex)
532 | 
533 |         # Test printing without debug mode
534 |         results = _log_scraper.get_log_data()
535 |         _log_scraper.print_total_stats(results, out=out)
536 |         expected = '\x1b[94m\x1b[0m\x1b[92mTotal hits for regex Name_is_judge: 3\n\x1b[0m'
537 |         self.assertEquals(out.getvalue(), expected)
538 | 
539 |         out.close()
540 |         out = StringIO()
541 | 
542 |         _log_scraper.print_stats_per_file(results, out=out)
543 |         expected = 'File: ./logs/log1.log\n\x1b[94m\x1b[0mFile: ./logs/log2.log\n\x1b[94m\x1b[0m'
544 |         self.assertEquals(out.getvalue(), expected)
545 | 
546 |         # Test printing with debug mode
547 |         out.close()
548 |         out = StringIO()
549 | 
550 |         user_params[LSC.DEBUG] = True
551 |         _log_scraper.set_user_params(user_params)
552 | 
553 |         results = _log_scraper.get_log_data()
554 |         _log_scraper.print_total_stats(results, out=out)
555 |         expected = ('\x1b[94m\nTotal Name_is_judge hits: 3\n\x1b[0m\x1b[92m'
556 |                     'Total hits for regex Name_is_judge: 3\n\x1b[0m')
557 |         self.assertEquals(out.getvalue(), expected)
558 | 
559 |         out.close()
560 |         out = StringIO()
561 | 
562 |         _log_scraper.print_stats_per_file(results, out=out)
563 |         expected = ('File: ./logs/log1.log\n\x1b[94m\n'
564 |                     'Total Name_is_judge hits: 2\n\x1b[0m'
565 |                     'File: ./logs/log2.log\n\x1b[94m\n'
566 |                     'Total Name_is_judge hits: 1\n\x1b[0m')
567 |         self.assertEquals(out.getvalue(), expected)
568 | 
569 |         # Test printing regex matches
570 |         _log_scraper.add_regex(name='no_match', pattern='No match')
571 |         results = _log_scraper.get_regex_matches()
572 |         out.close()
573 |         out = StringIO()
574 |         expected = ('Regex: no_match\n'
575 |                     'Matches:\n'
576 |                     './logs/log1.log-No matches\n'
577 |                     'Regex: name_is_judge\n'
578 |                     'Matches:\n'
579 |                     './logs/log1.log-My name is Judge.\n'
580 |                     './logs/log1.log-My name is Judge.\n'
581 |                     'Regex: no_match\n'
582 |                     'Matches:\n'
583 |                     './logs/log2.log-No matches\n'
584 |                     'Regex: name_is_judge\n'
585 |                     'Matches:\n'
586 |                     './logs/log2.log-My name is Judge.\n')
587 |         _log_scraper.view_regex_matches(out=out)
588 |         self.assertEquals(out.getvalue(), expected)
589 | 
590 |     def test_file_reading(self):
591 |         '''Test the opening and reading of files'''
592 | 
593 |         _log_scraper = LogScraperWithOptions({})
594 |         # Test regular files
595 |         contents = LOG_FILE_1[1].splitlines(True)
596 |         cntr = 0
597 |         for line in _log_scraper._gen_lines(os.path.join(LOG_DIR, LOG_FILE_1[0])):
598 |             self.assertEquals(line, contents[cntr])
599 |             cntr += 1
600 | 
601 |         # Test gzip files
602 |         zip_file = os.path.join(LOG_DIR, LOG_FILE_2[0] + '.gz')
603 |         with gzip.open(zip_file, 'wb') as handle:
604 |             handle.write(LOG_FILE_2[1])
605 |         contents = LOG_FILE_2[1].splitlines(True)
606 |         cntr = 0
607 |         for line in _log_scraper._gen_lines(zip_file):
608 |             self.assertEquals(line, contents[cntr])
609 |             cntr += 1
610 | 
611 |     def tearDown(self):
612 |         '''Remove any temp files'''
613 |         shutil.rmtree(LOG_DIR)
614 | 


--------------------------------------------------------------------------------