├── .gitignore ├── LICENSE ├── README.md ├── README.txt ├── run_tests.py ├── setup.cfg ├── setup.py ├── src ├── __init__.py └── log_scraper │ ├── __init__.py │ ├── base.py │ └── consts.py └── tests └── test_log_scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore compiled python 2 | *.pyc 3 | 4 | # ignore eggs 5 | *.egg-info 6 | 7 | # ignore vim tmp files 8 | .*.swp 9 | *~ 10 | .*~ 11 | 12 | # ignore output of build / tests 13 | .coverage 14 | build/* 15 | dist/* 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, RohitK89 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LogScraper 2 | A generic library for gathering stats from log files by running regexes on them. 3 | Things you can do: 4 | * Create and run any number of regexes on any number of files in parallel. 5 | * Aggregate stats by creating named regex groups in your regexes 6 | * Grab archived logs (so long as you tell it where your archives live) 7 | * Grab files from remote boxes 8 | * Print stats to console 9 | * Print regex matches to console 10 | * Search on gzipped files 11 | 12 | ## Installation 13 | The easiest manner of installation is to grab the package from the PyPI repository. 14 | 15 | ``` 16 | pip install log_scraper 17 | ``` 18 | 19 | ## Usage 20 | #### Base Usage 21 | For off the cuff usage, you can just create a LogScraper object and tell it what regexes to run 22 | and where to look for files. Eg. 23 | 24 | ```python 25 | from log_scraper.base import LogScraper 26 | import log_scraper.consts as LSC 27 | 28 | filepath = '/path/to/file' 29 | filename = 'filename.ext' 30 | scraper = LogScraper(default_filepath={LSC.DEFAULT_PATH : filepath, LSC.DEFAULT_FILENAME : filename}) 31 | scraper.add_regex(name='regex1', pattern=r'your_regex_here') 32 | 33 | # To get aggregated stats 34 | data = scraper.get_log_data() 35 | 36 | # To print all the stats 37 | scraper.print_total_stats(data) 38 | 39 | # To print each file's individual stats 40 | scraper.print_stats_per_file(data) 41 | 42 | # To view log lines matching the regex 43 | scraper.view_regex_matches(scraper.get_regex_matches()) 44 | ``` 45 | 46 | The real power, though, is in creating your own class deriving from LogScraper that presets 47 | the paths and the regexes to run so that anyone can then use that anywhere to mine data from 48 | a process' logs. 49 | 50 | 51 | ## Development 52 | ### Dependencies 53 | * Python 2.7 54 | * [paramiko](http://paramiko-www.readthedocs.org/en/latest/index.html) 55 | 56 | ### Testing 57 | To test successfully, you must set up a virtual environment 58 | On Unix, in the root folder for the package, do the following: 59 | ``` 60 | python -m virtualenv . 61 | source ./bin/activate 62 | ./bin/python setup.py develop 63 | ``` 64 | 65 | Now you can make any changes you want and then run the unit-tests by doing: 66 | 67 | ``` 68 | ./bin/python setup.py test 69 | ``` 70 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | LogScraper 2 | ========== 3 | 4 | A generic library for gathering stats from log files by running regexes 5 | on them. Things you can do: \* Create and run any number of regexes on 6 | any number of files in parallel. \* Aggregate stats by creating named 7 | regex groups in your regexes \* Grab archived logs (so long as you tell 8 | it where your archives live) \* Grab files from remote boxes \* Print 9 | stats to console \* Print regex matches to console \* Search on gzipped 10 | files 11 | 12 | Installation 13 | ------------ 14 | 15 | The easiest manner of installation is to grab the package from the PyPI 16 | repository. 17 | 18 | :: 19 | 20 | pip install log_scraper 21 | 22 | Usage 23 | ----- 24 | 25 | Base Usage 26 | ^^^^^^^^^^ 27 | 28 | For off the cuff usage, you can just create a LogScraper object and tell 29 | it what regexes to run and where to look for files. Eg. 30 | 31 | :: 32 | 33 | from log_scraper.base import LogScraper 34 | import log_scraper.consts as LSC 35 | 36 | filepath = '/path/to/file' 37 | filename = 'filename.ext' 38 | scraper = LogScraper(default_filepath={LSC.DEFAULT_PATH : filepath, LSC.DEFAULT_FILENAME : filename}) 39 | scraper.add_regex(name='regex1', pattern=r'your_regex_here') 40 | 41 | # To get aggregated stats 42 | data = scraper.get_log_data() 43 | 44 | # To print all the stats 45 | scraper.print_total_stats(data) 46 | 47 | # To print each file's individual stats 48 | scraper.print_stats_per_file(data) 49 | 50 | # To view log lines matching the regex 51 | scraper.view_regex_matches(scraper.get_regex_matches()) 52 | 53 | The real power, though, is in creating your own class deriving from 54 | LogScraper that presets the paths and the regexes to run so that anyone 55 | can then use that anywhere to mine data from a process' logs. 56 | 57 | Development 58 | ----------- 59 | 60 | Dependencies 61 | ~~~~~~~~~~~~ 62 | 63 | - Python 2.7 64 | - `paramiko `_ 65 | 66 | Testing 67 | ~~~~~~~ 68 | 69 | To test successfully, you must set up a virtual environment On Unix, in 70 | the root folder for the package, do the following: 71 | ``python -m virtualenv . source ./bin/activate ./bin/python setup.py develop`` 72 | 73 | Now you can make any changes you want and then run the unit-tests by 74 | doing: 75 | 76 | :: 77 | 78 | ./bin/python setup.py test 79 | 80 | -------------------------------------------------------------------------------- /run_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | import nose 7 | 8 | def main(argv=None): 9 | if argv is None: 10 | argv = ['nosetests', '--cover-erase', '--with-coverage', 11 | '--cover-package=fds.log_scraper'] 12 | 13 | nose.run_exit(argv=argv, 14 | defaultTest=os.path.join(os.path.dirname(__file__), 'tests')) 15 | 16 | if __name__ == '__main__': 17 | main(sys.argv) 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.txt 3 | 4 | [bdist_rpm] 5 | doc_files = README.txt 6 | LICENSE 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.txt') as file: 4 | long_description = file.read() 5 | 6 | setup( 7 | name ='log_scraper', 8 | version ='0.9.9', 9 | install_requires =['paramiko'], 10 | package_dir ={'': 'src'}, 11 | packages =find_packages('src'), 12 | namespace_packages =['log_scraper'], 13 | 14 | # metadata for upload to PyPI 15 | author ='Rohit Kapur', 16 | author_email ='rohitkapur@rohitkapur.com', 17 | maintainer ='Rohit Kapur', 18 | maintainer_email ='rohitkapur@rohitkapur.com', 19 | description =( 20 | 'A base library for writing your own log scraper, ' 21 | 'i.e. something that can run regexes over files ' 22 | 'and give you meaningful information like stats. ' 23 | 'Add your own regexes and plug and play. ' 24 | 'See the readme for more information.' 25 | ), 26 | long_description = long_description, 27 | license ='Simplified BSD License', 28 | platforms =['UNIX', 'OS X', 'Windows'], 29 | url ='https://github.com/RohitK89/LogScraper/', 30 | download_url ='https://github.com/RohitK89/LogScraper/tarball/0.9.6', 31 | keywords =['log scraper','logs','regex','stats','grep'], 32 | test_suite ='run_tests.main', 33 | tests_require =['coverage', 'nose'] 34 | ) 35 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /src/log_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /src/log_scraper/base.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Log Scraper 3 | 4 | Dependencies: 5 | * Python 2.7 6 | * paramiko for SSHing to remote hosts 7 | 8 | The LogScraper class provides a plug and play experience to mine data from logs. 9 | So long as you give it a regex and a file to run on, you'll be good to go. 10 | You can also build your own scraper on top of the LogScraper class. 11 | For example, if you want to create a scraper that always runs the same regexes 12 | on the same files (unless the user requests otherwise), you can override 13 | the _init_regexes method in your own derived class. 14 | 15 | There are a host of extra parameters you can set when creating your scraper. 16 | For a full list with explanations, please see the log_scraper_consts module. 17 | 18 | For stats aggregation, just create named groups in your regexes. 19 | The scraper will then aggregate all hits for each value of each named group. 20 | 21 | If run on several files, the returned dataset will be the aggregated data 22 | from all the files for each regex, along with each individual file's data. 23 | 24 | For usage examples, please see the unit-tests. 25 | ''' 26 | 27 | from Crypto.pct_warnings import CryptoRuntimeWarning 28 | from datetime import date 29 | from glob import glob 30 | from multiprocessing import Pool 31 | from operator import itemgetter 32 | import collections 33 | import contextlib 34 | import copy_reg 35 | import gzip 36 | import logging 37 | import os 38 | import re 39 | import socket 40 | import sys 41 | import threading 42 | import time 43 | import types 44 | import warnings 45 | import log_scraper.consts as LSC 46 | 47 | # C'est la vie... 48 | warnings.filterwarnings('ignore', category=CryptoRuntimeWarning) 49 | import paramiko 50 | 51 | LOGGER = logging.getLogger('log_scraper') 52 | _LOGGING_SETUP_LOCK = threading.Lock() 53 | 54 | TIMEOUT = 99999999 55 | 56 | class LogScraperException(Exception): 57 | '''Base LogScraper Exception class''' 58 | pass 59 | 60 | class BadRegexException(Exception): 61 | '''Use for anything to do with bad regexes''' 62 | pass 63 | 64 | class MissingArgumentException(LogScraperException): 65 | '''Use to let caller know they didn't provide a required argument''' 66 | pass 67 | 68 | class InvalidArgumentException(LogScraperException): 69 | '''Use to let the caller know of any bad arguments''' 70 | pass 71 | 72 | class RegexObject(object): 73 | ''' 74 | A wrapper around a regex pattern. 75 | Also provides an easy way to get all the named groups from the regex, 76 | which you can then use for aggregation or what have you 77 | ''' 78 | def __init__(self, name=None, pattern=None): 79 | ''' 80 | Initialize the object. 81 | Throws BadRegexException if the user gives a bad pattern. 82 | ''' 83 | self.name = name 84 | self._pattern = pattern 85 | self._matcher = None 86 | self._create_matcher() 87 | 88 | def __repr__(self): 89 | return 'RegexObject(name={}, pattern={})'.format(self.name, self._pattern) 90 | 91 | def __str__(self): 92 | '''Pretty print info about self''' 93 | return 'Pattern: {}, Groups: {}'.format(self._pattern, self._matcher.groupindex.keys()) 94 | 95 | def _create_matcher(self): 96 | ''' 97 | Compile the regex pattern and update the matcher member variable. 98 | Throws BadRegexException if the user gives a bad pattern. 99 | ''' 100 | try: 101 | self._matcher = re.compile(self._pattern) 102 | except Exception: 103 | raise BadRegexException('Invalid pattern: {}. ' 104 | 'Could not create matcher'.format(self._pattern)) 105 | 106 | def get_matcher(self): 107 | '''Returns the matcher object''' 108 | return self._matcher 109 | 110 | def get_pattern(self): 111 | '''Returns the pattern''' 112 | return self._pattern 113 | 114 | def update_pattern(self, pattern): 115 | ''' 116 | Reset the regex pattern, the compiled matcher and the group dicts. 117 | Throws BadRegexException if the user gives a bad pattern. 118 | ''' 119 | self._pattern = pattern 120 | self._create_matcher() 121 | 122 | def get_groups(self): 123 | '''Returns a list of all named groups found in the regex''' 124 | return self._matcher.groupindex.keys() 125 | 126 | 127 | def _pickle_method(method): 128 | ''' 129 | Define pickling for a custom method and register it. 130 | This is needed because multiprocessing needs to be able to 131 | pickle/unpickle the various LogScraper methods while 132 | processing files in parallel. 133 | ''' 134 | if method.im_self is None: 135 | return getattr, (method.im_class, method.im_func.func_name) 136 | else: 137 | return getattr, (method.im_self, method.im_func.func_name) 138 | 139 | copy_reg.pickle(types.MethodType, _pickle_method) 140 | 141 | class LogScraper(object): 142 | ''' 143 | Base class for a log scraper. 144 | Takes care of everything for you, so long as you provide regexes to run. 145 | If your regexes have named groups in them, it will aggregate stats for 146 | each value found for each named group. 147 | You can set defaults for where the logfile(s) live, 148 | as well as where the archived files live. 149 | You can specify how many days worth of data is kept before archiving, 150 | so that it knows where to look for the files. 151 | Wildcards are acceptable in filepaths. 152 | If files are to be grabbed from various boxes, 153 | it will copy them over to your specified temporary space; 154 | this is faster than just keeping the file open over SSH. 155 | By default, it will refresh the temporary files if they are older than an hour. 156 | Data is returned as a python dict mapping the regexes run to the stats found. 157 | You can print to console by setting the PRINT_STATS key in the options dict, 158 | or just request the data as a dict to work with. 159 | ''' 160 | 161 | COLORS = { 162 | 'HEADER' : '\033[95m', 163 | 'BLUE' : '\033[94m', 164 | 'GREEN' : '\033[92m', 165 | 'WARNING' : '\033[93m', 166 | 'RED' : '\033[91m', 167 | 'ENDC' : '\033[0m' 168 | } 169 | 170 | def __init__(self, default_filepath=None, optional_params=None, user_params=None): 171 | ''' 172 | default_filepath - Should be a dict containing key-value pairs for: 173 | {LSC.DEFAULT_PATH : , 174 | LSC.DEFAULT_FILENAME : } 175 | Wildcards are allowed in the filename. 176 | 177 | optional_params - Dict that specifies some optional stuff for your scraper. 178 | These should be values that are constant for your scraper 179 | and are not invocation dependent. 180 | For the common optional params, see LSC.OPTIONAL_PARAMS. 181 | 182 | user_params - Dict of all values that are invocation-dependent 183 | ''' 184 | if default_filepath is None: 185 | default_filepath = {} 186 | self._default_path = default_filepath.get(LSC.DEFAULT_PATH, '') 187 | default_filename = default_filepath.get(LSC.DEFAULT_FILENAME, '') 188 | 189 | self._default_filename, self._default_ext = os.path.splitext(default_filename) 190 | self._user_params = user_params if user_params else {} 191 | self._optional_params = {} 192 | 193 | self._init_logger() 194 | 195 | self._init_optional_params(optional_params) 196 | self._validate_user_params() 197 | 198 | self._regexes = [] 199 | self._init_regexes() 200 | 201 | self._file_list = [] 202 | 203 | def __repr__(self): 204 | return ('LogScraper(default_filename={}, default_filepath={}, ' 205 | 'optional_params={}, ' 206 | 'user_params={}'.format(self._default_filename, self._default_path, 207 | self._optional_params, self._user_params)) 208 | 209 | 210 | def __str__(self): 211 | return ('Regexes: {}\n' 212 | 'Default filename: {}\n' 213 | 'Default filepath: {}\n' 214 | 'Optional params: {}\n' 215 | 'User params: {}'.format(self._regexes, self._default_filename, 216 | self._default_path, self._optional_params, 217 | self._user_params)) 218 | 219 | # public: 220 | 221 | def add_regex(self, name, pattern): 222 | ''' 223 | Add a regex to the list of regexes to run. 224 | Throws BadRegexException if the user gives a bad pattern. 225 | ''' 226 | self._regexes.append(RegexObject(name=name, pattern=pattern)) 227 | 228 | def clear_regexes(self): 229 | '''Resets the list of regexes to run''' 230 | self._regexes = [] 231 | 232 | def get_log_data(self): 233 | ''' 234 | Main driver function for scraping logs. 235 | Returns the data as a dict 236 | ''' 237 | 238 | #Make sure there's some files to run on 239 | self._file_list = self._get_file_list() 240 | try: 241 | self._validate_file_list() 242 | except InvalidArgumentException as err: 243 | LOGGER.error('InvalidArgumentException: %s', err) 244 | return None 245 | 246 | regex_hits = {} 247 | regex_hits[LSC.REGEXES] = {} 248 | 249 | for regex in self._regexes: 250 | regex_hits[LSC.REGEXES][regex.name] = {} 251 | regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS] = {} 252 | for group in regex.get_groups(): 253 | regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS][group] = \ 254 | collections.OrderedDict() 255 | 256 | if self._user_params.get(LSC.DEBUG): 257 | self._print_regex_patterns() 258 | 259 | results = self._multiprocess_files(self._process_file_for_aggregates) 260 | 261 | if results is None: 262 | return None 263 | 264 | for result in results: 265 | for regex_name, hits in result[LSC.REGEXES].items(): 266 | self._combine_hits(hits, regex_hits[LSC.REGEXES][regex_name]) 267 | 268 | #Sort the group data 269 | for hits in regex_hits[LSC.REGEXES].values(): 270 | if LSC.GROUP_HITS in hits: 271 | for group, group_hits in hits[LSC.GROUP_HITS].items(): 272 | hits[LSC.GROUP_HITS][group] = \ 273 | collections.OrderedDict(sorted(group_hits.iteritems())) 274 | 275 | if len(results) > 1: 276 | regex_hits[LSC.FILE_HITS] = results 277 | 278 | return regex_hits 279 | 280 | def get_regexes(self): 281 | '''Returns the list of regexes stored''' 282 | return self._regexes 283 | 284 | def get_regex_matches(self): 285 | ''' 286 | Returns a dict with all the regex matches found for each file 287 | ''' 288 | 289 | #Make sure there's some files to run on 290 | self._file_list = self._get_file_list() 291 | try: 292 | self._validate_file_list() 293 | except InvalidArgumentException as err: 294 | LOGGER.error('InvalidArgumentException: %s', err) 295 | return None 296 | 297 | if self._user_params.get(LSC.DEBUG, None): 298 | self._print_regex_patterns() 299 | matches = self._multiprocess_files(self._process_file_for_matches) 300 | return matches 301 | 302 | def get_user_params(self): 303 | '''Getter for user_params''' 304 | return self._user_params 305 | 306 | def print_stats_per_file(self, regex_hits, out=sys.stdout): 307 | '''Prints stats for each file separately''' 308 | if regex_hits is None: 309 | return 310 | for result in regex_hits[LSC.FILE_HITS]: 311 | out.write('File: {}\n'.format(result[LSC.FILENAME])) 312 | self._pretty_print(result[LSC.REGEXES], self._user_params, out) 313 | 314 | def print_total_stats(self, regex_hits, out=sys.stdout): 315 | '''Prints the total stats''' 316 | if regex_hits is None: 317 | return 318 | self._pretty_print(regex_hits[LSC.REGEXES], self._user_params, out) 319 | for regex_name, hits in regex_hits[LSC.REGEXES].items(): 320 | out.write(self.COLORS['GREEN']) 321 | out.write('Total hits for regex {}: {:,}\n'.format(regex_name.capitalize(), 322 | hits[LSC.TOTAL_HITS])) 323 | out.write(self.COLORS['ENDC']) 324 | 325 | 326 | def set_user_params(self, user_params): 327 | ''' 328 | Setter for the user params 329 | Throws: InvalidArgumentException 330 | ''' 331 | self._user_params = user_params 332 | self._validate_user_params() 333 | 334 | def view_regex_matches(self, out=sys.stdout): 335 | ''' 336 | Prints out all the lines that match the regexes in the file list properly 337 | ''' 338 | matches = self.get_regex_matches() 339 | for file_matches in matches: 340 | for regex_name, regex_data in file_matches[LSC.REGEXES].items(): 341 | out.write('Regex: {}\n'.format(regex_name)) 342 | out.write('Matches:\n') 343 | if regex_data[LSC.MATCHES] == []: 344 | out.write('{}-No matches\n'.format(file_matches[LSC.FILENAME])) 345 | else: 346 | for match in regex_data[LSC.MATCHES]: 347 | out.write('{}-{}'.format(file_matches[LSC.FILENAME], match)) 348 | 349 | # private: 350 | 351 | def _init_base_logger(self): 352 | '''Creates the base logger''' 353 | log_level = logging.INFO 354 | if self._user_params.get(LSC.DEBUG, None): 355 | log_level = logging.DEBUG 356 | 357 | LOGGER.setLevel(log_level) 358 | # create console handler 359 | handler = logging.StreamHandler(sys.stdout) 360 | handler.setLevel(log_level) 361 | # create formatter 362 | formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s', 363 | datefmt='%Y%m%d %H:%M:%S') 364 | # add formatter to ch 365 | handler.setFormatter(formatter) 366 | # add ch to logger 367 | LOGGER.addHandler(handler) 368 | 369 | def _init_logger(self): 370 | '''Sets the format of the logging''' 371 | with _LOGGING_SETUP_LOCK: 372 | if not LOGGER.handlers: 373 | self._init_base_logger() 374 | LOGGER.propagate = False 375 | # Set paramiko logging to only show warnings and higher 376 | paramiko_logger = logging.getLogger("paramiko") 377 | if paramiko_logger.level == logging.NOTSET: 378 | paramiko_logger.setLevel(logging.WARNING) 379 | 380 | def _init_optional_params(self, opt): 381 | ''' 382 | Initializes all optional params with given values or defaults. 383 | ''' 384 | if opt is None: 385 | opt = {} 386 | for param, default in LSC.OPTIONAL_PARAMS.items(): 387 | self._optional_params[param] = opt.get(param, default) 388 | 389 | # Methods you should implement for your own scraper 390 | def _init_regexes(self): 391 | '''This is where you write the logic for what regexes to run''' 392 | pass 393 | 394 | 395 | def _get_archived_file_path(self): 396 | '''Should return where your archived files live''' 397 | pass 398 | 399 | def _validate_user_params(self): 400 | ''' 401 | Make sure that all user-given values make sense. 402 | Should throw InvalidArgumentException with a descriptive message otherwise. 403 | Call this in your derived class constructor.''' 404 | pass 405 | 406 | def _are_logs_archived(self, log_date): 407 | ''' 408 | Returns whether logs are on netapp or on local box. 409 | Always returns false if days_before_archiving is zero. 410 | ''' 411 | 412 | if self._optional_params[LSC.DAYS_BEFORE_ARCHIVING] == 0 or log_date is None: 413 | return False 414 | 415 | today = date.today() 416 | date_obj = date(int(log_date[:-4]), int(log_date[-4:-2]), int(log_date[-2:])) 417 | 418 | delta = today - date_obj 419 | if delta.days >= self._optional_params[LSC.DAYS_BEFORE_ARCHIVING]: 420 | return True 421 | 422 | return False 423 | 424 | @classmethod 425 | def _calc_stats(cls, items): 426 | '''Calculates the min, max and average items processed per key''' 427 | 428 | ret_dict = {LSC.MAX_KEY : 0, LSC.MIN_KEY : 0, LSC.MAX_COUNT : 0, 429 | LSC.MIN_COUNT : 0, LSC.AVG_COUNT : 0} 430 | 431 | if items is None or len(items) == 0: 432 | return ret_dict 433 | 434 | max_key, max_count = max(items.iteritems(), key=itemgetter(1)) 435 | min_key, min_count = min(items.iteritems(), key=itemgetter(1)) 436 | total = sum(items.itervalues()) 437 | count = len(items) 438 | 439 | avg_count = float(total)/count 440 | 441 | ret_dict[LSC.MAX_KEY] = max_key 442 | ret_dict[LSC.MIN_KEY] = min_key 443 | ret_dict[LSC.MAX_COUNT] = max_count 444 | ret_dict[LSC.MIN_COUNT] = min_count 445 | ret_dict[LSC.AVG_COUNT] = avg_count 446 | 447 | return ret_dict 448 | 449 | @classmethod 450 | def _combine_hits(cls, match_groups, combining_dict): 451 | ''' 452 | Aggregates all matches for each key found in match_groups 453 | into combining_dict 454 | ''' 455 | 456 | for group, hits in match_groups.items(): 457 | if isinstance(hits, collections.Mapping): 458 | cls._combine_hits(match_groups[group], combining_dict[group]) 459 | else: 460 | if combining_dict.get(group, None): 461 | combining_dict[group] += hits 462 | else: 463 | combining_dict[group] = hits 464 | 465 | @classmethod 466 | def _copy_remote_file(cls, filepath, local_file, box): 467 | '''Creates an SSH connection and copies filepath to local_file''' 468 | ssh = cls._open_ssh_connection(box) 469 | if ssh is None: 470 | return '' 471 | with contextlib.closing(ssh): 472 | with contextlib.closing(ssh.open_sftp()) as sftp: 473 | #Temporarily copy file to current box. 474 | #This is being done because reading the file over SSH 475 | #slows everything down insanely. 476 | sftp.get(filepath, local_file) 477 | 478 | def _gen_lines(self, filename): 479 | '''Generator that yields one line at a time from a file''' 480 | with self._get_file_handle(filename) as handle: 481 | for line in handle: 482 | yield line 483 | 484 | def _get_box_from_level(self, level): 485 | '''Returns the mapped box name for the given production level''' 486 | return self._optional_params[LSC.LEVELS_TO_BOXES].get(level, None) 487 | 488 | @classmethod 489 | def _get_file_handle(cls, log_file): 490 | ''' 491 | Returns a handle connected to the given file. 492 | Needed because it grabs over ssh if needed, 493 | and also checks to see if the given file is a gzip file, 494 | in which case, some fancy stuff is needed to open it properly. 495 | The first two characters of the header are inspected to see 496 | whether the file is a gzipped file or plaintext. 497 | ''' 498 | LOGGER.info('Opening file %s', log_file) 499 | 500 | handle = open(log_file, 'rb') 501 | if handle.read(2) == '\x1f\x8b': 502 | handle.seek(0) 503 | handle = gzip.GzipFile(fileobj=handle) 504 | else: 505 | handle.seek(0) 506 | return handle 507 | 508 | def _get_file_list(self): 509 | '''Checks the default filename or wildcard search and the prod level set, 510 | and returns a list of all files found on the relevant box at the 511 | given path. If no level value is given, looks on current box''' 512 | 513 | file_list = list() 514 | level = self._user_params.get(LSC.LEVEL, None) 515 | log_date = self._user_params.get(LSC.DATE, None) 516 | filename = self._user_params.get(LSC.FILENAME, None) 517 | 518 | if (level is not None 519 | and not self._are_logs_archived(log_date)): 520 | if (self._optional_params.get(LSC.FORCE_COPY, False) 521 | or socket.gethostname() != self._get_box_from_level(level)): 522 | ssh = self._open_ssh_connection(self._get_box_from_level(level)) 523 | if ssh is None: 524 | return file_list 525 | with contextlib.closing(ssh): 526 | with contextlib.closing(ssh.open_sftp()) as sftp: 527 | filename_regex = \ 528 | self._make_file_name(self._optional_params[LSC.FILENAME_REGEX], 529 | log_date, level) 530 | 531 | files = sftp.listdir(self._default_path) 532 | for name in files: 533 | match = re.match(filename_regex, str(name)) 534 | if match is not None: 535 | file_list.append(os.path.join(self._default_path, match.group())) 536 | sftp.close() 537 | ssh.close() 538 | return file_list 539 | 540 | #By default, let's look at the default_filepath 541 | if filename is None: 542 | filename = self._make_file_path() 543 | file_list = glob(filename) 544 | 545 | else: 546 | files = filename.split(',') 547 | for file_iter in files: 548 | file_list += glob(file_iter) 549 | file_list = sorted([f for f in file_list if os.path.isfile(f)]) 550 | 551 | return file_list 552 | 553 | def _get_log_file(self, log_file): 554 | ''' 555 | Copies the log file from the appropriate box to local temp space. 556 | Returns path to local file. 557 | Doesn't copy if it finds a local file already that is less than 558 | local_copy_lifetime_in_hours, 559 | which is an int value specifying how many hours before we recopy. 560 | ''' 561 | level = self._user_params.get(LSC.LEVEL, None) 562 | debug = self._user_params.get(LSC.DEBUG, None) 563 | 564 | remote_file = os.path.split(log_file)[1] 565 | local_filepath = os.path.join(self._optional_params[LSC.TMP_PATH], 566 | '_'.join([level, remote_file])) 567 | 568 | mtime = 0 569 | if os.path.exists(local_filepath): 570 | mtime = os.path.getmtime(local_filepath) 571 | 572 | try: 573 | now = time.time() 574 | max_time_before_recopy = now - self._optional_params[LSC.LOCAL_COPY_LIFETIME]*60*60 575 | 576 | if mtime < max_time_before_recopy: 577 | if os.path.exists(local_filepath): 578 | os.remove(local_filepath) 579 | if debug: 580 | LOGGER.debug('Copying file from %s:%s to %s temporarily', 581 | self._get_box_from_level(level), 582 | log_file, 583 | local_filepath) 584 | self._copy_remote_file(log_file, local_filepath, 585 | self._get_box_from_level(level)) 586 | if debug: 587 | LOGGER.debug('Done copying file') 588 | except IOError as err: 589 | LOGGER.error('Couldn\'t copy %s from %s. Error: %s', log_file, 590 | self._get_box_from_level(level), str(err)) 591 | return '' 592 | 593 | return local_filepath 594 | 595 | def _make_file_path(self): 596 | '''Creates and returns the path where files should be globbed for 597 | for a given date and production level''' 598 | log_date = self._user_params.get(LSC.DATE, None) 599 | level = self._user_params.get(LSC.LEVEL, None) 600 | if log_date is None and level is None: 601 | return os.path.join(self._default_path, 602 | self._make_file_name(self._default_filename)) 603 | 604 | if not self._are_logs_archived(log_date): 605 | return os.path.join(self._default_path, 606 | self._make_file_name(self._default_filename, 607 | log_date, 608 | self._get_box_from_level(level))) 609 | 610 | return os.path.join(self._get_archived_file_path(), 611 | self._make_file_name(self._default_filename, 612 | log_date, self._get_box_from_level(level)) 613 | + '*') 614 | 615 | def _make_file_name(self, base_name, log_date=None, box=None): 616 | ''' 617 | Basic implementation: --. 618 | Override if necessary 619 | Returns the filename that would be appropriate for your logs, 620 | based on the given base_name, date and level. 621 | ''' 622 | parts = [base_name] 623 | if box is not None: 624 | parts.append(box) 625 | if log_date is not None: 626 | parts.append(log_date) 627 | return '-'.join(parts) + self._default_ext 628 | 629 | def _multiprocess_files(self, func): 630 | ''' 631 | Creates a pool to run the given function func 632 | through several files at once. 633 | ''' 634 | 635 | # First copy any remote files as needed and create final file list 636 | if (self._user_params.get(LSC.LEVEL, None) 637 | and not self._are_logs_archived(self._user_params.get(LSC.DATE, None))): 638 | if (self._optional_params.get(LSC.FORCE_COPY, False) 639 | or socket.gethostname() != \ 640 | self._get_box_from_level(self._user_params.get(LSC.LEVEL, None))): 641 | pool = Pool(processes=self._optional_params[LSC.PROCESSOR_COUNT]) 642 | # Why is there a crazy timeout value at the end of this call? 643 | # Because python has a bug in it that's been open for years and has not been fixed 644 | # outside of v3.3 and above, wherein a KeyboardInterruption is never delivered 645 | # when a thread is waiting for a condition, which leads to a hang 646 | # if a user hits ^C. 647 | # However, if you set a timeout on the call, Condition.wait() will receive 648 | # the interrupt immediately. 649 | # See: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 650 | file_list = pool.map_async(self._get_log_file, self._file_list).get(TIMEOUT) 651 | self._file_list = sorted(filter(lambda x: x != '', file_list)) 652 | pool = None 653 | 654 | LOGGER.debug('Final file list: %s', self._file_list) 655 | 656 | if self._file_list == []: 657 | LOGGER.error('No files found to process.') 658 | return None 659 | 660 | pool = Pool(processes=self._optional_params[LSC.PROCESSOR_COUNT]) 661 | results = pool.map_async(func, self._file_list).get(TIMEOUT) 662 | return results 663 | 664 | @classmethod 665 | def _open_ssh_connection(cls, server): 666 | '''Creates and returns an SSH connection to the appropriate box''' 667 | ssh = paramiko.SSHClient() 668 | ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 669 | try: 670 | ssh.connect(hostname=server, timeout=300) 671 | return ssh 672 | except socket.gaierror as err: 673 | LOGGER.error('Could not create SSH connection on server %s. Error: %s', 674 | server, str(err)) 675 | except paramiko.BadHostKeyException as err: 676 | LOGGER.error('Could not verify hostkey for server: %s. Error: %s', server, str(err)) 677 | except paramiko.AuthenticationException as err: 678 | LOGGER.error('Could not authenticate on server: %s. Error: %s', server, str(err)) 679 | except paramiko.SSHException as err: 680 | LOGGER.error('Could not create SSH connection on server %s. Error: %s', 681 | server, str(err)) 682 | 683 | return None 684 | 685 | @classmethod 686 | def _pretty_print(cls, result, options, out=sys.stdout): 687 | ''' 688 | Pretty prints the stats 689 | ''' 690 | 691 | out.write(cls.COLORS["BLUE"]) 692 | if options.get(LSC.DEBUG): 693 | for regex_name, hits in result.items(): 694 | regex_name = regex_name.capitalize() 695 | for group, group_hits in hits[LSC.GROUP_HITS].items(): 696 | if group == LSC.TOTAL_HITS: 697 | continue 698 | out.write('\n{} hits per {}:\n'.format(regex_name, group.capitalize())) 699 | cls._pretty_print_dict(group_hits) 700 | out.write('\n{} max, min and average:\n'.format(regex_name)) 701 | cls._print_max_min_avg(group, cls._calc_stats(group_hits)) 702 | 703 | out.write('\nTotal {} hits: {:,}\n'.format(regex_name, hits[LSC.TOTAL_HITS])) 704 | 705 | out.write(cls.COLORS['ENDC']) 706 | 707 | @classmethod 708 | def _pretty_print_dict(cls, results): 709 | ''' 710 | Pretty self-explanatory. 711 | ''' 712 | if results is None: 713 | return 714 | for key, val in results.iteritems(): 715 | print "{} : {}\n".format(key, val) 716 | 717 | @classmethod 718 | def _print_max_min_avg(cls, group, stats): 719 | '''Prints the min, max and average stats''' 720 | print '\nAggregator: {}\n'.format(group) 721 | print '\nMax requests processed : {:,}, stat value: {}\n'.format(stats[LSC.MAX_COUNT], 722 | stats[LSC.MAX_KEY]) 723 | print 'Min requests processed : {:,}, stat value: {}\n'.format(stats[LSC.MIN_COUNT], 724 | stats[LSC.MIN_KEY]) 725 | print 'Average requests processed : {:,}\n'.format(stats[LSC.AVG_COUNT]) 726 | 727 | def _print_regex_patterns(self): 728 | '''Prints all the regex patterns''' 729 | for regex in self._regexes: 730 | LOGGER.debug('Running regex: %s', regex.get_pattern()) 731 | 732 | def _process_file_for_matches(self, log_file): 733 | '''Extracts all regex matches in the given log file and returns. 734 | Override if you need to run several regexes or do any special 735 | processing on the files.''' 736 | 737 | regex_hits = {LSC.FILENAME : log_file, LSC.REGEXES : {}} 738 | with self._get_file_handle(log_file) as file_handle: 739 | for regex in self._regexes: 740 | regex_hits[LSC.REGEXES][regex.name] = {} 741 | regex_hits[LSC.REGEXES][regex.name][LSC.MATCHES] = [] 742 | 743 | for line in file_handle: 744 | for regex in self._regexes: 745 | matcher = regex.get_matcher() 746 | if matcher.match(line) != None: 747 | regex_hits[LSC.REGEXES][regex.name][LSC.MATCHES].append(line) 748 | 749 | return regex_hits 750 | 751 | 752 | def _process_file_for_aggregates(self, log_file): 753 | '''Extracts the data from the given log_file and returns. 754 | Override if you need to run several regexes or do any special 755 | processing on the files.''' 756 | 757 | regex_hits = {LSC.FILENAME : log_file, LSC.REGEXES : {}} 758 | for regex in self._regexes: 759 | regex_hits[LSC.REGEXES][regex.name] = {} 760 | regex_hits[LSC.REGEXES][regex.name][LSC.TOTAL_HITS] = 0 761 | regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS] = {} 762 | for group in regex.get_groups(): 763 | regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS][group] = {} 764 | 765 | for line in self._gen_lines(log_file): 766 | for regex in self._regexes: 767 | group_hits = regex_hits[LSC.REGEXES][regex.name][LSC.GROUP_HITS] 768 | regex_hits[LSC.REGEXES][regex.name][LSC.TOTAL_HITS] += \ 769 | self._run_regex_and_do_aggregation(line, 770 | regex.get_matcher(), 771 | group_hits) 772 | #Sort the group data 773 | for hits in regex_hits[LSC.REGEXES].values(): 774 | for group, group_hits in hits[LSC.GROUP_HITS].items(): 775 | hits[LSC.GROUP_HITS][group] = \ 776 | collections.OrderedDict(sorted(group_hits.iteritems())) 777 | 778 | return regex_hits 779 | 780 | @classmethod 781 | def _run_regex_and_do_aggregation(cls, line, matcher, aggregators): 782 | ''' 783 | Given the text and a regular expression, 784 | adds found values for each regex group to the aggregators dict, 785 | and returns 1. 786 | If no match is found, returns 0 787 | ''' 788 | try: 789 | match = matcher.match(line) 790 | if match is not None: 791 | for agg_key, agg_dict in aggregators.items(): 792 | cls._sum_group_matches(agg_dict, match, agg_key) 793 | return 1 794 | except AttributeError as err: 795 | LOGGER.error('Regex Exception %s: %s, Line: %s', type(err), err, line) 796 | return None 797 | return 0 798 | 799 | @classmethod 800 | def _sum_group_matches(cls, group_sums, match, regex_group): 801 | ''' 802 | Takes a regex match and a group value and populates the given dict 803 | with counts for each unique value for the regex group in the match. 804 | If the regex match fails, returns silently. 805 | ''' 806 | 807 | try: 808 | key = match.group(regex_group) 809 | if not key in group_sums: 810 | group_sums[key] = 1 811 | else: 812 | group_sums[key] += 1 813 | except IndexError: 814 | return 815 | 816 | def _validate_file_list(self): 817 | '''Makes sure that there are files to process''' 818 | 819 | if self._file_list == []: 820 | if self._user_params.get(LSC.FILENAME, None): 821 | raise InvalidArgumentException('File does not exist at {} ' 822 | 'Please provide a valid path to a ' 823 | 'log file.'.format(self._user_params[LSC.FILENAME])) 824 | else: 825 | raise InvalidArgumentException(('No files found at {} on {}. ' 826 | 'Please provide a valid path to a log file.' 827 | ).format(self._make_file_path(), 828 | 'the current box' 829 | if not 830 | self._user_params.get(LSC.LEVEL, None) 831 | else 832 | self._get_box_from_level(self._user_params[LSC.LEVEL]))) 833 | 834 | -------------------------------------------------------------------------------- /src/log_scraper/consts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | All the consts used by the LogScraper library. 3 | ''' 4 | 5 | # Where your scraper should look in for files if the user doesn't provide a path themselves 6 | DEFAULT_PATH = 'default_path' 7 | DEFAULT_FILENAME = 'default_filename' 8 | 9 | # How many days worth of data is kept in the default_filepath before being moved for archival 10 | DAYS_BEFORE_ARCHIVING = 'days_before_archiving' 11 | 12 | # This is needed because files are grabbed from remote boxes over paramiko, 13 | # and the way that's done is by getting a list of all files in the default_path using listdir(), 14 | # and then running a regex over that list to get the files we care about. 15 | # All this because paramiko has no way to get a list of files with a wildcard in the filename 16 | FILENAME_REGEX = 'filename_regex' 17 | 18 | # If this key is set to true, the scraper will copy files if a value for 'level' is specified, 19 | # even if the box mapping for 'level' is the same as the host we're currently on. 20 | # Mostly, I'm adding this so that I can write unit-tests for copying 21 | FORCE_COPY = 'force_copy' 22 | 23 | # Mapping of what level corresponds to what boxname, so that users can just say things like 24 | # --sandbox or --production. 25 | 26 | LEVELS_TO_BOXES = 'levels_to_boxes' 27 | 28 | # Files copied over remotely are automatically refreshed if the timestamp on the local copy 29 | # is older than the value for LOCAL_COPY_LIFETIME, which is specified in hours 30 | # Defaults to 0, so that remote files are always refreshed 31 | LOCAL_COPY_LIFETIME = 'local_copy_lifetime' 32 | 33 | # Where to copy over any files grabbed over SSH 34 | TMP_PATH = 'tmp_path' 35 | 36 | # How many processors to use while doing multiprocessing on the files 37 | PROCESSOR_COUNT = 'processor_count' 38 | 39 | # Defaults 40 | OPTIONAL_PARAMS = {DAYS_BEFORE_ARCHIVING : 0, FILENAME_REGEX : '', 41 | LEVELS_TO_BOXES : {}, LOCAL_COPY_LIFETIME : 0, 42 | TMP_PATH : '', PROCESSOR_COUNT : 4, 43 | FORCE_COPY : False} 44 | 45 | # Misc useful params you could query the user for 46 | DATE = 'date' 47 | 48 | # Runs logger in debug mode 49 | DEBUG = 'debug' 50 | 51 | # Override any default filelist in favor of whatever the user gives 52 | FILENAME = 'filename' 53 | 54 | FILE_HITS = 'file_hits' 55 | 56 | # What production level box to look on 57 | LEVEL = 'level' 58 | 59 | # The scraper only prints stats to console if this key is set to True 60 | PRINT_STATS = 'print_stats' 61 | 62 | # The keys used in the dicts that store the extracted data 63 | REGEXES = 'regexes' 64 | MATCHES = 'matches' 65 | GROUP_HITS = 'group_hits' 66 | TOTAL_HITS = 'total_hits' 67 | 68 | # Stats dict 69 | MAX_KEY = 'max_key' 70 | MIN_KEY = 'min_key' 71 | MAX_COUNT = 'max_count' 72 | MIN_COUNT = 'min_count' 73 | AVG_COUNT = 'avg_count' 74 | -------------------------------------------------------------------------------- /tests/test_log_scraper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Unit-tests for the Log Scraper library 3 | ''' 4 | 5 | from collections import OrderedDict 6 | from datetime import datetime, timedelta 7 | from StringIO import StringIO 8 | import gzip 9 | import os 10 | import shutil 11 | import socket 12 | import sys 13 | import unittest 14 | 15 | BASE_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), os.pardir)) 16 | sys.path.append(BASE_PATH) 17 | 18 | from src.log_scraper.base import LogScraper, RegexObject 19 | from src.log_scraper.base import BadRegexException, MissingArgumentException, InvalidArgumentException 20 | import src.log_scraper.consts as LSC 21 | 22 | #DIRS 23 | ARCHIVE_DIR = 'archived' 24 | LOG_DIR = './logs' 25 | LOG_FILE = 'log*.log' 26 | LOG_FILE_REGEX = r'log\d+' 27 | 28 | #Log files 29 | LOG_FILE_1 = ("log1.log", 30 | '''My name is Judge. 31 | My name is Franklin. 32 | Judge my name? 33 | My name is Judge. 34 | ''' 35 | ) 36 | LOG_FILE_2 = ("log2.log", 37 | '''The weather is sunny. 38 | The time is noon. 39 | My name is Judge. 40 | What's my name? 41 | My name is Franklin. 42 | The weather is rainy. 43 | The weather is icy. 44 | ''' 45 | ) 46 | # Remote coyping needs full path 47 | REMOTE_LOG_FILE_PATH = os.path.join(BASE_PATH, 'logs', LOG_FILE_1[0]) 48 | REMOTE_FILE_1 = 'log1-this_box.log' 49 | REMOTE_FILE_2 = 'log2-this_box.log' 50 | TMP_REMOTE_DIR = './tmp_remote' 51 | 52 | def _clean_dir(xdir=LOG_DIR): 53 | '''Set up directory structure. Will delete any existing directories and files.''' 54 | if os.path.exists(xdir): 55 | shutil.rmtree(xdir) 56 | os.makedirs(xdir) 57 | os.mkdir(os.path.join(xdir, ARCHIVE_DIR)) 58 | 59 | def _write_file(filename, contents, inc_dir=LOG_DIR): 60 | ''' 61 | Takes a filename, contents and path and writes out file 62 | ''' 63 | with open(os.path.join(inc_dir, filename), 'w') as mfile: 64 | mfile.write(contents) 65 | 66 | def _write_file_from_pair(filename, inc_dir=LOG_DIR): 67 | ''' 68 | Takes a tuple that has as its first element a filename, and the second as the contents, 69 | and writes out the file to the path provided in the inc_dir param. 70 | ''' 71 | with open(os.path.join(inc_dir, filename[0]), 'w') as mfile: 72 | mfile.write(filename[1]) 73 | 74 | class LogScraperWithOptions(LogScraper): 75 | '''A sample implementation of the log scraper library that sets some of the optional params''' 76 | 77 | def __init__(self, user_params): 78 | default_filepath = {} 79 | optional_params = {} 80 | 81 | default_filepath[LSC.DEFAULT_PATH] = LOG_DIR 82 | default_filepath[LSC.DEFAULT_FILENAME] = LOG_FILE 83 | optional_params[LSC.DAYS_BEFORE_ARCHIVING] = 1 84 | optional_params[LSC.LEVELS_TO_BOXES] = {'this_box' : socket.gethostname()} 85 | 86 | super(LogScraperWithOptions, self).__init__(default_filepath=default_filepath, 87 | optional_params=optional_params, 88 | user_params=user_params) 89 | 90 | def _init_regexes(self): 91 | '''Sample regexes''' 92 | no_group_regex = r'My name is Judge\.$' 93 | group_regex = r'My name is (?P\w+)\.$' 94 | self._regexes.append(RegexObject(name='no_group', pattern=no_group_regex)) 95 | self._regexes.append(RegexObject(name='group', pattern=group_regex)) 96 | 97 | def _get_archived_file_path(self): 98 | '''Where logs are archived''' 99 | return os.path.join(LOG_DIR, ARCHIVE_DIR) 100 | 101 | class TestLogScraper(unittest.TestCase): 102 | '''Creates a simple log scraper and tests out all the functionality''' 103 | 104 | def setUp(self): 105 | '''Create the log scraper to use, write out the test log files''' 106 | #Write out the sample log files 107 | _clean_dir() 108 | _write_file_from_pair(LOG_FILE_1) 109 | _write_file_from_pair(LOG_FILE_2) 110 | _write_file_from_pair((LOG_FILE_1[0].split('.')[0] + '-20150301.log', LOG_FILE_1[1]), 111 | os.path.join(LOG_DIR, ARCHIVE_DIR)) 112 | _write_file_from_pair((LOG_FILE_2[0].split('.')[0] + '-20150301.log', LOG_FILE_2[1]), 113 | os.path.join(LOG_DIR, ARCHIVE_DIR)) 114 | 115 | def test_setting_user_params(self): 116 | '''Tests to make sure that the user params dict is set correctly''' 117 | _log_scraper = LogScraper() 118 | self.assertEquals(_log_scraper.get_user_params(), {}) 119 | 120 | user_params = {'TEST' : 'TEST1'} 121 | 122 | _log_scraper.set_user_params(user_params) 123 | self.assertEquals(_log_scraper.get_user_params(), user_params) 124 | 125 | def test_base_scraper(self): 126 | '''Test the no-nonsense simple scraper''' 127 | 128 | _log_scraper = LogScraper() 129 | expected = ("LogScraper(default_filename=, default_filepath=, " 130 | "optional_params={'levels_to_boxes': {}, 'filename_regex': '', " 131 | "'processor_count': 4, 'local_copy_lifetime': 0, 'tmp_path': '', " 132 | "'force_copy': False, 'days_before_archiving': 0}, user_params={}") 133 | self.assertEquals(repr(_log_scraper), expected) 134 | 135 | expected = ("Regexes: []\n" 136 | "Default filename: \n" 137 | "Default filepath: \n" 138 | "Optional params: {'levels_to_boxes': {}, 'filename_regex': '', " 139 | "'processor_count': 4, 'local_copy_lifetime': 0, 'tmp_path': '', " 140 | "'force_copy': False, 'days_before_archiving': 0}\n" 141 | "User params: {}") 142 | self.assertEquals(str(_log_scraper), expected) 143 | 144 | #Set file list 145 | user_params = {} 146 | user_params[LSC.DEBUG] = True 147 | user_params[LSC.FILENAME] = os.path.join(LOG_DIR, LOG_FILE) 148 | _log_scraper.set_user_params(user_params) 149 | 150 | #Add some regexes 151 | no_group_regex = r'My name is Judge\.$' 152 | group_regex = r'The (?P\w+) is (?P\w+)\.$' 153 | _log_scraper.add_regex(name='name_is_judge', pattern=no_group_regex) 154 | _log_scraper.add_regex(name='key_value_regex', pattern=group_regex) 155 | 156 | # Validate user params (should do nothing) 157 | _log_scraper._validate_user_params() 158 | 159 | # Should give back nothing 160 | self.assertEquals(_log_scraper._get_archived_file_path(), None) 161 | 162 | #Finally, get some data 163 | results = _log_scraper.get_log_data() 164 | 165 | expected = {'regexes' : {'key_value_regex': {'group_hits': {'key': OrderedDict([('time', 1), 166 | ('weather', 3)]), 167 | 'value': OrderedDict([('icy', 1), 168 | ('noon', 1), 169 | ('rainy', 1), 170 | ('sunny', 1)])}, 171 | 'total_hits': 4}, 172 | 'name_is_judge': {'group_hits': {}, 'total_hits': 3}}, 173 | 'file_hits': [{'regexes': {'key_value_regex': {'group_hits': {'value': OrderedDict(), 174 | 'key': OrderedDict()}, 175 | 'total_hits': 0}, 176 | 'name_is_judge': {'group_hits': {}, 'total_hits': 2}}, 177 | 'filename': './logs/log1.log'}, 178 | {'regexes': {'key_value_regex': {'group_hits': {'value': OrderedDict([('icy', 1), 179 | ('noon', 1), 180 | ('rainy', 1), 181 | ('sunny', 1)]), 182 | 'key': OrderedDict([('time', 1), 183 | ('weather', 3)])}, 184 | 'total_hits': 4}, 185 | 'name_is_judge': {'group_hits': {}, 186 | 'total_hits': 1}}, 187 | 'filename': './logs/log2.log'}]} 188 | 189 | self.maxDiff = None 190 | self.assertDictEqual(results, expected) 191 | 192 | # Test the min/max/avg 193 | # Test with no data 194 | stats = _log_scraper._calc_stats([]) 195 | expected = {'max_key': 0, 196 | 'max_count': 0, 197 | 'avg_count': 0, 198 | 'min_count': 0, 199 | 'min_key': 0} 200 | self.assertDictEqual(stats, expected) 201 | 202 | stats = _log_scraper._calc_stats(results['regexes']['key_value_regex'][LSC.GROUP_HITS]['key']) 203 | expected = {'max_key': 'weather', 204 | 'max_count': 3, 205 | 'avg_count': 2.0, 206 | 'min_count': 1, 207 | 'min_key': 'time'} 208 | self.assertDictEqual(stats, expected) 209 | 210 | # Test viewing regex hits 211 | matches = _log_scraper.get_regex_matches() 212 | self.assertEquals(len(matches), 2) 213 | self.assertEquals(matches[0][LSC.FILENAME], os.path.join(LOG_DIR, LOG_FILE_1[0])) 214 | self.assertEquals(len(matches[0][LSC.REGEXES]['key_value_regex'][LSC.MATCHES]), 0) 215 | self.assertEquals(len(matches[0][LSC.REGEXES]['name_is_judge'][LSC.MATCHES]), 2) 216 | self.assertEquals(matches[1][LSC.FILENAME], os.path.join(LOG_DIR, LOG_FILE_2[0])) 217 | self.assertEquals(len(matches[1][LSC.REGEXES]['key_value_regex'][LSC.MATCHES]), 4) 218 | self.assertEquals(len(matches[1][LSC.REGEXES]['name_is_judge'][LSC.MATCHES]), 1) 219 | 220 | def test_no_match_regex(self): 221 | '''How regexes that don't match anything are handled''' 222 | 223 | _log_scraper = LogScraper() 224 | 225 | #Set file list 226 | user_params = {} 227 | user_params[LSC.FILENAME] = os.path.join(LOG_DIR, LOG_FILE) 228 | _log_scraper.set_user_params(user_params) 229 | 230 | #Add some regexes 231 | no_group_regex = r'This should not match\.$' 232 | _log_scraper.add_regex(name='no_match', pattern=no_group_regex) 233 | 234 | results = _log_scraper.get_log_data() 235 | self.assertEquals(results[LSC.REGEXES]['no_match'][LSC.TOTAL_HITS], 0) 236 | 237 | def test_archived_scraping(self): 238 | '''Test the scraper that fetches archived files''' 239 | 240 | user_params = {} 241 | user_params[LSC.DATE] = '20150301' 242 | user_params[LSC.DEBUG] = True 243 | _option_scraper = LogScraperWithOptions(user_params=user_params) 244 | 245 | results = _option_scraper.get_log_data() 246 | expected = {'regexes' : {'no_group': {'group_hits': {}, 'total_hits': 3}, 247 | 'group': {'group_hits': {'name': OrderedDict([('Franklin', 2), ('Judge', 3)])}, 248 | 'total_hits': 5}}, 249 | 'file_hits': [{'regexes': {'no_group': {'group_hits': {}, 250 | 'total_hits': 2}, 251 | 'group': {'group_hits': {'name': OrderedDict([('Franklin', 1), 252 | ('Judge', 2)])}, 253 | 'total_hits': 3}}, 254 | 'filename': './logs/archived/log1-20150301.log'}, 255 | {'regexes': {'no_group': {'group_hits': {}, 256 | 'total_hits': 1}, 257 | 'group': {'group_hits': {'name': OrderedDict([('Franklin', 1), 258 | ('Judge', 1)])}, 259 | 'total_hits': 2}}, 260 | 'filename': './logs/archived/log2-20150301.log'}]} 261 | self.assertDictEqual(results, expected) 262 | 263 | test_date = datetime.today().strftime('%Y%m%d') 264 | # Yes, yes, it's testing a private method directly. Let's move on 265 | self.assertFalse(_option_scraper._are_logs_archived(test_date)) 266 | 267 | def test_regexes(self): 268 | '''Test the logic for adding and removing regexes from the scraper''' 269 | _log_scraper = LogScraper() 270 | 271 | pattern = 'Very Specific Regex' 272 | regex_obj = RegexObject(name='test_regex', pattern=pattern) 273 | self.assertEqual(regex_obj.get_pattern(), pattern) 274 | matcher = regex_obj.get_matcher() 275 | self.assertEqual(matcher.match(pattern).group(), pattern) 276 | 277 | new_pattern = 'New (?P(Pattern))' 278 | regex_obj.update_pattern(new_pattern) 279 | self.assertEqual(regex_obj.get_pattern(), new_pattern) 280 | matcher = regex_obj.get_matcher() 281 | self.assertEqual(regex_obj.__repr__(), 282 | 'RegexObject(name=test_regex, pattern=New (?P(Pattern)))') 283 | 284 | self.assertEqual(regex_obj.__str__(), 285 | "Pattern: New (?P(Pattern)), Groups: ['group']") 286 | 287 | self.assertEqual(regex_obj.get_groups(), ['group']) 288 | 289 | 290 | _log_scraper.add_regex(name='test_regex', pattern='.*') 291 | 292 | self.assertEqual(1, len(_log_scraper.get_regexes())) 293 | 294 | _log_scraper.add_regex(name='test_regex_2', pattern='^.*$') 295 | 296 | self.assertEqual(2, len(_log_scraper.get_regexes())) 297 | 298 | # Clear regexes and test size 299 | _log_scraper.clear_regexes() 300 | 301 | self.assertEqual(0, len(_log_scraper.get_regexes())) 302 | 303 | # Give a bad pattern 304 | with self.assertRaises(BadRegexException): 305 | _log_scraper.add_regex(name='bad_regex', pattern='?P