├── tests ├── __init__.py ├── test-s3-put.json └── test.py ├── pmdp ├── parser │ ├── __init__.py │ ├── line_parser.py │ └── file_parser.py ├── writer │ ├── __init__.py │ └── writer.py ├── __init__.py ├── parse_elb_log_local.py └── parse_elb_log_lambda.py ├── setup.cfg ├── img ├── pmdp-elb.png └── pmdp-lambda.png ├── MANIFEST ├── requirements.txt ├── setup.py ├── LICENSE ├── .gitignore ├── README.md └── .github └── workflows └── codeql-analysis.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pmdp/parser/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pmdp/writer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /img/pmdp-elb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dangoldin/poor-mans-data-pipeline/HEAD/img/pmdp-elb.png -------------------------------------------------------------------------------- /img/pmdp-lambda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dangoldin/poor-mans-data-pipeline/HEAD/img/pmdp-lambda.png -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.cfg 3 | setup.py 4 | pmdp/__init__.py 5 | pmdp/parse_elb_log_lambda.py 6 | pmdp/parse_elb_log_local.py 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.4.1 2 | botocore==1.4.67 3 | docutils==0.12 4 | futures==3.0.5 5 | jmespath==0.9.0 6 | python-dateutil==2.5.3 7 | s3transfer==0.1.9 8 | six==1.10.0 9 | wsgiref==0.1.2 10 | -------------------------------------------------------------------------------- /pmdp/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __title__ = 'pmdp' 4 | __version__ = '0.3' 5 | __author__ = 'Dan Goldin' 6 | __license__ = 'MIT' 7 | __copyright__ = 'Copyright 2016 Dan Goldin' 8 | 9 | from . import parser 10 | from . import writer 11 | 12 | from . import parse_elb_log_local 13 | from . import parse_elb_log_lambda 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Mostly from http://peterdowns.com/posts/first-time-with-pypi.html 2 | 3 | from distutils.core import setup 4 | setup( 5 | name = 'pmdp', 6 | packages = ['pmdp'], 7 | version = '0.3', 8 | description = 'A poor man\'s data pipeline', 9 | author = 'Dan Goldin', 10 | author_email = 'dangoldin@gmail.com', 11 | url = 'https://github.com/dangoldin/poor-mans-data-pipeline', 12 | download_url = 'https://github.com/dangoldin/poor-mans-data-pipeline/tarball/0.3', 13 | keywords = ['data', 'data-pipeline'], 14 | classifiers = [], 15 | ) 16 | -------------------------------------------------------------------------------- /pmdp/parse_elb_log_local.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | 6 | from parser.line_parser import DatePathLogLineParser 7 | from parser.file_parser import FileParser, DirectoryParser 8 | from writer.csv_writer import CSVFileWriter 9 | 10 | if __name__ == '__main__': 11 | if len(sys.argv) < 2: 12 | print 'Specify a file or directory' 13 | sys.exit(1) 14 | 15 | fn = sys.argv[1] 16 | lp = DatePathLogLineParser() 17 | 18 | if os.path.isdir(fn): 19 | p = DirectoryParser(lp, fn) 20 | else: 21 | p = FileParser(lp, fn) 22 | out = p.parse() 23 | 24 | out_fn = 'out.csv' 25 | if len(sys.argv) == 3: 26 | out_fn = sys.argv[2] 27 | 28 | w = CSVFileWriter(out_fn) 29 | w.write_summary(out) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Dan Goldin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pmdp/parse_elb_log_lambda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | 5 | import urllib 6 | 7 | import datetime 8 | import uuid 9 | 10 | from parser.line_parser import DatePathLogLineParser 11 | from parser.file_parser import S3Parser 12 | from writer.s3_csv_writer import S3CSVFileWriter 13 | 14 | print('Loading function') 15 | 16 | def generate_filename(): 17 | out_path = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 18 | return out_path + '/' + str(uuid.uuid4()) 19 | 20 | def lambda_handler(event, context): 21 | bucket = event['Records'][0]['s3']['bucket']['name'] 22 | key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8')) 23 | try: 24 | # print('Getting', bucket, key) 25 | lp = DatePathLogLineParser() 26 | sp = S3Parser(lp, bucket, key) 27 | summary = sp.parse() 28 | print(summary) 29 | writer = S3CSVFileWriter(bucket, generate_filename()) 30 | writer.write_summary(summary) 31 | return {'success': True} 32 | except Exception as e: 33 | print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket)) 34 | raise e 35 | -------------------------------------------------------------------------------- /tests/test-s3-put.json: -------------------------------------------------------------------------------- 1 | { 2 | "Records": [ 3 | { 4 | "eventVersion": "2.0", 5 | "eventTime": "1970-01-01T00:00:00.000Z", 6 | "requestParameters": { 7 | "sourceIPAddress": "127.0.0.1" 8 | }, 9 | "s3": { 10 | "configurationId": "testConfigRule", 11 | "object": { 12 | "eTag": "0123456789abcdef0123456789abcdef", 13 | "sequencer": "0A1B2C3D4E5F678901", 14 | "key": "AWSLogs/663647347154/elasticloadbalancing/us-east-1/2016/10/24/663647347154_elasticloadbalancing_us-east-1_poor-mans-data-pipeline_20161024T0200Z_107.22.223.245_4vf56hab.log" 15 | }, 16 | "bucket": { 17 | "arn": "arn:aws:s3:::poor-mans-data-pipeline.dangoldin.com", 18 | "name": "poor-mans-data-pipeline.dangoldin.com", 19 | "ownerIdentity": { 20 | "principalId": "EXAMPLE" 21 | } 22 | }, 23 | "s3SchemaVersion": "1.0" 24 | }, 25 | "responseElements": { 26 | "x-amz-id-2": "EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH", 27 | "x-amz-request-id": "EXAMPLE123456789" 28 | }, 29 | "awsRegion": "us-east-1", 30 | "eventName": "ObjectCreated:Put", 31 | "userIdentity": { 32 | "principalId": "EXAMPLE" 33 | }, 34 | "eventSource": "aws:s3" 35 | } 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Temp files 92 | *~ 93 | 94 | # Lambda archive 95 | *zip 96 | 97 | # OS X files 98 | .DS_Store 99 | -------------------------------------------------------------------------------- /pmdp/writer/writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import csv 4 | import io 5 | import boto3 6 | 7 | from abc import ABCMeta, abstractmethod 8 | 9 | class BaseSummaryWriter: 10 | """ 11 | BaseSummaryWriter is the abstract class that handles writing/exporting 12 | a generated summary. 13 | """ 14 | __metaclass__ = ABCMeta 15 | 16 | @abstractmethod 17 | def write_summary(self, summary): pass 18 | 19 | class CSVFileWriter(BaseSummaryWriter): 20 | """ 21 | CSVFileWriter is an implementation that writes a summary to a CSV file 22 | """ 23 | 24 | def __init__(self, fn): 25 | self.fn = fn 26 | 27 | def write_summary(self, summary): 28 | with open(self.fn, 'w') as f: 29 | w = csv.writer(f) 30 | # TODO: Figure out header 31 | # w.writerow(headers + ['cnt',]) 32 | w.writerows([i + (cnt,) for i, cnt in summary.iteritems()]) 33 | 34 | class S3CSVFileWriter(BaseSummaryWriter): 35 | """ 36 | S3CSVFileWriter is an implementation that writes a summary to a CSV file 37 | and uploads it to S3 38 | """ 39 | 40 | def __init__(self, bucket, key): 41 | self.bucket = bucket 42 | self.key = key 43 | self.s3 = boto3.client('s3') 44 | 45 | def write_summary(self, summary): 46 | output = io.BytesIO() 47 | w = csv.writer(output) 48 | for k, v in summary.iteritems(): 49 | w.writerow(k + (v,)) 50 | # print 'Writing to',self.bucket,self.key 51 | self.s3.put_object(Bucket=self.bucket, Key=self.key, Body=output.getvalue()) 52 | -------------------------------------------------------------------------------- /pmdp/parser/line_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from abc import ABCMeta, abstractmethod 4 | from operator import itemgetter 5 | 6 | # TODO: Parse ip, user agent 7 | 8 | class LogLineParser: 9 | """ 10 | LogLineParser is the abstract class that handles parsing individal 11 | rows in the AWS ELB log file format. 12 | 13 | http://docs.aws.amazon.com/elasticloadbalancing/latest/classic/access-log-collection.html 14 | """ 15 | 16 | __metaclass__ = ABCMeta 17 | 18 | @abstractmethod 19 | def parse_line(self, line): pass 20 | 21 | class DatePathLogLineParser(LogLineParser): 22 | """ 23 | DatePathLogLineParser is a simple log line parser that returns 24 | a date in ymd and the path. 25 | """ 26 | 27 | def parse_ymd(self, dt_str): 28 | return dt_str[:10] # Just the ymd 29 | 30 | def parse_path(self, url): 31 | return url.split('/')[-1] # Skip the host name 32 | 33 | def parse_line(self, line): 34 | dt, url = itemgetter(0,12)(line.split(' ')) 35 | dt = self.parse_ymd(dt) 36 | path = self.parse_path(url) 37 | return (dt, path) 38 | 39 | class DatePathKeyLogLineParser(DatePathLogLineParser): 40 | """ 41 | DatePathKeyLogLineParser is a log line parser that returns 42 | a date in ymd, the path, as well as the GET arg valus for 43 | the specified keys. 44 | """ 45 | 46 | def __init__(self, keys): 47 | self.keys = keys 48 | 49 | def parse_path_and_keys(self, path): 50 | query_args = {} 51 | parts = path.split('?') 52 | if len(parts) == 2: 53 | for q in parts[1].split('&'): 54 | kv = q.split('=') 55 | if len(kv) == 2: # Only handle valid ones 56 | query_args[kv[0]] = kv[1] 57 | 58 | return (parts[0], ) + tuple(query_args.get(k, '') for k in self.keys) 59 | 60 | def parse_line(self, line): 61 | dt, url = itemgetter(0,12)(line.split(' ')) 62 | dt = super(DatePathKeyLogLineParser, self).parse_ymd(dt) 63 | path = super(DatePathKeyLogLineParser, self).parse_path(url) 64 | url_pieces = self.parse_path_and_keys(path) 65 | return (dt, ) + url_pieces 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Poor Man's Data Pipeline 2 | A minimal way to get an extremely robust, scalable, and cheap data pipeline up and running. 3 | 4 | ## Motivation 5 | Most data pipelines require a large infrastructure to get up and running. The intent of Poor Man's Data Pipeline is to use a variety of "serverless" components in order to build a data pipeline that has very few points of failure while still scaling to large volumes at low cost. 6 | 7 | ## How it works 8 | The pipeline works by using Amazon's Elastic Load Balancer with access logs enabled. These logs are then stored on S3 and are parsed and aggregated via simple Lambda functions. 9 | 10 | ## Components 11 | This project is still a work in progress but the components are listed below. A simple way to see how they're wired together is to take a look at the parse_elb_log_lambda.py file which is configured to be called by an AWS lambda function. 12 | - Line parser: This is responsible for parsing each line of an S3 access log file. This is what you will need to change in order to support configure your logging levels. 13 | - File parser: This just runs the line parser across each line of the log file. There are few options here for testing but generally you'll want to use the S3Parser class. 14 | - Summary writer: This takes the result of the parse and exports it. At the moment I only have a simple writer back to S3 but one can build additional functionality to write it to a database or send the summary to another service. 15 | 16 | ## How to get it working 17 | You will need to do two things. 18 | 19 | 1. Set up an Elastic Load Balancer and enable access logging. Note that you don't need to connect any instances to it since it will still be able to log every request. Note that responses to the ELB will have a 503 status code. 20 | ![alt text](https://github.com/dangoldin/poor-mans-data-pipeline/raw/master/img/pmdp-elb.png "PMDP ELB setup") 21 | 22 | 2. Set up an AWS Lambda function to parse the resulting access logs. The default code will do a simple count grouped by date and path and then upload them back to the original bucket. You can set up AWS Lambda by creating a zip archive and setting it up in the AWS console. 23 | 24 | ``` 25 | cd pmdp 26 | zip -r lambda.zip * 27 | ``` 28 | 29 | ![alt text](https://github.com/dangoldin/poor-mans-data-pipeline/raw/master/img/pmdp-lambda.png "PMDP Lambda setup") 30 | -------------------------------------------------------------------------------- /pmdp/parser/file_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from abc import ABCMeta, abstractmethod 4 | 5 | import os 6 | import io 7 | import boto3 8 | 9 | from collections import Counter 10 | 11 | class LogFileParser: 12 | """ 13 | LogFileParser is the abstract class that handles parsing log files 14 | in an AWS ELB log file format. 15 | 16 | http://docs.aws.amazon.com/elasticloadbalancing/latest/classic/access-log-collection.html 17 | """ 18 | __metaclass__ = ABCMeta 19 | 20 | @abstractmethod 21 | def parse(self): pass 22 | 23 | class StringParser(LogFileParser): 24 | """ 25 | StringParser parses a log file that's already been loaded into a string. 26 | Primarily used for testing. 27 | """ 28 | 29 | def __init__(self, llp, st): 30 | self.llp = llp 31 | self.st = st 32 | 33 | def parse(self): 34 | llp = self.llp 35 | summary = Counter() 36 | for line in self.st.strip().split("\n"): 37 | summary[llp.parse_line(line)] += 1 38 | return summary 39 | 40 | class S3Parser(LogFileParser): 41 | """ 42 | S3Parser parses a log file that's stored on S3. 43 | """ 44 | 45 | def __init__(self, llp, bucket, key): 46 | self.llp = llp 47 | self.bucket = bucket 48 | self.key = key 49 | 50 | def parse(self): 51 | s3 = boto3.client('s3') 52 | response = s3.get_object(Bucket=self.bucket, Key=self.key) 53 | sp = StringParser(self.llp, response['Body'].read()) 54 | return sp.parse() 55 | 56 | class FileParser(LogFileParser): 57 | """ 58 | FileParser parses a log file that's stored locally. 59 | """ 60 | 61 | def __init__(self, llp, fn): 62 | self.llp = llp 63 | self.fn = fn 64 | 65 | def parse(self): 66 | llp = self.llp 67 | summary = Counter() 68 | with io.open(self.fn, 'r') as f: 69 | for line in f: 70 | summary[llp.parse_line(line)] += 1 71 | return summary 72 | 73 | class DirectoryParser(LogFileParser): 74 | """ 75 | DirectoryParser parses a local directory containing log files. 76 | """ 77 | 78 | def __init__(self, llp, dr): 79 | self.llp = llp 80 | self.dr = dr 81 | 82 | def parse(self): 83 | llp = self.llp 84 | summary = Counter() 85 | for f in os.listdir(self.dr): 86 | fn = os.path.join(self.dr, f) 87 | if os.path.isfile(fn): 88 | fp = FileParser(llp, fn) 89 | summary += fp.parse() 90 | return summary 91 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '25 4 * * 6' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | import json 6 | import os 7 | 8 | from pmdp.parse_elb_log_lambda import lambda_handler 9 | from pmdp.parser.line_parser import DatePathLogLineParser, DatePathKeyLogLineParser 10 | 11 | class TestLambdaFunction(unittest.TestCase): 12 | # Valid end to end 13 | def test_valid_lambda_parse(self): 14 | dir_path = os.path.dirname(os.path.realpath(__file__)) 15 | 16 | with open(os.path.join(dir_path, 'test-s3-put.json'), 'r') as f: 17 | d = json.loads(f.read()) 18 | out = lambda_handler(d, None) 19 | self.assertTrue(out['success']) 20 | 21 | class TestDatePathLogLineParser(unittest.TestCase): 22 | def test_valid_line_parse(self): 23 | line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -""" 24 | lp = DatePathLogLineParser() 25 | results = lp.parse_line(line) 26 | self.assertEquals(2, len(results)) 27 | self.assertEquals('2016-10-22', results[0]) 28 | self.assertEquals('PATH123', results[1]) 29 | 30 | # Should throw exception 31 | def test_invalid_line_parse(self): 32 | line = "BAD LINE" 33 | lp = DatePathLogLineParser() 34 | try: 35 | results = lp.parse_line(line) 36 | self.fail("Should have thrown an exception") 37 | except Exception, e: 38 | pass 39 | 40 | class TestDatePathKeyLogLineParser(unittest.TestCase): 41 | def test_valid_line_parse_no_get_args(self): 42 | line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -""" 43 | lp = DatePathKeyLogLineParser(('key1', 'key2')) 44 | results = lp.parse_line(line) 45 | 46 | self.assertEquals(4, len(results)) 47 | self.assertEquals('2016-10-22', results[0]) 48 | self.assertEquals('PATH123', results[1]) 49 | self.assertEquals('', results[2]) 50 | self.assertEquals('', results[3]) 51 | 52 | def test_valid_line_parse_missing_get_arg(self): 53 | line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key1=value1 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -""" 54 | lp = DatePathKeyLogLineParser(('key1', 'key2')) 55 | results = lp.parse_line(line) 56 | 57 | self.assertEquals(4, len(results)) 58 | self.assertEquals('2016-10-22', results[0]) 59 | self.assertEquals('PATH123', results[1]) 60 | self.assertEquals('value1', results[2]) 61 | self.assertEquals('', results[3]) 62 | 63 | def test_valid_line_parse_complete_get_args_ordered(self): 64 | line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key1=value1&key2=value2 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -""" 65 | lp = DatePathKeyLogLineParser(('key1', 'key2')) 66 | results = lp.parse_line(line) 67 | 68 | self.assertEquals(4, len(results)) 69 | self.assertEquals('2016-10-22', results[0]) 70 | self.assertEquals('PATH123', results[1]) 71 | self.assertEquals('value1', results[2]) 72 | self.assertEquals('value2', results[3]) 73 | 74 | def test_valid_line_parse_complete_get_args_unordered(self): 75 | line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key2=value2&key1=value1 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -""" 76 | lp = DatePathKeyLogLineParser(('key1', 'key2')) 77 | results = lp.parse_line(line) 78 | 79 | self.assertEquals(4, len(results)) 80 | self.assertEquals('2016-10-22', results[0]) 81 | self.assertEquals('PATH123', results[1]) 82 | self.assertEquals('value1', results[2]) 83 | self.assertEquals('value2', results[3]) 84 | 85 | def test_valid_line_parse_extra_get_args(self): 86 | line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key1=value1&key2=value2&key3=value3 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -""" 87 | lp = DatePathKeyLogLineParser(('key1', 'key2')) 88 | results = lp.parse_line(line) 89 | 90 | self.assertEquals(4, len(results)) 91 | self.assertEquals('2016-10-22', results[0]) 92 | self.assertEquals('PATH123', results[1]) 93 | self.assertEquals('value1', results[2]) 94 | self.assertEquals('value2', results[3]) 95 | 96 | # Should throw exception 97 | def test_invalid_line_parse(self): 98 | line = "BAD LINE" 99 | lp = DatePathLogLineParser() 100 | try: 101 | results = lp.parse_line(line) 102 | self.fail("Should have thrown an exception") 103 | except Exception, e: 104 | pass 105 | 106 | if __name__ == '__main__': 107 | unittest.main() 108 | --------------------------------------------------------------------------------