├── tests
    ├── __init__.py
    ├── test-s3-put.json
    └── test.py
├── pmdp
    ├── parser
    │   ├── __init__.py
    │   ├── line_parser.py
    │   └── file_parser.py
    ├── writer
    │   ├── __init__.py
    │   └── writer.py
    ├── __init__.py
    ├── parse_elb_log_local.py
    └── parse_elb_log_lambda.py
├── setup.cfg
├── img
    ├── pmdp-elb.png
    └── pmdp-lambda.png
├── MANIFEST
├── requirements.txt
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
└── .github
    └── workflows
        └── codeql-analysis.yml


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pmdp/parser/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pmdp/writer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/img/pmdp-elb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dangoldin/poor-mans-data-pipeline/HEAD/img/pmdp-elb.png


--------------------------------------------------------------------------------
/img/pmdp-lambda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dangoldin/poor-mans-data-pipeline/HEAD/img/pmdp-lambda.png


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | setup.cfg
3 | setup.py
4 | pmdp/__init__.py
5 | pmdp/parse_elb_log_lambda.py
6 | pmdp/parse_elb_log_local.py
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.4.1
 2 | botocore==1.4.67
 3 | docutils==0.12
 4 | futures==3.0.5
 5 | jmespath==0.9.0
 6 | python-dateutil==2.5.3
 7 | s3transfer==0.1.9
 8 | six==1.10.0
 9 | wsgiref==0.1.2
10 | 


--------------------------------------------------------------------------------
/pmdp/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | __title__ = 'pmdp'
 4 | __version__ = '0.3'
 5 | __author__ = 'Dan Goldin'
 6 | __license__ = 'MIT'
 7 | __copyright__ = 'Copyright 2016 Dan Goldin'
 8 | 
 9 | from . import parser
10 | from . import writer
11 | 
12 | from . import parse_elb_log_local
13 | from . import parse_elb_log_lambda
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Mostly from http://peterdowns.com/posts/first-time-with-pypi.html
 2 | 
 3 | from distutils.core import setup
 4 | setup(
 5 |   name = 'pmdp',
 6 |   packages = ['pmdp'],
 7 |   version = '0.3',
 8 |   description = 'A poor man\'s data pipeline',
 9 |   author = 'Dan Goldin',
10 |   author_email = 'dangoldin@gmail.com',
11 |   url = 'https://github.com/dangoldin/poor-mans-data-pipeline',
12 |   download_url = 'https://github.com/dangoldin/poor-mans-data-pipeline/tarball/0.3',
13 |   keywords = ['data', 'data-pipeline'],
14 |   classifiers = [],
15 | )
16 | 


--------------------------------------------------------------------------------
/pmdp/parse_elb_log_local.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | from parser.line_parser import DatePathLogLineParser
 7 | from parser.file_parser import FileParser, DirectoryParser
 8 | from writer.csv_writer import CSVFileWriter
 9 | 
10 | if __name__ == '__main__':
11 |     if len(sys.argv) < 2:
12 |         print 'Specify a file or directory'
13 |         sys.exit(1)
14 | 
15 |     fn = sys.argv[1]
16 |     lp = DatePathLogLineParser()
17 | 
18 |     if os.path.isdir(fn):
19 |         p = DirectoryParser(lp, fn)
20 |     else:
21 |         p = FileParser(lp, fn)
22 |     out = p.parse()
23 | 
24 |     out_fn = 'out.csv'
25 |     if len(sys.argv) == 3:
26 |         out_fn = sys.argv[2]
27 | 
28 |     w = CSVFileWriter(out_fn)
29 |     w.write_summary(out)
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Dan Goldin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pmdp/parse_elb_log_lambda.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import urllib
 6 | 
 7 | import datetime
 8 | import uuid
 9 | 
10 | from parser.line_parser import DatePathLogLineParser
11 | from parser.file_parser import S3Parser
12 | from writer.s3_csv_writer import S3CSVFileWriter
13 | 
14 | print('Loading function')
15 | 
16 | def generate_filename():
17 |     out_path = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
18 |     return out_path + '/' + str(uuid.uuid4())
19 | 
20 | def lambda_handler(event, context):
21 |     bucket = event['Records'][0]['s3']['bucket']['name']
22 |     key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))
23 |     try:
24 |         # print('Getting', bucket, key)
25 |         lp = DatePathLogLineParser()
26 |         sp = S3Parser(lp, bucket, key)
27 |         summary = sp.parse()
28 |         print(summary)
29 |         writer = S3CSVFileWriter(bucket, generate_filename())
30 |         writer.write_summary(summary)
31 |         return {'success': True}
32 |     except Exception as e:
33 |         print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
34 |         raise e
35 | 


--------------------------------------------------------------------------------
/tests/test-s3-put.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Records": [
 3 |     {
 4 |       "eventVersion": "2.0",
 5 |       "eventTime": "1970-01-01T00:00:00.000Z",
 6 |       "requestParameters": {
 7 |         "sourceIPAddress": "127.0.0.1"
 8 |       },
 9 |       "s3": {
10 |         "configurationId": "testConfigRule",
11 |         "object": {
12 |           "eTag": "0123456789abcdef0123456789abcdef",
13 |           "sequencer": "0A1B2C3D4E5F678901",
14 |           "key": "AWSLogs/663647347154/elasticloadbalancing/us-east-1/2016/10/24/663647347154_elasticloadbalancing_us-east-1_poor-mans-data-pipeline_20161024T0200Z_107.22.223.245_4vf56hab.log"
15 |         },
16 |         "bucket": {
17 |           "arn": "arn:aws:s3:::poor-mans-data-pipeline.dangoldin.com",
18 |           "name": "poor-mans-data-pipeline.dangoldin.com",
19 |           "ownerIdentity": {
20 |             "principalId": "EXAMPLE"
21 |           }
22 |         },
23 |         "s3SchemaVersion": "1.0"
24 |       },
25 |       "responseElements": {
26 |         "x-amz-id-2": "EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH",
27 |         "x-amz-request-id": "EXAMPLE123456789"
28 |       },
29 |       "awsRegion": "us-east-1",
30 |       "eventName": "ObjectCreated:Put",
31 |       "userIdentity": {
32 |         "principalId": "EXAMPLE"
33 |       },
34 |       "eventSource": "aws:s3"
35 |     }
36 |   ]
37 | }
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | # Temp files
92 | *~
93 | 
94 | # Lambda archive
95 | *zip
96 | 
97 | # OS X files
98 | .DS_Store
99 | 


--------------------------------------------------------------------------------
/pmdp/writer/writer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import csv
 4 | import io
 5 | import boto3
 6 | 
 7 | from abc import ABCMeta, abstractmethod
 8 | 
 9 | class BaseSummaryWriter:
10 |     """
11 |     BaseSummaryWriter is the abstract class that handles writing/exporting
12 |     a generated summary.
13 |     """
14 |     __metaclass__  = ABCMeta
15 | 
16 |     @abstractmethod
17 |     def write_summary(self, summary): pass
18 | 
19 | class CSVFileWriter(BaseSummaryWriter):
20 |     """
21 |     CSVFileWriter is an implementation that writes a summary to a CSV file
22 |     """
23 | 
24 |     def __init__(self, fn):
25 |         self.fn = fn
26 | 
27 |     def write_summary(self, summary):
28 |         with open(self.fn, 'w') as f:
29 |             w = csv.writer(f)
30 |             # TODO: Figure out header
31 |             # w.writerow(headers + ['cnt',])
32 |             w.writerows([i + (cnt,) for i, cnt in summary.iteritems()])
33 | 
34 | class S3CSVFileWriter(BaseSummaryWriter):
35 |     """
36 |     S3CSVFileWriter is an implementation that writes a summary to a CSV file
37 |     and uploads it to S3
38 |     """
39 | 
40 |     def __init__(self, bucket, key):
41 |         self.bucket = bucket
42 |         self.key = key
43 |         self.s3 = boto3.client('s3')
44 | 
45 |     def write_summary(self, summary):
46 |         output = io.BytesIO()
47 |         w = csv.writer(output)
48 |         for k, v in summary.iteritems():
49 |             w.writerow(k + (v,))
50 |         # print 'Writing to',self.bucket,self.key
51 |         self.s3.put_object(Bucket=self.bucket, Key=self.key, Body=output.getvalue())
52 | 


--------------------------------------------------------------------------------
/pmdp/parser/line_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from abc import ABCMeta, abstractmethod
 4 | from operator import itemgetter
 5 | 
 6 | # TODO: Parse ip, user agent
 7 | 
 8 | class LogLineParser:
 9 |     """
10 |     LogLineParser is the abstract class that handles parsing individal
11 |     rows in the AWS ELB log file format.
12 | 
13 |     http://docs.aws.amazon.com/elasticloadbalancing/latest/classic/access-log-collection.html
14 |     """
15 | 
16 |     __metaclass__  = ABCMeta
17 | 
18 |     @abstractmethod
19 |     def parse_line(self, line): pass
20 | 
21 | class DatePathLogLineParser(LogLineParser):
22 |     """
23 |     DatePathLogLineParser is a simple log line parser that returns
24 |     a date in ymd and the path.
25 |     """
26 | 
27 |     def parse_ymd(self, dt_str):
28 |         return dt_str[:10] # Just the ymd
29 | 
30 |     def parse_path(self, url):
31 |         return url.split('/')[-1] # Skip the host name
32 | 
33 |     def parse_line(self, line):
34 |         dt, url = itemgetter(0,12)(line.split(' '))
35 |         dt = self.parse_ymd(dt)
36 |         path = self.parse_path(url)
37 |         return (dt, path)
38 | 
39 | class DatePathKeyLogLineParser(DatePathLogLineParser):
40 |     """
41 |     DatePathKeyLogLineParser is a log line parser that returns
42 |     a date in ymd, the path, as well as the GET arg valus for
43 |     the specified keys.
44 |     """
45 | 
46 |     def __init__(self, keys):
47 |         self.keys = keys
48 | 
49 |     def parse_path_and_keys(self, path):
50 |         query_args = {}
51 |         parts = path.split('?')
52 |         if len(parts) == 2:
53 |             for q in parts[1].split('&'):
54 |                 kv = q.split('=')
55 |                 if len(kv) == 2: # Only handle valid ones
56 |                     query_args[kv[0]] = kv[1]
57 | 
58 |         return (parts[0], ) + tuple(query_args.get(k, '') for k in self.keys)
59 | 
60 |     def parse_line(self, line):
61 |         dt, url = itemgetter(0,12)(line.split(' '))
62 |         dt = super(DatePathKeyLogLineParser, self).parse_ymd(dt)
63 |         path = super(DatePathKeyLogLineParser, self).parse_path(url)
64 |         url_pieces = self.parse_path_and_keys(path)
65 |         return (dt, ) + url_pieces
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Poor Man's Data Pipeline
 2 | A minimal way to get an extremely robust, scalable, and cheap data pipeline up and running.
 3 | 
 4 | ## Motivation
 5 | Most data pipelines require a large infrastructure to get up and running. The intent of Poor Man's Data Pipeline is to use a variety of "serverless" components in order to build a data pipeline that has very few points of failure while still scaling to large volumes at low cost.
 6 | 
 7 | ## How it works
 8 | The pipeline works by using Amazon's Elastic Load Balancer with access logs enabled. These logs are then stored on S3 and are parsed and aggregated via simple Lambda functions.
 9 | 
10 | ## Components
11 | This project is still a work in progress but the components are listed below. A simple way to see how they're wired together is to take a look at the parse_elb_log_lambda.py file which is configured to be called by an AWS lambda function.
12 | - Line parser: This is responsible for parsing each line of an S3 access log file. This is what you will need to change in order to support configure your logging levels.
13 | - File parser: This just runs the line parser across each line of the log file. There are few options here for testing but generally you'll want to use the S3Parser class.
14 | - Summary writer: This takes the result of the parse and exports it. At the moment I only have a simple writer back to S3 but one can build additional functionality to write it to a database or send the summary to another service.
15 | 
16 | ## How to get it working
17 | You will need to do two things.
18 | 
19 | 1. Set up an Elastic Load Balancer and enable access logging. Note that you don't need to connect any instances to it since it will still be able to log every request. Note that responses to the ELB will have a 503 status code.
20 | ![alt text](https://github.com/dangoldin/poor-mans-data-pipeline/raw/master/img/pmdp-elb.png "PMDP ELB setup")
21 | 
22 | 2. Set up an AWS Lambda function to parse the resulting access logs. The default code will do a simple count grouped by date and path and then upload them back to the original bucket. You can set up AWS Lambda by creating a zip archive and setting it up in the AWS console.
23 | 
24 | ```
25 | cd pmdp
26 | zip -r lambda.zip *
27 | ```
28 | 
29 | ![alt text](https://github.com/dangoldin/poor-mans-data-pipeline/raw/master/img/pmdp-lambda.png "PMDP Lambda setup")
30 | 


--------------------------------------------------------------------------------
/pmdp/parser/file_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from abc import ABCMeta, abstractmethod
 4 | 
 5 | import os
 6 | import io
 7 | import boto3
 8 | 
 9 | from collections import Counter
10 | 
11 | class LogFileParser:
12 |     """
13 |     LogFileParser is the abstract class that handles parsing log files
14 |     in an AWS ELB log file format.
15 | 
16 |     http://docs.aws.amazon.com/elasticloadbalancing/latest/classic/access-log-collection.html
17 |     """
18 |     __metaclass__  = ABCMeta
19 | 
20 |     @abstractmethod
21 |     def parse(self): pass
22 | 
23 | class StringParser(LogFileParser):
24 |     """
25 |     StringParser parses a log file that's already been loaded into a string.
26 |     Primarily used for testing.
27 |     """
28 | 
29 |     def __init__(self, llp, st):
30 |         self.llp = llp
31 |         self.st  = st
32 | 
33 |     def parse(self):
34 |         llp = self.llp
35 |         summary = Counter()
36 |         for line in self.st.strip().split("\n"):
37 |             summary[llp.parse_line(line)] += 1
38 |         return summary
39 | 
40 | class S3Parser(LogFileParser):
41 |     """
42 |     S3Parser parses a log file that's stored on S3.
43 |     """
44 | 
45 |     def __init__(self, llp, bucket, key):
46 |         self.llp = llp
47 |         self.bucket = bucket
48 |         self.key = key
49 | 
50 |     def parse(self):
51 |         s3 = boto3.client('s3')
52 |         response = s3.get_object(Bucket=self.bucket, Key=self.key)
53 |         sp = StringParser(self.llp, response['Body'].read())
54 |         return sp.parse()
55 | 
56 | class FileParser(LogFileParser):
57 |     """
58 |     FileParser parses a log file that's stored locally.
59 |     """
60 | 
61 |     def __init__(self, llp, fn):
62 |         self.llp = llp
63 |         self.fn  = fn
64 | 
65 |     def parse(self):
66 |         llp = self.llp
67 |         summary = Counter()
68 |         with io.open(self.fn, 'r') as f:
69 |             for line in f:
70 |                 summary[llp.parse_line(line)] += 1
71 |         return summary
72 | 
73 | class DirectoryParser(LogFileParser):
74 |     """
75 |     DirectoryParser parses a local directory containing log files.
76 |     """
77 | 
78 |     def __init__(self, llp, dr):
79 |         self.llp = llp
80 |         self.dr  = dr
81 | 
82 |     def parse(self):
83 |         llp = self.llp
84 |         summary = Counter()
85 |         for f in os.listdir(self.dr):
86 |             fn = os.path.join(self.dr, f)
87 |             if os.path.isfile(fn):
88 |                 fp = FileParser(llp, fn)
89 |                 summary += fp.parse()
90 |         return summary
91 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '25 4 * * 6'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v2
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v1
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v1
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v1
71 | 


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import unittest
  4 | 
  5 | import json
  6 | import os
  7 | 
  8 | from pmdp.parse_elb_log_lambda import lambda_handler
  9 | from pmdp.parser.line_parser import DatePathLogLineParser, DatePathKeyLogLineParser
 10 | 
 11 | class TestLambdaFunction(unittest.TestCase):
 12 |     # Valid end to end
 13 |     def test_valid_lambda_parse(self):
 14 |         dir_path = os.path.dirname(os.path.realpath(__file__))
 15 | 
 16 |         with open(os.path.join(dir_path, 'test-s3-put.json'), 'r') as f:
 17 |             d = json.loads(f.read())
 18 |             out = lambda_handler(d, None)
 19 |             self.assertTrue(out['success'])
 20 | 
 21 | class TestDatePathLogLineParser(unittest.TestCase):
 22 |     def test_valid_line_parse(self):
 23 |         line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -"""
 24 |         lp = DatePathLogLineParser()
 25 |         results = lp.parse_line(line)
 26 |         self.assertEquals(2, len(results))
 27 |         self.assertEquals('2016-10-22', results[0])
 28 |         self.assertEquals('PATH123', results[1])
 29 | 
 30 |     # Should throw exception
 31 |     def test_invalid_line_parse(self):
 32 |         line = "BAD LINE"
 33 |         lp = DatePathLogLineParser()
 34 |         try:
 35 |             results = lp.parse_line(line)
 36 |             self.fail("Should have thrown an exception")
 37 |         except Exception, e:
 38 |             pass
 39 | 
 40 | class TestDatePathKeyLogLineParser(unittest.TestCase):
 41 |     def test_valid_line_parse_no_get_args(self):
 42 |         line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -"""
 43 |         lp = DatePathKeyLogLineParser(('key1', 'key2'))
 44 |         results = lp.parse_line(line)
 45 | 
 46 |         self.assertEquals(4, len(results))
 47 |         self.assertEquals('2016-10-22', results[0])
 48 |         self.assertEquals('PATH123', results[1])
 49 |         self.assertEquals('', results[2])
 50 |         self.assertEquals('', results[3])
 51 | 
 52 |     def test_valid_line_parse_missing_get_arg(self):
 53 |         line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key1=value1 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -"""
 54 |         lp = DatePathKeyLogLineParser(('key1', 'key2'))
 55 |         results = lp.parse_line(line)
 56 | 
 57 |         self.assertEquals(4, len(results))
 58 |         self.assertEquals('2016-10-22', results[0])
 59 |         self.assertEquals('PATH123', results[1])
 60 |         self.assertEquals('value1', results[2])
 61 |         self.assertEquals('', results[3])
 62 | 
 63 |     def test_valid_line_parse_complete_get_args_ordered(self):
 64 |         line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key1=value1&key2=value2 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -"""
 65 |         lp = DatePathKeyLogLineParser(('key1', 'key2'))
 66 |         results = lp.parse_line(line)
 67 | 
 68 |         self.assertEquals(4, len(results))
 69 |         self.assertEquals('2016-10-22', results[0])
 70 |         self.assertEquals('PATH123', results[1])
 71 |         self.assertEquals('value1', results[2])
 72 |         self.assertEquals('value2', results[3])
 73 | 
 74 |     def test_valid_line_parse_complete_get_args_unordered(self):
 75 |         line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key2=value2&key1=value1 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -"""
 76 |         lp = DatePathKeyLogLineParser(('key1', 'key2'))
 77 |         results = lp.parse_line(line)
 78 | 
 79 |         self.assertEquals(4, len(results))
 80 |         self.assertEquals('2016-10-22', results[0])
 81 |         self.assertEquals('PATH123', results[1])
 82 |         self.assertEquals('value1', results[2])
 83 |         self.assertEquals('value2', results[3])
 84 | 
 85 |     def test_valid_line_parse_extra_get_args(self):
 86 |         line = """2016-10-22T22:35:17.425648Z poor-mans-data-pipeline 1.2.3.4:5 - -1 -1 -1 503 0 0 0 "GET http://poor-mans-data-pipeline.us-east-1.elb.amazonaws.com:80/PATH123?key1=value1&key2=value2&key3=value3 HTTP/1.1" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" - -"""
 87 |         lp = DatePathKeyLogLineParser(('key1', 'key2'))
 88 |         results = lp.parse_line(line)
 89 | 
 90 |         self.assertEquals(4, len(results))
 91 |         self.assertEquals('2016-10-22', results[0])
 92 |         self.assertEquals('PATH123', results[1])
 93 |         self.assertEquals('value1', results[2])
 94 |         self.assertEquals('value2', results[3])
 95 | 
 96 |     # Should throw exception
 97 |     def test_invalid_line_parse(self):
 98 |         line = "BAD LINE"
 99 |         lp = DatePathLogLineParser()
100 |         try:
101 |             results = lp.parse_line(line)
102 |             self.fail("Should have thrown an exception")
103 |         except Exception, e:
104 |             pass
105 | 
106 | if __name__ == '__main__':
107 |     unittest.main()
108 | 


--------------------------------------------------------------------------------