├── LICENSE
├── README.md
├── main_pipeline_batch.py
├── main_pipeline_stream.py
├── publish_logs.py
├── set_up_install.sh
└── stream_logs.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 DFoly
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # User_log_pipeline
2 | Creating a Streaming Pipeline for user log data in Google Cloud Platform
3 | 


--------------------------------------------------------------------------------
/main_pipeline_batch.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | from apache_beam.options.pipeline_options import PipelineOptions
 3 | from google.cloud import bigquery
 4 | import re
 5 | import logging
 6 | import sys
 7 | 
 8 | PROJECT='user-logs-237110'
 9 | schema = 'remote_addr:STRING, timelocal:STRING, request_type:STRING, status:STRING, body_bytes_sent:STRING, http_referer:STRING, http_user_agent:STRING'
10 | 
11 | 
12 | src_path = "user_log_fileC.txt"
13 | 
14 | def regex_clean(data):
15 | 
16 |     PATTERNS =  [r'(^\S+\.[\S+\.]+\S+)\s',r'(?<=\[).+?(?=\])',
17 |            r'\"(\S+)\s(\S+)\s*(\S*)\"',r'\s(\d+)\s',r"(?<=\[).\d+(?=\])",
18 |            r'\"[A-Z][a-z]+', r'\"(http|https)://[a-z]+.[a-z]+.[a-z]+']
19 |     result = []
20 |     for match in PATTERNS:
21 |       try:
22 |         reg_match = re.search(match, data).group()
23 |         if reg_match:
24 |           result.append(reg_match)
25 |         else:
26 |           result.append(" ")
27 |       except:
28 |         print("There was an error with the regex search")
29 |     result = [x.strip() for x in result]
30 |     result = [x.replace('"', "") for x in result]
31 |     res = ','.join(result)
32 |     return res
33 | 
34 | 
35 | class Split(beam.DoFn):
36 | 
37 |     def process(self, element):
38 |         from datetime import datetime
39 |         element = element.split(",")
40 |         d = datetime.strptime(element[1], "%d/%b/%Y:%H:%M:%S")
41 |         date_string = d.strftime("%Y-%m-%d %H:%M:%S")
42 | 
43 |         return [{ 
44 |             'remote_addr': element[0],
45 |             'timelocal': date_string,
46 |             'request_type': element[2],
47 |             'body_bytes_sent': element[3],
48 |             'status': element[4],
49 |             'http_referer': element[5],
50 |             'http_user_agent': element[6]
51 |     
52 |         }]
53 | 
54 | def main():
55 |    # argv = [
56 |    #    '--project={0}'.format(PROJECT),
57 |    #    '--staging_location=gs://{0}/staging/'.format(BUCKET),
58 |    #    '--temp_location=gs://{0}/staging/'.format(BUCKET),
59 |    #    '--runner=DataflowRunner'
60 |    # ]
61 | 
62 |    p = beam.Pipeline()
63 | 
64 |    (p
65 |       | 'ReadData' >> beam.io.textio.ReadFromText(src_path)
66 |       | "clean address" >> beam.Map(regex_clean)
67 |       | 'ParseCSV' >> beam.ParDo(Split())
68 |       | 'WriteToBigQuery' >> beam.io.WriteToBigQuery('{0}:userlogs.logdata'.format(PROJECT), schema=schema,
69 |         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
70 |    )
71 | 
72 |    p.run()
73 | 
74 | if __name__ == '__main__':
75 |   logger = logging.getLogger().setLevel(logging.INFO)
76 |   main()
77 | 


--------------------------------------------------------------------------------
/main_pipeline_stream.py:
--------------------------------------------------------------------------------
 1 | from apache_beam.options.pipeline_options import PipelineOptions
 2 | from google.cloud import pubsub_v1
 3 | from google.cloud import bigquery
 4 | import apache_beam as beam
 5 | import logging
 6 | import argparse
 7 | import sys
 8 | import re
 9 | 
10 | 
11 | PROJECT="user-logs-237110"
12 | schema = 'remote_addr:STRING, timelocal:STRING, request_type:STRING, status:STRING, body_bytes_sent:STRING, http_referer:STRING, http_user_agent:STRING'
13 | TOPIC = "projects/user-logs-237110/topics/userlogs"
14 | 
15 | 
16 | def regex_clean(data):
17 | 
18 |     PATTERNS =  [r'(^\S+\.[\S+\.]+\S+)\s',r'(?<=\[).+?(?=\])',
19 |            r'\"(\S+)\s(\S+)\s*(\S*)\"',r'\s(\d+)\s',r"(?<=\[).\d+(?=\])",
20 |            r'\"[A-Z][a-z]+', r'\"(http|https)://[a-z]+.[a-z]+.[a-z]+']
21 |     result = []
22 |     for match in PATTERNS:
23 |       try:
24 |         reg_match = re.search(match, data).group()
25 |         if reg_match:
26 |           result.append(reg_match)
27 |         else:
28 |           result.append(" ")
29 |       except:
30 |         print("There was an error with the regex search")
31 |     result = [x.strip() for x in result]
32 |     result = [x.replace('"', "") for x in result]
33 |     res = ','.join(result)
34 |     return res
35 | 
36 | 
37 | class Split(beam.DoFn):
38 | 
39 |     def process(self, element):
40 |         from datetime import datetime
41 |         element = element.split(",")
42 |         d = datetime.strptime(element[1], "%d/%b/%Y:%H:%M:%S")
43 |         date_string = d.strftime("%Y-%m-%d %H:%M:%S")
44 | 
45 |         return [{ 
46 |             'remote_addr': element[0],
47 |             'timelocal': date_string,
48 |             'request_type': element[2],
49 |             'body_bytes_sent': element[3],
50 |             'status': element[4],
51 |             'http_referer': element[5],
52 |             'http_user_agent': element[6]
53 |     
54 |         }]
55 | 
56 | def main(argv=None):
57 | 
58 |    parser = argparse.ArgumentParser()
59 |    parser.add_argument("--input_topic")
60 |    parser.add_argument("--output")
61 |    known_args = parser.parse_known_args(argv)
62 | 
63 | 
64 |    p = beam.Pipeline(options=PipelineOptions())
65 | 
66 |    (p
67 |       | 'ReadData' >> beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes)
68 |       | "Decode" >> beam.Map(lambda x: x.decode('utf-8'))
69 |       | "Clean Data" >> beam.Map(regex_clean)
70 |       | 'ParseCSV' >> beam.ParDo(Split())
71 |       | 'WriteToBigQuery' >> beam.io.WriteToBigQuery('{0}:userlogs.streaminglogs'.format(PROJECT), schema=schema,
72 |         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
73 |    )
74 |    result = p.run()
75 |    result.wait_until_finish()
76 | 
77 | if __name__ == '__main__':
78 |   logger = logging.getLogger().setLevel(logging.INFO)
79 |   main()
80 | 


--------------------------------------------------------------------------------
/publish_logs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script imports a function to 
 3 | generate fake user log data and then
 4 | publishes it to a pub/sub topic 
 5 | """
 6 | 
 7 | 
 8 | from stream_logs import generate_log_line
 9 | import logging
10 | from google.cloud import pubsub_v1
11 | import random
12 | import time
13 | 
14 | 
15 | PROJECT_ID="user-logs-237110"
16 | TOPIC = "userlogs"
17 | 
18 | 
19 | publisher = pubsub_v1.PublisherClient()
20 | topic_path = publisher.topic_path(PROJECT_ID, TOPIC)
21 | 
22 | 
23 | 
24 | def publish(publisher, topic, message):
25 |     data = message.encode('utf-8')
26 |     return publisher.publish(topic_path, data = data)
27 | 
28 | 
29 | 
30 | def callback(message_future):
31 |     # When timeout is unspecified, the exception method waits indefinitely.
32 |     if message_future.exception(timeout=30):
33 |         print('Publishing message on {} threw an Exception {}.'.format(
34 |             topic_name, message_future.exception()))
35 |     else:
36 |         print(message_future.result())
37 | 
38 | 
39 | if __name__ == '__main__':
40 |  
41 |     while True:
42 |         line = generate_log_line()
43 |         print(line)
44 |         message_future = publish(publisher, topic_path, line)
45 |         message_future.add_done_callback(callback)
46 | 
47 |         sleep_time = random.choice(range(1, 3, 1))
48 |         time.sleep(sleep_time)


--------------------------------------------------------------------------------
/set_up_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export GOOGLE_APPLICATION_CREDENTIALS="/Users/danielfoley/desktop/blog/GCP_user_logs/pipeline_scripts/gcpkeys.json"
 4 | export PROJECT="user-logs-237110"
 5 | export BUCKET="gs://macbook_files"
 6 | 
 7 | 
 8 | apt-get install python-pip
 9 | sudo pip install apache-beam[gcp] oauth2client==3.0.0 
10 | sudo pip install -U pip
11 | sudo pip install Faker==1.0.2


--------------------------------------------------------------------------------
/stream_logs.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | import time
 3 | import random
 4 | import os
 5 | import numpy as np
 6 | from datetime import datetime, timedelta
 7 | 
 8 | 
 9 | 
10 | LINE = """\
11 | {remote_addr} - - [{time_local}] "{request_type} {request_path} HTTP/1.1" [{status}] {body_bytes_sent} "{http_referer}" "{http_user_agent}"\
12 | """
13 | 
14 | 
15 | 
16 | def generate_log_line():
17 |     fake = Faker()
18 |     now = datetime.now()
19 |     remote_addr = fake.ipv4()
20 |     time_local = now.strftime('%d/%b/%Y:%H:%M:%S')
21 |     request_type = random.choice(["GET", "POST", "PUT"])
22 |     request_path = "/" + fake.uri_path()
23 | 
24 |     status = np.random.choice([200, 401, 404], p = [0.9, 0.05, 0.05])
25 |     body_bytes_sent = random.choice(range(5, 1000, 1))
26 |     http_referer = fake.uri()
27 |     http_user_agent = fake.user_agent()
28 | 
29 |     log_line = LINE.format(
30 |         remote_addr=remote_addr,
31 |         time_local=time_local,
32 |         request_type=request_type,
33 |         request_path=request_path,
34 |         status=status,
35 |         body_bytes_sent=body_bytes_sent,
36 |         http_referer=http_referer,
37 |         http_user_agent=http_user_agent
38 |     )
39 | 
40 |     return log_line
41 | 


--------------------------------------------------------------------------------