├── .gitignore ├── README.md ├── count_browsers.py ├── count_visitors.py ├── log_generator.py ├── requirements.txt └── store_logs.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | .idea 97 | 98 | log_a.txt 99 | log_b.txt 100 | db.sqlite -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Analytics Pipeline 2 | 3 | This repo contains the code for creating a data pipeline to calculate metrics for a fake webserver: 4 | 5 | * `log_generator.py` -- generates fake webserver logs. 6 | * `store_logs.py` -- parses the logs and stores them in a SQLite database. 7 | * `count_visitors.py` -- pulls from the database to count visitors to the site per day. 8 | 9 | # Installation 10 | 11 | To get this repo running: 12 | 13 | * Install Python 3. You can find instructions [here](https://wiki.python.org/moin/BeginnersGuide/Download). 14 | * Create a [virtual environment](https://docs.python.org/3/library/venv.html). 15 | * Clone this repo with `git clone git@github.com:dataquestio/analytics_pipeline.git` 16 | * Get into the folder with `cd analytics_pipeline` 17 | * Install the requirements with `pip install -r requirements.txt` 18 | 19 | # Usage 20 | 21 | * Execute the three scripts mentioned above, in order. 22 | 23 | You should see output from `count_visitors.py`. 24 | 25 | -------------------------------------------------------------------------------- /count_browsers.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import sqlite3 4 | from datetime import datetime 5 | 6 | DB_NAME = "db.sqlite" 7 | 8 | def get_lines(time_obj): 9 | conn = sqlite3.connect(DB_NAME) 10 | cur = conn.cursor() 11 | cur.execute("SELECT time_local,http_user_agent FROM logs WHERE created > ?", [time_obj]) 12 | resp = cur.fetchall() 13 | return resp 14 | 15 | def get_time_and_ip(lines): 16 | browsers = [] 17 | times = [] 18 | for line in lines: 19 | times.append(parse_time(line[0])) 20 | browsers.append(parse_user_agent(line[1])) 21 | return browsers, times 22 | 23 | def parse_time(time_str): 24 | try: 25 | time_obj = datetime.strptime(time_str, '[%d/%b/%Y:%H:%M:%S %z]') 26 | except Exception: 27 | time_obj = "" 28 | return time_obj 29 | 30 | def parse_user_agent(user_agent): 31 | browsers = ["Firefox", "Chrome", "Opera", "Safari", "MSIE"] 32 | for browser in browsers: 33 | if browser in user_agent: 34 | return browser 35 | return "Other" 36 | 37 | if __name__ == "__main__": 38 | browser_counts = {} 39 | start_time = datetime(year=2017, month=3, day=9) 40 | while True: 41 | lines = get_lines(start_time) 42 | browsers, times = get_time_and_ip(lines) 43 | if len(times) > 0: 44 | start_time = times[-1] 45 | for browser, time_obj in zip(browsers, times): 46 | if browser not in browser_counts: 47 | browser_counts[browser] = 0 48 | browser_counts[browser] += 1 49 | 50 | count_list = browser_counts.items() 51 | count_list = sorted(count_list, key=lambda x: x[0]) 52 | print("") 53 | print(datetime.now()) 54 | for item in count_list: 55 | print("{}: {}".format(*item)) 56 | 57 | time.sleep(5) -------------------------------------------------------------------------------- /count_visitors.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import sqlite3 4 | from datetime import datetime 5 | 6 | DB_NAME = "db.sqlite" 7 | 8 | def get_lines(time_obj): 9 | conn = sqlite3.connect(DB_NAME) 10 | cur = conn.cursor() 11 | cur.execute("SELECT remote_addr,time_local FROM logs WHERE created > ?", [time_obj]) 12 | resp = cur.fetchall() 13 | return resp 14 | 15 | def get_time_and_ip(lines): 16 | ips = [] 17 | times = [] 18 | for line in lines: 19 | ips.append(line[0]) 20 | times.append(parse_time(line[1])) 21 | return ips, times 22 | 23 | def parse_time(time_str): 24 | try: 25 | time_obj = datetime.strptime(time_str, '[%d/%b/%Y:%H:%M:%S %z]') 26 | except Exception: 27 | time_obj = "" 28 | return time_obj 29 | 30 | if __name__ == "__main__": 31 | unique_ips = {} 32 | counts = {} 33 | start_time = datetime(year=2017, month=3, day=9) 34 | while True: 35 | lines = get_lines(start_time) 36 | ips, times = get_time_and_ip(lines) 37 | if len(times) > 0: 38 | start_time = times[-1] 39 | for ip, time_obj in zip(ips, times): 40 | day = time_obj.strftime("%d-%m-%Y") 41 | if day not in unique_ips: 42 | unique_ips[day] = set() 43 | unique_ips[day].add(ip) 44 | 45 | for k, v in unique_ips.items(): 46 | counts[k] = len(v) 47 | 48 | count_list = counts.items() 49 | count_list = sorted(count_list, key=lambda x: x[0]) 50 | 51 | print("") 52 | print(datetime.now()) 53 | for item in count_list: 54 | print("{}: {}".format(*item)) 55 | 56 | time.sleep(5) -------------------------------------------------------------------------------- /log_generator.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | from datetime import datetime 3 | import random 4 | import time 5 | 6 | LINE = """\ 7 | {remote_addr} - - [{time_local} +0000] "{request_type} {request_path} HTTP/1.1" {status} {body_bytes_sent} "{http_referer}" "{http_user_agent}"\ 8 | """ 9 | 10 | LOG_FILE_A = "log_a.txt" 11 | LOG_FILE_B = "log_b.txt" 12 | LOG_MAX = 100 13 | 14 | def generate_log_line(): 15 | fake = Faker() 16 | now = datetime.now() 17 | remote_addr = fake.ipv4() 18 | time_local = now.strftime('%d/%b/%Y:%H:%M:%S') 19 | request_type = random.choice(["GET", "POST", "PUT"]) 20 | request_path = "/" + fake.uri_path() 21 | 22 | status = random.choice([200, 401, 404]) 23 | body_bytes_sent = random.choice(range(5, 1000, 1)) 24 | http_referer = fake.uri() 25 | http_user_agent = fake.user_agent() 26 | 27 | log_line = LINE.format( 28 | remote_addr=remote_addr, 29 | time_local=time_local, 30 | request_type=request_type, 31 | request_path=request_path, 32 | status=status, 33 | body_bytes_sent=body_bytes_sent, 34 | http_referer=http_referer, 35 | http_user_agent=http_user_agent 36 | ) 37 | 38 | return log_line 39 | 40 | def write_log_line(log_file, line): 41 | with open(log_file, "a") as f: 42 | f.write(line) 43 | f.write("\n") 44 | 45 | def clear_log_file(log_file): 46 | with open(log_file, "w+") as f: 47 | f.write("") 48 | 49 | if __name__ == "__main__": 50 | current_log_file = LOG_FILE_A 51 | lines_written = 0 52 | 53 | clear_log_file(LOG_FILE_A) 54 | clear_log_file(LOG_FILE_B) 55 | 56 | while True: 57 | line = generate_log_line() 58 | 59 | write_log_line(current_log_file, line) 60 | lines_written += 1 61 | 62 | if lines_written % LOG_MAX == 0: 63 | new_log_file = LOG_FILE_B 64 | if current_log_file == LOG_FILE_B: 65 | new_log_file = LOG_FILE_A 66 | 67 | clear_log_file(new_log_file) 68 | current_log_file = new_log_file 69 | 70 | sleep_time = random.choice(range(1, 5, 1)) 71 | 72 | time.sleep(sleep_time) 73 | 74 | 75 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Faker==0.7.9 -------------------------------------------------------------------------------- /store_logs.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import sqlite3 4 | from datetime import datetime 5 | 6 | DB_NAME = "db.sqlite" 7 | 8 | def create_table(): 9 | conn = sqlite3.connect(DB_NAME) 10 | 11 | conn.execute(""" 12 | CREATE TABLE IF NOT EXISTS logs ( 13 | raw_log TEXT NOT NULL UNIQUE, 14 | remote_addr TEXT, 15 | time_local TEXT, 16 | request_type TEXT, 17 | request_path TEXT, 18 | status INTEGER, 19 | body_bytes_sent INTEGER, 20 | http_referer TEXT, 21 | http_user_agent TEXT, 22 | created DATETIME DEFAULT CURRENT_TIMESTAMP 23 | ) 24 | """) 25 | conn.close() 26 | 27 | def parse_line(line): 28 | split_line = line.split(" ") 29 | if len(split_line) < 12: 30 | return [] 31 | remote_addr = split_line[0] 32 | time_local = split_line[3] + " " + split_line[4] 33 | request_type = split_line[5] 34 | request_path = split_line[6] 35 | status = split_line[8] 36 | body_bytes_sent = split_line[9] 37 | http_referer = split_line[10] 38 | http_user_agent = " ".join(split_line[11:]) 39 | created = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 40 | 41 | return [ 42 | remote_addr, 43 | time_local, 44 | request_type, 45 | request_path, 46 | status, 47 | body_bytes_sent, 48 | http_referer, 49 | http_user_agent, 50 | created 51 | ] 52 | 53 | def insert_record(line, parsed): 54 | conn = sqlite3.connect(DB_NAME) 55 | cur = conn.cursor() 56 | args = [line] + parsed 57 | cur.execute('INSERT INTO logs VALUES (?,?,?,?,?,?,?,?,?,?)', args) 58 | conn.commit() 59 | conn.close() 60 | 61 | LOG_FILE_A = "log_a.txt" 62 | LOG_FILE_B = "log_b.txt" 63 | 64 | if __name__ == "__main__": 65 | create_table() 66 | try: 67 | f_a = open(LOG_FILE_A, 'r') 68 | f_b = open(LOG_FILE_B, 'r') 69 | while True: 70 | where_a = f_a.tell() 71 | line_a = f_a.readline() 72 | where_b = f_b.tell() 73 | line_b = f_b.readline() 74 | 75 | if not line_a and not line_b: 76 | time.sleep(1) 77 | f_a.seek(where_a) 78 | f_b.seek(where_b) 79 | continue 80 | else: 81 | if line_a: 82 | line = line_a 83 | else: 84 | line = line_b 85 | 86 | line = line.strip() 87 | parsed = parse_line(line) 88 | if len(parsed) > 0: 89 | insert_record(line, parsed) 90 | except KeyboardInterrupt: 91 | pass 92 | except Exception as e: 93 | print(e) 94 | finally: 95 | f_a.close() 96 | f_b.close() 97 | sys.exit() --------------------------------------------------------------------------------