├── data └── .gitkeep ├── etl ├── __init__.py ├── job │ ├── __init__.py │ └── etl_job.py ├── loader │ ├── __init__.py │ ├── db │ │ ├── __init__.py │ │ ├── base.py │ │ └── flight.py │ ├── chunk_generator.py │ └── azair_content_loader.py ├── extractor │ ├── __init__.py │ └── azair_content_parser.py ├── transformer │ ├── __init__.py │ └── azair_content_transformer.py ├── __main__.py └── config.py ├── tests ├── __init__.py ├── test_azair_content_loader.py ├── test_chunk_generator.py ├── test_azair_content_parser.py ├── fixtures.py └── test_azair_content_transformer.py ├── .flake8 ├── images └── etl.png ├── .gitignore ├── .dockerignore ├── requirements.txt ├── run.sh ├── Dockerfile ├── README.md ├── .github └── workflows │ └── etljob-ci.yml └── LICENSE /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl/job/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl/loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl/extractor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude: 3 | __pycache__ 4 | tests/* 5 | __init__.py -------------------------------------------------------------------------------- /images/etl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/damklis/etljob/HEAD/images/etl.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | __pycache__ 3 | .idea 4 | .pytest_cache 5 | *.pyc 6 | *.sqlite* -------------------------------------------------------------------------------- /etl/__main__.py: -------------------------------------------------------------------------------- 1 | from etl.job.etl_job import run_etl_job 2 | 3 | if __name__ == "__main__": 4 | run_etl_job() 5 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /images 2 | README.md 3 | LICENSE 4 | *.sqlite* 5 | *.sh 6 | .vscode 7 | __pycache__ 8 | .idea 9 | .pytest_cache -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | SQLAlchemy==1.3.15 2 | requests==2.20.0 3 | beautifulsoup4==4.6.0 4 | lxml==4.6.3 5 | pytest==5.4.1 6 | flake8==3.9.2 -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xe 4 | 5 | IMAGE_TAG=${1:-v0.1} 6 | 7 | docker run --rm \ 8 | --volume data:/usr/src/etljob/data \ 9 | etljob:$IMAGE_TAG -------------------------------------------------------------------------------- /etl/loader/db/__init__.py: -------------------------------------------------------------------------------- 1 | from etl.loader.db.flight import Flight 2 | from etl.loader.db.base import ( 3 | engine, Session, Base 4 | ) 5 | 6 | 7 | Base.metadata.create_all(engine) 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | WORKDIR /usr/src/etljob 4 | 5 | COPY . . 6 | 7 | RUN pip install --upgrade pip && \ 8 | pip install --no-cache-dir -r requirements.txt 9 | 10 | CMD [ "python", "-m", "etl" ] -------------------------------------------------------------------------------- /etl/loader/chunk_generator.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | 3 | 4 | def generate_chunk(content, chunk_size): 5 | iterable_content = iter(content) 6 | 7 | while True: 8 | batch = list(islice(iterable_content, chunk_size)) 9 | if not batch: 10 | return 11 | yield batch 12 | -------------------------------------------------------------------------------- /etl/loader/db/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sqlalchemy import create_engine 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from etl.config import ETLConfig as cfg 6 | 7 | 8 | db_path = "sqlite:///" + os.path.join(cfg.DATA_DIR, f"{cfg.DB_NAME}.sqlite3") 9 | 10 | engine = create_engine(db_path) 11 | 12 | Session = sessionmaker(bind=engine) 13 | 14 | Base = declarative_base() 15 | -------------------------------------------------------------------------------- /tests/test_azair_content_loader.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | import pytest 3 | from etl.loader.chunk_generator import generate_chunk 4 | 5 | from .fixtures import generate_content_stream 6 | 7 | 8 | @mock.patch("etl.loader.azair_content_loader.AZAirContentLoader.load_content") 9 | @mock.patch("sqlalchemy.ext.declarative.declarative_base") 10 | def test_load_content(load_content, base): 11 | 12 | content_stream = generate_content_stream() 13 | load_content(content_stream) 14 | 15 | assert load_content.called_once_with(content_stream) 16 | -------------------------------------------------------------------------------- /tests/test_chunk_generator.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import resource_string 2 | import pytest 3 | from etl.loader.chunk_generator import generate_chunk 4 | from etl.transformer.azair_content_transformer import AZAirContentTransformer 5 | 6 | from .fixtures import generate_content_stream 7 | 8 | 9 | def test_generate_chunk(): 10 | 11 | expected = 2 12 | 13 | content_stream = generate_content_stream() 14 | chunks = generate_chunk(content_stream, chunk_size=2) 15 | result = len(next(chunks)) 16 | 17 | assert result == expected 18 | -------------------------------------------------------------------------------- /etl/loader/azair_content_loader.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict 2 | from etl.loader.chunk_generator import generate_chunk 3 | from etl.loader.db import ( 4 | Session, Flight 5 | ) 6 | 7 | 8 | class AZAirContentLoader: 9 | def __init__(self, chunk_size=10): 10 | self.chunk_size = chunk_size 11 | self.session = Session() 12 | 13 | def load_content(self, content): 14 | for chunk in generate_chunk(content, self.chunk_size): 15 | objects = [Flight(**asdict(row)) for row in chunk] 16 | self.session.bulk_save_objects(objects) 17 | self.session.commit() 18 | 19 | self.session.close() 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Example ETL job using Python 2 | ![Build](https://github.com/damklis/etljob/actions/workflows/etljob-ci.yml/badge.svg) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-360/) 3 | 4 | This example ETL jobs scrapes data from `azair.com`, formulates records and saves them into the SQLite database. 5 | 6 | !["ETL"](./images/etl.png) 7 | 8 | ## Requirements 9 | 10 | 1. [Docker](https://www.docker.com/) 11 | 12 | 13 | ## Run ETL job 14 | You can use optional parameter with tag version. (eg. `v0.1`) 15 | 1. Build docker image 16 | ```sh 17 | ./build.sh 18 | ``` 19 | 20 | 2. Run docker image 21 | ```sh 22 | ./run.sh 23 | ``` 24 | 25 | 26 | ## Run tests 27 | 28 | ```sh 29 | pytest --show-capture=no 30 | ``` -------------------------------------------------------------------------------- /tests/test_azair_content_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import requests 3 | from etl.extractor.azair_content_parser import AZAirContentParser 4 | 5 | from .fixtures import azair_parser 6 | 7 | 8 | def test_extract_content(azair_parser): 9 | 10 | expected_bytes = bytes 11 | 12 | result = azair_parser.extract_content() 13 | 14 | assert isinstance(result, expected_bytes) 15 | 16 | 17 | def test_is_good_response(azair_parser): 18 | 19 | url = "http://www.azair.com/" 20 | bad_url = "http://www.azair.com/poland" 21 | 22 | good_response = requests.get(url) 23 | bad_response = requests.get(bad_url) 24 | 25 | result = azair_parser.is_good_response(good_response) 26 | result2 = azair_parser.is_good_response(bad_response) 27 | 28 | assert result is True 29 | assert result2 is False 30 | -------------------------------------------------------------------------------- /.github/workflows/etljob-ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build ETL 5 | 6 | on: 7 | pull_request: 8 | branches: [ master ] 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 3.8 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.8 21 | - name: Install dependencies 22 | run: | 23 | pip install -r requirements.txt 24 | - name: Lint with flake8 25 | run: | 26 | flake8 . --count --exit-zero --max-complexity=10 --statistics 27 | - name: Test with pytest 28 | run: | 29 | pytest --show-capture=no 30 | -------------------------------------------------------------------------------- /tests/fixtures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pkg_resources import resource_string 3 | from etl.transformer.azair_content_transformer import AZAirContentTransformer 4 | from etl.extractor.azair_content_parser import AZAirContentParser 5 | 6 | 7 | def generate_content_stream(): 8 | 9 | parser = "lxml" 10 | transformer = AZAirContentTransformer(parser) 11 | raw_content = resource_string( 12 | __name__, "test_data/raw_content.txt" 13 | ) 14 | for flight in transformer.transform_raw_content(raw_content): 15 | yield flight 16 | 17 | 18 | @pytest.fixture() 19 | def azair_parser(): 20 | url = "http://www.azair.com/azfin.php" 21 | params = { 22 | "tp" : 0, 23 | "searchtype" : "flexi" 24 | } 25 | yield AZAirContentParser( 26 | url=url, 27 | params=params 28 | ) 29 | 30 | 31 | @pytest.fixture() 32 | def transformer(): 33 | parser = "lxml" 34 | yield AZAirContentTransformer(parser) 35 | 36 | 37 | @pytest.fixture() 38 | def example_raw_content(): 39 | yield resource_string( 40 | __name__, 41 | "test_data/raw_content.txt" 42 | ) -------------------------------------------------------------------------------- /etl/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import timedelta, date 3 | 4 | 5 | class ETLConfig: 6 | 7 | PROJECT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) 8 | 9 | DATA_DIR = os.path.join(PROJECT_DIR, "data") 10 | 11 | PARSER = "lxml" 12 | 13 | TIMEDELTA = 30 14 | 15 | _departure = date.today().strftime("%d.%m.%Y") 16 | _arrival = (date.today() + timedelta(days=TIMEDELTA)).strftime("%d.%m.%Y") 17 | 18 | URL = "http://www.azair.com/azfin.php" 19 | 20 | PARAMS = { 21 | "tp": 0, 22 | "searchtype": "flexi", 23 | "srcAirport": "Krakow+[KRK]+(%2BKTW)", 24 | "srcTypedText": "krak", 25 | "srcFreeTypedText": "", 26 | "srcMC": "", 27 | "srcap0": "KTW", 28 | "srcFreeAirport": "", 29 | "dstAirport": "Anywhere+[XXX]", 30 | "dstTypedText": "any", 31 | "dstFreeTypedText": "", 32 | "dstMC": "", 33 | "adults": 1, 34 | "depdate": f"{_departure}", 35 | "arrdate": f"{_arrival}", 36 | "minDaysStay": 3, 37 | "maxDaysStay": 7, 38 | "currency": "PLN", 39 | "maxChng": 1, 40 | "isOneway": "return" 41 | } 42 | 43 | CHUNK_SIZE = 20 44 | 45 | DB_NAME = "flights_db" 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, Damian Klis 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | Redistributions in binary form must reproduce the above copyright notice, 9 | this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 11 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 12 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 13 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 14 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 15 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /etl/loader/db/flight.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import ( 2 | Column, String, Integer 3 | ) 4 | from etl.loader.db.base import Base 5 | 6 | 7 | class Flight(Base): 8 | 9 | __tablename__ = "flights" 10 | 11 | id = Column(Integer, primary_key=True) 12 | uuid = Column(String) 13 | direction = Column(String) 14 | day = Column(String) 15 | flight_date = Column(String) 16 | start = Column(String) 17 | departure = Column(String) 18 | target = Column(String) 19 | arrival = Column(String) 20 | duration = Column(String) 21 | change = Column(Integer) 22 | price = Column(String) 23 | created_at = Column(String) 24 | 25 | def __init__( 26 | self, uuid, direction, day, flight_date, start, 27 | departure, target, arrival, duration, change, price, created_at): 28 | self.uuid = uuid 29 | self.direction = direction 30 | self.flight_date = flight_date 31 | self.day = day 32 | self.start = start 33 | self.departure = departure 34 | self.target = target 35 | self.arrival = arrival 36 | self.duration = duration 37 | self.change = change 38 | self.price = price 39 | self.created_at = created_at 40 | 41 | def __str__(self): 42 | return f"Flight from {self.start} to {self.direction}." 43 | -------------------------------------------------------------------------------- /tests/test_azair_content_transformer.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from dataclasses import asdict 3 | from pkg_resources import resource_string 4 | import pytest 5 | import requests 6 | from etl.transformer.azair_content_transformer import ( 7 | AZAirContentTransformer, FlightRow 8 | ) 9 | 10 | from .fixtures import transformer, example_raw_content 11 | 12 | 13 | def test_raw_content_type(transformer, example_raw_content): 14 | 15 | expected = FlightRow 16 | 17 | content = transformer.transform_raw_content(example_raw_content) 18 | result = next(content) 19 | 20 | assert isinstance(result, expected) 21 | 22 | 23 | def test_raw_content_keys(transformer, example_raw_content): 24 | 25 | expected = [ 26 | "uuid", "direction", "day", "flight_date", "start", 27 | "departure", "target","arrival", "duration", 28 | "change", "price", "created_at" 29 | ] 30 | 31 | content = transformer.transform_raw_content(example_raw_content) 32 | result = [key for key in asdict(next(content)).keys()] 33 | 34 | assert set(result) == set(expected) 35 | 36 | 37 | def test_raw_content_date(transformer, example_raw_content): 38 | 39 | expected = str(date.today()) 40 | 41 | content = transformer.transform_raw_content(example_raw_content) 42 | result = asdict(next(content)).get("created_at") 43 | 44 | assert result == expected 45 | -------------------------------------------------------------------------------- /etl/extractor/azair_content_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from contextlib import closing 3 | from requests import get 4 | from requests.exceptions import ( 5 | ConnectionError, 6 | HTTPError 7 | ) 8 | 9 | 10 | class AZAirContentParser: 11 | def __init__(self, url, params): 12 | self.url = url 13 | self.params = params 14 | 15 | def extract_content(self): 16 | """ 17 | Attempts to get the content at `url` by making an HTTP GET request 18 | If the content-type of response is some kind of HTML/XML, return the 19 | text content, otherwise return None 20 | """ 21 | try: 22 | with closing(get(self.url, self.params, stream=True)) as resp: 23 | if self.is_good_response(resp): 24 | return resp.content 25 | else: 26 | return None 27 | 28 | except ConnectionError as con_err: 29 | print(f"Connection error occurred. More info: {con_err}") 30 | 31 | except HTTPError as http_err: 32 | print(f"HTTP error occurred. More info: {http_err}") 33 | 34 | @staticmethod 35 | def is_good_response(response): 36 | """ 37 | Returns True if the response seems to be HTML, False otherwise 38 | """ 39 | content_type = response.headers['Content-Type'].lower() 40 | return ( 41 | response.status_code == 200 42 | and content_type is not None 43 | and content_type.find('html') > -1 44 | ) 45 | 46 | def __str__(self): 47 | clean_link = re.sub("(http[s]?://|www.)", "", self.url) 48 | domain, _ = clean_link.split(".") 49 | return f'WebContentParser of {domain.upper()}' 50 | -------------------------------------------------------------------------------- /etl/job/etl_job.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | from etl.config import ETLConfig as cfg 3 | from etl.extractor.azair_content_parser import AZAirContentParser 4 | from etl.transformer.azair_content_transformer import AZAirContentTransformer 5 | from etl.loader.azair_content_loader import AZAirContentLoader 6 | 7 | 8 | class ETLJob: 9 | def __init__(self, extractor, transformer, loader): 10 | self.extractor = extractor 11 | self.transformer = transformer 12 | self.loader = loader 13 | 14 | def run(self): 15 | try: 16 | self._execute_pipeline() 17 | return "SUCCESS" 18 | except Exception as err: 19 | print(err) 20 | return "FAILED" 21 | 22 | def _execute_pipeline(self): 23 | raw_content = self.extractor.extract_content() 24 | 25 | content = self.transformer.transform_raw_content( 26 | raw_content=raw_content 27 | ) 28 | self.loader.load_content(content=content) 29 | 30 | 31 | def time_func(function): 32 | 33 | def wrapper(*args, **kwargs): 34 | 35 | execution_start = time() 36 | result = function(*args, **kwargs) 37 | execution_end = time() 38 | 39 | execution_time = (execution_end - execution_start) 40 | print(f"Execution time: {execution_time:.2f} s") 41 | 42 | return result 43 | 44 | return wrapper 45 | 46 | 47 | @time_func 48 | def run_etl_job(): 49 | 50 | extractor = AZAirContentParser(cfg.URL, cfg.PARAMS) 51 | transformer = AZAirContentTransformer(cfg.PARSER) 52 | loader = AZAirContentLoader(cfg.CHUNK_SIZE) 53 | 54 | etl = ETLJob( 55 | extractor=extractor, 56 | transformer=transformer, 57 | loader=loader 58 | ) 59 | 60 | status = etl.run() 61 | print(f"Job status: {status}") 62 | -------------------------------------------------------------------------------- /etl/transformer/azair_content_transformer.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from datetime import date 3 | from dataclasses import dataclass 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | @dataclass(frozen=True) 8 | class FlightRow: 9 | 10 | uuid: str 11 | direction: str 12 | day: str 13 | flight_date: str 14 | start: str 15 | departure: str 16 | target: str 17 | arrival: str 18 | duration: str 19 | change: str 20 | price: str 21 | created_at: str = str(date.today()) 22 | 23 | 24 | class AZAirContentTransformer: 25 | def __init__(self, parser): 26 | self.parser = parser 27 | self.row_formatter = self.RowFromatter() 28 | 29 | def transform_raw_content(self, raw_content): 30 | """ 31 | Returns dictionary with provided data as row 32 | """ 33 | bs_object = BeautifulSoup(raw_content, self.parser) 34 | 35 | return ( 36 | row for row 37 | in self.row_formatter.create_row(bs_object) 38 | ) 39 | 40 | class RowFromatter: 41 | 42 | _routes = ["caption tam", "caption sem"] 43 | 44 | def generate_uuid(self): 45 | """ 46 | Returns unique flight identification number 47 | """ 48 | return str(uuid.uuid4()) 49 | 50 | def extract_result_classes(self, bs_object): 51 | """ 52 | Returns list of HTML result's class that follow provided name 53 | """ 54 | return ( 55 | result for result 56 | in bs_object.find_all(class_="result") 57 | ) 58 | 59 | def map_row(self, ptag, _id): 60 | """ 61 | Returns dictionary with flight data as row 62 | """ 63 | day, flight_date = ptag.find("span", "date").text.split(" ") 64 | _from = ptag.find("span", "from").text.strip() 65 | ffloor = _from.find(" ") 66 | departure, start = (_from[:ffloor], _from[ffloor:]) 67 | duration, change = ptag.find("span", "durcha").text.split("/") 68 | _to = ptag.find("span", "to").text 69 | arrival, target = (_to[:ffloor], _to[ffloor:]) 70 | direction = ptag.find("span", class_=self._routes).text 71 | price = ptag.find("span", "subPrice").text.split(" ")[0] 72 | 73 | return FlightRow( 74 | _id, 75 | direction, 76 | day, 77 | flight_date, 78 | start.strip(), 79 | departure, 80 | target.strip(), 81 | arrival, 82 | duration[:5].strip(), 83 | change.split(" ")[1].replace("no", "0"), 84 | price 85 | ) 86 | 87 | def extract_ptags(self, result): 88 | """ 89 | Returns list of tags that follow provided condition 90 | """ 91 | return ( 92 | tag for tag in result.find_all("p") 93 | if "caption " in str(tag) 94 | ) 95 | 96 | def create_row(self, bs_object): 97 | """ 98 | Returns JSON object containing detailed information 99 | about each flight. Every object in data list represents 100 | row in database 101 | """ 102 | for result in self.extract_result_classes(bs_object): 103 | uuid = self.generate_uuid() 104 | for ptag in self.extract_ptags(result): 105 | yield self.map_row( 106 | ptag, uuid 107 | ) 108 | --------------------------------------------------------------------------------