├── exercises ├── .gitkeep ├── gtfs.sqlite ├── trees.sqlite ├── airports.sqlite ├── country-stats.sqlite ├── temperatures.sqlite ├── exercise1.jv ├── exercise2.jv ├── exercise5.jv ├── exercise3.jv └── exercise4.jv ├── project ├── .gitkeep ├── tests.sh ├── pipeline.sh ├── data-report.pdf ├── analysis-report.pdf ├── project-plan.md ├── pipeline.py └── tests.py ├── .gitignore ├── examples ├── data.sqlite ├── project-plan-example.md └── data-exploration-example.ipynb ├── .github └── workflows │ ├── test.yml │ └── exercise-feedback.yml └── README.md /exercises/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /project/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /data/* 3 | !/data/.gitkeep -------------------------------------------------------------------------------- /project/tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python ./project/tests.py -------------------------------------------------------------------------------- /project/pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python3 ./project/pipeline.py -------------------------------------------------------------------------------- /examples/data.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/examples/data.sqlite -------------------------------------------------------------------------------- /exercises/gtfs.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/gtfs.sqlite -------------------------------------------------------------------------------- /exercises/trees.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/trees.sqlite -------------------------------------------------------------------------------- /exercises/airports.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/airports.sqlite -------------------------------------------------------------------------------- /project/data-report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/project/data-report.pdf -------------------------------------------------------------------------------- /project/analysis-report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/project/analysis-report.pdf -------------------------------------------------------------------------------- /exercises/country-stats.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/country-stats.sqlite -------------------------------------------------------------------------------- /exercises/temperatures.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/temperatures.sqlite -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | run: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Check out repository code 14 | uses: actions/checkout@v3 15 | 16 | # python version 3.9 installations 17 | - name: Set up Python 3.9 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: 3.9 21 | 22 | - name: Install all dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install pandas 26 | pip install sqlalchemy 27 | pip install requests 28 | 29 | 30 | 31 | - name: Run tests file 32 | run: | 33 | chmod +x ./project/tests.sh 34 | ./project/tests.sh -------------------------------------------------------------------------------- /examples/project-plan-example.md: -------------------------------------------------------------------------------- 1 | # Project Plan 2 | 3 | ## Title 4 | 5 | Awesome MADE project. 6 | 7 | ## Main Question 8 | 9 | 10 | 1. Does writing an example question help students write better project plans? 11 | 12 | ## Description 13 | 14 | 15 | XY is an important problem, because... This projects analyzes XY, using method A. The results can give insights into... 16 | 17 | ## Datasources 18 | 19 | 20 | 21 | ### Datasource1: ExampleSource 22 | * Metadata URL: https://mobilithek.info/offers/-6901989592576801458 23 | * Data URL: https://raw.githubusercontent.com/od-ms/radverkehr-zaehlstellen/main/100035541/2019-01.csv 24 | * Data Type: CSV 25 | 26 | Short description of the DataSource. 27 | 28 | ## Work Packages 29 | 30 | 31 | 32 | 1. Example Issue [#1][i1] 33 | 2. ... 34 | 35 | [i1]: https://github.com/jvalue/made-template/issues/1 36 | -------------------------------------------------------------------------------- /.github/workflows/exercise-feedback.yml: -------------------------------------------------------------------------------- 1 | name: Exercise Feedback 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - exercises/** 9 | 10 | jobs: 11 | exercise-feedback: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v4 16 | with: 17 | path: main 18 | 19 | - name: Checkout exercise feedback 20 | uses: actions/checkout@v4 21 | with: 22 | repository: jvalue/made-exercise-feedback 23 | path: grading 24 | 25 | - name: Set up Node 26 | uses: actions/setup-node@v4 27 | with: 28 | node-version: "lts/*" 29 | 30 | # Install jayvee 31 | - name: Set up Jayvee 32 | run: | 33 | ./grading/ci/setup_jayvee.sh 34 | 35 | # Install python 36 | - name: Set up Python 3.11 37 | uses: actions/setup-python@v5 38 | with: 39 | python-version: 3.11 40 | 41 | # Run grading feedback 42 | - name: Exercise feedback 43 | id: ex_feedback 44 | run: | 45 | ./grading/ci/run_grading.sh 46 | 47 | # Upload feedback 48 | - name: Upload feedback 49 | uses: actions/upload-artifact@v4 50 | with: 51 | name: exercise-feedback 52 | path: ./grading/feedback-ex?.txt 53 | -------------------------------------------------------------------------------- /exercises/exercise1.jv: -------------------------------------------------------------------------------- 1 | 2 | // Name: KM Rashedul Alam 3 | 4 | pipeline AirportsPipeline { 5 | 6 | AirportsExtractor 7 | -> AirportsTextFileInterpreter 8 | -> AirportsCSVInterpreter 9 | -> AirportsTableInterpreter 10 | -> AirportsLoader; 11 | 12 | block AirportsExtractor oftype HttpExtractor { 13 | 14 | // provided data source 15 | 16 | url: "https://opendata.rhein-kreis-neuss.de/api/explore/v2.1/catalog/datasets/rhein-kreis-neuss-flughafen-weltweit/exports/csv?lang=en&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B"; 17 | } 18 | 19 | 20 | block AirportsTextFileInterpreter oftype TextFileInterpreter { } 21 | 22 | // correct table 23 | 24 | block AirportsCSVInterpreter oftype CSVInterpreter { 25 | delimiter: ";"; 26 | } 27 | 28 | 29 | block AirportsTableInterpreter oftype TableInterpreter { 30 | header: true; 31 | 32 | columns: [ 33 | "Lfd. Nummer" oftype integer, 34 | "Name des Flughafens" oftype text, 35 | "Ort" oftype text, 36 | "Land" oftype text, 37 | "IATA" oftype text, 38 | "ICAO" oftype text, 39 | "Latitude" oftype decimal, 40 | "Longitude" oftype decimal, 41 | "Altitude" oftype integer 42 | 43 | 44 | ]; 45 | } 46 | 47 | block AirportsLoader oftype SQLiteLoader { 48 | table: "airports"; 49 | file: "./airports.sqlite"; 50 | } 51 | 52 | 53 | } -------------------------------------------------------------------------------- /project/project-plan.md: -------------------------------------------------------------------------------- 1 | # Project Plan 2 | 3 | ## Title 4 | 5 | Correlation Between Education Spending and GDP Growth in North America 6 | 7 | ## Main Question 8 | 9 | 10 | 1. How does government expenditure on education as a percentage of GDP correlate with GDP growth in North American countries from 2016 to 2023? 11 | 12 | ## Description 13 | 14 | 15 | The project will investigate the potential relationship between public investment in education and economic growth, providing insights into how education funding impacts economic performance. 16 | 17 | ## Datasources 18 | 19 | 20 | 21 | ### Datasource1: Worldbank (Government expenditure on education, total (% of GDP)) 22 | * Metadata URL: https://data.worldbank.org/indicator/SE.XPD.TOTL.GD.ZS 23 | * Data URL: https://api.worldbank.org/v2/en/indicator/SE.XPD.TOTL.GD.ZS?downloadformat=csv 24 | * Data Type: Zip->CSV 25 | 26 | This dataset contains data on government expenditure on education as a percentage of GDP, covering countries worldwide. For this project, only data from North American countries for the years 2016–2023 were extracted 27 | 28 | ### Datasource2: Worldbank (GDP growth (annual %)) 29 | * Metadata URL: https://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG 30 | * Data URL: https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.KD.ZG?downloadformat=csv 31 | * Data Type: Zip->CSV 32 | 33 | This dataset contains GDP growth rates for countries worldwide. We focused on North American countries for the years 2016–2023. 34 | 35 | ## Work Packages 36 | 37 | 38 | 39 | 1. Dataset selection 40 | 2. Building an automated data pipeline 41 | 3. Exploratory Data Analysis (EDA). 42 | 4. Reporting on findings 43 | -------------------------------------------------------------------------------- /exercises/exercise2.jv: -------------------------------------------------------------------------------- 1 | // Exercise 02 on Jayvee pipeline to process tree planting dataset 2 | // KM Rashedul Alam 3 | pipeline TreePlantingDatasetPipeline { 4 | 5 | // Workflow Pipeline 6 | TreeDatasetExtractor 7 | -> TxtFileInterpreter 8 | -> CsvDataInterpreter 9 | -> TableInterpreter 10 | -> SQLiteFinalDataLoader; 11 | 12 | // Extract CSV data from give URL 13 | block TreeDatasetExtractor oftype HttpExtractor { 14 | url: "https://opendata.rhein-kreis-neuss.de/api/v2/catalog/datasets/stadt-neuss-herbstpflanzung-2023/exports/csv"; 15 | } 16 | 17 | // Interpret the extracted file as plain text to prepare for CSV parsing 18 | block TxtFileInterpreter oftype TextFileInterpreter { } 19 | 20 | // Interpreting the text data as CSV data 21 | block CsvDataInterpreter oftype CSVInterpreter { 22 | delimiter: ";"; 23 | } 24 | 25 | // data validation 26 | constraint DistrictNameConstraint oftype RegexConstraint { 27 | regex: /^Vogelsang/; 28 | } 29 | 30 | constraint GeoPointsConstraint oftype RegexConstraint { 31 | regex: /^\d{1,3}\.\d+,\s*\d{1,3}\.\d+$/; 32 | } 33 | 34 | valuetype DistrictType oftype text { 35 | constraints: [ DistrictNameConstraint ]; 36 | } 37 | 38 | // Custom type 'id' with the geopoint format constraints 39 | valuetype GeoPointType oftype text { 40 | constraints: [ GeoPointsConstraint ]; 41 | } 42 | 43 | //Interpreting CSV data 44 | block TableInterpreter oftype TableInterpreter { 45 | header: true; 46 | 47 | // validation and dropping 'baumart_deutsch' 48 | columns: [ 49 | "lfd_nr" oftype integer, 50 | "stadtteil" oftype DistrictType, 51 | "standort" oftype text, 52 | "baumart_botanisch" oftype text, 53 | "id" oftype GeoPointType, 54 | "baumfamilie" oftype text 55 | ]; 56 | } 57 | 58 | // Loading the validated data into a SQLite DB 59 | block SQLiteFinalDataLoader oftype SQLiteLoader { 60 | file: "./trees.sqlite"; // Target 61 | table: "trees"; // Given Name of DB 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /exercises/exercise5.jv: -------------------------------------------------------------------------------- 1 | pipeline GTFSProcessor { 2 | 3 | // Download GTFS data from the specified URL 4 | GTFSDownloader 5 | -> ArchiveHandler 6 | -> FilePickerStops 7 | -> FileDecoderStops 8 | -> CSVParserStops 9 | -> TableHandlerStops 10 | -> DatabaseWriter; 11 | 12 | // Block to download the GTFS zip file 13 | block GTFSDownloader oftype HttpExtractor { 14 | url: "https://gtfs.rhoenenergie-bus.de/GTFS.zip"; 15 | } 16 | 17 | // Block to interpret the downloaded zip archive 18 | block ArchiveHandler oftype ArchiveInterpreter { 19 | archiveType: "zip"; 20 | } 21 | 22 | // Block to select the stops.txt file from the zip archive 23 | block FilePickerStops oftype FilePicker { 24 | path: "./stops.txt"; 25 | } 26 | 27 | // Block to decode the stops.txt file as UTF-8 text 28 | block FileDecoderStops oftype TextFileInterpreter { 29 | encoding: "utf8"; 30 | } 31 | 32 | // Block to interpret the stops.txt file as CSV 33 | block CSVParserStops oftype CSVInterpreter { 34 | delimiter: ","; 35 | enclosing: '"'; 36 | } 37 | 38 | // Custom constraints for validation 39 | constraint LatitudeLongitudeRange oftype RangeConstraint { 40 | lowerBound: -90; 41 | upperBound: 90; 42 | lowerBoundInclusive: true; 43 | upperBoundInclusive: true; 44 | } 45 | 46 | valuetype Coordinates oftype decimal { 47 | constraints: [LatitudeLongitudeRange]; 48 | } 49 | 50 | constraint FixedZoneID oftype RangeConstraint { 51 | lowerBound: 1925; 52 | upperBound: 1925; 53 | lowerBoundInclusive: true; 54 | upperBoundInclusive: true; 55 | } 56 | 57 | valuetype ZoneIdentifier oftype integer { 58 | constraints: [FixedZoneID]; 59 | } 60 | 61 | // Block to define the table structure and filter data 62 | block TableHandlerStops oftype TableInterpreter { 63 | header: true; 64 | columns: [ 65 | "stop_id" oftype integer, 66 | "stop_name" oftype text, 67 | "stop_lat" oftype Coordinates, 68 | "stop_lon" oftype Coordinates, 69 | "zone_id" oftype ZoneIdentifier 70 | ]; 71 | } 72 | 73 | // Block to write the processed data into a SQLite database 74 | block DatabaseWriter oftype SQLiteLoader { 75 | table: "stops"; 76 | file: "gtfs.sqlite"; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Methods of Advanced Data Engineering Template Project 2 | 3 | This template project provides some structure for your open data project in the MADE module at FAU. 4 | This repository contains (a) a data science project that is developed by the student over the course of the semester, and (b) the exercises that are submitted over the course of the semester. 5 | 6 | To get started, please follow these steps: 7 | 1. Create your own fork of this repository. Feel free to rename the repository right after creation, before you let the teaching instructors know your repository URL. **Do not rename the repository during the semester**. 8 | 9 | ## Project Work 10 | Your data engineering project will run alongside lectures during the semester. We will ask you to regularly submit project work as milestones, so you can reasonably pace your work. All project work submissions **must** be placed in the `project` folder. 11 | 12 | ### Exporting a Jupyter Notebook 13 | Jupyter Notebooks can be exported using `nbconvert` (`pip install nbconvert`). For example, to export the example notebook to HTML: `jupyter nbconvert --to html examples/final-report-example.ipynb --embed-images --output final-report.html` 14 | 15 | 16 | ## Exercises 17 | During the semester you will need to complete exercises using [Jayvee](https://github.com/jvalue/jayvee). You **must** place your submission in the `exercises` folder in your repository and name them according to their number from one to five: `exercise.jv`. 18 | 19 | In regular intervals, exercises will be given as homework to complete during the semester. Details and deadlines will be discussed in the lecture, also see the [course schedule](https://made.uni1.de/). 20 | 21 | ### Exercise Feedback 22 | We provide automated exercise feedback using a GitHub action (that is defined in `.github/workflows/exercise-feedback.yml`). 23 | 24 | To view your exercise feedback, navigate to Actions → Exercise Feedback in your repository. 25 | 26 | The exercise feedback is executed whenever you make a change in files in the `exercise` folder and push your local changes to the repository on GitHub. To see the feedback, open the latest GitHub Action run, open the `exercise-feedback` job and `Exercise Feedback` step. You should see command line output that contains output like this: 27 | 28 | ```sh 29 | Found exercises/exercise1.jv, executing model... 30 | Found output file airports.sqlite, grading... 31 | Grading Exercise 1 32 | Overall points 17 of 17 33 | --- 34 | By category: 35 | Shape: 4 of 4 36 | Types: 13 of 13 37 | ``` 38 | -------------------------------------------------------------------------------- /exercises/exercise3.jv: -------------------------------------------------------------------------------- 1 | pipeline EconomicDataPipeline { 2 | 3 | // Step 1: Download the Excel file directly from the given URL 4 | DataFetcher 5 | -> FileInterpreter 6 | -> SheetExtractor 7 | -> DataSelector 8 | -> BondDataCleaner 9 | -> BondDataParser 10 | -> BondDataSaver; 11 | 12 | // Step 2: Extract GDP per Capita data separately 13 | DataSelector 14 | -> GdpDataFilter 15 | -> GdpDataParser 16 | -> GdpDataSaver; 17 | 18 | // Block to download the Excel file from the provided link 19 | block DataFetcher oftype HttpExtractor { 20 | url: "https://thedocs.worldbank.org/en/doc/7d852628d96b9411d43e5d36d5dff941-0050062022/original/Graphs-Chapter-5-02082022.xlsx"; 21 | } 22 | 23 | // Interpret the Excel workbook 24 | block FileInterpreter oftype XLSXInterpreter { } 25 | 26 | // Select the specific sheet named "Figure S5.1.2" 27 | block SheetExtractor oftype SheetPicker { 28 | sheetName: "Figure S5.1.2"; 29 | } 30 | 31 | // Extract the data range from P2 to S45 32 | block DataSelector oftype CellRangeSelector { 33 | select: range P2:S45; 34 | } 35 | 36 | // Filter out unnecessary columns for bond data 37 | block BondDataCleaner oftype ColumnDeleter { 38 | delete: [column B, column C]; 39 | } 40 | 41 | // Filter columns for GDP data extraction 42 | block GdpDataFilter oftype ColumnDeleter { 43 | delete: [column B, column D]; 44 | } 45 | 46 | // Parse Bond Issuance data 47 | block BondDataParser oftype TableInterpreter { 48 | header: false; 49 | columns: [ 50 | "Country Code" oftype CountryCodeAlpha3, 51 | "Bond Issuance Share" oftype BondPercentage 52 | ]; 53 | } 54 | 55 | // Parse GDP per Capita data 56 | block GdpDataParser oftype TableInterpreter { 57 | header: false; 58 | columns: [ 59 | "Country Code" oftype CountryCodeAlpha3, 60 | "GDP per Capita" oftype GdpAmount 61 | ]; 62 | } 63 | 64 | // Save Bond Issuance data into SQLite database 65 | block BondDataSaver oftype SQLiteLoader { 66 | table: "bondIssuance"; 67 | file: "./country-stats.sqlite"; 68 | } 69 | 70 | // Save GDP per Capita data into SQLite database 71 | block GdpDataSaver oftype SQLiteLoader { 72 | table: "gdpPerCapita"; 73 | file: "./country-stats.sqlite"; 74 | } 75 | 76 | // Define custom value types with constraints 77 | valuetype GdpAmount oftype decimal { 78 | constraints: [PositiveGdpCheck]; 79 | } 80 | 81 | constraint PositiveGdpCheck on decimal: value > 0; 82 | 83 | valuetype BondPercentage oftype decimal { 84 | constraints: [BondShareRange]; 85 | } 86 | 87 | constraint BondShareRange oftype RangeConstraint { 88 | lowerBound: 0; 89 | lowerBoundInclusive: true; 90 | upperBound: 1; 91 | upperBoundInclusive: true; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /exercises/exercise4.jv: -------------------------------------------------------------------------------- 1 | pipeline TempDataPipeline { 2 | 3 | ZipFileFetch 4 | -> ArchiveExtract 5 | -> SelectorCSVFile 6 | -> TextLoader 7 | -> InterpretorCSV 8 | -> UpdateBatteryCol 9 | -> HeadersUpdate 10 | -> IntegrityCheck 11 | -> TransformTempData 12 | -> BatteryTempTransform 13 | -> ProcessedDataToSQLite; 14 | 15 | // Download the ZIP 16 | block ZipFileFetch oftype HttpExtractor { 17 | url: "https://www.mowesta.com/data/measure/mowesta-dataset-20221107.zip"; 18 | } 19 | 20 | // Extract 21 | block ArchiveExtract oftype ArchiveInterpreter { 22 | archiveType: "zip"; 23 | } 24 | 25 | 26 | block SelectorCSVFile oftype FilePicker { 27 | path: "/data.csv"; 28 | } 29 | 30 | block TextLoader oftype TextFileInterpreter {} 31 | 32 | block InterpretorCSV oftype CSVInterpreter { 33 | delimiter: ";"; // Use semicolon as delimiter 34 | } 35 | 36 | block UpdateBatteryCol oftype CellWriter { 37 | at: cell J1; 38 | write: ["battery_temperature"]; 39 | } 40 | 41 | block HeadersUpdate oftype CellWriter { 42 | at: range A1:E1; 43 | write: ["id", "producer", "model", "month", "temperature"]; 44 | } 45 | 46 | 47 | block IntegrityCheck oftype TableInterpreter { 48 | header: true; 49 | columns: [ 50 | "id" oftype integer, 51 | "producer" oftype text, 52 | "model" oftype text, 53 | "month" oftype month, 54 | "temperature" oftype decimal, 55 | "battery_temperature" oftype decimal 56 | ]; 57 | } 58 | 59 | 60 | transform ConvertCelsiusToFahrenheit { 61 | from CelsiusValue oftype decimal; 62 | to FahrenheitValue oftype decimal; 63 | 64 | FahrenheitValue: 32 + ((CelsiusValue * 9) / 5); 65 | } 66 | 67 | 68 | block TransformTempData oftype TableTransformer { 69 | inputColumns: ["temperature"]; 70 | outputColumn: "temperature"; 71 | uses: ConvertCelsiusToFahrenheit; 72 | } 73 | 74 | 75 | block BatteryTempTransform oftype TableTransformer { 76 | inputColumns: ["battery_temperature"]; 77 | outputColumn: "battery_temperature"; 78 | uses: ConvertCelsiusToFahrenheit; 79 | } 80 | 81 | constraint MonthRange oftype RangeConstraint { 82 | lowerBound: 1; 83 | lowerBoundInclusive: true; 84 | upperBound: 12; 85 | upperBoundInclusive: true; 86 | } 87 | 88 | valuetype month oftype integer { 89 | constraints:[MonthRange]; 90 | } 91 | 92 | 93 | block ProcessedDataToSQLite oftype SQLiteLoader { 94 | table: "temperatures"; 95 | file: "./temperatures.sqlite"; 96 | } 97 | 98 | 99 | 100 | 101 | } -------------------------------------------------------------------------------- /project/pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import requests 4 | import pandas as pd 5 | import sqlite3 6 | from io import BytesIO 7 | 8 | # Define directories 9 | data_directory = "../data" 10 | if not os.path.exists(data_directory): 11 | os.makedirs(data_directory) 12 | 13 | # North American countries list 14 | north_american_countries = [ 15 | "Canada", "United States", "Mexico", "Bermuda", "Bahamas, The", 16 | "Barbados", "Cuba", "Haiti", "Dominican Republic", "Jamaica", 17 | "Trinidad and Tobago", "Saint Kitts and Nevis", "Antigua and Barbuda", 18 | "Saint Lucia", "Saint Vincent and the Grenadines", "Grenada", "Belize", 19 | "Panama", "Costa Rica", "El Salvador", "Honduras", "Nicaragua", 20 | "Guatemala" 21 | ] 22 | 23 | # URLs for GDP and education expenditure data from Worldbank 24 | url_gdp_zip = "https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.KD.ZG?downloadformat=csv" 25 | url_edu_zip = "https://api.worldbank.org/v2/en/indicator/SE.XPD.TOTL.GD.ZS?downloadformat=csv" 26 | 27 | # Function to download and extract ZIP files 28 | def download_and_extract_zip(url, output_dir): 29 | response = requests.get(url) 30 | if response.status_code == 200: 31 | with zipfile.ZipFile(BytesIO(response.content)) as z: 32 | z.extractall(output_dir) 33 | print(f"Extracted files to {output_dir}") 34 | csv_files = [f for f in z.namelist() if f.endswith(".csv") and "Metadata" not in f] 35 | return [os.path.join(output_dir, f) for f in csv_files] 36 | else: 37 | print(f"Failed to download data from {url}.") 38 | return [] 39 | 40 | # Process CSV file to filtering and reshaping 41 | def clean_and_reshape_data(file_path, countries, years): 42 | df = pd.read_csv(file_path, skiprows=4) 43 | # Filter countries and select years 44 | df_filtered = df[df["Country Name"].isin(countries)][["Country Name", "Country Code"] + years] 45 | # Reshape from wide to long format 46 | df_long = df_filtered.melt( 47 | id_vars=["Country Name", "Country Code"], 48 | var_name="Year", 49 | value_name="Value" 50 | ) 51 | return df_long 52 | 53 | # Save cleaned data to SQLite 54 | def export_to_sqlite(df, table_name, db_path): 55 | with sqlite3.connect(db_path) as conn: 56 | df.to_sql(table_name, conn, if_exists="replace", index=False) 57 | print(f"Saved table '{table_name}' to SQLite database at {db_path}.") 58 | 59 | # Download, process, and save GDP data 60 | gdp_files = download_and_extract_zip(url_gdp_zip, data_directory) 61 | edu_files = download_and_extract_zip(url_edu_zip, data_directory) 62 | 63 | # Filter years 64 | years = [str(year) for year in range(2016, 2023)] 65 | 66 | # Clean and reshape data 67 | if gdp_files: 68 | gdp_cleaned = clean_and_reshape_data(gdp_files[0], north_american_countries, years) 69 | gdp_cleaned.to_csv(os.path.join(data_directory, "gdp_cleaned.csv"), index=False) 70 | print("Cleaned GDP data saved as CSV.") 71 | export_to_sqlite(gdp_cleaned, "gdp_data", os.path.join(data_directory, "data_cleaned.db")) 72 | 73 | if edu_files: 74 | edu_cleaned = clean_and_reshape_data(edu_files[0], north_american_countries, years) 75 | edu_cleaned.to_csv(os.path.join(data_directory, "edu_cleaned.csv"), index=False) 76 | print("Cleaned Education Expenditure data saved as CSV.") 77 | export_to_sqlite(edu_cleaned, "education_data", os.path.join(data_directory, "data_cleaned.db")) 78 | -------------------------------------------------------------------------------- /project/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | import pandas as pd 4 | import subprocess 5 | 6 | # Define paths and constants 7 | DATA_DIR = "../data" 8 | GDP_CSV = os.path.join(DATA_DIR, "gdp_cleaned.csv") 9 | EDU_CSV = os.path.join(DATA_DIR, "edu_cleaned.csv") 10 | SQLITE_DB = os.path.join(DATA_DIR, "data_cleaned.db") 11 | PIPELINE_SCRIPT = "./project/pipeline.py" 12 | 13 | 14 | def test_pipeline_execution(): 15 | """ 16 | Test if the pipeline script executes successfully. 17 | """ 18 | print("Testing pipeline execution...") 19 | result = subprocess.run(["python", PIPELINE_SCRIPT], capture_output=True, text=True) 20 | assert result.returncode == 0, f"Pipeline script failed: {result.stderr}" 21 | print("Pipeline executed successfully.") 22 | 23 | 24 | def test_gdp_csv_exists(): 25 | """ 26 | Test if the GDP cleaned CSV file is created. 27 | """ 28 | print("Testing if GDP cleaned CSV exists...") 29 | assert os.path.exists(GDP_CSV), f"GDP cleaned CSV file not found: {GDP_CSV}" 30 | print("GDP cleaned CSV exists.") 31 | 32 | 33 | def test_edu_csv_exists(): 34 | """ 35 | Test if the Education cleaned CSV file is created. 36 | """ 37 | print("Testing if Education cleaned CSV exists...") 38 | assert os.path.exists(EDU_CSV), f"Education cleaned CSV file not found: {EDU_CSV}" 39 | print("Education cleaned CSV exists.") 40 | 41 | 42 | def test_gdp_csv_content(): 43 | """ 44 | Test the content of the GDP cleaned CSV file. 45 | """ 46 | print("Testing GDP cleaned CSV content") 47 | df = pd.read_csv(GDP_CSV) 48 | assert not df.empty, "GDP cleaned CSV file is empty." 49 | assert "Country Name" in df.columns, "Expected column 'Country Name' not found in GDP CSV." 50 | assert "Year" in df.columns, "Expected column 'Year' not found in GDP CSV." 51 | print("GDP cleaned CSV content is valid.") 52 | 53 | 54 | def test_edu_csv_content(): 55 | """ 56 | Test the content of the Education cleaned CSV file. 57 | """ 58 | print("Testing Education cleaned CSV content.") 59 | df = pd.read_csv(EDU_CSV) 60 | assert not df.empty, "Education cleaned CSV file is empty." 61 | assert "Country Name" in df.columns, "Expected column 'Country Name' not found in Education CSV." 62 | assert "Year" in df.columns, "Expected column 'Year' not found in Education CSV." 63 | print("Education cleaned CSV content is valid.") 64 | 65 | 66 | def test_sqlite_db_exists(): 67 | """ 68 | Test if the SQLite database file is created. 69 | """ 70 | print("Testing if SQLite database exists..") 71 | assert os.path.exists(SQLITE_DB), f"SQLite database not found: {SQLITE_DB}" 72 | print("SQLite database exists.") 73 | 74 | 75 | def test_sqlite_tables(): 76 | """ 77 | Test if the expected tables exist in the SQLite database. 78 | """ 79 | print("Testing SQLite database tables.") 80 | with sqlite3.connect(SQLITE_DB) as conn: 81 | cursor = conn.cursor() 82 | cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") 83 | tables = [row[0] for row in cursor.fetchall()] 84 | assert "gdp_data" in tables, "Table 'gdp_data' not found in SQLite database." 85 | assert "education_data" in tables, "Table 'education_data' not found in SQLite database." 86 | print("Expected tables found in SQLite database.") 87 | 88 | 89 | def test_sqlite_table_content(): 90 | """ 91 | Test the content of the tables in the SQLite database. 92 | """ 93 | print("Testing SQLite database table content...") 94 | with sqlite3.connect(SQLITE_DB) as conn: 95 | gdp_df = pd.read_sql("SELECT * FROM gdp_data;", conn) 96 | edu_df = pd.read_sql("SELECT * FROM education_data;", conn) 97 | assert not gdp_df.empty, "Table 'gdp_data' in SQLite database is empty." 98 | assert not edu_df.empty, "Table 'education_data' in SQLite database is empty." 99 | print("SQLite database tables contain valid data.") 100 | 101 | 102 | if __name__ == "__main__": 103 | # Run all tests accordinglz 104 | test_pipeline_execution() 105 | test_gdp_csv_exists() 106 | test_edu_csv_exists() 107 | test_gdp_csv_content() 108 | test_edu_csv_content() 109 | test_sqlite_db_exists() 110 | test_sqlite_tables() 111 | test_sqlite_table_content() 112 | 113 | print("All tests passed successfullz.") 114 | -------------------------------------------------------------------------------- /examples/data-exploration-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data Exploration\n", 9 | "\n", 10 | "In this notebook describe your data exploration steps." 11 | ] 12 | }, 13 | { 14 | "attachments": {}, 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Install dependencies" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "Requirement already satisfied: pandas in /usr/local/lib/python3.11/site-packages (1.5.3)\n", 31 | "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/pheltweg/Library/Python/3.11/lib/python/site-packages (from pandas) (2.8.2)\n", 32 | "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/site-packages (from pandas) (2022.7.1)\n", 33 | "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.11/site-packages (from pandas) (1.24.2)\n", 34 | "Requirement already satisfied: six>=1.5 in /Users/pheltweg/Library/Python/3.11/lib/python/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", 35 | "\n", 36 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", 37 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", 38 | "Note: you may need to restart the kernel to use updated packages.\n", 39 | "Requirement already satisfied: SQLAlchemy==1.4.46 in /usr/local/lib/python3.11/site-packages (1.4.46)\n", 40 | "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.11/site-packages (from SQLAlchemy==1.4.46) (2.0.2)\n", 41 | "\n", 42 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", 43 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", 44 | "Note: you may need to restart the kernel to use updated packages.\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "%pip install pandas\n", 50 | "%pip install 'SQLAlchemy==1.4.46'" 51 | ] 52 | }, 53 | { 54 | "attachments": {}, 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Load data" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import pandas as pd\n", 68 | "\n", 69 | "df = pd.read_sql_table('trainstops', 'sqlite:///data.sqlite')" 70 | ] 71 | }, 72 | { 73 | "attachments": {}, 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Look at the first rows" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | "
EVA_NRDS100IFOPTNAMEVerkehrLaengeBreiteBetreiber_NameBetreiber_NrStatus
08002551AELBde:02000:11943Hamburg ElbbrückenRV10.02450053.534500DB Station und Service AGNaNneu
18001944TETNNoneEutingen NordRV8.75310048.484700DB Station und Service AGNaNneu
28003074MIANoneIngolstadt AudiRV11.40745648.790496DB Station und Service AGNaNneu
38001723HEBANoneEinbeck Otto-Hahn-StraßeRV9.89291051.814478Ilmebahn GmbHNaNneu
48004371KRONoneNörvenich-Rommelsheimnur DPN6.54758650.782539Rurtalbahn GmbHNaNneu
58010340DSTRNoneStraßgräbchen-Bernsdorfnur DPN14.05204751.361469NoneNaNneu
68001510TDSAde:08237:8009:2Dornstetten-AachRV8.48291048.473300DB Station und Service AGNaNneu
78001966MFOLde:09187:90183Feldollingnur DPN11.85224447.895336DB Station und Service AGNaNneu
88002060FFGGde:06412:11500Frankfurt(Main)-Gateway GardensRV8.59449550.056574DB Station und Service AGNaNneu
98002535EOBGde:05962:3517Halver-OberbrüggeRV7.57404251.191867DB Station und Service AGNaNneu
\n", 251 | "
" 252 | ], 253 | "text/plain": [ 254 | " EVA_NR DS100 IFOPT NAME Verkehr \\\n", 255 | "0 8002551 AELB de:02000:11943 Hamburg Elbbrücken RV \n", 256 | "1 8001944 TETN None Eutingen Nord RV \n", 257 | "2 8003074 MIA None Ingolstadt Audi RV \n", 258 | "3 8001723 HEBA None Einbeck Otto-Hahn-Straße RV \n", 259 | "4 8004371 KRO None Nörvenich-Rommelsheim nur DPN \n", 260 | "5 8010340 DSTR None Straßgräbchen-Bernsdorf nur DPN \n", 261 | "6 8001510 TDSA de:08237:8009:2 Dornstetten-Aach RV \n", 262 | "7 8001966 MFOL de:09187:90183 Feldolling nur DPN \n", 263 | "8 8002060 FFGG de:06412:11500 Frankfurt(Main)-Gateway Gardens RV \n", 264 | "9 8002535 EOBG de:05962:3517 Halver-Oberbrügge RV \n", 265 | "\n", 266 | " Laenge Breite Betreiber_Name Betreiber_Nr Status \n", 267 | "0 10.024500 53.534500 DB Station und Service AG NaN neu \n", 268 | "1 8.753100 48.484700 DB Station und Service AG NaN neu \n", 269 | "2 11.407456 48.790496 DB Station und Service AG NaN neu \n", 270 | "3 9.892910 51.814478 Ilmebahn GmbH NaN neu \n", 271 | "4 6.547586 50.782539 Rurtalbahn GmbH NaN neu \n", 272 | "5 14.052047 51.361469 None NaN neu \n", 273 | "6 8.482910 48.473300 DB Station und Service AG NaN neu \n", 274 | "7 11.852244 47.895336 DB Station und Service AG NaN neu \n", 275 | "8 8.594495 50.056574 DB Station und Service AG NaN neu \n", 276 | "9 7.574042 51.191867 DB Station und Service AG NaN neu " 277 | ] 278 | }, 279 | "execution_count": 3, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "df.head(10)" 286 | ] 287 | }, 288 | { 289 | "attachments": {}, 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "### Data exploration\n", 294 | "Print some basic information about the data. Your data exploration would continue here." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 6, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "\n", 307 | "RangeIndex: 6519 entries, 0 to 6518\n", 308 | "Data columns (total 10 columns):\n", 309 | " # Column Non-Null Count Dtype \n", 310 | "--- ------ -------------- ----- \n", 311 | " 0 EVA_NR 6519 non-null int64 \n", 312 | " 1 DS100 6519 non-null object \n", 313 | " 2 IFOPT 6512 non-null object \n", 314 | " 3 NAME 6519 non-null object \n", 315 | " 4 Verkehr 6519 non-null object \n", 316 | " 5 Laenge 6519 non-null float64\n", 317 | " 6 Breite 6519 non-null float64\n", 318 | " 7 Betreiber_Name 6517 non-null object \n", 319 | " 8 Betreiber_Nr 5395 non-null float64\n", 320 | " 9 Status 24 non-null object \n", 321 | "dtypes: float64(3), int64(1), object(6)\n", 322 | "memory usage: 509.4+ KB\n" 323 | ] 324 | }, 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "array(['neu', None], dtype=object)" 329 | ] 330 | }, 331 | "execution_count": 6, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "df.info()\n", 338 | "\n", 339 | "df['Status'].unique()" 340 | ] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 3", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.11.2" 360 | }, 361 | "orig_nbformat": 4 362 | }, 363 | "nbformat": 4, 364 | "nbformat_minor": 2 365 | } 366 | --------------------------------------------------------------------------------