├── exercises
    ├── .gitkeep
    ├── gtfs.sqlite
    ├── trees.sqlite
    ├── airports.sqlite
    ├── country-stats.sqlite
    ├── temperatures.sqlite
    ├── exercise1.jv
    ├── exercise2.jv
    ├── exercise5.jv
    ├── exercise3.jv
    └── exercise4.jv
├── project
    ├── .gitkeep
    ├── tests.sh
    ├── pipeline.sh
    ├── data-report.pdf
    ├── analysis-report.pdf
    ├── project-plan.md
    ├── pipeline.py
    └── tests.py
├── .gitignore
├── examples
    ├── data.sqlite
    ├── project-plan-example.md
    └── data-exploration-example.ipynb
├── .github
    └── workflows
    │   ├── test.yml
    │   └── exercise-feedback.yml
└── README.md


/exercises/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/project/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /data/*
3 | !/data/.gitkeep


--------------------------------------------------------------------------------
/project/tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python ./project/tests.py


--------------------------------------------------------------------------------
/project/pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python3 ./project/pipeline.py


--------------------------------------------------------------------------------
/examples/data.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/examples/data.sqlite


--------------------------------------------------------------------------------
/exercises/gtfs.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/gtfs.sqlite


--------------------------------------------------------------------------------
/exercises/trees.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/trees.sqlite


--------------------------------------------------------------------------------
/exercises/airports.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/airports.sqlite


--------------------------------------------------------------------------------
/project/data-report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/project/data-report.pdf


--------------------------------------------------------------------------------
/project/analysis-report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/project/analysis-report.pdf


--------------------------------------------------------------------------------
/exercises/country-stats.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/country-stats.sqlite


--------------------------------------------------------------------------------
/exercises/temperatures.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rashakil-ds/data-engineering-fau/main/exercises/temperatures.sqlite


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test CI 
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Check out repository code
14 |         uses: actions/checkout@v3
15 | 
16 |       # python version 3.9 installations
17 |       - name: Set up Python 3.9
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: 3.9
21 | 
22 |       - name: Install all dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           pip install pandas
26 |           pip install sqlalchemy
27 |           pip install requests
28 |           
29 | 
30 | 
31 |       - name: Run tests file
32 |         run: |
33 |           chmod +x ./project/tests.sh
34 |           ./project/tests.sh 


--------------------------------------------------------------------------------
/examples/project-plan-example.md:
--------------------------------------------------------------------------------
 1 | # Project Plan
 2 | 
 3 | ## Title
 4 | <!-- Give your project a short title. -->
 5 | Awesome MADE project.
 6 | 
 7 | ## Main Question
 8 | 
 9 | <!-- Think about one main question you want to answer based on the data. -->
10 | 1. Does writing an example question help students write better project plans?
11 | 
12 | ## Description
13 | 
14 | <!-- Describe your data science project in max. 200 words. Consider writing about why and how you attempt it. -->
15 | XY is an important problem, because... This projects analyzes XY, using method A. The results can give insights into...
16 | 
17 | ## Datasources
18 | 
19 | <!-- Describe each datasources you plan to use in a section. Use the prefic "DatasourceX" where X is the id of the datasource. -->
20 | 
21 | ### Datasource1: ExampleSource
22 | * Metadata URL: https://mobilithek.info/offers/-6901989592576801458
23 | * Data URL: https://raw.githubusercontent.com/od-ms/radverkehr-zaehlstellen/main/100035541/2019-01.csv
24 | * Data Type: CSV
25 | 
26 | Short description of the DataSource.
27 | 
28 | ## Work Packages
29 | 
30 | <!-- List of work packages ordered sequentially, each pointing to an issue with more details. -->
31 | 
32 | 1. Example Issue [#1][i1]
33 | 2. ...
34 | 
35 | [i1]: https://github.com/jvalue/made-template/issues/1
36 | 


--------------------------------------------------------------------------------
/.github/workflows/exercise-feedback.yml:
--------------------------------------------------------------------------------
 1 | name: Exercise Feedback
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - exercises/**
 9 | 
10 | jobs:
11 |   exercise-feedback:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout
15 |         uses: actions/checkout@v4
16 |         with:
17 |           path: main
18 | 
19 |       - name: Checkout exercise feedback
20 |         uses: actions/checkout@v4
21 |         with:
22 |           repository: jvalue/made-exercise-feedback
23 |           path: grading
24 | 
25 |       - name: Set up Node
26 |         uses: actions/setup-node@v4
27 |         with:
28 |           node-version: "lts/*"
29 | 
30 |       # Install jayvee
31 |       - name: Set up Jayvee
32 |         run: |
33 |           ./grading/ci/setup_jayvee.sh
34 | 
35 |       # Install python
36 |       - name: Set up Python 3.11
37 |         uses: actions/setup-python@v5
38 |         with:
39 |           python-version: 3.11
40 | 
41 |       # Run grading feedback
42 |       - name: Exercise feedback
43 |         id: ex_feedback
44 |         run: |
45 |           ./grading/ci/run_grading.sh
46 | 
47 |       # Upload feedback
48 |       - name: Upload feedback
49 |         uses: actions/upload-artifact@v4
50 |         with:
51 |           name: exercise-feedback
52 |           path: ./grading/feedback-ex?.txt
53 | 


--------------------------------------------------------------------------------
/exercises/exercise1.jv:
--------------------------------------------------------------------------------
 1 | 
 2 | // Name: KM Rashedul Alam
 3 | 
 4 | pipeline AirportsPipeline {
 5 | 
 6 |   AirportsExtractor
 7 |     -> AirportsTextFileInterpreter
 8 |     -> AirportsCSVInterpreter
 9 |     -> AirportsTableInterpreter
10 |     -> AirportsLoader;
11 |    
12 |   block AirportsExtractor oftype HttpExtractor {
13 | 
14 | // provided data source
15 | 
16 |     url: "https://opendata.rhein-kreis-neuss.de/api/explore/v2.1/catalog/datasets/rhein-kreis-neuss-flughafen-weltweit/exports/csv?lang=en&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B";
17 |   }
18 | 
19 | 
20 |   block AirportsTextFileInterpreter oftype TextFileInterpreter { }
21 | 
22 | // correct table
23 | 
24 |   block AirportsCSVInterpreter oftype CSVInterpreter {
25 |     delimiter: ";";
26 |   }
27 | 
28 | 
29 |   block AirportsTableInterpreter oftype TableInterpreter {
30 |     header: true;
31 |     
32 |     columns: [
33 |       "Lfd. Nummer" oftype integer,
34 |       "Name des Flughafens" oftype text,
35 |       "Ort" oftype text,
36 |       "Land" oftype text,
37 |       "IATA" oftype text,
38 |       "ICAO" oftype text,
39 |       "Latitude" oftype decimal,
40 |       "Longitude" oftype decimal,
41 |       "Altitude" oftype integer
42 |       
43 |  
44 |     ];
45 |   }
46 | 
47 |   block AirportsLoader oftype SQLiteLoader {
48 |     table: "airports";
49 |     file: "./airports.sqlite";
50 |   }
51 | 
52 | 
53 | }


--------------------------------------------------------------------------------
/project/project-plan.md:
--------------------------------------------------------------------------------
 1 | # Project Plan
 2 | 
 3 | ## Title
 4 | <!-- Give your project a short title. -->
 5 | Correlation Between Education Spending and GDP Growth in North America
 6 | 
 7 | ## Main Question
 8 | 
 9 | <!-- Think about one main question you want to answer based on the data. -->
10 | 1. How does government expenditure on education as a percentage of GDP correlate with GDP growth in North American countries from 2016 to 2023?
11 | 
12 | ## Description
13 | 
14 | <!-- Describe your data science project in max. 200 words. Consider writing about why and how you attempt it. -->
15 | The project will investigate the potential relationship between public investment in education and economic growth, providing insights into how education funding impacts economic performance.
16 | 
17 | ## Datasources
18 | 
19 | <!-- Describe each datasources you plan to use in a section. Use the prefic "DatasourceX" where X is the id of the datasource. -->
20 | 
21 | ### Datasource1: Worldbank (Government expenditure on education, total (% of GDP))
22 | * Metadata URL: https://data.worldbank.org/indicator/SE.XPD.TOTL.GD.ZS
23 | * Data URL: https://api.worldbank.org/v2/en/indicator/SE.XPD.TOTL.GD.ZS?downloadformat=csv
24 | * Data Type: Zip->CSV
25 | 
26 | This dataset contains data on government expenditure on education as a percentage of GDP, covering countries worldwide. For this project, only data from North American countries for the years 2016–2023 were extracted
27 | 
28 | ### Datasource2: Worldbank (GDP growth (annual %))
29 | * Metadata URL: https://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG
30 | * Data URL: https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.KD.ZG?downloadformat=csv
31 | * Data Type: Zip->CSV
32 | 
33 | This dataset contains GDP growth rates for countries worldwide. We focused on North American countries for the years 2016–2023.
34 | 
35 | ## Work Packages
36 | 
37 | <!-- List of work packages ordered sequentially, each pointing to an issue with more details. -->
38 | 
39 | 1. Dataset selection
40 | 2. Building an automated data pipeline
41 | 3. Exploratory Data Analysis (EDA).
42 | 4. Reporting on findings
43 | 


--------------------------------------------------------------------------------
/exercises/exercise2.jv:
--------------------------------------------------------------------------------
 1 | // Exercise 02 on Jayvee pipeline to process tree planting dataset
 2 | // KM Rashedul Alam
 3 | pipeline TreePlantingDatasetPipeline {
 4 | 
 5 |     // Workflow Pipeline 
 6 |     TreeDatasetExtractor
 7 |         -> TxtFileInterpreter
 8 |         -> CsvDataInterpreter
 9 |         -> TableInterpreter
10 |         -> SQLiteFinalDataLoader;
11 | 
12 |     // Extract CSV data from give URL
13 |     block TreeDatasetExtractor oftype HttpExtractor {
14 |         url: "https://opendata.rhein-kreis-neuss.de/api/v2/catalog/datasets/stadt-neuss-herbstpflanzung-2023/exports/csv";
15 |     }
16 | 
17 |     // Interpret the extracted file as plain text to prepare for CSV parsing
18 |     block TxtFileInterpreter oftype TextFileInterpreter { }
19 | 
20 |     // Interpreting the text data as CSV data
21 |     block CsvDataInterpreter oftype CSVInterpreter {
22 |         delimiter: ";"; 
23 |     }
24 | 
25 |     // data validation
26 |     constraint DistrictNameConstraint oftype RegexConstraint {
27 |         regex: /^Vogelsang/;
28 |     }
29 | 
30 |     constraint GeoPointsConstraint oftype RegexConstraint {
31 |         regex: /^\d{1,3}\.\d+,\s*\d{1,3}\.\d+$/;
32 |     }
33 | 
34 |     valuetype DistrictType oftype text {
35 |         constraints: [ DistrictNameConstraint ];
36 |     }
37 | 
38 |     // Custom type 'id' with the geopoint format constraints
39 |     valuetype GeoPointType oftype text {
40 |         constraints: [ GeoPointsConstraint ];
41 |     }
42 | 
43 |     //Interpreting CSV data
44 |     block TableInterpreter oftype TableInterpreter {
45 |         header: true; 
46 | 
47 |         // validation and dropping 'baumart_deutsch'
48 |         columns: [
49 |             "lfd_nr" oftype integer,           
50 |             "stadtteil" oftype DistrictType,   
51 |             "standort" oftype text,            
52 |             "baumart_botanisch" oftype text,   
53 |             "id" oftype GeoPointType,          
54 |             "baumfamilie" oftype text          
55 |         ];
56 |     }
57 | 
58 |     // Loading the validated data into a SQLite DB
59 |     block SQLiteFinalDataLoader oftype SQLiteLoader {
60 |         file: "./trees.sqlite";  // Target 
61 |         table: "trees";          // Given Name of DB
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/exercises/exercise5.jv:
--------------------------------------------------------------------------------
 1 | pipeline GTFSProcessor {
 2 | 
 3 |   // Download GTFS data from the specified URL
 4 |   GTFSDownloader
 5 |     -> ArchiveHandler
 6 |     -> FilePickerStops
 7 |     -> FileDecoderStops
 8 |     -> CSVParserStops
 9 |     -> TableHandlerStops
10 |     -> DatabaseWriter;
11 | 
12 |   // Block to download the GTFS zip file
13 |   block GTFSDownloader oftype HttpExtractor {
14 |       url: "https://gtfs.rhoenenergie-bus.de/GTFS.zip"; 
15 |   }
16 | 
17 |   // Block to interpret the downloaded zip archive
18 |   block ArchiveHandler oftype ArchiveInterpreter {
19 |       archiveType: "zip"; 
20 |   }
21 | 
22 |   // Block to select the stops.txt file from the zip archive
23 |   block FilePickerStops oftype FilePicker {
24 |       path: "./stops.txt"; 
25 |   }
26 | 
27 |   // Block to decode the stops.txt file as UTF-8 text
28 |   block FileDecoderStops oftype TextFileInterpreter {
29 |       encoding: "utf8"; 
30 |   }
31 | 
32 |   // Block to interpret the stops.txt file as CSV
33 |   block CSVParserStops oftype CSVInterpreter {
34 |       delimiter: ","; 
35 |       enclosing: '"'; 
36 |   }
37 | 
38 |   // Custom constraints for validation
39 |   constraint LatitudeLongitudeRange oftype RangeConstraint {
40 |       lowerBound: -90;
41 |       upperBound: 90;
42 |       lowerBoundInclusive: true;
43 |       upperBoundInclusive: true;
44 |   }
45 | 
46 |   valuetype Coordinates oftype decimal {
47 |       constraints: [LatitudeLongitudeRange]; 
48 |   }
49 | 
50 |   constraint FixedZoneID oftype RangeConstraint {
51 |       lowerBound: 1925;
52 |       upperBound: 1925;
53 |       lowerBoundInclusive: true;
54 |       upperBoundInclusive: true;
55 |   }
56 | 
57 |   valuetype ZoneIdentifier oftype integer {
58 |       constraints: [FixedZoneID]; 
59 |   }
60 | 
61 |   // Block to define the table structure and filter data
62 |   block TableHandlerStops oftype TableInterpreter {
63 |       header: true; 
64 |       columns: [
65 |           "stop_id" oftype integer,
66 |           "stop_name" oftype text,
67 |           "stop_lat" oftype Coordinates,
68 |           "stop_lon" oftype Coordinates,
69 |           "zone_id" oftype ZoneIdentifier
70 |       ]; 
71 |   }
72 | 
73 |   // Block to write the processed data into a SQLite database
74 |   block DatabaseWriter oftype SQLiteLoader {
75 |       table: "stops"; 
76 |       file: "gtfs.sqlite"; 
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Methods of Advanced Data Engineering Template Project
 2 | 
 3 | This template project provides some structure for your open data project in the MADE module at FAU.
 4 | This repository contains (a) a data science project that is developed by the student over the course of the semester, and (b) the exercises that are submitted over the course of the semester.
 5 | 
 6 | To get started, please follow these steps:
 7 | 1. Create your own fork of this repository. Feel free to rename the repository right after creation, before you let the teaching instructors know your repository URL. **Do not rename the repository during the semester**.
 8 | 
 9 | ## Project Work
10 | Your data engineering project will run alongside lectures during the semester. We will ask you to regularly submit project work as milestones, so you can reasonably pace your work. All project work submissions **must** be placed in the `project` folder.
11 | 
12 | ### Exporting a Jupyter Notebook
13 | Jupyter Notebooks can be exported using `nbconvert` (`pip install nbconvert`). For example, to export the example notebook to HTML: `jupyter nbconvert --to html examples/final-report-example.ipynb --embed-images --output final-report.html`
14 | 
15 | 
16 | ## Exercises
17 | During the semester you will need to complete exercises using [Jayvee](https://github.com/jvalue/jayvee). You **must** place your submission in the `exercises` folder in your repository and name them according to their number from one to five: `exercise<number from 1-5>.jv`.
18 | 
19 | In regular intervals, exercises will be given as homework to complete during the semester. Details and deadlines will be discussed in the lecture, also see the [course schedule](https://made.uni1.de/).
20 | 
21 | ### Exercise Feedback
22 | We provide automated exercise feedback using a GitHub action (that is defined in `.github/workflows/exercise-feedback.yml`). 
23 | 
24 | To view your exercise feedback, navigate to Actions → Exercise Feedback in your repository.
25 | 
26 | The exercise feedback is executed whenever you make a change in files in the `exercise` folder and push your local changes to the repository on GitHub. To see the feedback, open the latest GitHub Action run, open the `exercise-feedback` job and `Exercise Feedback` step. You should see command line output that contains output like this:
27 | 
28 | ```sh
29 | Found exercises/exercise1.jv, executing model...
30 | Found output file airports.sqlite, grading...
31 | Grading Exercise 1
32 | 	Overall points 17 of 17
33 | 	---
34 | 	By category:
35 | 		Shape: 4 of 4
36 | 		Types: 13 of 13
37 | ```
38 | 


--------------------------------------------------------------------------------
/exercises/exercise3.jv:
--------------------------------------------------------------------------------
 1 | pipeline EconomicDataPipeline {
 2 | 
 3 |   // Step 1: Download the Excel file directly from the given URL
 4 |   DataFetcher
 5 |     -> FileInterpreter
 6 |     -> SheetExtractor
 7 |     -> DataSelector
 8 |     -> BondDataCleaner
 9 |     -> BondDataParser
10 |     -> BondDataSaver;
11 | 
12 |   // Step 2: Extract GDP per Capita data separately
13 |   DataSelector
14 |     -> GdpDataFilter
15 |     -> GdpDataParser
16 |     -> GdpDataSaver;
17 | 
18 |   // Block to download the Excel file from the provided link
19 |   block DataFetcher oftype HttpExtractor {
20 |     url: "https://thedocs.worldbank.org/en/doc/7d852628d96b9411d43e5d36d5dff941-0050062022/original/Graphs-Chapter-5-02082022.xlsx";
21 |   }
22 | 
23 |   // Interpret the Excel workbook
24 |   block FileInterpreter oftype XLSXInterpreter { }
25 | 
26 |   // Select the specific sheet named "Figure S5.1.2"
27 |   block SheetExtractor oftype SheetPicker {
28 |     sheetName: "Figure S5.1.2";
29 |   }
30 | 
31 |   // Extract the data range from P2 to S45
32 |   block DataSelector oftype CellRangeSelector {
33 |     select: range P2:S45;
34 |   }
35 | 
36 |   // Filter out unnecessary columns for bond data
37 |   block BondDataCleaner oftype ColumnDeleter {
38 |     delete: [column B, column C];
39 |   }
40 | 
41 |   // Filter columns for GDP data extraction
42 |   block GdpDataFilter oftype ColumnDeleter {
43 |     delete: [column B, column D];
44 |   }
45 | 
46 |   // Parse Bond Issuance data
47 |   block BondDataParser oftype TableInterpreter {
48 |     header: false;
49 |     columns: [
50 |       "Country Code" oftype CountryCodeAlpha3,
51 |       "Bond Issuance Share" oftype BondPercentage
52 |     ];
53 |   }
54 | 
55 |   // Parse GDP per Capita data
56 |   block GdpDataParser oftype TableInterpreter {
57 |     header: false;
58 |     columns: [
59 |       "Country Code" oftype CountryCodeAlpha3,
60 |       "GDP per Capita" oftype GdpAmount
61 |     ];
62 |   }
63 | 
64 |   // Save Bond Issuance data into SQLite database
65 |   block BondDataSaver oftype SQLiteLoader {
66 |     table: "bondIssuance";
67 |     file: "./country-stats.sqlite";
68 |   }
69 | 
70 |   // Save GDP per Capita data into SQLite database
71 |   block GdpDataSaver oftype SQLiteLoader {
72 |     table: "gdpPerCapita";
73 |     file: "./country-stats.sqlite";
74 |   }
75 | 
76 |   // Define custom value types with constraints
77 |   valuetype GdpAmount oftype decimal {
78 |     constraints: [PositiveGdpCheck];
79 |   }
80 | 
81 |   constraint PositiveGdpCheck on decimal: value > 0;
82 | 
83 |   valuetype BondPercentage oftype decimal {
84 |     constraints: [BondShareRange];
85 |   }
86 | 
87 |   constraint BondShareRange oftype RangeConstraint {
88 |     lowerBound: 0;
89 |     lowerBoundInclusive: true;
90 |     upperBound: 1;
91 |     upperBoundInclusive: true;
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/exercises/exercise4.jv:
--------------------------------------------------------------------------------
  1 | pipeline TempDataPipeline {
  2 | 
  3 |     ZipFileFetch 
  4 |         -> ArchiveExtract
  5 |         -> SelectorCSVFile 
  6 |         -> TextLoader
  7 |         -> InterpretorCSV
  8 |         -> UpdateBatteryCol
  9 |         -> HeadersUpdate
 10 |         -> IntegrityCheck
 11 |         -> TransformTempData
 12 |         -> BatteryTempTransform
 13 |         -> ProcessedDataToSQLite;
 14 | 
 15 |     // Download the ZIP 
 16 |     block ZipFileFetch oftype HttpExtractor {
 17 |         url: "https://www.mowesta.com/data/measure/mowesta-dataset-20221107.zip";
 18 |     }
 19 | 
 20 |     // Extract 
 21 |     block ArchiveExtract oftype ArchiveInterpreter {
 22 |         archiveType: "zip";
 23 |     }
 24 | 
 25 |   
 26 |     block SelectorCSVFile oftype FilePicker {
 27 |         path: "/data.csv";
 28 |     }
 29 | 
 30 |     block TextLoader oftype TextFileInterpreter {}
 31 | 
 32 |     block InterpretorCSV oftype CSVInterpreter {
 33 |         delimiter: ";"; // Use semicolon as delimiter
 34 |     }
 35 | 
 36 |     block UpdateBatteryCol oftype CellWriter {
 37 |         at: cell J1; 
 38 |         write: ["battery_temperature"];
 39 |     }
 40 | 
 41 |     block HeadersUpdate oftype CellWriter {
 42 |         at: range A1:E1; 
 43 |         write: ["id", "producer", "model", "month", "temperature"];
 44 |     }
 45 | 
 46 |    
 47 |     block IntegrityCheck oftype TableInterpreter {
 48 |         header: true;
 49 |         columns: [
 50 |             "id" oftype integer,
 51 |             "producer" oftype text,
 52 |             "model" oftype text,
 53 |             "month" oftype month,
 54 |             "temperature" oftype decimal,
 55 |             "battery_temperature" oftype decimal
 56 |         ];
 57 |     }
 58 | 
 59 |     
 60 |     transform ConvertCelsiusToFahrenheit {
 61 |         from CelsiusValue oftype decimal;
 62 |         to FahrenheitValue oftype decimal;
 63 | 
 64 |         FahrenheitValue: 32 + ((CelsiusValue * 9) / 5);
 65 |     }
 66 | 
 67 |   
 68 |     block TransformTempData oftype TableTransformer {
 69 |         inputColumns: ["temperature"];
 70 |         outputColumn: "temperature";
 71 |         uses: ConvertCelsiusToFahrenheit;
 72 |     }
 73 | 
 74 |     
 75 |     block BatteryTempTransform oftype TableTransformer {
 76 |         inputColumns: ["battery_temperature"];
 77 |         outputColumn: "battery_temperature";
 78 |         uses: ConvertCelsiusToFahrenheit;
 79 |     }
 80 | 
 81 |     constraint MonthRange oftype RangeConstraint {
 82 |         lowerBound: 1;
 83 |         lowerBoundInclusive: true;
 84 |         upperBound: 12;
 85 |         upperBoundInclusive: true;
 86 |     }
 87 | 
 88 |     valuetype month oftype integer {
 89 |         constraints:[MonthRange];
 90 |     }
 91 | 
 92 | 
 93 |     block ProcessedDataToSQLite oftype SQLiteLoader {
 94 |         table: "temperatures";
 95 |         file: "./temperatures.sqlite";
 96 |     }
 97 | 
 98 | 
 99 | 
100 | 
101 | }


--------------------------------------------------------------------------------
/project/pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | import requests
 4 | import pandas as pd
 5 | import sqlite3
 6 | from io import BytesIO
 7 | 
 8 | # Define directories
 9 | data_directory = "../data"
10 | if not os.path.exists(data_directory):
11 |     os.makedirs(data_directory)
12 | 
13 | # North American countries list
14 | north_american_countries = [
15 |     "Canada", "United States", "Mexico", "Bermuda", "Bahamas, The",
16 |     "Barbados", "Cuba", "Haiti", "Dominican Republic", "Jamaica",
17 |     "Trinidad and Tobago", "Saint Kitts and Nevis", "Antigua and Barbuda",
18 |     "Saint Lucia", "Saint Vincent and the Grenadines", "Grenada", "Belize",
19 |     "Panama", "Costa Rica", "El Salvador", "Honduras", "Nicaragua",
20 |     "Guatemala"
21 | ]
22 | 
23 | # URLs for GDP and education expenditure data from Worldbank
24 | url_gdp_zip = "https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.KD.ZG?downloadformat=csv"
25 | url_edu_zip = "https://api.worldbank.org/v2/en/indicator/SE.XPD.TOTL.GD.ZS?downloadformat=csv"
26 | 
27 | # Function to download and extract ZIP files
28 | def download_and_extract_zip(url, output_dir):
29 |     response = requests.get(url)
30 |     if response.status_code == 200:
31 |         with zipfile.ZipFile(BytesIO(response.content)) as z:
32 |             z.extractall(output_dir)
33 |             print(f"Extracted files to {output_dir}")
34 |             csv_files = [f for f in z.namelist() if f.endswith(".csv") and "Metadata" not in f]
35 |             return [os.path.join(output_dir, f) for f in csv_files]
36 |     else:
37 |         print(f"Failed to download data from {url}.")
38 |         return []
39 | 
40 | # Process CSV file to filtering and reshaping
41 | def clean_and_reshape_data(file_path, countries, years):
42 |     df = pd.read_csv(file_path, skiprows=4)
43 |     # Filter countries and select years
44 |     df_filtered = df[df["Country Name"].isin(countries)][["Country Name", "Country Code"] + years]
45 |     # Reshape from wide to long format
46 |     df_long = df_filtered.melt(
47 |         id_vars=["Country Name", "Country Code"],
48 |         var_name="Year",
49 |         value_name="Value"
50 |     )
51 |     return df_long
52 | 
53 | # Save cleaned data to SQLite
54 | def export_to_sqlite(df, table_name, db_path):
55 |     with sqlite3.connect(db_path) as conn:
56 |         df.to_sql(table_name, conn, if_exists="replace", index=False)
57 |     print(f"Saved table '{table_name}' to SQLite database at {db_path}.")
58 | 
59 | # Download, process, and save GDP data
60 | gdp_files = download_and_extract_zip(url_gdp_zip, data_directory)
61 | edu_files = download_and_extract_zip(url_edu_zip, data_directory)
62 | 
63 | # Filter years
64 | years = [str(year) for year in range(2016, 2023)]
65 | 
66 | # Clean and reshape data
67 | if gdp_files:
68 |     gdp_cleaned = clean_and_reshape_data(gdp_files[0], north_american_countries, years)
69 |     gdp_cleaned.to_csv(os.path.join(data_directory, "gdp_cleaned.csv"), index=False)
70 |     print("Cleaned GDP data saved as CSV.")
71 |     export_to_sqlite(gdp_cleaned, "gdp_data", os.path.join(data_directory, "data_cleaned.db"))
72 | 
73 | if edu_files:
74 |     edu_cleaned = clean_and_reshape_data(edu_files[0], north_american_countries, years)
75 |     edu_cleaned.to_csv(os.path.join(data_directory, "edu_cleaned.csv"), index=False)
76 |     print("Cleaned Education Expenditure data saved as CSV.")
77 |     export_to_sqlite(edu_cleaned, "education_data", os.path.join(data_directory, "data_cleaned.db"))
78 | 


--------------------------------------------------------------------------------
/project/tests.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sqlite3
  3 | import pandas as pd
  4 | import subprocess
  5 | 
  6 | # Define paths and constants
  7 | DATA_DIR = "../data"
  8 | GDP_CSV = os.path.join(DATA_DIR, "gdp_cleaned.csv")
  9 | EDU_CSV = os.path.join(DATA_DIR, "edu_cleaned.csv")
 10 | SQLITE_DB = os.path.join(DATA_DIR, "data_cleaned.db")
 11 | PIPELINE_SCRIPT = "./project/pipeline.py"
 12 | 
 13 | 
 14 | def test_pipeline_execution():
 15 |     """
 16 |     Test if the pipeline script executes successfully.
 17 |     """
 18 |     print("Testing pipeline execution...")
 19 |     result = subprocess.run(["python", PIPELINE_SCRIPT], capture_output=True, text=True)
 20 |     assert result.returncode == 0, f"Pipeline script failed: {result.stderr}"
 21 |     print("Pipeline executed successfully.")
 22 | 
 23 | 
 24 | def test_gdp_csv_exists():
 25 |     """
 26 |     Test if the GDP cleaned CSV file is created.
 27 |     """
 28 |     print("Testing if GDP cleaned CSV exists...")
 29 |     assert os.path.exists(GDP_CSV), f"GDP cleaned CSV file not found: {GDP_CSV}"
 30 |     print("GDP cleaned CSV exists.")
 31 | 
 32 | 
 33 | def test_edu_csv_exists():
 34 |     """
 35 |     Test if the Education cleaned CSV file is created.
 36 |     """
 37 |     print("Testing if Education cleaned CSV exists...")
 38 |     assert os.path.exists(EDU_CSV), f"Education cleaned CSV file not found: {EDU_CSV}"
 39 |     print("Education cleaned CSV exists.")
 40 | 
 41 | 
 42 | def test_gdp_csv_content():
 43 |     """
 44 |     Test the content of the GDP cleaned CSV file.
 45 |     """
 46 |     print("Testing GDP cleaned CSV content")
 47 |     df = pd.read_csv(GDP_CSV)
 48 |     assert not df.empty, "GDP cleaned CSV file is empty."
 49 |     assert "Country Name" in df.columns, "Expected column 'Country Name' not found in GDP CSV."
 50 |     assert "Year" in df.columns, "Expected column 'Year' not found in GDP CSV."
 51 |     print("GDP cleaned CSV content is valid.")
 52 | 
 53 | 
 54 | def test_edu_csv_content():
 55 |     """
 56 |     Test the content of the Education cleaned CSV file.
 57 |     """
 58 |     print("Testing Education cleaned CSV content.")
 59 |     df = pd.read_csv(EDU_CSV)
 60 |     assert not df.empty, "Education cleaned CSV file is empty."
 61 |     assert "Country Name" in df.columns, "Expected column 'Country Name' not found in Education CSV."
 62 |     assert "Year" in df.columns, "Expected column 'Year' not found in Education CSV."
 63 |     print("Education cleaned CSV content is valid.")
 64 | 
 65 | 
 66 | def test_sqlite_db_exists():
 67 |     """
 68 |     Test if the SQLite database file is created.
 69 |     """
 70 |     print("Testing if SQLite database exists..")
 71 |     assert os.path.exists(SQLITE_DB), f"SQLite database not found: {SQLITE_DB}"
 72 |     print("SQLite database exists.")
 73 | 
 74 | 
 75 | def test_sqlite_tables():
 76 |     """
 77 |     Test if the expected tables exist in the SQLite database.
 78 |     """
 79 |     print("Testing SQLite database tables.")
 80 |     with sqlite3.connect(SQLITE_DB) as conn:
 81 |         cursor = conn.cursor()
 82 |         cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
 83 |         tables = [row[0] for row in cursor.fetchall()]
 84 |         assert "gdp_data" in tables, "Table 'gdp_data' not found in SQLite database."
 85 |         assert "education_data" in tables, "Table 'education_data' not found in SQLite database."
 86 |         print("Expected tables found in SQLite database.")
 87 | 
 88 | 
 89 | def test_sqlite_table_content():
 90 |     """
 91 |     Test the content of the tables in the SQLite database.
 92 |     """
 93 |     print("Testing SQLite database table content...")
 94 |     with sqlite3.connect(SQLITE_DB) as conn:
 95 |         gdp_df = pd.read_sql("SELECT * FROM gdp_data;", conn)
 96 |         edu_df = pd.read_sql("SELECT * FROM education_data;", conn)
 97 |         assert not gdp_df.empty, "Table 'gdp_data' in SQLite database is empty."
 98 |         assert not edu_df.empty, "Table 'education_data' in SQLite database is empty."
 99 |         print("SQLite database tables contain valid data.")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     # Run all tests accordinglz
104 |     test_pipeline_execution()
105 |     test_gdp_csv_exists()
106 |     test_edu_csv_exists()
107 |     test_gdp_csv_content()
108 |     test_edu_csv_content()
109 |     test_sqlite_db_exists()
110 |     test_sqlite_tables()
111 |     test_sqlite_table_content()
112 | 
113 |     print("All tests passed successfullz.")
114 | 


--------------------------------------------------------------------------------
/examples/data-exploration-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Data Exploration\n",
  9 |     "\n",
 10 |     "In this notebook describe your data exploration steps."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "attachments": {},
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Install dependencies"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Requirement already satisfied: pandas in /usr/local/lib/python3.11/site-packages (1.5.3)\n",
 31 |       "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/pheltweg/Library/Python/3.11/lib/python/site-packages (from pandas) (2.8.2)\n",
 32 |       "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/site-packages (from pandas) (2022.7.1)\n",
 33 |       "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.11/site-packages (from pandas) (1.24.2)\n",
 34 |       "Requirement already satisfied: six>=1.5 in /Users/pheltweg/Library/Python/3.11/lib/python/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n",
 35 |       "\n",
 36 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
 37 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
 38 |       "Note: you may need to restart the kernel to use updated packages.\n",
 39 |       "Requirement already satisfied: SQLAlchemy==1.4.46 in /usr/local/lib/python3.11/site-packages (1.4.46)\n",
 40 |       "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.11/site-packages (from SQLAlchemy==1.4.46) (2.0.2)\n",
 41 |       "\n",
 42 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
 43 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
 44 |       "Note: you may need to restart the kernel to use updated packages.\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "%pip install pandas\n",
 50 |     "%pip install 'SQLAlchemy==1.4.46'"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "attachments": {},
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Load data"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 2,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "import pandas as pd\n",
 68 |     "\n",
 69 |     "df = pd.read_sql_table('trainstops', 'sqlite:///data.sqlite')"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "attachments": {},
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### Look at the first rows"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/html": [
 88 |        "<div>\n",
 89 |        "<style scoped>\n",
 90 |        "    .dataframe tbody tr th:only-of-type {\n",
 91 |        "        vertical-align: middle;\n",
 92 |        "    }\n",
 93 |        "\n",
 94 |        "    .dataframe tbody tr th {\n",
 95 |        "        vertical-align: top;\n",
 96 |        "    }\n",
 97 |        "\n",
 98 |        "    .dataframe thead th {\n",
 99 |        "        text-align: right;\n",
100 |        "    }\n",
101 |        "</style>\n",
102 |        "<table border=\"1\" class=\"dataframe\">\n",
103 |        "  <thead>\n",
104 |        "    <tr style=\"text-align: right;\">\n",
105 |        "      <th></th>\n",
106 |        "      <th>EVA_NR</th>\n",
107 |        "      <th>DS100</th>\n",
108 |        "      <th>IFOPT</th>\n",
109 |        "      <th>NAME</th>\n",
110 |        "      <th>Verkehr</th>\n",
111 |        "      <th>Laenge</th>\n",
112 |        "      <th>Breite</th>\n",
113 |        "      <th>Betreiber_Name</th>\n",
114 |        "      <th>Betreiber_Nr</th>\n",
115 |        "      <th>Status</th>\n",
116 |        "    </tr>\n",
117 |        "  </thead>\n",
118 |        "  <tbody>\n",
119 |        "    <tr>\n",
120 |        "      <th>0</th>\n",
121 |        "      <td>8002551</td>\n",
122 |        "      <td>AELB</td>\n",
123 |        "      <td>de:02000:11943</td>\n",
124 |        "      <td>Hamburg Elbbrücken</td>\n",
125 |        "      <td>RV</td>\n",
126 |        "      <td>10.024500</td>\n",
127 |        "      <td>53.534500</td>\n",
128 |        "      <td>DB Station und Service AG</td>\n",
129 |        "      <td>NaN</td>\n",
130 |        "      <td>neu</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>1</th>\n",
134 |        "      <td>8001944</td>\n",
135 |        "      <td>TETN</td>\n",
136 |        "      <td>None</td>\n",
137 |        "      <td>Eutingen Nord</td>\n",
138 |        "      <td>RV</td>\n",
139 |        "      <td>8.753100</td>\n",
140 |        "      <td>48.484700</td>\n",
141 |        "      <td>DB Station und Service AG</td>\n",
142 |        "      <td>NaN</td>\n",
143 |        "      <td>neu</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>2</th>\n",
147 |        "      <td>8003074</td>\n",
148 |        "      <td>MIA</td>\n",
149 |        "      <td>None</td>\n",
150 |        "      <td>Ingolstadt Audi</td>\n",
151 |        "      <td>RV</td>\n",
152 |        "      <td>11.407456</td>\n",
153 |        "      <td>48.790496</td>\n",
154 |        "      <td>DB Station und Service AG</td>\n",
155 |        "      <td>NaN</td>\n",
156 |        "      <td>neu</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>3</th>\n",
160 |        "      <td>8001723</td>\n",
161 |        "      <td>HEBA</td>\n",
162 |        "      <td>None</td>\n",
163 |        "      <td>Einbeck Otto-Hahn-Straße</td>\n",
164 |        "      <td>RV</td>\n",
165 |        "      <td>9.892910</td>\n",
166 |        "      <td>51.814478</td>\n",
167 |        "      <td>Ilmebahn GmbH</td>\n",
168 |        "      <td>NaN</td>\n",
169 |        "      <td>neu</td>\n",
170 |        "    </tr>\n",
171 |        "    <tr>\n",
172 |        "      <th>4</th>\n",
173 |        "      <td>8004371</td>\n",
174 |        "      <td>KRO</td>\n",
175 |        "      <td>None</td>\n",
176 |        "      <td>Nörvenich-Rommelsheim</td>\n",
177 |        "      <td>nur DPN</td>\n",
178 |        "      <td>6.547586</td>\n",
179 |        "      <td>50.782539</td>\n",
180 |        "      <td>Rurtalbahn GmbH</td>\n",
181 |        "      <td>NaN</td>\n",
182 |        "      <td>neu</td>\n",
183 |        "    </tr>\n",
184 |        "    <tr>\n",
185 |        "      <th>5</th>\n",
186 |        "      <td>8010340</td>\n",
187 |        "      <td>DSTR</td>\n",
188 |        "      <td>None</td>\n",
189 |        "      <td>Straßgräbchen-Bernsdorf</td>\n",
190 |        "      <td>nur DPN</td>\n",
191 |        "      <td>14.052047</td>\n",
192 |        "      <td>51.361469</td>\n",
193 |        "      <td>None</td>\n",
194 |        "      <td>NaN</td>\n",
195 |        "      <td>neu</td>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>6</th>\n",
199 |        "      <td>8001510</td>\n",
200 |        "      <td>TDSA</td>\n",
201 |        "      <td>de:08237:8009:2</td>\n",
202 |        "      <td>Dornstetten-Aach</td>\n",
203 |        "      <td>RV</td>\n",
204 |        "      <td>8.482910</td>\n",
205 |        "      <td>48.473300</td>\n",
206 |        "      <td>DB Station und Service AG</td>\n",
207 |        "      <td>NaN</td>\n",
208 |        "      <td>neu</td>\n",
209 |        "    </tr>\n",
210 |        "    <tr>\n",
211 |        "      <th>7</th>\n",
212 |        "      <td>8001966</td>\n",
213 |        "      <td>MFOL</td>\n",
214 |        "      <td>de:09187:90183</td>\n",
215 |        "      <td>Feldolling</td>\n",
216 |        "      <td>nur DPN</td>\n",
217 |        "      <td>11.852244</td>\n",
218 |        "      <td>47.895336</td>\n",
219 |        "      <td>DB Station und Service AG</td>\n",
220 |        "      <td>NaN</td>\n",
221 |        "      <td>neu</td>\n",
222 |        "    </tr>\n",
223 |        "    <tr>\n",
224 |        "      <th>8</th>\n",
225 |        "      <td>8002060</td>\n",
226 |        "      <td>FFGG</td>\n",
227 |        "      <td>de:06412:11500</td>\n",
228 |        "      <td>Frankfurt(Main)-Gateway Gardens</td>\n",
229 |        "      <td>RV</td>\n",
230 |        "      <td>8.594495</td>\n",
231 |        "      <td>50.056574</td>\n",
232 |        "      <td>DB Station und Service AG</td>\n",
233 |        "      <td>NaN</td>\n",
234 |        "      <td>neu</td>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>9</th>\n",
238 |        "      <td>8002535</td>\n",
239 |        "      <td>EOBG</td>\n",
240 |        "      <td>de:05962:3517</td>\n",
241 |        "      <td>Halver-Oberbrügge</td>\n",
242 |        "      <td>RV</td>\n",
243 |        "      <td>7.574042</td>\n",
244 |        "      <td>51.191867</td>\n",
245 |        "      <td>DB Station und Service AG</td>\n",
246 |        "      <td>NaN</td>\n",
247 |        "      <td>neu</td>\n",
248 |        "    </tr>\n",
249 |        "  </tbody>\n",
250 |        "</table>\n",
251 |        "</div>"
252 |       ],
253 |       "text/plain": [
254 |        "    EVA_NR DS100            IFOPT                             NAME  Verkehr  \\\n",
255 |        "0  8002551  AELB   de:02000:11943               Hamburg Elbbrücken       RV   \n",
256 |        "1  8001944  TETN             None                    Eutingen Nord       RV   \n",
257 |        "2  8003074   MIA             None                  Ingolstadt Audi       RV   \n",
258 |        "3  8001723  HEBA             None         Einbeck Otto-Hahn-Straße       RV   \n",
259 |        "4  8004371   KRO             None            Nörvenich-Rommelsheim  nur DPN   \n",
260 |        "5  8010340  DSTR             None          Straßgräbchen-Bernsdorf  nur DPN   \n",
261 |        "6  8001510  TDSA  de:08237:8009:2                 Dornstetten-Aach       RV   \n",
262 |        "7  8001966  MFOL   de:09187:90183                       Feldolling  nur DPN   \n",
263 |        "8  8002060  FFGG   de:06412:11500  Frankfurt(Main)-Gateway Gardens       RV   \n",
264 |        "9  8002535  EOBG    de:05962:3517                Halver-Oberbrügge       RV   \n",
265 |        "\n",
266 |        "      Laenge     Breite             Betreiber_Name  Betreiber_Nr Status  \n",
267 |        "0  10.024500  53.534500  DB Station und Service AG           NaN    neu  \n",
268 |        "1   8.753100  48.484700  DB Station und Service AG           NaN    neu  \n",
269 |        "2  11.407456  48.790496  DB Station und Service AG           NaN    neu  \n",
270 |        "3   9.892910  51.814478              Ilmebahn GmbH           NaN    neu  \n",
271 |        "4   6.547586  50.782539            Rurtalbahn GmbH           NaN    neu  \n",
272 |        "5  14.052047  51.361469                       None           NaN    neu  \n",
273 |        "6   8.482910  48.473300  DB Station und Service AG           NaN    neu  \n",
274 |        "7  11.852244  47.895336  DB Station und Service AG           NaN    neu  \n",
275 |        "8   8.594495  50.056574  DB Station und Service AG           NaN    neu  \n",
276 |        "9   7.574042  51.191867  DB Station und Service AG           NaN    neu  "
277 |       ]
278 |      },
279 |      "execution_count": 3,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "df.head(10)"
286 |    ]
287 |   },
288 |   {
289 |    "attachments": {},
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "### Data exploration\n",
294 |     "Print some basic information about the data. Your data exploration would continue here."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 6,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "<class 'pandas.core.frame.DataFrame'>\n",
307 |       "RangeIndex: 6519 entries, 0 to 6518\n",
308 |       "Data columns (total 10 columns):\n",
309 |       " #   Column          Non-Null Count  Dtype  \n",
310 |       "---  ------          --------------  -----  \n",
311 |       " 0   EVA_NR          6519 non-null   int64  \n",
312 |       " 1   DS100           6519 non-null   object \n",
313 |       " 2   IFOPT           6512 non-null   object \n",
314 |       " 3   NAME            6519 non-null   object \n",
315 |       " 4   Verkehr         6519 non-null   object \n",
316 |       " 5   Laenge          6519 non-null   float64\n",
317 |       " 6   Breite          6519 non-null   float64\n",
318 |       " 7   Betreiber_Name  6517 non-null   object \n",
319 |       " 8   Betreiber_Nr    5395 non-null   float64\n",
320 |       " 9   Status          24 non-null     object \n",
321 |       "dtypes: float64(3), int64(1), object(6)\n",
322 |       "memory usage: 509.4+ KB\n"
323 |      ]
324 |     },
325 |     {
326 |      "data": {
327 |       "text/plain": [
328 |        "array(['neu', None], dtype=object)"
329 |       ]
330 |      },
331 |      "execution_count": 6,
332 |      "metadata": {},
333 |      "output_type": "execute_result"
334 |     }
335 |    ],
336 |    "source": [
337 |     "df.info()\n",
338 |     "\n",
339 |     "df['Status'].unique()"
340 |    ]
341 |   }
342 |  ],
343 |  "metadata": {
344 |   "kernelspec": {
345 |    "display_name": "Python 3",
346 |    "language": "python",
347 |    "name": "python3"
348 |   },
349 |   "language_info": {
350 |    "codemirror_mode": {
351 |     "name": "ipython",
352 |     "version": 3
353 |    },
354 |    "file_extension": ".py",
355 |    "mimetype": "text/x-python",
356 |    "name": "python",
357 |    "nbconvert_exporter": "python",
358 |    "pygments_lexer": "ipython3",
359 |    "version": "3.11.2"
360 |   },
361 |   "orig_nbformat": 4
362 |  },
363 |  "nbformat": 4,
364 |  "nbformat_minor": 2
365 | }
366 | 


--------------------------------------------------------------------------------