├── .gitignore
├── Dockerfile
├── README.md
├── adventure_works.ipynb
├── air_pollution.ipynb
├── assets
    ├── forecast_prophet.csv
    ├── getml_logo.png
    ├── getml_logo_dark.png
    ├── loans-schema.png
    ├── pred_prophet_30d.csv
    └── zuordnung.json
├── atherosclerosis.ipynb
├── baseball.ipynb
├── consumer_expenditures.ipynb
├── cora.ipynb
├── cora_sota.ipynb
├── docker-compose.yml
├── dodgers.ipynb
├── fastprop_benchmark
    ├── air_pollution_prop.ipynb
    ├── comparisons
    │   ├── air_pollution.csv
    │   ├── auc-rsquared_fps.png
    │   ├── comparisons.py
    │   ├── dodgers.csv
    │   ├── fps.png
    │   ├── fps_performance.png
    │   ├── interstate94.csv
    │   ├── nrpf.png
    │   ├── nrpf_performance.png
    │   ├── occupancy.csv
    │   ├── performance.png
    │   └── robot.csv
    ├── dodgers_prop.ipynb
    ├── interstate94_prop.ipynb
    ├── occupancy_prop.ipynb
    └── robot_prop.ipynb
├── formula1.ipynb
├── imdb.ipynb
├── interstate94.ipynb
├── kaggle_notebooks
    ├── cora_getml_vs_gnn.ipynb
    ├── data
    │   └── Epileptic Seizure Recognition.csv
    ├── epilepsy_recognition.ipynb
    ├── getml-and-gnns-a-natural-symbiosis.ipynb
    └── resources
    │   ├── Cover.png
    │   ├── CoverWithTitle.png
    │   ├── Examples2.png
    │   ├── FeatureEngineeringExample2.png
    │   ├── Linkedin_Optimized_Cover.png
    │   └── ManualFeatureEngineering2.png
├── loans.ipynb
├── movie_lens.ipynb
├── occupancy.ipynb
├── online_retail.ipynb
├── requirements.txt
├── robot.ipynb
├── seznam.ipynb
├── sfscores.ipynb
├── stats.ipynb
└── utils
    ├── __init__.py
    ├── add_original_columns.py
    ├── benchmark.py
    ├── ft_time_series_builder.py
    ├── load.py
    ├── print_time_taken.py
    ├── remove_target_column.py
    ├── ts_fresh_builder.py
    └── zuordnung.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | **/.DS_Store
  2 | 
  3 | # Temp
  4 | preparation/
  5 | 
  6 | # Binder
  7 | .bash_logout
  8 | .bashrc
  9 | .cache/
 10 | .config/
 11 | .getML/
 12 | .local/
 13 | .npm/
 14 | .yarn/
 15 | .jupyter/
 16 | .profile
 17 | .gitconfig
 18 | binder/getml/
 19 | .binder/
 20 | 
 21 | # Byte-compiled / optimized / DLL files
 22 | __pycache__/
 23 | *.py[cod]
 24 | *$py.class
 25 | 
 26 | # C extensions
 27 | *.so
 28 | 
 29 | # Distribution / packaging
 30 | .Python
 31 | build/
 32 | develop-eggs/
 33 | dist/
 34 | downloads/
 35 | eggs/
 36 | .eggs/
 37 | lib/
 38 | lib64/
 39 | parts/
 40 | sdist/
 41 | var/
 42 | wheels/
 43 | pip-wheel-metadata/
 44 | share/python-wheels/
 45 | *.egg-info/
 46 | .installed.cfg
 47 | *.egg
 48 | MANIFEST
 49 | 
 50 | # PyInstaller
 51 | #  Usually these files are written by a python script from a template
 52 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 53 | *.manifest
 54 | *.spec
 55 | 
 56 | # Installer logs
 57 | pip-log.txt
 58 | pip-delete-this-directory.txt
 59 | 
 60 | # Unit test / coverage reports
 61 | htmlcov/
 62 | .tox/
 63 | .nox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *.cover
 70 | *.py,cover
 71 | .hypothesis/
 72 | .pytest_cache/
 73 | 
 74 | # Translations
 75 | *.mo
 76 | *.pot
 77 | 
 78 | # Django stuff:
 79 | *.log
 80 | local_settings.py
 81 | db.sqlite3
 82 | db.sqlite3-journal
 83 | 
 84 | # Flask stuff:
 85 | instance/
 86 | .webassets-cache
 87 | 
 88 | # Scrapy stuff:
 89 | .scrapy
 90 | 
 91 | # Sphinx documentation
 92 | docs/_build/
 93 | 
 94 | # PyBuilder
 95 | target/
 96 | 
 97 | # Jupyter Notebook
 98 | .ipynb_checkpoints
 99 | 
100 | # IPython
101 | profile_default/
102 | ipython_config.py
103 | 
104 | # pyenv
105 | .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # generated
152 | *_spark/
153 | *_pipeline/
154 | .vscode/
155 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # getml-base
 3 | FROM python:3.11.8 AS getml-base
 4 | 
 5 | RUN useradd getml
 6 | USER getml
 7 | WORKDIR /home/getml
 8 | 
 9 | COPY --chown=getml:getml --chmod=0777 ./requirements.txt /home/getml/requirements.txt
10 | 
11 | ENV PATH="/home/getml/.local/bin:$PATH"
12 | 
13 | RUN python3.11 \
14 |     -mpip install \
15 |     -r /home/getml/requirements.txt
16 | 
17 | RUN mkdir /home/getml/.getML
18 | 
19 | 
20 | ################################################################################
21 | # getml-demo
22 | FROM getml-base AS getml-demo
23 | 
24 | ARG TARGETARCH
25 | ARG TARGETOS
26 | RUN \
27 |     if [ "${TARGETARCH}" = "amd64" ]; then \
28 |         export GETML_ARCH="x64" ;\
29 |     else \
30 |         export GETML_ARCH="${TARGETARCH}" ;\
31 |     fi; \
32 |     export GETML_VERSION=$(grep -oP '(?<=getml==)\d+\.\d+\.\d+' requirements.txt) ;\
33 |     export GETML_BUCKET="https://go.getml.com/static/demo/download" ;\
34 |     export GETML_ENGINE_FILE="getml-${GETML_VERSION}-${GETML_ARCH}-${TARGETOS}.tar.gz" ;\
35 |     export GETML_ENGINE_URL="${GETML_BUCKET}/${GETML_VERSION}/${GETML_ENGINE_FILE}" ;\
36 |     echo "Downloading getML engine from ${GETML_ENGINE_URL}" ;\
37 |     curl ${GETML_ENGINE_URL} | tar -C /home/getml/.getML -xvzf -
38 | 
39 | COPY --chown=getml:getml . /home/getml/demo/
40 | 
41 | EXPOSE 1709 8888
42 | CMD [ "/home/getml/.local/bin/jupyter", "lab", "--ip='*'", "--port=8888", "--notebook-dir='/home/getml/demo'" ]
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center" style="text-align: center;">
  2 |     <img width="400" style="width: 50% !important; max-width: 400px;" src="assets/getml_logo_dark.png#gh-dark-mode-only" />
  3 |     <img width="400" style="width: 50% !important; max-width: 400px;" src="assets/getml_logo.png#gh-light-mode-only" />
  4 | </p>
  5 | 
  6 | <p align="center" style="text-align: center;">
  7 |     <i>getML combines feature learning with AutoML to build end-to-end prediction pipelines</i>
  8 | </p>
  9 | 
 10 | <p align="center" style="text-align: center;">
 11 |         <a href="https://getml.com/latest/contact" target="_blank">
 12 |         <img src="https://img.shields.io/badge/schedule-a_meeting-blueviolet.svg" /></a>
 13 |         <a href="mailto:hello@getml.com" target="_blank">
 14 |         <img src="https://img.shields.io/badge/contact-us_by_mail-orange.svg" /></a>
 15 | </p>
 16 | 
 17 | <br>
 18 | <span style="display: block; border-bottom: 1px solid #eaecef;"></span>
 19 | <br>
 20 | 
 21 | # Introduction
 22 | 
 23 | This repository contains different [Jupyter Notebooks](https://jupyter.org) to demonstrate the capabilities of [getML](https://www.getml.com) in the realm of machine learning on relational data-sets in various domains. getML and its feature engineering algorithms ([FastProp](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-fastprop), [Multirel](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-multirel), [Relboost](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-relboost), [RelMT](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-relmt)), its [predictors](https://getml.com/latest/user_guide/concepts/predicting#using-getml) (LinearRegression, LogisticRegression, XGBoostClassifier, XGBoostRegressor) and its [hyperparameter optimizer](https://getml.com/latest/user_guide/concepts/hyperopt#hyperparameter-optimization) (RandomSearch, LatinHypercubeSearch, GaussianHyperparameterSearch), are benchmarked against competing tools in similar categories, like [featuretools](https://www.featuretools.com/), [tsfresh](https://tsfresh.com/), and [prophet](https://facebook.github.io/prophet/). While [FastProp](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-fastprop) usually outperforms the competition in terms of runtime and resource requirements, the more sophisticated algorithms ([Multirel](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-multirel), [Relboost](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-relboost), [RelMT](https://getml.com/latest/user_guide/concepts/feature_engineering/#feature-engineering-algorithms-relmt)), which are part of the [Enterprise edition](https://getml.com/latest/enterprise), often lead to even higher accuracy while maintaining low resource requirements. The demonstrations are done on publicly available data-sets, which are standardly used for such comparisons.
 24 | 
 25 | # Table of Contents
 26 | 
 27 | - [Introduction](#introduction)
 28 | - [Table of Contents](#table-of-contents)
 29 | - [Usage](#usage)
 30 |   - [Reading Online](#reading-online)
 31 |   - [Experimenting Locally](#experimenting-locally)
 32 |     - [Using Docker](#using-docker)
 33 |     - [On the Machine (Linux/x64 \& arm64)](#on-the-machine-linuxx64--arm64)
 34 | - [Notebooks](#notebooks)
 35 |   - [Overview](#overview)
 36 |   - [Descriptions](#descriptions)
 37 |   - [Quick access by grouping by](#quick-access-by-grouping-by)
 38 |   - [Benchmarks](#benchmarks)
 39 |     - [FastProp Benchmarks](#fastprop-benchmarks)
 40 |     - [Further Benchmarks in the Relational Dataset Repository](#further-benchmarks-in-the-relational-dataset-repository)
 41 | 
 42 | # Usage
 43 | 
 44 | The provided notebooks can be checked and used in different ways. 
 45 | 
 46 | ## Reading Online 
 47 | 
 48 | As github renders the notebooks, they can each be viewed by just opening and scrolling through them. For convenience, the output of each cells execution is included.
 49 | 
 50 | ## Experimenting Locally
 51 | 
 52 | To experiment with the notebooks, such as playing with different pipelines and predictors, it is best to run them on a local machine. Linux users with an x64 architecture can choose from one of the options provided below. Soon, we will offer a simple, container-based solution compatible with all major systems (Windows, Mac) and will also support ARM-based architectures.
 53 | 
 54 | ### Using Docker
 55 | 
 56 | There are a `docker-compose.yml` and a `Dockerfile` for easy usage provided.
 57 | 
 58 | Simply clone this repository and run the docker command to start the `notebooks` service. The image it depends on will be build if it is not already available.
 59 | 
 60 | ```
 61 | $ git clone https://github.com/getml/getml-demo.git  
 62 | $ docker compose up notebooks  
 63 | ```
 64 | 
 65 | To open Jupyter Lab in the browser, look for the following lines in the output and copy-paste them in your browser:
 66 | 
 67 | ```
 68 | Or copy and paste one of these URLs:
 69 | 
 70 | http://localhost:8888/lab?token=<generated_token>
 71 | ```
 72 | 
 73 | After the first `getml.engine.launch(...)` is executed and the Engine is started, the corresponding Monitor can be opened in the browser under
 74 | 
 75 | ```
 76 | http://localhost:1709/#/token/token
 77 | ```
 78 | 
 79 | > [!NOTE]  
 80 | > Using alternatives to [Docker Desktop](https://www.docker.com/products/docker-desktop) like  
 81 | > * [Podman](https://podman.io),  
 82 | > * [Podman Desktop](https://podman-desktop.io) or  
 83 | > * [Rancher Desktop](https://rancherdesktop.io) with a container engine like dockerd(moby) or containerd(nerdctl)
 84 | > 
 85 | > allows bind-mounting the notebooks in a user-writeable way (this might need to be included: `userns_mode: keep-id`) instead of having to `COPY` them in. In combination with volume-binding `/home/user/.getML/logs` and `/home/user/.getML/projects`, runs and changes can be persisted across containers.
 86 | 
 87 | ### On the Machine (Linux/x64 & arm64)
 88 | 
 89 | Alternatively, getML and the notebooks can be run natively on the local Linux machine by having certain software installed, like Python and some Python libraries, Jupyter-Lab and the getML Engine. The [getML Python library](https://github.com/getml/getml-community) provides an Engine version without [Enterprise features](https://getml.com/latest/enterprise). In order to replicate Enterprise functionalities in the notebooks, you may obtain an [Enterprise trial version](https://getml.com/latest/enterprise/request-trial).
 90 | 
 91 | The following commands will set up a Python environment with necessary Python libraries and the getML Enterprise trial version, and Jupyter-Lab
 92 | 
 93 | ```
 94 | $ git clone https://github.com/getml/getml-demo.git  
 95 | $ cd getml-demo  
 96 | $ pipx install hatch
 97 | $ hatch env create
 98 | $ hatch shell
 99 | $ pip install -r requirements.txt
100 | $ jupyter-lab
101 | ```
102 | 
103 | > [!TIP]  
104 | > Install the [Enterprise trial version](https://getml.com/latest/enterprise/request-trial) via the  [Install getML on Linux guide](https://getml.com/latest/install/packages/linux#install-getml-on-linux) to try the Enterprise features.
105 | 
106 | With the last command, Jupyter-Lab should automatically open in the browser. If not, look for the following lines in the output and copy-paste it in your browser:
107 | 
108 | ```
109 | Or copy and paste one of these URLs:
110 | 
111 | http://localhost:8888/lab?token=<generated_token>
112 | ```
113 | 
114 | After the first `getml.engine.launch(...)` is executed and the Engine is started, the corresponding Monitor can be opened in the browser under
115 | 
116 | ```
117 | http://localhost:1709/#/token/token
118 | ```
119 | 
120 | # Notebooks
121 | 
122 | This repository contains various demonstrational projects to help getting started with relational learning and getML. They cover different aspects of the software, and can serve as documentation or as blueprints for own projects.
123 | 
124 | Each project solves a typical data science problem in a specific domain. You
125 | can either choose a project by domain or by the underlying machine learning
126 | problem, e.g. binary classification on a time series or regression using a
127 | relational data scheme involving many tables.
128 | 
129 | ## Overview
130 | 
131 | |                                                               | Task           | Data                     | Size               | Domain         |
132 | | ------------------------------------------------------------- | -------------- | ------------------------ | ------------------ | -------------- |
133 | | [AdventureWorks: Predicting customer churn][adventureworksnb] | Classification | Relational               | 71 tables, 233 MB  | Commerce       |
134 | | [Air pollution prediction][airpollutionnb]                    | Regression     | Multivariate time series | 1 table, 41k rows  | Environment    |
135 | | [Disease lethality prediction][atherosclerosisnb]             | Classification | Relational               | 3 tables, 22 MB    | Health         |
136 | | [Baseball (Lahman): Predicting salaries][baseballnb]          | Regression     | Relational               | 25 tables, 74 MB   | Sports         |
137 | | [Expenditure categorization][consumerexpendituresnb]          | Classification | Relational               | 3 tables, 150 MB   | E-commerce     |
138 | | [CORA: Categorizing academic studies][coranb]                 | Classification | Relational               | 3 tables, 4.6 MB   | Academia       |
139 | | [Traffic volume prediction (LA)][dodgersnb]                   | Regression     | Multivariate time series | 1 table, 47k rows  | Transportation |
140 | | [Formula 1 (ErgastF1): Predicting the winner][formula1nb]     | Classification | Relational               | 13 tables, 56 MB   | Sports         |
141 | | [IMDb: Predicting actors' gender][imdbnb]                     | Classification | Relational with text     | 7 tables, 477.1 MB | Entertainment  |
142 | | [Traffic volume prediction (I94)][interstate94nb]             | Regression     | Multivariate time series | 1 table, 24k rows  | Transportation |
143 | | [Financial: Loan default prediction][loansnb]                 | Classification | Relational               | 8 tables, 60 MB    | Financial      |
144 | | [MovieLens: Predicting users' gender][movielensnb]            | Classification | Relational               | 7 tables, 20 MB    | Entertainment  |
145 | | [Occupancy detection][occupancynb]                            | Classification | Multivariate time series | 1 table, 32k rows  | Energy         |
146 | | [Order cancellation][onlineretailnb]                          | Classification | Relational               | 1 table, 398k rows | E-commerce     |
147 | | [Predicting a force vector from sensor data][robotnb]         | Regression     | Multivariate time series | 1 table, 15k rows  | Robotics       |
148 | | [Seznam: Predicting the transaction volume][seznamnb]         | Regression     | Relational               | 4 tables, 147 MB   | E-commerce     |
149 | | [SFScores: Predicting health check scores][sfscoresnb]        | Regression     | Relational               | 3 tables, 9 MB     | Restaurants    |
150 | | [Stats: Predicting users' reputation][statsnb]                | Regression     | Relational               | 8 tables, 658 MB   | Internet       |
151 | 
152 | ## Descriptions
153 | 
154 | <details>
155 |   <summary>Adventure Works - Predicting customer churn</summary>
156 |   
157 |   In the notebook, we demonstrate how getML can be used for a customer churn project using a synthetic dataset of a fictional company. We also benchmark getML against featuretools.
158 |   
159 |   AdventureWorks is a fictional company, that sells bicycles. It is used by Microsoft to showcase how its MS SQL Server can be used to manage business data. Since the dataset resembles a real-world customer database and it is open-source, we use it to showcase, how getML can be used for a classic customer churn project (real customer databases are not easily available for the purposes of showcasing and benchmarking, for reasons of data privacy).
160 |   
161 |   * Prediction type: Classification model
162 |   * Domain: Customer loyalty
163 |   * Prediction target: churn
164 |   * Population size: 19704
165 |   
166 |   [> Open Notebook <](adventure_works.ipynb)
167 | </details>
168 | 
169 | <details>
170 |   <summary>Air Pollution - Why feature learning is better than simple propositionalization</summary>
171 |   
172 |   In the notebook we compare getML to featuretools and tsfresh, both of which are open-source libraries for feature engineering using propositionalization approaches. We showcase, that advanced algorithms featured in getML yield significantly better predictions on the dataset.
173 |   
174 |   The propositionalization methods usually work as follows:
175 |   
176 |   * Generate a large number of hard-coded features,
177 |   * Use feature selection to pick a percentage of these features.
178 |   
179 |   By contrast, getML contains approaches for feature learning, which adapts machine learning approaches, such as decision trees or gradient boosting, to the problem of extracting features from relational data and time series.
180 |   
181 |   * Prediction type: Regression model
182 |   * Domain: Air pollution
183 |   * Prediction target: pm 2.5 concentration
184 |   * Source data: Multivariate time series
185 |   * Population size: 41757
186 |   
187 |   [> Open Notebook <](air_pollution.ipynb)
188 |   
189 |   [> Open FastProp Benchmark Notebook <](fastprop_benchmark/air_pollution_prop.ipynb)
190 | </details>
191 | 
192 | <details>
193 |   <summary>Atherosclerosis - Disease lethality prediction</summary>
194 |   
195 |   With the notebook we give a brief introduction to feature engineering on relational data with many columns. We discuss why feature engineering on such data is particularly challenging and what we can do to overcome these problems.
196 |   
197 |   Every column, that we have, can either be aggregated or it can be used for our conditions. That means, if we have n columns to aggregate, we can potentially build conditions for n other columns. In other words, the computational complexity is n\*n in the number of columns.
198 |   
199 |   Note, that this problem occurs regardless of whether you automate feature engineering or you do it by hand. The size of the search space is in the number of columns in either case, unless you can rule something out a-priori.
200 |   
201 |   An algorithm, that generates specific features, can only use columns for conditions, it is not allowed to aggregate columns – and it doesn't need to do so. That means, the computational complexity is linear instead of quadratic. For data sets with a large number of columns this can make all the difference in the world. For instance, if you have 100 columns, the size of the search space of the second approach is only 1% of the size of the search space of the first one.
202 |   
203 |   To illustrate the problem of dimensionality in predictive analytics on relational data, we use a longitudinal study of atherosclerosis patients. One of its defining features is, that it contains many columns, which makes it a good candidate to illustrate the problem discussed in the notebook.
204 |   
205 |   The way, these studies handle the large number of columns in the data set, is to divide the columns into subgroups and then handling each subgroup separately. Even though this is one way to overcome the curse of dimensionality, it is not a very satisfying approach. We would like to be able to handle a large number of columns at once.
206 |   
207 |   * Prediction type: Binary classification
208 |   * Domain: Health
209 |   * Prediction target: Mortality within one year
210 |   * Source data: 146 columns in 2 tables, 22 MB
211 |   * Population size: 28433
212 |   
213 |   [> Open Notebook <](atherosclerosis.ipynb)
214 | </details>
215 | 
216 | <details>
217 |   <summary>Baseball - Predicting players' salary</summary>
218 |   
219 |   In the notebook, we benchmark several of getML's feature learning algorithms against featuretools using a dataset related to baseball players' salary.
220 |   
221 |   In the late 1990s, the Oakland Athletics began focusing on the idea of sabermetrics, using statistical methods to identify undervalued baseball players. This was done to compensate for the fact, that the team had a significantly smaller budget than most other teams in its league. Under its general manager Billy Beane, the Oakland Athletics became the first team in over 100 years to win 20 consecutive games in a row, despite still being significantly disadvantaged in terms of its budget. After this remarkable success, the use of sabermetrics quickly became the norm in baseball. These events have been documented in a bestselling book and a movie, both called Moneyball.
222 |   
223 |   In the notebook we demonstrate, that relational learning can be used for sabermetrics. Specifically, we develop a model to predict players' salary using getML's statistical relational learning algorithms. Such predictions can be used to identify undervalued players.
224 |   
225 |   * Prediction type: Regression model
226 |   * Domain: Sports
227 |   * Prediction target: Salaries
228 |   * Population size: 23111
229 |   
230 |   [> Open Notebook <](baseball.ipynb)
231 | </details>
232 | 
233 | <details>
234 |   <summary>Consumer Expenditures - Consumer expenditure categorization</summary>
235 |   
236 |   The notebook demonstrates how powerful a real relational learning algorithm can be. Based on a public-domain dataset on consumer behavior, we use a propostionalization algorithm to predict, whether purchases were made as a gift. We show, that with relational learning, we can get an AUC of over 90%. The generated features would have been impossible to build by hand or by using brute-force approaches.
237 |   
238 |   There are many subdomains of relational learning, but the most important one is extracting features from relational data: Most business data is relational, meaning that it is spread out over several relational tables. However, most machine learning algorithms require, that the data be presented in the form of a single flat table. So we need to extract features from our relational data. Some people also call this data wrangling.
239 |   
240 |   The Consumer Expenditure Data Set is a public domain data set provided by the American Bureau of Labor Statistics. It includes the diary entries, where American consumers are asked to keep diaries of the products they have purchased each month. These consumer goods are categorized using a six-digit classification system: the UCC. This system is hierarchical, meaning that every digit represents an increasingly granular category. The diaries also contain a flag that indicates whether the product was purchased as a gift. The challenge is to predict that flag using other information in the diary entries.
241 |   
242 |   * Prediction type: Classification model
243 |   * Domain: Retail
244 |   * Prediction target: If a purchase is a gift
245 |   * Source data: Relational data set, 4 tables
246 |   * Population size: 2.020.634
247 |   
248 |   [> Open Notebook <](consumer_expenditures.ipynb)
249 | </details>
250 | 
251 | <details>
252 |   <summary>CORA - Categorizing academic publications</summary>
253 |   
254 |   In the notebook, we compare getML against extant approaches in the relational learning literature on the CORA data set, which is often used for benchmarking. We demonstrate, how getML performs against the state of the art in the relational learning literature on this data set. Beyond the benchmarking aspects, this notebooks showcases getML's excellent capabilities in dealing with categorical data.
255 |   
256 |   CORA is a well-known benchmarking dataset in the academic literature on relational learning. The dataset contains 2708 scientific publications on machine learning. The papers are divided into 7 categories. The challenge is to predict the category of a paper based on the papers it cites, the papers it is cited by and keywords contained in the paper.
257 |   
258 |   * Prediction type: Classification model
259 |   * Domain: Academia
260 |   * Prediction target: The category of a paper
261 |   * Population size: 2708
262 |   
263 |   [> Open Notebook <](cora.ipynb)
264 | </details>
265 | 
266 | <details>
267 |   <summary>Dodgers - Traffic volume prediction on LA's 101 North freeway</summary>
268 |   
269 |   In the notebook we demonstrate a time series application of getML. We benchmark our results against Facebook's Prophet and tsfresh.
270 |   
271 |   The data set features some particularly interesting characteristics common for time series, which classical models may struggle to deal with. Such characteristics are:
272 |   
273 |   * High frequency (every five minutes)
274 |   * Dependence on irregular events (holidays, Dodgers games)
275 |   * Strong and overlapping cycles (daily, weekly)
276 |   * Anomalies
277 |   * Multiple seasonalities
278 |   
279 |   To quote the maintainers of the data set:
280 |   
281 |   > This loop sensor data was collected for the Glendale on ramp for the 101 North freeway in Los Angeles. It is close enough to the stadium to see unusual traffic after a Dodgers game, but not so close and heavily used by game traffic so that the signal for the extra traffic is overly obvious.
282 |   
283 |   * Prediction type: Regression model
284 |   * Domain: Transportation
285 |   * Prediction target: traffic volume
286 |   * Source data: Univariate time series
287 |   * Population size: 47497
288 |   
289 |   [> Open Notebook <](dodgers.ipynb)
290 |   
291 |   [> Open FastProp Benchmark Notebook <](fastprop_benchmark/dodgers_prop.ipynb)
292 | </details>
293 | 
294 | <details>
295 |   <summary>Formula 1 - Predicting the winner of a race</summary>
296 |   
297 |   In the notebook we benchmark getML against featuretools to predict the winner of a Formula 1 race.
298 |   
299 |   We develop a prediction model for Formula 1 races, that allows us to predict the winner of a race before the race has started.
300 |   
301 |   We use a dataset of all Formula 1 races from 1950 to 2017. The dataset includes information such as the time taken in each lap, the time taken for pit stops, the performance in the qualifying rounds etc.
302 |   
303 |   * Prediction type: Classification model
304 |   * Domain: Sports
305 |   * Prediction target: Win
306 |   * Population size: 31578
307 |   
308 |   [> Open Notebook <](formula1.ipynb)
309 | </details>
310 | 
311 | <details>
312 |   <summary>IMDB - Predicting actors' gender</summary>
313 |   
314 |   In the notebook, we demonstrate how getML can be applied to text fields. In relational databases, text fields are less structured and less standardized than categorical data, making it more difficult to extract useful information from them. When using a relational learning tool such as getML, we can easily generate simple features from text fields and leverage the information contained therein.
315 |   
316 |   As an example data set, we use the Internet Movie Database, which has been used by previous studies in the relational learning literature. This allows us to benchmark our approach to state-of-the-art algorithms in the relational learning literature.
317 |   
318 |   The data set contains about 800,000 actors. The goal is to predict the gender of said actors based on other information we have about them, such as the movies they have participated in and the roles they have played in these movies.
319 |   
320 |   * Prediction type: Classification model
321 |   * Domain: Entertainment
322 |   * Prediction target: The gender of an actor
323 |   * Population size: 817718
324 |   
325 |   [> Open Notebook <](imdb.ipynb)
326 | </details>
327 | 
328 | <details>
329 |   <summary>Interstate 94 - Hourly traffic volume prediction on Interstate 94</summary>
330 |   
331 |   In the notebook, we demonstrate a time series application of getML. We predict the hourly traffic volume on I-94 westbound from Minneapolis-St Paul. We benchmark our results against Facebook's Prophet.
332 |   
333 |   The dataset features some particularly interesting characteristics common for time series, which classical models may struggle to deal with appropriately. Such characteristics are:
334 |   
335 |   * High frequency (hourly)
336 |   * Dependence on irregular events (holidays)
337 |   * Strong and overlapping cycles (daily, weekly)
338 |   * Anomalies
339 |   * Multiple seasonalities
340 |   
341 |   <br>
342 |   
343 |   * Prediction type: Regression model
344 |   * Domain: Transportation
345 |   * Prediction target: Hourly traffic volume
346 |   * Source data: Multivariate time series, 5 components
347 |   * Population size: 24096
348 |   
349 |   [> Open Notebook <](interstate94.ipynb)
350 |   
351 |   [> Open FastProp Benchmark Notebook <](fastprop_benchmark/interstate94_prop.ipynb)
352 | </details>
353 | 
354 | <details>
355 |   <summary>Loans - Predicting the loan default risk of Czech bank customers</summary>
356 |   
357 |   The notebook demonstrates the application of our relational learning algorithm to predict if a customer of a bank will default on his loan. We train the predictor on customer metadata, transaction history, as well as other successful and unsuccessful loans.
358 |   
359 |   The notebook features a textbook example of predictive analytics applied to the financial sector. A loan is the lending of money to companies or individuals. Banks grant loans in exchange for the promise of repayment. Loan default is defined as the failure to meet this legal obligation, for example, when a home buyer fails to make a mortgage payment. A bank needs to estimate the risk it carries when granting loans to potentially non-performing customers.
360 |   
361 |   * Prediction type: Binary classification
362 |   * Domain: Finance
363 |   * Prediction target: Loan default
364 |   * Source data: 8 tables, 78.8 MB
365 |   * Population size: 682
366 |   
367 |   [> Open Notebook <](loans.ipynb)
368 | </details>
369 | 
370 | <details>
371 |   <summary>Movie Lens - Predicting a user's gender based on the movies they have watched</summary>
372 |   
373 |   In the notebook we apply getML to a dataset, that is often used for benchmarking in the relational learning literature: The MovieLens dataset.
374 |   
375 |   The MovieLens dataset is often used in the relational learning literature as a benchmark for newly developed algorithms. Following the tradition, we benchmark getML's own algorithms on this dataset as well. The task is to predict a user's gender based on the movies they have watched.
376 |   
377 |   * Prediction type: Classification model
378 |   * Domain: Entertainment
379 |   * Prediction target: The gender of a user
380 |   * Population size: 6039
381 |   
382 |   [> Open Notebook <](movie_lens.ipynb)
383 | </details>
384 | 
385 | <details>
386 |   <summary>Occupancy - Occupancy detection</summary>
387 |   
388 |   The notebook demonstrates, how to apply getML to multivariate time series and how to use getML's high-level interface for hyperparameter tuning.
389 |   
390 |   Our use case is a public domain data set for predicting room occupancy from sensor data. Note, that this is not only a neat use case for machine learning algorithms, but a real-world application with tangible consequences: If room occupancy is known with sufficient certainty, it can be applied to the control systems of a building. Such as system can reduce the energy consumption by up to 50 %.
391 |   
392 |   Instead of creating features by merging and aggregating peripheral tables in a relational data model, for a (multivariate) time-series, we perform the same operations on the population table itself. This results in features like these:
393 |   
394 |   * Aggregations over time, such as the average value of some column for the last 3 days.
395 |   * Seasonal effects, such as today is a Wednesday, so let's get the average value for the last four Wednesdays.
396 |   * Lag variables, such as get the value of some column from two hours ago.
397 |   
398 |   Using getML's algorithms for relational learning, we extract all of these features automatically. Having created a flat table of such features, we then apply state-of-the-art machine learning algorithms, like xgboost.
399 |   
400 |   The present analysis is based on a public domain time series dataset. It is available in the UC Irvine Machine Learning Repository. The challenge is straightforward: We want to predict whether an office room is occupied at a given moment in time using sensor data. The data is measured about once a minute. Ground-truth occupancy was obtained from time-stamped pictures. The available columns are
401 |   
402 |   * Date, year-month-day hour:minute:second
403 |   * Temperature, in Celsius
404 |   * Relative Humidity, %
405 |   * Light, in Lux
406 |   * CO2, in ppm
407 |   * Humidity Ratio, Derived quantity from temperature and relative humidity, in kgwater-vapor/kg-air
408 |   * Occupancy, 0 or 1, 0 for not occupied, 1 for occupied status
409 |   
410 |   <br>
411 |   
412 |   * Prediction type: Binary classification
413 |   * Domain: Energy
414 |   * Prediction target: Room occupancy
415 |   * Source data: 1 table, 32k rows
416 |   * Population size: 32k
417 |   
418 |   [> Open Notebook <](occupancy.ipynb)
419 |   
420 |   [> Open FastProp Benchmark Notebook <](fastprop_benchmark/occupancy_prop.ipynb)
421 | </details>
422 | 
423 | <details>
424 |   <summary>Online Retail - Predicting order cancellations</summary>
425 |   
426 |   The notebook demonstrate, how getML can be applied in an e-commerce context. We also show, that we can significantly improve our results by using getML's built-in hyperparameter tuning routines.
427 |   
428 |   The data set contains about 400,000 orders from a British online retailer. Each order consists of a product that has been ordered and a corresponding quantity. Several orders can be summarized onto a single invoice. The goal is to predict whether an order will be cancelled.
429 |   
430 |   Because the company mainly sells to other businesses, the cancellation rate is relatively low, namely 1.83%.
431 |   
432 |   * Prediction type: Classification model
433 |   * Domain: E-commerce
434 |   * Prediction target: Whether an order will be cancelled
435 |   * Population size: 397925
436 |   
437 |   [> Open Notebook <](online_retail.ipynb)
438 | </details>
439 | 
440 | <details>
441 |   <summary>Robot - Feature engineering on sensor data - how to overcome feature explosion</summary>
442 |   
443 |   The purpose of this notebook is to illustrate, how we can overcome the feature explosion problem based on an example dataset involving sensor data.
444 |   
445 |   Every column that we have, can either be aggregated or it can be used for our conditions. That means, if we have n columns to aggregate, we can potentially build conditions for n other columns. In other words, the computational complexity is n\*n in the number of columns. Note, that this problem occurs regardless of whether you automate feature engineering or you do it by hand. The size of the search space is n\*n in the number of columns in either case, unless you can rule something out a-priori. This problem is known as feature explosion.
446 |   
447 |   An algorithm, that generates specific different features can only use columns for conditions, it is not allowed to aggregate columns – and it doesn't need to do so. That means, the computational complexity is linear instead of quadratic. For data sets with a large number of columns, this can make all the difference in the world. For instance, if you have 100 columns the size of the search space of the second approach is only 1% of the size of the search space of the first one.
448 |   
449 |   getML features an algorithm called Relboost, which generates features according to this principle and is therefore very suitable for data sets with many columns.
450 |   
451 |   To illustrate the problem, we use a data set related to robotics. When robots interact with humans, the most important thing is, that they don't hurt people. In order to prevent such accidents, the force vector on the robot's arm is measured. However, measuring the force vector is expensive. Therefore, we want consider an alternative approach, where we would like to predict the force vector based on other sensor data that are less costly to measure. To do so, we use machine learning. However, the data set contains measurements from almost 100 different sensors and we do not know which and how many sensors are relevant for predicting the force vector.
452 |   
453 |   * Prediction type: Regression
454 |   * Domain: Robotics
455 |   * Prediction target: The force vector on the robot's arm
456 |   * Population size: 15001
457 |   
458 |   [> Open Notebook <](robot.ipynb)
459 |   
460 |   [> Open FastProp Benchmark Notebook <](fastprop_benchmark/robot_prop.ipynb)
461 | </details>
462 | 
463 | <details>
464 |   <summary>Seznam - Predicting the transaction volume </summary>
465 |   
466 |   Seznam is a Czech company with a scope similar to Google. The purpose of the notebook is to analyze data from Seznam's wallet, predicting the transaction volume.
467 |   
468 |   * Prediction type: Regression model
469 |   * Domain: E-commerce
470 |   * Prediction target: Transaction volume
471 |   * Population size: 1,462,078
472 |   
473 |   [> Open Notebook <](seznam.ipynb)
474 | </details>
475 | 
476 | <details>
477 |   <summary>SFScores - Predicting the Results of Health Inspections of Restaurants</summary>
478 |   
479 |   In the notebook, we benchmark several of getML's feature learning algorithms against featuretools using the San Francisco Dept. of Public Health's database of eateries in San Francisco. These eateries are regularly inspected. The inspections often result in a score. The challenge is to predict the score resulting from an inspection.
480 |   
481 |   * Prediction type: Regression model
482 |   * Domain: Health
483 |   * Prediction target: Sales
484 |   * Population size: 12887
485 |   
486 |   [> Open Notebook <](sfscores.ipynb)
487 | </details>
488 | 
489 | <details>
490 |   <summary>Stats - Predicting Users' Reputations</summary>
491 |   
492 |   In the notebook we use relational learning techniques to predict users' reputation on StackExchange. StatsExchange is a website similar to StackOverflow, but based on statistics and machine learning. Much like StackOverflow, it has a complicated system of calculating users' reputation.
493 |   
494 |   * Prediction type: Regression model
495 |   * Domain: Internet
496 |   * Prediction target: Reputation
497 |   * Population size: 41793
498 |   
499 |   [> Open Notebook <](stats.ipynb)
500 | </details>
501 | 
502 | ## Quick access by grouping by
503 | 
504 | <details>
505 |   <summary>Task</summary>
506 | 
507 |   - <details>
508 |     <summary>Classification</summary>
509 | 
510 |     * [Adventure Works][adventureworksnb]                      
511 |     * [Atherosclerosis][atherosclerosisnb]                     
512 |     * [Consumer Expenditures][consumerexpendituresnb]          
513 |     * [CORA][coranb]                                           
514 |     * [Formula 1][formula1nb]                                  
515 |     * [IMDB][imdbnb]                                           
516 |     * [Loans][loansnb]                                         
517 |     * [MovieLens][movielensnb]                                 
518 |     * [Occupancy][occupancynb]                                 
519 |     * [Online Retail][onlineretailnb]                          
520 | 
521 |     </details>
522 |   - <details>
523 |     <summary>Regression</summary>
524 | 
525 |     * [Air Pollution][airpollutionnb]                          
526 |     * [Baseball][baseballnb]                                   
527 |     * [Dodgers][dodgersnb]                                     
528 |     * [Interstate 94][interstate94nb]                          
529 |     * [Robot][robotnb]                                         
530 |     * [Seznam][seznamnb]                                       
531 |     * [SFScores][sfscoresnb]                                   
532 |     * [Stats][statsnb]                                         
533 |     </details>
534 | </details>
535 | 
536 | <details>
537 |   <summary>Data</summary>
538 | 
539 |   - <details>
540 |     <summary>Relational</summary>
541 |   
542 |     * [Adventure Works][adventureworksnb]                      
543 |     * [Atherosclerosis][atherosclerosisnb]                     
544 |     * [Baseball][baseballnb]                                   
545 |     * [Consumer Expenditures][consumerexpendituresnb]          
546 |     * [CORA][coranb]                                           
547 |     * [Formula 1][formula1nb]                                  
548 |     * [Loans][loansnb]                                         
549 |     * [MovieLens][movielensnb]                                 
550 |     * [Online Retail][onlineretailnb]                          
551 |     * [Seznam][seznamnb]                                       
552 |     * [SFScores][sfscoresnb]                                   
553 |     * [Stats][statsnb]                                         
554 |     </details>
555 |   - <details>
556 |     <summary>Relational With Text</summary>
557 |   
558 |     * [IMDB][imdbnb]                                           
559 |     </details>
560 |   - <details>
561 |     <summary>Multivariate Time Series</summary>
562 |   
563 |     * [Air Pollution][airpollutionnb]                          
564 |     * [Dodgers][dodgersnb]                                     
565 |     * [Interstate 94][interstate94nb]                          
566 |     * [Occupancy][occupancynb]                                 
567 |     * [Robot][robotnb]                                         
568 |     </details>
569 | </details>
570 | 
571 | <details>
572 |   <summary>Domain</summary>
573 |   
574 |   - <details>
575 |     <summary>Academia</summary>
576 |   
577 |     * [CORA][coranb]                                           
578 |     </details>
579 |   - <details>
580 |     <summary>Commerce</summary>
581 |   
582 |     * [Adventure Works][adventureworksnb]                      
583 |     </details>
584 |   - <details>
585 |     <summary>E-Commerce</summary>
586 |   
587 |     * [Consumer Expenditures][consumerexpendituresnb]          
588 |     * [Online Retail][onlineretailnb]                          
589 |     * [Seznam][seznamnb]                                       
590 |     </details>
591 |   - <details>
592 |     <summary>Energy</summary>
593 |   
594 |     * [Occupancy][occupancynb]                                 
595 |     </details>
596 |   - <details>
597 |     <summary>Entertainment</summary>
598 |   
599 |     * [IMDB][imdbnb]                                           
600 |     * [MovieLens][movielensnb]                                 
601 |     </details>
602 |   - <details>
603 |     <summary>Environment</summary>
604 |   
605 |     * [Air Pollution][airpollutionnb]                          
606 |     </details>
607 |   - <details>
608 |     <summary>Financial</summary>
609 |   
610 |     * [Loans][loansnb]                                         
611 |     </details>
612 |   - <details>
613 |      <summary>Health</summary>
614 |   
615 |      * [Atherosclerosis][atherosclerosisnb]                     
616 |     </details>
617 |   - <details>
618 |      <summary>Internet</summary>
619 |   
620 |      * [Stats][statsnb]                                         
621 |     </details>
622 |   - <details>
623 |      <summary>Restaurants</summary>
624 |   
625 |      * [SFScores][sfscoresnb]                                   
626 |     </details>
627 |   - <details>
628 |      <summary>Robotics</summary>
629 |   
630 |      * [Robot][robotnb]                                         
631 |     </details>
632 |   - <details>
633 |     <summary>Sports</summary>
634 |   
635 |     * [Baseball][baseballnb]                                   
636 |     * [Formula 1][formula1nb]                                  
637 |     </details>
638 |   - <details>
639 |     <summary>Transportation</summary>
640 |   
641 |     * [Dodgers][dodgersnb]                                     
642 |     * [Interstate 94][interstate94nb]                          
643 |     </details>
644 | 
645 | </details>
646 | 
647 | ## Benchmarks
648 | 
649 | The following notebooks specifically show getML's performance compared to other approaches:
650 | 
651 | > [!IMPORTANT]  
652 | > The results are hardware, software and version-dependent and therefore may be different from your own experience.  
653 | > However, usually getML's _FastProp_ is significantly faster than _featuretools_ and _tsfresh_ while consuming considerably less memory.  
654 | > If this is not the case for you, or you see flaws or room for improvements, then please let us know!
655 | 
656 | |                                                               | Benchmarks                                       | Results   | getML  | other              |
657 | | ------------------------------------------------------------- | ------------------------------------------------ | --------- | ------ | ------------------ |
658 | | [AdventureWorks: Predicting customer churn][adventureworksnb] | featuretools                                     | AUC       | 97.8%  | featuretools 96.8% |
659 | | [Air pollution prediction][airpollutionnb]                    | featuretools, tsfresh                            | R-squared | 61.0%  | next best 53.7%    |
660 | | [Baseball (Lahman): Predicting salaries][baseballnb]          | featuretools                                     | R-squared | 83.7%  | featuretools 78.0% |
661 | | [CORA: Categorizing academic studies][coranb]                 | Academic literature: RelF, LBP, EPRN, PRN, ACORA | Accuracy  | 89.9%  | next best 85.7%    |
662 | | [Traffic volume prediction (LA)][dodgersnb]                   | Prophet (fbprophet), tsfresh                     | R-squared | 76%    | next best 67%      |
663 | | [Formula 1 (ErgastF1): Predicting the winner][formula1nb]     | featuretools                                     | AUC       | 92.6%  | featuretools 92.0% |
664 | | [IMDb: Predicting actors' gender][imdbnb]                     | Academic literature: RDN, Wordification, RPT     | AUC       | 91.34% | next best 86%      |
665 | | [Traffic volume prediction (I94)][interstate94nb]             | Prophet (fbprophet)                              | R-squared | 98.1%  | prophet 83.3%      |
666 | | [MovieLens: Predicting users' gender][movielensnb]            | Academic literature: PRM, MBN                    | Accuracy  | 81.6%  | next best 69%      |
667 | | [Occupancy detection][occupancynb]                            | Academic literature: Neural networks             | AUC       | 99.8%  | next best 99.6%    |
668 | | [Seznam: Predicting the transaction volume][seznamnb]         | featuretools                                     | R-squared | 78.2%  | featuretools 63.2% |
669 | | [SFScores: Predicting health check scores][sfscoresnb]        | featuretools                                     | R-squared | 29.1%  | featuretools 26.5% |
670 | | [Stats: Predicting users' reputation][statsnb]                | featuretools                                     | R-squared | 98.1%  | featuretools 96.6% |
671 | 
672 | ### FastProp Benchmarks
673 | 
674 | The following notebooks specifically compare different implementations of propositionalization algorithms against getML's _FastProp_ (short for fast propositionalization).
675 | 
676 | <p align="center" style="text-align: center;">
677 |     <img src="fastprop_benchmark/comparisons/nrpf_performance.png" />
678 | </p>
679 | 
680 | |                                      | Faster vs. featuretools | Faster vs. tsfresh | Remarks                                                                                                                                                                               |
681 | | ------------------------------------ | ----------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
682 | | [Air pollution][airpollutionnb_prop] | ~65x                    | ~33x               | The predictive accuracy can be significantly improved by using RelMT instead of propositionalization approaches, please refer to [this notebook][airpollutionnb].                     |
683 | | [Dodgers][dodgersnb_prop]            | ~42x                    | ~75x               | The predictive accuracy can be significantly improved by using the mapping preprocessor and/or more advanced feature learning algorithms, please refer to [this notebook][dodgersnb]. |
684 | | [Interstate94][interstate94nb_prop]  | ~55x                    |                    |                                                                                                                                                                                       |                                                                                                                                                                                       |
685 | | [Occupancy][occupancynb_prop]        | ~87x                    | ~41x               |                                                                                                                                                                                       |
686 | | [Robot][robotnb_prop]                | ~162x                   | ~77x               |                                                                                                                                                                                       |
687 | 
688 | 
689 | ### Further Benchmarks in the Relational Dataset Repository
690 | 
691 | Further benchmarks are also featured on the [Relational Dataset Repository](https://relational-data.org/):
692 | 
693 | |                                                                  | Official page                                                           |
694 | | ---------------------------------------------------------------- | ----------------------------------------------------------------------- |
695 | | [AdventureWorks: Predicting customer churn][adventureworksnb]    | [AdventureWorks](https://relational-data.org/dataset/AdventureWorks)    |
696 | | [Baseball (Lahman): Predicting salaries][baseballnb]             | [Lahman](https://relational-data.org/dataset/Lahman)                    |
697 | | [CORA: Categorizing academic studies][coranb]                    | [CORA](https://relational-data.org/dataset/CORA)                        |
698 | | [Financial: Loan default prediction][loansnb]                    | [Financial](https://relational-data.org/dataset/Financial)              |
699 | | [Formula 1 (ErgastF1): Predicting the winner][formula1nb]        | [ErgastF1](https://relational-data.org/dataset/ErgastF1)                |
700 | | [IMDb: Predicting actors' gender][imdbnb]                        | [IMDb](https://relational-data.org/dataset/IMDb)                        |
701 | | [MovieLens: Predicting users' gender][movielensnb]               | [MovieLens](https://relational-data.org/dataset/MovieLens)              |
702 | | [Seznam: Predicting the transaction volume][seznamnb]            | [Seznam](https://relational-data.org/dataset/Seznam)                    |
703 | | [SFScores: Predicting health check scores][sfscoresnb]           | [SFScores](https://relational-data.org/dataset/SFScores)                |
704 | | [Stats: Predicting users' reputation][statsnb]                   | [Stats](https://relational-data.org/dataset/Stats)                      |
705 | 
706 | <!-- Link Mapping for Notebooks -->
707 | 
708 | [loansnb]: loans.ipynb
709 | [occupancynb]: occupancy.ipynb
710 | [consumerexpendituresnb]: consumer_expenditures.ipynb
711 | [atherosclerosisnb]: atherosclerosis.ipynb
712 | [imdbnb]: imdb.ipynb
713 | [movielensnb]: movie_lens.ipynb
714 | [coranb]: cora.ipynb
715 | [onlineretailnb]: online_retail.ipynb
716 | [interstate94nb]: interstate94.ipynb
717 | [airpollutionnb]: air_pollution.ipynb
718 | [dodgersnb]: dodgers.ipynb
719 | [robotnb]: robot.ipynb
720 | [adventureworksnb]: adventure_works.ipynb
721 | [baseballnb]: baseball.ipynb
722 | [formula1nb]: formula1.ipynb
723 | [seznamnb]: seznam.ipynb
724 | [sfscoresnb]: sfscores.ipynb
725 | [statsnb]: stats.ipynb
726 | 
727 | <!-- Link Mapping for FastProp Benchmarks -->
728 | 
729 | [airpollutionnb_prop]: fastprop_benchmark/air_pollution_prop.ipynb
730 | [dodgersnb_prop]: fastprop_benchmark/dodgers_prop.ipynb
731 | [interstate94nb_prop]: fastprop_benchmark/interstate94_prop.ipynb
732 | [occupancynb_prop]: fastprop_benchmark/occupancy_prop.ipynb
733 | [robotnb_prop]: fastprop_benchmark/robot_prop.ipynb


--------------------------------------------------------------------------------
/assets/getml_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/assets/getml_logo.png


--------------------------------------------------------------------------------
/assets/getml_logo_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/assets/getml_logo_dark.png


--------------------------------------------------------------------------------
/assets/loans-schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/assets/loans-schema.png


--------------------------------------------------------------------------------
/assets/zuordnung.json:
--------------------------------------------------------------------------------
1 | [[35, 1358], [40, 1687], [114, 1623], [117, 1624], [128, 139], [130, 1773], [164, 142], [288, 1609], [424, 465], [434, 581], [463, 913], [504, 798], [506, 1794], [887, 1072], [906, 962], [910, 1810], [936, 1824], [940, 1827], [941, 1409], [943, 1408], [1026, 889], [1033, 797], [1034, 800], [1035, 1834], [1213, 898], [1237, 1837], [1246, 1204], [1272, 1013], [1365, 1701], [1385, 2078], [1481, 661], [1688, 1103], [1694, 1887], [1717, 1889], [1786, 1215], [1817, 1895], [1919, 1061], [1949, 541], [1951, 646], [1952, 647], [1953, 1896], [1955, 1898], [1956, 645], [1959, 1902], [1997, 210], [1999, 211], [2354, 1296], [2440, 1914], [2653, 1416], [2654, 1927], [2658, 1413], [2663, 1937], [2665, 1542], [2695, 1944], [2696, 310], [2698, 1945], [2702, 622], [3084, 1470], [3085, 1954], [3095, 119], [3097, 1360], [3101, 904], [3112, 1022], [3187, 1964], [3191, 1703], [3192, 1702], [3217, 1652], [3218, 1973], [3220, 436], [3222, 438], [3223, 437], [3229, 1986], [3231, 88], [3232, 2018], [3233, 55], [3235, 325], [3236, 323], [3237, 162], [3240, 651], [3243, 2025], [3828, 1826], [3932, 2029], [4274, 903], [4329, 1336], [4330, 2034], [4335, 429], [4553, 31], [4584, 2045], [4637, 2049], [4649, 1616], [4660, 1926], [4804, 1628], [4878, 97], [4983, 635], [5038, 2052], [5055, 2059], [5062, 1958], [5064, 735], [5069, 739], [5075, 1543], [5086, 1966], [5194, 327], [5348, 1950], [5454, 2065], [5462, 549], [5600, 458], [5869, 1331], [5959, 1377], [5966, 2071], [6125, 1602], [6130, 2075], [6151, 236], [6152, 2079], [6155, 153], [6163, 487], [6169, 484], [6170, 384], [6184, 519], [6196, 133], [6209, 505], [6210, 308], [6213, 306], [6214, 109], [6215, 1805], [6216, 2096], [6217, 1142], [6220, 1782], [6224, 426], [6238, 2097], [6311, 1526], [6318, 2100], [6334, 1309], [6343, 514], [6344, 516], [6346, 517], [6378, 2094], [6385, 36], [6539, 693], [6639, 1975], [6741, 711], [6767, 490], [6771, 1909], [6775, 386], [6782, 2116], [6784, 2117], [6786, 275], [6814, 33], [6818, 2122], [6898, 2124], [6910, 1119], [6913, 2130], [6917, 1849], [6923, 1464], [6925, 1463], [6935, 1351], [6939, 2135], [6941, 1683], [7022, 2005], [7032, 1671], [7041, 1024], [7047, 548], [7272, 2139], [7276, 1636], [7296, 2142], [7297, 24], [7419, 1367], [7430, 137], [7432, 1144], [7532, 2146], [7537, 374], [7867, 2058], [8079, 897], [8213, 1560], [8224, 1224], [8581, 1577], [8591, 2157], [8594, 2160], [8617, 2162], [8619, 697], [8687, 87], [8696, 161], [8699, 2016], [8703, 118], [8766, 1836], [8821, 2017], [8832, 1982], [8865, 957], [8872, 2168], [8874, 650], [8875, 2083], [8961, 2170], [9513, 52], [9515, 2172], [9559, 2173], [9581, 1063], [9586, 2175], [9708, 644], [9716, 763], [10169, 415], [10174, 2179], [10177, 525], [10183, 441], [10186, 440], [10430, 1537], [10435, 1538], [10531, 756], [10793, 975], [10796, 973], [10798, 454], [10981, 387], [11093, 49], [11148, 2192], [11325, 993], [11326, 2193], [11335, 702], [11337, 98], [11339, 2194], [11342, 586], [12155, 1631], [12158, 2197], [12165, 901], [12169, 80], [12182, 1042], [12194, 1391], [12195, 2199], [12197, 2180], [12198, 2181], [12199, 1388], [12210, 2054], [12211, 2073], [12238, 1279], [12247, 964], [12275, 639], [12330, 1952], [12337, 979], [12347, 746], [12350, 95], [12359, 2163], [12439, 1121], [12558, 1694], [12576, 1483], [12631, 2126], [12638, 1273], [12946, 224], [12960, 2266], [13024, 2209], [13136, 1819], [13193, 302], [13195, 303], [13205, 2210], [13208, 104], [13212, 1065], [13213, 54], [13269, 330], [13652, 710], [13654, 51], [13656, 2212], [13658, 2216], [13686, 836], [13717, 2217], [13885, 603], [13917, 1248], [13960, 1045], [13966, 1571], [13972, 1459], [13982, 475], [14062, 1317], [14083, 432], [14090, 320], [14428, 75], [14429, 2224], [14430, 284], [14431, 2225], [14529, 578], [14531, 2227], [14545, 2229], [14549, 809], [14807, 1083], [15076, 2230], [15429, 2182], [15431, 2234], [15670, 333], [15889, 1955], [15892, 2235], [15984, 2236], [15987, 1240], [16008, 2238], [16437, 2241], [16451, 1303], [16461, 1997], [16470, 2207], [16471, 2243], [16474, 1503], [16476, 1502], [16485, 281], [16819, 2248], [16843, 2250], [17201, 121], [17208, 1158], [17242, 1264], [17363, 2253], [17476, 1505], [17477, 1506], [17488, 2254], [17798, 737], [17811, 1668], [17821, 225], [18251, 2256], [18313, 2257], [18532, 2258], [18536, 1213], [18582, 1550], [18615, 69], [18619, 2189], [18770, 1419], [18773, 2261], [18774, 2262], [18777, 1015], [18781, 1104], [18785, 2264], [18811, 984], [18812, 1322], [18815, 1324], [18832, 1089], [18833, 1088], [18834, 741], [19045, 1262], [19231, 1395], [19621, 598], [19697, 1160], [20178, 1415], [20179, 1414], [20180, 1553], [20193, 1441], [20526, 103], [20528, 191], [20534, 2080], [20584, 868], [20592, 2276], [20593, 1844], [20601, 734], [20602, 736], [20821, 1163], [20833, 2101], [20850, 851], [20857, 850], [20920, 2265], [20923, 857], [20924, 854], [20942, 2019], [20972, 1800], [22229, 565], [22241, 1528], [22386, 472], [22431, 1370], [22563, 1131], [22564, 1133], [22566, 2185], [22835, 633], [22869, 2284], [22874, 2286], [22875, 451], [22876, 2285], [22883, 989], [22886, 2287], [23069, 582], [23070, 444], [23116, 2291], [23258, 102], [23448, 143], [23502, 232], [23507, 231], [23545, 1352], [23546, 1918], [23738, 2155], [23774, 1245], [24043, 2296], [24476, 2297], [24530, 2300], [24966, 733], [24974, 2305], [25181, 745], [25184, 743], [25413, 495], [25702, 482], [25772, 2309], [25791, 1289], [25794, 2310], [25805, 2056], [26850, 1863], [27174, 160], [27199, 1693], [27203, 839], [27230, 506], [27241, 2312], [27243, 1275], [27246, 1277], [27249, 2295], [27250, 1152], [27510, 873], [27514, 2314], [27530, 1344], [27531, 25], [27535, 1343], [27543, 2317], [27606, 347], [27612, 2074], [27623, 1681], [27627, 1682], [27631, 1998], [27632, 2319], [27895, 1203], [28026, 1237], [28202, 1705], [28227, 1775], [28230, 2323], [28249, 1335], [28254, 695], [28265, 1828], [28267, 940], [28278, 1787], [28287, 1776], [28290, 1735], [28336, 1254], [28350, 350], [28359, 1140], [28385, 2325], [28387, 1777], [28389, 685], [28412, 2326], [28447, 610], [28456, 1801], [28471, 887], [28473, 2106], [28485, 1251], [28487, 18], [28489, 1253], [28491, 2327], [28504, 2329], [28542, 1564], [28632, 1125], [28640, 373], [28641, 6], [28649, 2333], [28674, 707], [28851, 174], [28957, 2335], [28964, 2259], [29492, 1558], [29708, 879], [29723, 665], [29738, 780], [30817, 899], [30895, 1583], [30901, 1120], [30934, 2267], [30973, 590], [31043, 795], [31055, 1447], [31083, 1259], [31097, 115], [31105, 2343], [31336, 1472], [31349, 240], [31353, 1692], [31479, 1115], [31483, 1178], [31489, 577], [31769, 681], [31863, 2355], [31927, 2349], [31932, 1498], [32083, 1171], [32260, 2332], [32276, 606], [32688, 2039], [32698, 38], [32872, 589], [33013, 1788], [33231, 442], [33301, 2363], [33303, 1555], [33325, 2364], [33412, 1999], [33818, 2176], [33823, 1016], [33895, 2365], [33904, 1738], [33907, 1739], [34082, 84], [34257, 2367], [34263, 1093], [34266, 1090], [34315, 453], [34355, 1809], [34708, 827], [34961, 1645], [34979, 2219], [35061, 687], [35070, 822], [35335, 267], [35343, 2000], [35490, 20], [35778, 2376], [35797, 1894], [35852, 372], [35854, 1515], [35863, 476], [35905, 576], [35922, 2001], [36131, 2382], [36140, 1655], [36145, 1080], [36162, 2385], [36167, 1791], [36620, 885], [36802, 638], [37483, 2388], [37541, 304], [37879, 1018], [37884, 2040], [37888, 893], [37998, 96], [38000, 2164], [38205, 1740], [38480, 1779], [38537, 1634], [38722, 544], [38771, 662], [38829, 392], [38839, 844], [38845, 1183], [38846, 1182], [39124, 1494], [39126, 1908], [39127, 2394], [39130, 2024], [39131, 2178], [39165, 1529], [39199, 1666], [39210, 1662], [39403, 1780], [39474, 1888], [39890, 963], [39904, 1866], [40124, 2268], [40125, 2306], [40131, 781], [40135, 2403], [40151, 1912], [40583, 906], [40605, 2351], [40886, 2405], [40922, 668], [41216, 2406], [41417, 643], [41666, 2233], [41714, 73], [41732, 2377], [42156, 311], [42207, 751], [42209, 2408], [42221, 434], [42847, 1821], [42848, 2409], [43165, 2410], [43186, 2412], [43639, 1942], [43698, 172], [44017, 2195], [44121, 538], [44368, 796], [44455, 1214], [44514, 2413], [45052, 2416], [45061, 214], [45188, 2418], [45189, 65], [45212, 1622], [45533, 640], [45599, 613], [45603, 683], [45605, 2359], [46079, 757], [46431, 1741], [46452, 2422], [46468, 86], [46470, 497], [46476, 2397], [46491, 279], [46500, 11], [46501, 2424], [46536, 630], [46547, 270], [46887, 2425], [47570, 129], [47682, 2427], [47683, 2428], [47684, 349], [47839, 2429], [48066, 2342], [48075, 663], [48550, 1095], [48555, 298], [48764, 131], [48766, 1546], [48768, 1547], [48781, 202], [49482, 201], [49660, 1174], [49720, 2431], [49753, 2432], [49811, 1079], [49843, 779], [49844, 1919], [49847, 778], [49895, 414], [50336, 91], [50337, 2123], [50354, 1450], [50381, 1892], [50807, 2433], [50838, 21], [50980, 587], [51045, 375], [51049, 2434], [51052, 2435], [51180, 1580], [51831, 500], [51834, 502], [51866, 1507], [51879, 2436], [51909, 2112], [51934, 601], [52000, 168], [52003, 167], [52007, 2438], [52515, 2], [52784, 738], [52835, 471], [52847, 2137], [53942, 753], [54129, 1229], [54131, 1742], [54132, 2446], [54550, 193], [54844, 1523], [55403, 1291], [55770, 113], [55801, 282], [55968, 1781], [56112, 704], [56115, 2450], [56119, 1743], [56167, 1269], [56708, 1283], [56709, 1282], [57119, 2451], [57764, 1839], [57773, 122], [57922, 1695], [57932, 555], [57948, 2133], [58268, 1647], [58436, 2456], [58453, 221], [58454, 220], [58540, 452], [58552, 568], [58758, 998], [59045, 1468], [59244, 1029], [59626, 315], [59715, 1427], [59772, 2373], [59798, 553], [60159, 1053], [60169, 1405], [60170, 2118], [60560, 1600], [60682, 634], [61069, 1035], [61073, 931], [61312, 824], [61417, 2457], [62274, 1669], [62329, 760], [62333, 1603], [62347, 968], [62389, 1222], [62417, 523], [62607, 2461], [62634, 1216], [62676, 1442], [62718, 1077], [63477, 300], [63486, 1527], [63549, 1192], [63812, 1190], [63832, 1412], [63835, 1512], [63915, 2282], [63931, 175], [64271, 2269], [64319, 527], [64484, 1783], [64519, 945], [65057, 584], [65074, 2464], [65212, 377], [65650, 74], [65653, 1848], [66556, 580], [66563, 1758], [66564, 464], [66594, 815], [66596, 2467], [66751, 1475], [66782, 1976], [66794, 1519], [66805, 1759], [66809, 1244], [66982, 2381], [66986, 2398], [66990, 48], [67245, 1445], [67246, 1446], [67292, 1822], [67415, 456], [67584, 217], [67633, 67], [68115, 419], [68224, 1098], [68463, 2015], [68495, 2183], [68505, 2057], [69198, 411], [69284, 1284], [69296, 1604], [69392, 158], [69397, 1981], [69418, 2042], [70281, 2475], [70441, 1312], [70442, 1311], [70444, 1313], [70520, 1197], [70970, 2476], [71336, 1066], [71736, 1559], [71904, 1590], [72056, 886], [72101, 2006], [72406, 2478], [72805, 1236], [72908, 1359], [73119, 583], [73146, 1143], [73162, 89], [73323, 2481], [73327, 1903], [73712, 2482], [73972, 1032], [74427, 1147], [74698, 2289], [74700, 140], [74749, 1913], [74821, 78], [74920, 369], [74921, 2483], [74937, 2484], [74975, 1579], [75121, 2485], [75318, 2486], [75674, 409], [75691, 1055], [75693, 2487], [75694, 2488], [75695, 1054], [75969, 1501], [75972, 1396], [75983, 396], [77108, 365], [77112, 260], [77438, 2383], [77515, 1139], [77758, 1869], [77826, 1276], [77829, 1963], [78508, 1458], [78511, 1761], [78549, 595], [78552, 4], [78555, 1091], [78557, 1256], [78994, 1608], [79809, 94], [79817, 2263], [80491, 509], [80515, 2148], [80656, 1392], [81350, 399], [81714, 628], [81722, 1764], [82087, 391], [82090, 493], [82098, 68], [82664, 1904], [82666, 2491], [82920, 1169], [83449, 1901], [83461, 331], [83725, 2113], [83746, 2213], [83826, 1870], [83847, 2493], [84020, 749], [84021, 748], [84459, 79], [84695, 2361], [85299, 1871], [85324, 2494], [85352, 1765], [85449, 518], [85452, 1850], [85688, 1851], [86258, 62], [86359, 831], [86840, 725], [86923, 1290], [87363, 189], [87417, 817], [87482, 1598], [87915, 2290], [88356, 1474], [89308, 2497], [89335, 2166], [89416, 1795], [89547, 239], [90470, 2498], [90655, 15], [90888, 1873], [91038, 1454], [91581, 2357], [91852, 370], [91853, 371], [91975, 910], [92065, 2008], [92589, 1266], [93273, 2499], [93318, 1411], [93320, 176], [93555, 1874], [93755, 2502], [93923, 181], [94229, 2283], [94416, 2458], [94639, 2503], [94641, 1067], [94713, 1481], [94953, 351], [95188, 2270], [95198, 400], [95225, 2202], [95435, 2252], [95579, 406], [95586, 2145], [95588, 134], [95589, 2095], [95594, 2504], [95597, 2505], [95642, 114], [95718, 863], [95719, 862], [96335, 2009], [96845, 860], [96847, 859], [96851, 762], [97377, 2156], [97390, 2221], [97645, 1766], [97892, 1661], [98693, 1148], [98698, 1145], [99023, 877], [99025, 875], [99030, 2508], [100197, 1987], [100701, 163], [100935, 2507], [100961, 1508], [101143, 1899], [101145, 388], [101261, 2511], [101263, 2203], [101660, 2134], [101662, 2293], [101811, 1535], [102061, 2512], [102406, 1921], [102879, 1467], [102884, 2053], [102938, 183], [102939, 182], [103430, 1684], [103482, 1708], [103515, 1154], [103528, 703], [103529, 1961], [103531, 2516], [103537, 1385], [103543, 148], [104840, 2010], [105057, 1410], [105856, 2247], [105865, 1365], [105899, 1012], [106590, 1484], [107177, 1922], [107251, 2280], [107252, 242], [107569, 2204], [108047, 657], [108962, 1428], [108963, 1429], [108974, 1534], [108983, 1202], [109323, 1010], [110041, 2525], [110162, 1959], [110163, 379], [110164, 132], [111676, 1578], [111770, 585], [111866, 81], [112099, 2527], [112378, 1842], [112787, 2528], [112813, 2271], [114189, 593], [114308, 342], [114966, 2529], [115188, 520], [116021, 2321], [116081, 2396], [116084, 1421], [116087, 1420], [116512, 1371], [116528, 1268], [116545, 2530], [116552, 246], [116553, 936], [116790, 1618], [117315, 1308], [117316, 1627], [117328, 128], [118079, 2402], [118259, 1180], [118260, 1304], [118424, 916], [118435, 2531], [118436, 1026], [118558, 508], [118559, 460], [118682, 2350], [118873, 1201], [119686, 2215], [119712, 1489], [119761, 722], [119956, 1497], [120013, 1270], [120039, 1039], [120084, 729], [120817, 2535], [121792, 462], [123556, 321], [123825, 316], [124064, 1772], [124224, 1611], [124296, 285], [124734, 408], [124828, 1294], [124952, 1307], [126128, 2589], [126793, 1138], [126867, 259], [126868, 2537], [126909, 602], [126912, 714], [126920, 381], [126926, 378], [126927, 2518], [127033, 389], [127940, 2539], [128202, 1188], [128203, 1187], [128383, 2407], [128540, 1499], [129042, 107], [129045, 1650], [129287, 2541], [129558, 360], [129896, 463], [129897, 1906], [131042, 1818], [131117, 2339], [131122, 2542], [131315, 397], [131317, 1633], [131318, 401], [132806, 341], [132821, 1556], [133550, 1996], [133553, 2543], [133563, 1325], [133566, 2063], [133567, 1326], [133615, 2154], [133628, 2544], [134060, 10], [134128, 1798], [134199, 1629], [134219, 2546], [134307, 623], [134314, 2547], [134315, 2548], [134316, 515], [134320, 2549], [135130, 1620], [135464, 1221], [135765, 412], [135766, 56], [135798, 528], [136665, 1561], [136766, 596], [136767, 992], [136768, 41], [137130, 828], [137359, 93], [137380, 2423], [137790, 895], [137849, 443], [137868, 39], [137873, 1968], [137956, 108], [139547, 1605], [139738, 2550], [139865, 1862], [140005, 1137], [140569, 2064], [141160, 269], [141171, 2147], [141324, 2444], [141342, 1721], [141347, 1292], [141596, 2552], [141868, 120], [142268, 1102], [143323, 2553], [143476, 810], [143676, 997], [143801, 2500], [144212, 1136], [144330, 2274], [144408, 792], [144679, 376], [144701, 1610], [145134, 2303], [145176, 383], [145215, 816], [145315, 1476], [145384, 135], [147870, 2308], [148170, 1722], [148341, 2560], [148399, 965], [149139, 292], [149669, 1785], [151430, 1916], [151708, 2038], [152219, 1191], [152226, 2068], [152227, 861], [152483, 1520], [152731, 197], [153063, 1050], [153598, 1490], [154023, 2565], [154047, 953], [154134, 1911], [154982, 1907], [155158, 42], [155277, 2341], [155736, 1047], [155738, 1588], [156794, 943], [156977, 788], [157401, 1023], [157761, 2055], [157805, 334], [158098, 2237], [158172, 588], [158614, 966], [158812, 799], [159084, 380], [159085, 2569], [159897, 2336], [160705, 1581], [160732, 116], [161221, 1799], [162075, 1386], [162080, 2570], [162664, 1417], [163235, 2574], [164885, 1659], [166420, 2575], [166825, 410], [166989, 2576], [167205, 2158], [167656, 718], [167670, 498], [168332, 136], [168410, 1448], [168958, 670], [169279, 1462], [169280, 2577], [170338, 1699], [170798, 1478], [171225, 1440], [171954, 2472], [173863, 1425], [173884, 255], [174418, 2360], [174425, 2578], [175256, 328], [175291, 30], [175548, 972], [175576, 858], [175909, 1226], [177115, 393], [177993, 1280], [177998, 2581], [178209, 2109], [178718, 46], [178727, 686], [179180, 2582], [179702, 1664], [179706, 658], [180187, 1698], [180301, 2583], [180373, 277], [180399, 696], [181782, 1258], [182093, 2584], [182094, 558], [184157, 2251], [184918, 869], [187260, 2586], [187354, 1482], [188318, 1479], [188471, 1337], [189566, 271], [189571, 2240], [189572, 152], [189574, 2294], [189577, 151], [189620, 2386], [189623, 2587], [189655, 1369], [189708, 317], [189721, 922], [189774, 1401], [189856, 2387], [190697, 1723], [190698, 768], [190706, 1189], [191216, 288], [191222, 618], [191404, 1774], [192734, 1293], [192850, 1156], [192870, 2370], [193347, 1593], [193352, 145], [193354, 144], [193742, 126], [193918, 2588], [193931, 2509], [193932, 2519], [194223, 2379], [194609, 1185], [194617, 1469], [194645, 234], [195150, 1085], [195361, 2088], [195792, 567], [197054, 876], [197452, 2304], [197783, 1924], [198443, 1724], [198653, 1070], [198866, 2592], [199571, 2593], [200480, 1882], [200630, 649], [202520, 652], [202522, 654], [202639, 218], [203646, 1424], [205192, 1941], [205196, 833], [206259, 1786], [206371, 902], [206524, 594], [207395, 448], [208345, 747], [210309, 2231], [210871, 1725], [210872, 2597], [211432, 324], [211875, 818], [211906, 769], [212097, 1112], [212107, 677], [212777, 1917], [212930, 843], [213246, 599], [213279, 942], [214472, 85], [215912, 2590], [216877, 77], [216878, 354], [217115, 2232], [217139, 1368], [217852, 551], [217984, 570], [218410, 329], [218666, 896], [218682, 1549], [219218, 422], [219239, 2198], [219446, 2555], [219976, 1404], [220420, 1612], [221302, 1956], [226698, 445], [227178, 1199], [227286, 230], [228990, 2600], [228992, 832], [229635, 1726], [230300, 937], [230879, 507], [230884, 402], [231198, 1488], [231249, 1727], [232605, 1328], [232606, 1327], [232860, 2602], [233106, 900], [235670, 2455], [235678, 123], [235679, 99], [235683, 26], [235776, 2099], [236759, 1933], [237376, 1220], [237489, 1607], [237521, 216], [238099, 1972], [238401, 1696], [239800, 794], [239810, 890], [239829, 1105], [240321, 180], [240791, 480], [241133, 2089], [241821, 477], [242637, 1348], [242663, 2401], [243274, 2540], [243483, 526], [245288, 829], [245955, 2255], [246618, 2515], [248119, 1524], [248395, 1223], [248425, 1041], [248431, 1150], [248823, 2591], [249421, 1728], [249858, 423], [250566, 272], [251756, 1513], [252715, 2190], [252725, 939], [253762, 715], [253971, 2414], [254923, 1729], [255233, 1953], [255628, 1159], [256106, 1272], [258259, 2606], [259126, 2014], [259701, 1730], [259702, 1731], [259772, 2184], [260121, 1615], [260979, 1172], [261040, 479], [262108, 2607], [262121, 363], [262178, 1373], [263069, 1114], [263279, 1732], [263482, 2608], [263486, 891], [263498, 173], [263553, 2129], [264347, 1232], [264556, 669], [265203, 1733], [267003, 1792], [267824, 2609], [270085, 632], [270456, 1621], [270600, 398], [272345, 2062], [272720, 771], [273152, 1734], [273949, 2246], [277263, 504], [278394, 866], [278403, 2610], [280876, 248], [282700, 1934], [284023, 2501], [284025, 1567], [284414, 1845], [285675, 1239], [285687, 2611], [286500, 101], [286513, 2612], [286562, 1514], [287787, 154], [288107, 1036], [289085, 1086], [289088, 1225], [289779, 1736], [289780, 155], [289781, 156], [289885, 1097], [289945, 2613], [292277, 494], [293271, 294], [293285, 2047], [293974, 1051], [294030, 203], [294126, 404], [294145, 106], [294239, 421], [299195, 1802], [299197, 417], [300071, 2120], [300806, 332], [302545, 947], [307015, 1737], [307336, 2027], [307656, 1430], [308003, 262], [308232, 2165], [308529, 2324], [308920, 1969], [309476, 1517], [310530, 264], [310653, 2524], [310742, 1970], [312409, 2462], [314459, 1101], [315266, 286], [315789, 1378], [318071, 2617], [318187, 491], [321004, 2618], [321861, 1925], [323128, 2136], [325314, 19], [325497, 2322], [328370, 841], [330148, 1864], [330208, 914], [334153, 1803], [335042, 1124], [335733, 1209], [337766, 2621], [340075, 1820], [340078, 835], [340299, 1865], [341188, 1778], [342802, 1946], [345340, 1789], [346243, 1332], [346292, 1092], [348305, 2622], [348437, 496], [350319, 2115], [350362, 572], [350373, 2384], [353541, 554], [354004, 946], [358866, 934], [358884, 2211], [358887, 675], [358894, 2356], [359067, 1314], [360028, 1058], [362926, 2468], [365294, 2331], [367312, 44], [368431, 1110], [368605, 2371], [368657, 2625], [370366, 579], [372862, 2132], [375605, 2627], [375825, 560], [376704, 1181], [377303, 631], [379288, 2628], [380341, 1059], [384428, 692], [385067, 701], [385251, 100], [385572, 1570], [387795, 337], [389715, 838], [390693, 987], [390889, 2041], [390894, 2380], [390896, 2121], [390922, 2002], [395075, 1557], [395540, 872], [395547, 2033], [395553, 1002], [395725, 1957], [396412, 2003], [397488, 1173], [397590, 1372], [399173, 1883], [399339, 1346], [399370, 229], [400356, 2206], [400455, 2626], [400473, 2340], [408885, 2631], [409255, 273], [409725, 92], [411005, 1096], [411092, 539], [415693, 1568], [416455, 905], [416867, 1302], [416964, 2632], [417017, 252], [421481, 774], [423463, 1971], [423816, 1345], [424540, 607], [427606, 985], [428610, 791], [429781, 2051], [429805, 483], [430329, 2242], [430574, 2633], [430711, 2634], [431206, 2127], [436796, 13], [440815, 1569], [444191, 1697], [444240, 1625], [445938, 698], [446271, 1], [446610, 2637], [447224, 5], [447250, 2004], [449841, 447], [458439, 2639], [459206, 425], [459213, 533], [459214, 1007], [459216, 2279], [463825, 1218], [466170, 2640], [467383, 2641], [469504, 150], [470511, 1962], [481073, 2545], [486840, 626], [502574, 1690], [503871, 2372], [503877, 16], [503883, 364], [503893, 466], [509233, 1374], [509315, 569], [509379, 60], [510715, 63], [510718, 1452], [513189, 111], [519318, 813], [519353, 2463], [520471, 908], [521183, 249], [521207, 1884], [521251, 621], [521252, 478], [521269, 1885], [521855, 1157], [522338, 245], [523010, 605], [523394, 706], [523574, 2645], [529165, 1691], [531348, 2260], [531351, 529], [545647, 2069], [552469, 1644], [559804, 1094], [560936, 1846], [561238, 1744], [561364, 1234], [561568, 1230], [561581, 194], [561582, 1451], [561593, 1081], [561595, 2564], [561610, 1380], [561611, 1379], [561613, 1342], [561674, 473], [561789, 2647], [561809, 2648], [562067, 243], [562123, 1867], [562940, 2649], [563613, 1460], [566488, 265], [566653, 1128], [566664, 2275], [567005, 960], [567018, 2599], [568045, 2110], [568857, 1745], [573535, 2470], [573553, 591], [573964, 758], [573978, 853], [574009, 346], [574264, 1746], [574462, 1492], [574710, 359], [575077, 59], [575292, 228], [575331, 1747], [575402, 223], [575795, 1249], [576257, 481], [576362, 1589], [576691, 2492], [576725, 1748], [576795, 1281], [576973, 1431], [577086, 825], [577227, 1471], [577331, 1606], [578306, 1177], [578309, 1176], [578337, 1300], [578347, 1935], [578365, 295], [578645, 2442], [578646, 867], [578649, 871], [578650, 2652], [578669, 629], [578780, 1749], [578845, 2614], [578898, 2523], [579008, 609], [579108, 58], [582139, 1477], [582343, 1920], [582349, 865], [582511, 2070], [583318, 83], [589923, 808], [590022, 924], [591016, 1675], [591017, 1676], [592826, 724], [592830, 2228], [592973, 489], [592975, 2390], [592986, 2391], [592993, 933], [592996, 227], [593022, 1155], [593060, 2392], [593068, 1596], [593091, 1750], [593104, 1037], [593105, 1038], [593155, 2460], [593201, 1164], [593209, 1161], [593210, 110], [593240, 1599], [593248, 1601], [593260, 1389], [593328, 1509], [593329, 564], [593544, 615], [593559, 723], [593560, 9], [593813, 616], [593859, 2445], [593921, 2447], [593942, 403], [594011, 1383], [594025, 1030], [594039, 2655], [594047, 1751], [594119, 1034], [594387, 2448], [594483, 1263], [594511, 2656], [594543, 1384], [594649, 1287], [594900, 999], [595056, 1804], [595063, 2558], [595157, 2334], [595193, 427], [596075, 2510], [601462, 1642], [601561, 2022], [601567, 682], [604073, 1005], [606479, 2415], [606647, 268], [608190, 2651], [608191, 105], [608292, 1685], [608326, 552], [610529, 222], [612306, 1165], [613409, 420], [616336, 127], [617378, 1496], [617575, 2659], [621555, 1673], [626530, 12], [626531, 1318], [626574, 2662], [626999, 2663], [627024, 1049], [628458, 2664], [628459, 1438], [628500, 563], [628667, 656], [628668, 2090], [628751, 2495], [628764, 1641], [628766, 2595], [628815, 2048], [628888, 637], [630817, 1943], [630890, 811], [631015, 185], [631052, 297], [632796, 2352], [632874, 2353], [632935, 2354], [633030, 2665], [633031, 2666], [633081, 1129], [633585, 394], [633721, 849], [634902, 326], [634904, 1752], [634938, 1753], [634975, 1754], [636098, 1847], [636500, 531], [636511, 27], [640617, 1755], [641956, 994], [641976, 45], [642593, 592], [642621, 837], [642641, 1491], [642681, 368], [642798, 1688], [642827, 22], [642847, 2393], [642894, 2671], [642920, 2676], [642930, 1439], [643003, 1255], [643069, 348], [643199, 64], [643221, 1257], [643239, 1135], [643485, 2673], [643597, 2594], [643695, 2680], [643734, 1457], [643735, 1456], [643777, 1019], [644093, 996], [644334, 1366], [644361, 1449], [644363, 361], [644427, 956], [644441, 1194], [644448, 2679], [644470, 1639], [644494, 177], [644577, 492], [644843, 1390], [645016, 2681], [645046, 1109], [645084, 1320], [645088, 2684], [645452, 209], [645571, 450], [645870, 726], [645897, 2538], [646195, 1461], [646286, 2249], [646289, 1261], [646334, 2674], [646357, 2675], [646412, 187], [646440, 2646], [646809, 1756], [646836, 1306], [646837, 1305], [646900, 2685], [646913, 732], [647315, 358], [647408, 1757], [647413, 263], [647447, 819], [648106, 1657], [648112, 1554], [648121, 2688], [648232, 784], [648369, 1231], [649730, 1297], [649731, 1868], [649739, 845], [649944, 1170], [650807, 2689], [650814, 1706], [650834, 1426], [653441, 258], [653628, 611], [654177, 1980], [654326, 71], [654339, 2691], [654519, 1210], [656048, 2517], [656231, 694], [662250, 1310], [662279, 2489], [662416, 1111], [662572, 546], [671052, 2636], [671269, 2077], [671293, 435], [672064, 2091], [672070, 2667], [672071, 2668], [675649, 790], [675756, 1548], [675847, 775], [682508, 2694], [682666, 1552], [682815, 290], [683294, 2695], [683355, 1046], [683360, 1162], [683404, 1510], [684372, 1823], [684531, 1126], [684972, 740], [684986, 141], [686015, 407], [686030, 2320], [686061, 948], [686532, 512], [686559, 291], [687401, 1595], [688361, 1295], [688824, 1613], [688849, 2696], [689152, 2579], [689439, 2604], [693143, 614], [694759, 1760], [695284, 888], [696342, 253], [696343, 254], [696345, 251], [696346, 812], [703953, 912], [708945, 938], [709113, 2159], [709518, 7], [711527, 2453], [711598, 204], [711994, 2697], [714208, 761], [714256, 43], [714260, 1653], [714289, 1443], [714748, 1530], [714879, 1585], [714975, 805], [733167, 1893], [733534, 2698], [733576, 944], [734406, 2699], [735303, 1205], [735311, 319], [737204, 2571], [738941, 1151], [739280, 2701], [739707, 2573], [739816, 620], [746058, 2624], [751408, 1936], [752684, 1108], [753047, 186], [753070, 2580], [753264, 1228], [753265, 1319], [754594, 1298], [755082, 705], [755217, 719], [756061, 1679], [762980, 927], [763009, 1301], [763010, 17], [763181, 2141], [767763, 1536], [779960, 1044], [782486, 699], [785678, 2167], [787016, 1762], [801170, 1763], [814836, 991], [815073, 345], [815096, 1678], [817774, 2704], [820661, 641], [820662, 642], [824245, 287], [851968, 40], [853114, 165], [853115, 2707], [853116, 1473], [853118, 169], [853150, 1872], [853155, 2706], [854434, 2028], [884094, 1064], [892139, 2107], [899085, 2479], [899119, 1504], [907845, 1591], [911198, 1432], [917493, 2551], [919885, 2007], [928873, 1134], [943087, 636], [948147, 1573], [948299, 1875], [948846, 226], [949217, 1654], [949318, 1323], [949511, 1876], [950052, 1100], [950305, 1877], [950986, 1130], [954315, 2480], [964248, 1667], [975567, 1433], [976284, 2021], [976334, 1784], [987188, 457], [987197, 1397], [989397, 1117], [990075, 2506], [1000012, 604], [1022969, 125], [1031453, 1852], [1050679, 1149], [1059953, 1025], [1061127, 1485], [1063773, 2345], [1071981, 2208], [1095507, 2520], [1102364, 1381], [1102400, 1330], [1102407, 1853], [1102442, 164], [1102548, 1594], [1102550, 1830], [1102567, 1353], [1102625, 1840], [1102646, 2108], [1102751, 842], [1102761, 130], [1102794, 2174], [1102850, 256], [1102873, 2191], [1103016, 1597], [1103031, 2222], [1103038, 266], [1103162, 1146], [1103315, 608], [1103383, 1398], [1103394, 1500], [1103499, 2298], [1103610, 1357], [1103676, 382], [1103737, 1983], [1103960, 1709], [1103969, 2223], [1103979, 1806], [1103985, 1123], [1104007, 1838], [1104031, 2374], [1104055, 149], [1104182, 2318], [1104191, 932], [1104258, 1278], [1104261, 2404], [1104300, 2043], [1104379, 716], [1104435, 1286], [1104449, 2035], [1104495, 2420], [1104647, 2169], [1104749, 1233], [1104769, 1021], [1104787, 2244], [1104809, 2437], [1104851, 405], [1104946, 2449], [1104999, 969], [1105011, 983], [1105033, 773], [1105062, 1854], [1105116, 2378], [1105148, 864], [1105221, 742], [1105231, 1831], [1105344, 1062], [1105360, 1387], [1105394, 1767], [1105428, 2030], [1105433, 1265], [1105450, 2419], [1105505, 147], [1105530, 1811], [1105531, 627], [1105574, 918], [1105603, 1347], [1105622, 1563], [1105672, 2344], [1105698, 362], [1105718, 1574], [1105764, 124], [1105810, 840], [1105877, 385], [1105887, 2050], [1105932, 1399], [1106052, 1878], [1106103, 2105], [1106112, 1768], [1106172, 1769], [1106236, 1960], [1106287, 157], [1106298, 274], [1106330, 76], [1106370, 2011], [1106388, 1592], [1106401, 543], [1106406, 542], [1106418, 485], [1106492, 1333], [1106546, 1796], [1106547, 138], [1106568, 1988], [1106630, 1928], [1106671, 2301], [1106764, 499], [1106771, 395], [1106789, 1967], [1106849, 988], [1106854, 1084], [1106966, 655], [1107010, 1825], [1107041, 47], [1107062, 1833], [1107067, 1677], [1107095, 921], [1107136, 355], [1107140, 1910], [1107171, 801], [1107215, 468], [1107312, 2143], [1107319, 2200], [1107325, 925], [1107355, 1572], [1107367, 2288], [1107385, 2521], [1107418, 513], [1107455, 1770], [1107558, 949], [1107567, 2186], [1107572, 2072], [1107674, 562], [1107728, 184], [1107808, 1393], [1107861, 1435], [1108050, 1241], [1108167, 1841], [1108169, 352], [1108175, 597], [1108209, 1989], [1108258, 117], [1108267, 344], [1108329, 573], [1108363, 1113], [1108389, 783], [1108551, 1905], [1108570, 3], [1108597, 241], [1108656, 803], [1108728, 2151], [1108834, 1812], [1108841, 926], [1109017, 28], [1109185, 2559], [1109199, 289], [1109208, 2561], [1109392, 179], [1109439, 1614], [1109542, 1626], [1109566, 2533], [1109581, 1196], [1109830, 727], [1109873, 1186], [1109891, 8], [1109957, 2084], [1110000, 1965], [1110024, 1179], [1110028, 1680], [1110209, 1207], [1110256, 667], [1110390, 755], [1110426, 1525], [1110438, 1929], [1110494, 664], [1110515, 449], [1110520, 1576], [1110531, 2023], [1110546, 2366], [1110563, 0], [1110579, 1354], [1110628, 247], [1110768, 2032], [1110947, 671], [1110950, 1375], [1110998, 261], [1111052, 112], [1111186, 2368], [1111230, 1551], [1111240, 2114], [1111265, 313], [1111304, 1797], [1111614, 1028], [1111733, 575], [1111788, 1341], [1111899, 1267], [1111978, 2292], [1112026, 2316], [1112071, 954], [1112075, 952], [1112099, 731], [1112106, 2566], [1112194, 1056], [1112319, 1376], [1112369, 2430], [1112417, 545], [1112426, 2338], [1112574, 1540], [1112650, 1793], [1112665, 2104], [1112686, 1068], [1112723, 309], [1112767, 1434], [1112911, 244], [1112929, 2036], [1113035, 990], [1113084, 2603], [1113182, 1656], [1113438, 1710], [1113459, 2085], [1113534, 2149], [1113541, 2605], [1113551, 1167], [1113614, 958], [1113739, 2026], [1113742, 2514], [1113828, 1116], [1113831, 1243], [1113852, 35], [1113926, 2012], [1113934, 935], [1113995, 1855], [1114118, 1004], [1114125, 660], [1114153, 547], [1114184, 212], [1114192, 1947], [1114222, 787], [1114239, 672], [1114331, 1711], [1114336, 70], [1114352, 1665], [1114364, 237], [1114388, 467], [1114398, 2369], [1114442, 1003], [1114502, 1566], [1114512, 1674], [1114526, 2131], [1114605, 532], [1114629, 2081], [1114664, 2031], [1114777, 540], [1114838, 730], [1114864, 1879], [1114992, 2245], [1115166, 233], [1115291, 1531], [1115375, 338], [1115456, 894], [1115471, 2013], [1115670, 907], [1115677, 1977], [1115701, 470], [1115790, 1217], [1115886, 2092], [1115959, 2214], [1116044, 488], [1116146, 612], [1116181, 852], [1116268, 61], [1116328, 1274], [1116336, 1890], [1116347, 1856], [1116397, 720], [1116410, 2375], [1116530, 1246], [1116569, 1813], [1116594, 343], [1116629, 770], [1116835, 1057], [1116839, 1060], [1116842, 305], [1116922, 190], [1116974, 765], [1117049, 2619], [1117089, 2620], [1117184, 1880], [1117219, 2128], [1117249, 1198], [1117348, 2119], [1117476, 1712], [1117501, 2471], [1117618, 659], [1117653, 1075], [1117760, 1584], [1117786, 955], [1117833, 1122], [1117920, 2465], [1117942, 1771], [1118017, 2568], [1118083, 1948], [1118092, 1790], [1118120, 690], [1118209, 2277], [1118245, 980], [1118286, 550], [1118302, 1900], [1118332, 314], [1118347, 2477], [1118388, 676], [1118546, 2623], [1118658, 276], [1118764, 1118], [1118823, 1195], [1118848, 1814], [1119004, 561], [1119078, 446], [1119140, 1857], [1119178, 1227], [1119180, 50], [1119211, 1582], [1119216, 2226], [1119295, 830], [1119471, 257], [1119505, 929], [1119623, 2526], [1119654, 474], [1119671, 1938], [1119708, 689], [1119742, 2536], [1119751, 1008], [1119987, 1974], [1120019, 2629], [1120020, 1651], [1120049, 1000], [1120059, 1099], [1120084, 571], [1120138, 982], [1120169, 666], [1120170, 556], [1120197, 336], [1120211, 1858], [1120252, 1418], [1120431, 708], [1120444, 826], [1120563, 1881], [1120643, 1250], [1120650, 1362], [1120713, 1455], [1120731, 1670], [1120777, 1951], [1120786, 293], [1120858, 1815], [1120866, 1052], [1120880, 625], [1120962, 510], [1121057, 1184], [1121063, 2311], [1121176, 2060], [1121254, 196], [1121313, 971], [1121398, 1466], [1121459, 2281], [1121537, 1006], [1121569, 455], [1121603, 1242], [1121659, 1649], [1121739, 2532], [1121867, 146], [1122304, 909], [1122425, 2086], [1122460, 1816], [1122574, 1014], [1122580, 461], [1122642, 1271], [1122704, 1939], [1123068, 356], [1123087, 1382], [1123093, 2638], [1123188, 2125], [1123215, 2562], [1123239, 892], [1123493, 917], [1123530, 2466], [1123553, 1009], [1123576, 2302], [1123689, 1141], [1123756, 1444], [1123867, 1843], [1123926, 1930], [1123991, 522], [1124837, 250], [1124844, 2346], [1125082, 1990], [1125092, 834], [1125258, 959], [1125386, 1713], [1125393, 37], [1125402, 2337], [1125467, 2020], [1125469, 911], [1125492, 188], [1125597, 2643], [1125895, 1991], [1125906, 339], [1125909, 340], [1125944, 1436], [1125953, 772], [1125992, 1978], [1125993, 2644], [1126011, 804], [1126012, 802], [1126029, 459], [1126037, 1166], [1126044, 1107], [1126050, 2218], [1126315, 700], [1126350, 2196], [1126503, 1043], [1127430, 1562], [1127530, 789], [1127541, 1575], [1127551, 1686], [1127558, 2473], [1127566, 2474], [1127619, 1915], [1127657, 777], [1127810, 2650], [1127812, 961], [1127851, 1402], [1127863, 2278], [1127913, 1714], [1128151, 557], [1128198, 301], [1128201, 977], [1128204, 978], [1128208, 976], [1128227, 764], [1128256, 1521], [1128267, 1193], [1128291, 2439], [1128314, 1715], [1128319, 2440], [1128369, 1361], [1128407, 559], [1128425, 2066], [1128430, 208], [1128437, 2653], [1128453, 90], [1128531, 1660], [1128536, 1658], [1128542, 2315], [1128839, 215], [1128846, 521], [1128853, 1076], [1128856, 1074], [1128868, 1992], [1128881, 1027], [1128927, 600], [1128935, 2395], [1128943, 883], [1128945, 882], [1128946, 881], [1128959, 684], [1128974, 536], [1128975, 2556], [1128977, 535], [1128978, 534], [1128982, 2443], [1128985, 1716], [1128990, 2313], [1128997, 312], [1129015, 2654], [1129018, 170], [1129021, 721], [1129027, 501], [1129040, 1407], [1129096, 2087], [1129106, 2061], [1129111, 511], [1129208, 530], [1129243, 1640], [1129367, 439], [1129368, 2657], [1129369, 2658], [1129442, 2187], [1129443, 1212], [1129494, 1541], [1129518, 1406], [1129570, 752], [1129572, 2358], [1129573, 754], [1129608, 1423], [1129610, 915], [1129621, 1200], [1129629, 2596], [1129683, 524], [1129778, 744], [1129798, 1832], [1129835, 213], [1129907, 199], [1129994, 299], [1130069, 974], [1130080, 967], [1130243, 2660], [1130356, 2098], [1130454, 2661], [1130539, 728], [1130567, 2046], [1130568, 178], [1130586, 207], [1130600, 318], [1130634, 433], [1130637, 2111], [1130653, 648], [1130657, 2272], [1130676, 1340], [1130678, 855], [1130680, 2067], [1130780, 1518], [1130808, 219], [1130847, 1339], [1130856, 1238], [1130915, 1993], [1130927, 566], [1130929, 57], [1130931, 673], [1130934, 674], [1131116, 1717], [1131137, 1984], [1131149, 1329], [1131150, 1219], [1131163, 2669], [1131164, 1422], [1131165, 2239], [1131167, 1545], [1131172, 1437], [1131180, 2672], [1131184, 200], [1131189, 198], [1131192, 1643], [1131195, 2411], [1131198, 2616], [1131223, 2563], [1131230, 430], [1131236, 428], [1131257, 2677], [1131258, 2670], [1131266, 2417], [1131267, 759], [1131270, 82], [1131274, 856], [1131277, 1349], [1131300, 1532], [1131301, 1533], [1131305, 688], [1131312, 1071], [1131314, 2682], [1131330, 1208], [1131334, 2678], [1131335, 1206], [1131345, 418], [1131348, 413], [1131359, 296], [1131360, 1718], [1131374, 1632], [1131414, 653], [1131420, 2687], [1131421, 2686], [1131464, 766], [1131466, 2490], [1131471, 1394], [1131549, 2554], [1131550, 2585], [1131557, 1516], [1131565, 2459], [1131607, 776], [1131611, 2201], [1131634, 1073], [1131639, 1175], [1131647, 335], [1131719, 537], [1131728, 1979], [1131734, 322], [1131741, 206], [1131745, 205], [1131748, 1859], [1131752, 1646], [1131754, 1648], [1131828, 941], [1132073, 2692], [1132083, 1663], [1132157, 1985], [1132285, 680], [1132385, 619], [1132406, 2693], [1132416, 2635], [1132418, 1704], [1132434, 159], [1132443, 486], [1132459, 2037], [1132461, 14], [1132486, 1940], [1132505, 171], [1132706, 424], [1132731, 2452], [1132809, 2161], [1132815, 2152], [1132857, 431], [1132864, 782], [1132887, 2362], [1132922, 874], [1132948, 1931], [1132968, 1334], [1133004, 1949], [1133008, 1356], [1133010, 2615], [1133028, 717], [1133047, 1617], [1133196, 951], [1133338, 1480], [1133390, 2171], [1133417, 1082], [1133428, 1211], [1133469, 1638], [1133846, 2567], [1133930, 2690], [1134022, 1886], [1134031, 23], [1134056, 208], [1134197, 986], [1134320, 1087], [1134346, 1153], [1134348, 884], [1134865, 353], [1135082, 2138], [1135108, 1891], [1135115, 2630], [1135122, 1400], [1135125, 1403], [1135137, 1299], [1135345, 2572], [1135358, 880], [1135368, 1860], [1135455, 2700], [1135589, 981], [1135746, 2347], [1135750, 1017], [1135894, 469], [1135899, 1932], [1135955, 390], [1136040, 2703], [1136110, 1247], [1136310, 1493], [1136342, 2557], [1136393, 617], [1136397, 1807], [1136422, 416], [1136442, 1861], [1136446, 2140], [1136447, 1316], [1136449, 1315], [1136631, 2702], [1136634, 1672], [1136791, 2441], [1136814, 53], [1137140, 713], [1137466, 919], [1138027, 2150], [1138043, 2469], [1138091, 750], [1138619, 307], [1138755, 192], [1138968, 848], [1138970, 1078], [1139009, 2705], [1139195, 1544], [1139928, 2330], [1140040, 1808], [1140230, 1363], [1140231, 1364], [1140289, 1994], [1140543, 2534], [1140547, 2044], [1140548, 923], [1152075, 1923], [1152143, 767], [1152150, 870], [1152162, 503], [1152179, 1132], [1152194, 1817], [1152244, 1619], [1152259, 283], [1152272, 503], [1152277, 366], [1152290, 32], [1152307, 995], [1152308, 1630], [1152358, 2220], [1152379, 1069], [1152394, 1048], [1152421, 1040], [1152436, 238], [1152448, 995], [1152490, 1031], [1152508, 34], [1152564, 1168], [1152569, 2421], [1152633, 814], [1152663, 2598], [1152673, 366], [1152676, 367], [1152711, 930], [1152714, 928], [1152740, 2093], [1152761, 1033], [1152821, 235], [1152858, 786], [1152859, 785], [1152896, 1829], [1152904, 503], [1152910, 2328], [1152917, 691], [1152944, 1011], [1152958, 1106], [1152959, 1321], [1152975, 995], [1152991, 283], [1153003, 2177], [1153014, 166], [1153024, 2299], [1153031, 1511], [1153056, 2082], [1153064, 280], [1153065, 72], [1153091, 847], [1153097, 846], [1153101, 950], [1153106, 2426], [1153148, 2153], [1153150, 950], [1153160, 1285], [1153166, 709], [1153169, 1288], [1153183, 624], [1153195, 2601], [1153254, 1539], [1153262, 1635], [1153264, 1637], [1153275, 195], [1153280, 1040], [1153287, 137], [1153577, 970], [1153703, 995], [1153724, 709], [1153728, 793], [1153736, 679], [1153784, 1689], [1153786, 2496], [1153811, 1487], [1153816, 1486], [1153853, 357], [1153860, 1033], [1153861, 1031], [1153866, 1587], [1153877, 1707], [1153879, 1700], [1153889, 2273], [1153891, 807], [1153896, 806], [1153897, 2307], [1153899, 2522], [1153900, 1252], [1153922, 1465], [1153933, 1355], [1153942, 2389], [1153943, 823], [1153945, 820], [1153946, 821], [1154012, 1001], [1154042, 678], [1154068, 1522], [1154071, 2683], [1154074, 2513], [1154076, 2642], [1154103, 1235], [1154123, 1020], [1154124, 1020], [1154169, 1453], [1154173, 66], [1154176, 1565], [1154229, 29], [1154230, 1350], [1154232, 2399], [1154233, 2400], [1154251, 366], [1154276, 920], [1154459, 1040], [1154500, 2188], [1154520, 278], [1154524, 2454], [1154525, 1835], [1155073, 712]]


--------------------------------------------------------------------------------
/cora_sota.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# CORA: getML performance breaks record\n",
   8 |     "\n",
   9 |     "Graph Neural Networks (GNNs) are renowned for their outstanding performance on graph-structured data, excelling in tasks like node classification and link prediction. However, deploying GNNs is often complex. Tasks such as graph preprocessing, optimizing architectures, tuning hyperparameters, and ensuring convergence are non-trivial challenges when working with neural network based approaches, requiring considerable time investment.\n",
  10 |     "\n",
  11 |     "**getML** offers a faster and more user-friendly alternative. Leveraging **getML FastProp**, the fastest open-source tool for propositionalization-based automation of feature engineering on relational data and time series, FastProp transforms relational data into a single feature table suitable for standard machine learning models by efficiently computing a wide range of statistical and temporal aggregates. When combined with models like **XGBoost**, getML delivers a straightforward yet highly performant approach to predictive modeling. This method eliminates the need for complex GNN-based approaches while ensuring coding efficiency, computational speed, and high model accuracy.\n",
  12 |     "\n",
  13 |     "This notebook demonstrates how **getML** surpasses the previous record on the CORA dataset—set by the GNN-based approach of [Izadi et al. (2020)](https://paperswithcode.com/sota/node-classification-on-cora)—with minimal code and configuration.\n",
  14 |     "\n",
  15 |     "Summary:\n",
  16 |     "\n",
  17 |     "- Prediction type: __Classification model__\n",
  18 |     "- Domain: __Academia__\n",
  19 |     "- Prediction target: __The category of a paper__ \n",
  20 |     "- Source data: __Relational data set, 3 tables__\n",
  21 |     "- Population size: __2,708__"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "markdown",
  26 |    "metadata": {},
  27 |    "source": [
  28 |     "First let some boilerplate code run."
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "code",
  33 |    "execution_count": 1,
  34 |    "metadata": {},
  35 |    "outputs": [
  36 |     {
  37 |      "name": "stdout",
  38 |      "output_type": "stream",
  39 |      "text": [
  40 |       "Note: you may need to restart the kernel to use updated packages.\n"
  41 |      ]
  42 |     }
  43 |    ],
  44 |    "source": [
  45 |     "%pip install -q \"getml==1.5.0\" \"ipywidgets==8.1.5\""
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "code",
  50 |    "execution_count": null,
  51 |    "metadata": {},
  52 |    "outputs": [
  53 |     {
  54 |      "name": "stdout",
  55 |      "output_type": "stream",
  56 |      "text": [
  57 |       "getML API version: 1.5.0\n",
  58 |       "\n"
  59 |      ]
  60 |     }
  61 |    ],
  62 |    "source": [
  63 |     "import os\n",
  64 |     "\n",
  65 |     "import json\n",
  66 |     "import numpy as np\n",
  67 |     "import pandas as pd\n",
  68 |     "\n",
  69 |     "import getml\n",
  70 |     "\n",
  71 |     "print(f\"getML API version: {getml.__version__}\\n\")"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": 3,
  77 |    "metadata": {},
  78 |    "outputs": [
  79 |     {
  80 |      "name": "stdout",
  81 |      "output_type": "stream",
  82 |      "text": [
  83 |       "Launching ./getML --allow-push-notifications=true --allow-remote-ips=false --home-directory=/home/user/.getML --in-memory=true --install=false --launch-browser=true --log=false --project-directory=/home/user/.getML/projects in /home/user/.getML/getml-enterprise-1.5.0-amd64-linux...\n",
  84 |       "Launched the getML Engine. The log output will be stored in /home/user/.getML/logs/getml_20241119160445.log\n",
  85 |       "\u001b[2K  Loading pipelines... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
  86 |       "\u001b[?25h"
  87 |      ]
  88 |     },
  89 |     {
  90 |      "data": {
  91 |       "text/html": [
  92 |        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Connected to project <span style=\"color: #008000; text-decoration-color: #008000\">'cora_sota'</span>.\n",
  93 |        "</pre>\n"
  94 |       ],
  95 |       "text/plain": [
  96 |        "Connected to project \u001b[32m'cora_sota'\u001b[0m.\n"
  97 |       ]
  98 |      },
  99 |      "metadata": {},
 100 |      "output_type": "display_data"
 101 |     }
 102 |    ],
 103 |    "source": [
 104 |     "getml.engine.launch()\n",
 105 |     "getml.engine.set_project(\"cora_sota\")"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "markdown",
 110 |    "metadata": {},
 111 |    "source": [
 112 |     "### 1. Loading data"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "markdown",
 117 |    "metadata": {},
 118 |    "source": [
 119 |     "#### 1.1 Download from source\n",
 120 |     "\n",
 121 |     "We begin by downloading the data from the source file:"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "code",
 126 |    "execution_count": 4,
 127 |    "metadata": {},
 128 |    "outputs": [
 129 |     {
 130 |      "data": {
 131 |       "text/plain": [
 132 |        "Connection(dbname='CORA', dialect='mysql', host='relational.fel.cvut.cz', port=3306)"
 133 |       ]
 134 |      },
 135 |      "execution_count": 4,
 136 |      "metadata": {},
 137 |      "output_type": "execute_result"
 138 |     }
 139 |    ],
 140 |    "source": [
 141 |     "conn = getml.database.connect_mysql(\n",
 142 |     "    host=\"relational.fel.cvut.cz\",\n",
 143 |     "    dbname=\"CORA\",\n",
 144 |     "    port=3306,\n",
 145 |     "    user=\"guest\",\n",
 146 |     "    password=\"ctu-relational\",\n",
 147 |     ")\n",
 148 |     "\n",
 149 |     "conn"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": 5,
 155 |    "metadata": {},
 156 |    "outputs": [],
 157 |    "source": [
 158 |     "def load_if_needed(name):\n",
 159 |     "    \"\"\"\n",
 160 |     "    Loads the data from the relational learning\n",
 161 |     "    repository, if the data frame has not already\n",
 162 |     "    been loaded.\n",
 163 |     "    \"\"\"\n",
 164 |     "    if not getml.data.exists(name):\n",
 165 |     "        data_frame = getml.data.DataFrame.from_db(name=name, table_name=name, conn=conn)\n",
 166 |     "        data_frame.save()\n",
 167 |     "    else:\n",
 168 |     "        data_frame = getml.data.load_data_frame(name)\n",
 169 |     "    return data_frame"
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "code",
 174 |    "execution_count": 6,
 175 |    "metadata": {},
 176 |    "outputs": [],
 177 |    "source": [
 178 |     "paper = load_if_needed(\"paper\")\n",
 179 |     "cites = load_if_needed(\"cites\")\n",
 180 |     "content = load_if_needed(\"content\")"
 181 |    ]
 182 |   },
 183 |   {
 184 |    "cell_type": "markdown",
 185 |    "metadata": {},
 186 |    "source": [
 187 |     "Here we deviate from the regular procedure by introducing the exact same train test split as the [current top seed](https://paperswithcode.com/paper/optimization-of-graph-neural-networks-with). While we contend, that testing on a single split is not sufficient to demonstrate performance of an algorithm on a specific data set, we proceed as such in order to maximize comparability with the current incumbent of the Leader Board. For a more extensive investigation of the getML performance on the CORA dataset, checkout [our other notebooks](https://getml.com/latest/examples/enterprise-notebooks/kaggle_notebooks/). "
 188 |    ]
 189 |   },
 190 |   {
 191 |    "cell_type": "markdown",
 192 |    "metadata": {},
 193 |    "source": [
 194 |     "To achieve the identical split we first need to match papers and their associated word matrix across data sources. "
 195 |    ]
 196 |   },
 197 |   {
 198 |    "cell_type": "code",
 199 |    "execution_count": null,
 200 |    "metadata": {},
 201 |    "outputs": [],
 202 |    "source": [
 203 |     "if not os.path.exists(\"assets/zuordnung.json\"):\n",
 204 |     "    !pip install torch\n",
 205 |     "    !pip install -q git+https://github.com/pyg-team/pytorch_geometric.git\n",
 206 |     "    from utils.zuordnung import run_zuordnung\n",
 207 |     "\n",
 208 |     "    # may take 90 minutes or longer to run\n",
 209 |     "    run_zuordnung(content)"
 210 |    ]
 211 |   },
 212 |   {
 213 |    "cell_type": "code",
 214 |    "execution_count": null,
 215 |    "metadata": {},
 216 |    "outputs": [],
 217 |    "source": [
 218 |     "with open(\"assets/zuordnung.json\", \"r\") as f:\n",
 219 |     "    zuordnung = json.load(f)\n",
 220 |     "\n",
 221 |     "paper_df = paper.to_pandas()\n",
 222 |     "paper_df[\"paper_id\"] = paper_df[\"paper_id\"].astype(int)\n",
 223 |     "zuo_df = pd.DataFrame(zuordnung)\n",
 224 |     "zuo_df[0] = zuo_df[0].astype(int)\n",
 225 |     "paper_df = paper_df.merge(zuo_df, left_on=\"paper_id\", right_on=0).sort_values(by=1)\n",
 226 |     "paper_df = paper_df[[\"class_label\", \"paper_id\"]]"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "markdown",
 231 |    "metadata": {},
 232 |    "source": [
 233 |     "We split the sorted data set according to the instructions in the Izadi et al. paper (see:  IV. Experiments, A. Datasets, third split)"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": 9,
 239 |    "metadata": {},
 240 |    "outputs": [],
 241 |    "source": [
 242 |     "paper_train = getml.data.DataFrame.from_pandas(paper_df[:1707], name=\"train\")\n",
 243 |     "paper_val = getml.data.DataFrame.from_pandas(\n",
 244 |     "    paper_df[1707 : 1707 + 500], name=\"validation\"\n",
 245 |     ")\n",
 246 |     "paper_test = getml.data.DataFrame.from_pandas(paper_df[1707 + 500 :], name=\"test\")\n",
 247 |     "\n",
 248 |     "paper, split = getml.data.split.concat(\n",
 249 |     "    \"population\", train=paper_train, validation=paper_val, test=paper_test\n",
 250 |     ")"
 251 |    ]
 252 |   },
 253 |   {
 254 |    "cell_type": "markdown",
 255 |    "metadata": {},
 256 |    "source": [
 257 |     "#### 1.2 Prepare data for getML"
 258 |    ]
 259 |   },
 260 |   {
 261 |    "cell_type": "markdown",
 262 |    "metadata": {},
 263 |    "source": [
 264 |     "getML requires that we define *roles* for each of the columns."
 265 |    ]
 266 |   },
 267 |   {
 268 |    "cell_type": "code",
 269 |    "execution_count": 10,
 270 |    "metadata": {},
 271 |    "outputs": [],
 272 |    "source": [
 273 |     "paper.set_role(\"paper_id\", getml.data.roles.join_key)\n",
 274 |     "paper.set_role(\"class_label\", getml.data.roles.categorical)\n",
 275 |     "cites.set_role([\"cited_paper_id\", \"citing_paper_id\"], getml.data.roles.join_key)\n",
 276 |     "content.set_role(\"paper_id\", getml.data.roles.join_key)\n",
 277 |     "content.set_role(\"word_cited_id\", getml.data.roles.categorical)"
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "markdown",
 282 |    "metadata": {},
 283 |    "source": [
 284 |     "The goal is to predict seven different labels. We generate a target column for each of those labels."
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "code",
 289 |    "execution_count": 11,
 290 |    "metadata": {},
 291 |    "outputs": [],
 292 |    "source": [
 293 |     "data_full = getml.data.make_target_columns(paper, \"class_label\")"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": 12,
 299 |    "metadata": {},
 300 |    "outputs": [
 301 |     {
 302 |      "data": {
 303 |       "text/html": [
 304 |        "<div style='margin-top: 15px;'>\n",
 305 |        "<div style='float: left; margin-right: 50px;'>\n",
 306 |        "<div style='margin-bottom: 10px; font-size: 1rem;'>population</div>\n",
 307 |        "    <style>\n",
 308 |        "  th {\n",
 309 |        "    text-align: left !important;\n",
 310 |        "  }\n",
 311 |        "  td {\n",
 312 |        "    text-align: left !important;\n",
 313 |        "  }\n",
 314 |        "  th:nth-child(1) {\n",
 315 |        "    text-align: right;\n",
 316 |        "    border-right: 1px solid LightGray;\n",
 317 |        "  }\n",
 318 |        "  th.float {\n",
 319 |        "    text-align: right !important;\n",
 320 |        "  }\n",
 321 |        "  td.float {\n",
 322 |        "    text-align: right !important;\n",
 323 |        "  }\n",
 324 |        "  th.int {\n",
 325 |        "    text-align: right !important;\n",
 326 |        "  }\n",
 327 |        "  td.int {\n",
 328 |        "    text-align: right !important;\n",
 329 |        "  }\n",
 330 |        "</style>\n",
 331 |        "\n",
 332 |        "<table class=\"dataframe\">\n",
 333 |        "  <thead>\n",
 334 |        "    <tr>\n",
 335 |        "      \n",
 336 |        "        \n",
 337 |        "          <th class=\"int\"> </th>\n",
 338 |        "        \n",
 339 |        "      \n",
 340 |        "        \n",
 341 |        "          <th class=\"str\">subset    </th>\n",
 342 |        "        \n",
 343 |        "      \n",
 344 |        "        \n",
 345 |        "          <th class=\"str\">name      </th>\n",
 346 |        "        \n",
 347 |        "      \n",
 348 |        "        \n",
 349 |        "          <th class=\"int\">rows</th>\n",
 350 |        "        \n",
 351 |        "      \n",
 352 |        "        \n",
 353 |        "          <th class=\"str\">type</th>\n",
 354 |        "        \n",
 355 |        "      \n",
 356 |        "    </tr>\n",
 357 |        "    \n",
 358 |        "  </thead>\n",
 359 |        "  <tbody>\n",
 360 |        "    \n",
 361 |        "      <tr>\n",
 362 |        "        <th>0</th>\n",
 363 |        "          \n",
 364 |        "            \n",
 365 |        "              <td class=\"str\">test</td>\n",
 366 |        "            \n",
 367 |        "          \n",
 368 |        "            \n",
 369 |        "              <td class=\"str\">population</td>\n",
 370 |        "            \n",
 371 |        "          \n",
 372 |        "            \n",
 373 |        "              <td class=\"int\">500</td>\n",
 374 |        "            \n",
 375 |        "          \n",
 376 |        "            \n",
 377 |        "              <td class=\"str\">View</td>\n",
 378 |        "            \n",
 379 |        "          \n",
 380 |        "      </tr>\n",
 381 |        "    \n",
 382 |        "      <tr>\n",
 383 |        "        <th>1</th>\n",
 384 |        "          \n",
 385 |        "            \n",
 386 |        "              <td class=\"str\">train</td>\n",
 387 |        "            \n",
 388 |        "          \n",
 389 |        "            \n",
 390 |        "              <td class=\"str\">population</td>\n",
 391 |        "            \n",
 392 |        "          \n",
 393 |        "            \n",
 394 |        "              <td class=\"int\">1708</td>\n",
 395 |        "            \n",
 396 |        "          \n",
 397 |        "            \n",
 398 |        "              <td class=\"str\">View</td>\n",
 399 |        "            \n",
 400 |        "          \n",
 401 |        "      </tr>\n",
 402 |        "    \n",
 403 |        "      <tr>\n",
 404 |        "        <th>2</th>\n",
 405 |        "          \n",
 406 |        "            \n",
 407 |        "              <td class=\"str\">validation</td>\n",
 408 |        "            \n",
 409 |        "          \n",
 410 |        "            \n",
 411 |        "              <td class=\"str\">population</td>\n",
 412 |        "            \n",
 413 |        "          \n",
 414 |        "            \n",
 415 |        "              <td class=\"int\">500</td>\n",
 416 |        "            \n",
 417 |        "          \n",
 418 |        "            \n",
 419 |        "              <td class=\"str\">View</td>\n",
 420 |        "            \n",
 421 |        "          \n",
 422 |        "      </tr>\n",
 423 |        "    \n",
 424 |        "  </tbody>\n",
 425 |        "</table>\n",
 426 |        "</div>\n",
 427 |        "<div style='float: left;'>\n",
 428 |        "<div style='margin-bottom: 10px; font-size: 1rem;'>peripheral</div>\n",
 429 |        "    <style>\n",
 430 |        "  th {\n",
 431 |        "    text-align: left !important;\n",
 432 |        "  }\n",
 433 |        "  td {\n",
 434 |        "    text-align: left !important;\n",
 435 |        "  }\n",
 436 |        "  th:nth-child(1) {\n",
 437 |        "    text-align: right;\n",
 438 |        "    border-right: 1px solid LightGray;\n",
 439 |        "  }\n",
 440 |        "  th.float {\n",
 441 |        "    text-align: right !important;\n",
 442 |        "  }\n",
 443 |        "  td.float {\n",
 444 |        "    text-align: right !important;\n",
 445 |        "  }\n",
 446 |        "  th.int {\n",
 447 |        "    text-align: right !important;\n",
 448 |        "  }\n",
 449 |        "  td.int {\n",
 450 |        "    text-align: right !important;\n",
 451 |        "  }\n",
 452 |        "</style>\n",
 453 |        "\n",
 454 |        "<table class=\"dataframe\">\n",
 455 |        "  <thead>\n",
 456 |        "    <tr>\n",
 457 |        "      \n",
 458 |        "        \n",
 459 |        "          <th class=\"int\"> </th>\n",
 460 |        "        \n",
 461 |        "      \n",
 462 |        "        \n",
 463 |        "          <th class=\"str\">alias  </th>\n",
 464 |        "        \n",
 465 |        "      \n",
 466 |        "        \n",
 467 |        "          <th class=\"str\">name      </th>\n",
 468 |        "        \n",
 469 |        "      \n",
 470 |        "        \n",
 471 |        "          <th class=\"int\"> rows</th>\n",
 472 |        "        \n",
 473 |        "      \n",
 474 |        "        \n",
 475 |        "          <th class=\"str\">type     </th>\n",
 476 |        "        \n",
 477 |        "      \n",
 478 |        "    </tr>\n",
 479 |        "    \n",
 480 |        "  </thead>\n",
 481 |        "  <tbody>\n",
 482 |        "    \n",
 483 |        "      <tr>\n",
 484 |        "        <th>0</th>\n",
 485 |        "          \n",
 486 |        "            \n",
 487 |        "              <td class=\"str\">cites</td>\n",
 488 |        "            \n",
 489 |        "          \n",
 490 |        "            \n",
 491 |        "              <td class=\"str\">cites</td>\n",
 492 |        "            \n",
 493 |        "          \n",
 494 |        "            \n",
 495 |        "              <td class=\"int\">5429</td>\n",
 496 |        "            \n",
 497 |        "          \n",
 498 |        "            \n",
 499 |        "              <td class=\"str\">DataFrame</td>\n",
 500 |        "            \n",
 501 |        "          \n",
 502 |        "      </tr>\n",
 503 |        "    \n",
 504 |        "      <tr>\n",
 505 |        "        <th>1</th>\n",
 506 |        "          \n",
 507 |        "            \n",
 508 |        "              <td class=\"str\">content</td>\n",
 509 |        "            \n",
 510 |        "          \n",
 511 |        "            \n",
 512 |        "              <td class=\"str\">content</td>\n",
 513 |        "            \n",
 514 |        "          \n",
 515 |        "            \n",
 516 |        "              <td class=\"int\">49216</td>\n",
 517 |        "            \n",
 518 |        "          \n",
 519 |        "            \n",
 520 |        "              <td class=\"str\">DataFrame</td>\n",
 521 |        "            \n",
 522 |        "          \n",
 523 |        "      </tr>\n",
 524 |        "    \n",
 525 |        "      <tr>\n",
 526 |        "        <th>2</th>\n",
 527 |        "          \n",
 528 |        "            \n",
 529 |        "              <td class=\"str\">paper</td>\n",
 530 |        "            \n",
 531 |        "          \n",
 532 |        "            \n",
 533 |        "              <td class=\"str\">population</td>\n",
 534 |        "            \n",
 535 |        "          \n",
 536 |        "            \n",
 537 |        "              <td class=\"int\">2708</td>\n",
 538 |        "            \n",
 539 |        "          \n",
 540 |        "            \n",
 541 |        "              <td class=\"str\">DataFrame</td>\n",
 542 |        "            \n",
 543 |        "          \n",
 544 |        "      </tr>\n",
 545 |        "    \n",
 546 |        "  </tbody>\n",
 547 |        "</table>\n",
 548 |        "</div>\n",
 549 |        "</div>"
 550 |       ],
 551 |       "text/plain": [
 552 |        "population\n",
 553 |        "    subset       name         rows   type\n",
 554 |        "0   test         population    500   View\n",
 555 |        "1   train        population   1708   View\n",
 556 |        "2   validation   population    500   View\n",
 557 |        "\n",
 558 |        "peripheral\n",
 559 |        "    alias     name          rows   type     \n",
 560 |        "0   cites     cites         5429   DataFrame\n",
 561 |        "1   content   content      49216   DataFrame\n",
 562 |        "2   paper     population    2708   DataFrame"
 563 |       ]
 564 |      },
 565 |      "execution_count": 12,
 566 |      "metadata": {},
 567 |      "output_type": "execute_result"
 568 |     }
 569 |    ],
 570 |    "source": [
 571 |     "container = getml.data.Container(population=data_full, split=split)\n",
 572 |     "container.add(cites=cites, content=content, paper=paper)\n",
 573 |     "container.freeze()\n",
 574 |     "container"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "markdown",
 579 |    "metadata": {},
 580 |    "source": [
 581 |     "### 2. Predictive modeling\n",
 582 |     "\n",
 583 |     "We loaded the data and defined the roles and units. Next, we create a getML pipeline for relational learning."
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "markdown",
 588 |    "metadata": {},
 589 |    "source": [
 590 |     "#### 2.1 Define relational model\n",
 591 |     "\n",
 592 |     "To get started with relational learning, we need to specify the data model. Even though the data set itself is quite simple with only three tables and six columns in total, the resulting data model is actually quite complicated.\n",
 593 |     "\n",
 594 |     "That is because the class label can be predicting using three different pieces of information:\n",
 595 |     "\n",
 596 |     "- The keywords used by the paper\n",
 597 |     "- The keywords used by papers it cites and by papers that cite the paper\n",
 598 |     "- The class label of papers it cites and by papers that cite the paper\n",
 599 |     "\n",
 600 |     "The main challenge here is that `cites` is used twice, once to connect the _cited_ papers and then to connect the _citing_ papers. To resolve this, we need two placeholders on `cites`."
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "code",
 605 |    "execution_count": 13,
 606 |    "metadata": {},
 607 |    "outputs": [
 608 |     {
 609 |      "data": {
 610 |       "text/html": [
 611 |        "            <div style='margin-top: 15px; margin-bottom: 5px;'>\n",
 612 |        "            <div style='margin-bottom: 10px; font-size: 1rem;'>diagram</div>\n",
 613 |        "            <div style=\"height:540px;width:1160px;position:relative;\"><svg height=\"530\" width=\"1150\"><rect y=\"0\" x=\"0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"73.8\" x=\"75.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">content</text><rect x=\"51\" y=\"10\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"67.0\" y1=\"10\" x2=\"67.0\" y2=\"58\" style=\"stroke:white;stroke-width:3\" /><line x1=\"83.0\" y1=\"10\" x2=\"83.0\" y2=\"58\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"26.0\" x2=\"99\" y2=\"26.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"42.0\" x2=\"99\" y2=\"42.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"110\" x=\"0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"183.8\" x=\"75.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">paper</text><rect x=\"51\" y=\"120\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"67.0\" y1=\"120\" x2=\"67.0\" y2=\"168\" style=\"stroke:white;stroke-width:3\" /><line x1=\"83.0\" y1=\"120\" x2=\"83.0\" y2=\"168\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"136.0\" x2=\"99\" y2=\"136.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"152.0\" x2=\"99\" y2=\"152.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"110\" x=\"500\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"183.8\" x=\"575.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">cites</text><rect x=\"551\" y=\"120\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"567.0\" y1=\"120\" x2=\"567.0\" y2=\"168\" style=\"stroke:white;stroke-width:3\" /><line x1=\"583.0\" y1=\"120\" x2=\"583.0\" y2=\"168\" style=\"stroke:white;stroke-width:3\" /><line x1=\"551\" y1=\"136.0\" x2=\"599\" y2=\"136.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"551\" y1=\"152.0\" x2=\"599\" y2=\"152.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"220\" x=\"0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"293.8\" x=\"75.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">content</text><rect x=\"51\" y=\"230\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"67.0\" y1=\"230\" x2=\"67.0\" y2=\"278\" style=\"stroke:white;stroke-width:3\" /><line x1=\"83.0\" y1=\"230\" x2=\"83.0\" y2=\"278\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"246.0\" x2=\"99\" y2=\"246.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"262.0\" x2=\"99\" y2=\"262.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"330\" x=\"0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"403.8\" x=\"75.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">paper</text><rect x=\"51\" y=\"340\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"67.0\" y1=\"340\" x2=\"67.0\" y2=\"388\" style=\"stroke:white;stroke-width:3\" /><line x1=\"83.0\" y1=\"340\" x2=\"83.0\" y2=\"388\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"356.0\" x2=\"99\" y2=\"356.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"51\" y1=\"372.0\" x2=\"99\" y2=\"372.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"330\" x=\"500\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"403.8\" x=\"575.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">cites</text><rect x=\"551\" y=\"340\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"567.0\" y1=\"340\" x2=\"567.0\" y2=\"388\" style=\"stroke:white;stroke-width:3\" /><line x1=\"583.0\" y1=\"340\" x2=\"583.0\" y2=\"388\" style=\"stroke:white;stroke-width:3\" /><line x1=\"551\" y1=\"356.0\" x2=\"599\" y2=\"356.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"551\" y1=\"372.0\" x2=\"599\" y2=\"372.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"440\" x=\"500\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"513.8\" x=\"575.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">content</text><rect x=\"551\" y=\"450\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"567.0\" y1=\"450\" x2=\"567.0\" y2=\"498\" style=\"stroke:white;stroke-width:3\" /><line x1=\"583.0\" y1=\"450\" x2=\"583.0\" y2=\"498\" style=\"stroke:white;stroke-width:3\" /><line x1=\"551\" y1=\"466.0\" x2=\"599\" y2=\"466.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"551\" y1=\"482.0\" x2=\"599\" y2=\"482.0\" style=\"stroke:white;stroke-width:3\" /><rect y=\"440\" x=\"1000\" rx=\"10\" ry=\"10\" width=\"150\" height=\"90\" style=\"fill:#6829c2;stroke-width:0;\" /><text y=\"513.8\" x=\"1075.0\" dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\">population</text><rect x=\"1051\" y=\"450\" rx=\"4\" ry=\"4\" width=\"48\" height=\"48\" style=\" fill:#6829c2;stroke:#ffffff;stroke-width:3;\" /><line x1=\"1067.0\" y1=\"450\" x2=\"1067.0\" y2=\"498\" style=\"stroke:white;stroke-width:3\" /><line x1=\"1083.0\" y1=\"450\" x2=\"1083.0\" y2=\"498\" style=\"stroke:white;stroke-width:3\" /><line x1=\"1051\" y1=\"466.0\" x2=\"1099\" y2=\"466.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"1051\" y1=\"482.0\" x2=\"1099\" y2=\"482.0\" style=\"stroke:white;stroke-width:3\" /><line x1=\"150\" y1=\"43.0\" x2=\"573.0\" y2=\"43.0\" style=\"stroke:#808080;;stroke-width:4\" /><line x1=\"573.0\" y1=\"41.0\" x2=\"573.0\" y2=\"100\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"573.0, 110 567.0, 100 579.0, 100 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"10.0\" x=\"249.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"45.0\" x=\"324.0\" font-size=\"7pt\" >paper_id = citing_paper_id</tspan></text><line x1=\"150\" y1=\"153.0\" x2=\"490\" y2=\"153.0\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"500, 153.0 490, 147.0 490, 159.0 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"120.0\" x=\"249.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"150.0\" x=\"324.0\" font-size=\"7pt\" >paper_id = citing_paper_id</tspan><tspan y=\"160.0\" x=\"324.0\" font-size=\"7pt\" >Relationship: many-to-one</tspan></text><line x1=\"150\" y1=\"263.0\" x2=\"573.0\" y2=\"263.0\" style=\"stroke:#808080;;stroke-width:4\" /><line x1=\"573.0\" y1=\"261.0\" x2=\"573.0\" y2=\"320\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"573.0, 330 567.0, 320 579.0, 320 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"230.0\" x=\"249.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"265.0\" x=\"324.0\" font-size=\"7pt\" >paper_id = cited_paper_id</tspan></text><line x1=\"150\" y1=\"373.0\" x2=\"490\" y2=\"373.0\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"500, 373.0 490, 367.0 490, 379.0 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"340.0\" x=\"249.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"370.0\" x=\"324.0\" font-size=\"7pt\" >paper_id = cited_paper_id</tspan><tspan y=\"380.0\" x=\"324.0\" font-size=\"7pt\" >Relationship: many-to-one</tspan></text><line x1=\"650\" y1=\"153.0\" x2=\"1073.0\" y2=\"153.0\" style=\"stroke:#808080;;stroke-width:4\" /><line x1=\"1073.0\" y1=\"151.0\" x2=\"1073.0\" y2=\"430\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"1073.0, 440 1067.0, 430 1079.0, 430 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"120.0\" x=\"749.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"155.0\" x=\"824.0\" font-size=\"7pt\" >cited_paper_id = paper_id</tspan></text><line x1=\"650\" y1=\"373.0\" x2=\"1073.0\" y2=\"373.0\" style=\"stroke:#808080;;stroke-width:4\" /><line x1=\"1073.0\" y1=\"371.0\" x2=\"1073.0\" y2=\"430\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"1073.0, 440 1067.0, 430 1079.0, 430 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"340.0\" x=\"749.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"375.0\" x=\"824.0\" font-size=\"7pt\" >citing_paper_id = paper_id</tspan></text><line x1=\"650\" y1=\"483.0\" x2=\"990\" y2=\"483.0\" style=\"stroke:#808080;;stroke-width:4\" /><polygon points=\"1000, 483.0 990, 477.0 990, 489.0 \" style=\"fill:#808080;;stroke-width:0;\" /><rect y=\"450.0\" x=\"749.0\" rx=\"10\" ry=\"10\" width=\"150\" height=\"70\" style=\"fill:#6829c2;stroke-width:0;\" /><text dominant-baseline=\"middle\" text-anchor=\"middle\" fill=\"white\"><tspan y=\"485.0\" x=\"824.0\" font-size=\"7pt\" >paper_id = paper_id</tspan></text></svg></div>\n",
 614 |        "            </div>\n",
 615 |        "\n",
 616 |        "            <div style='margin-top: 15px;'>\n",
 617 |        "            <div style='margin-bottom: 10px; font-size: 1rem;'>staging</div>\n",
 618 |        "            <style>\n",
 619 |        "  th {\n",
 620 |        "    text-align: left !important;\n",
 621 |        "  }\n",
 622 |        "  td {\n",
 623 |        "    text-align: left !important;\n",
 624 |        "  }\n",
 625 |        "  th:nth-child(1) {\n",
 626 |        "    text-align: right;\n",
 627 |        "    border-right: 1px solid LightGray;\n",
 628 |        "  }\n",
 629 |        "  th.float {\n",
 630 |        "    text-align: right !important;\n",
 631 |        "  }\n",
 632 |        "  td.float {\n",
 633 |        "    text-align: right !important;\n",
 634 |        "  }\n",
 635 |        "  th.int {\n",
 636 |        "    text-align: right !important;\n",
 637 |        "  }\n",
 638 |        "  td.int {\n",
 639 |        "    text-align: right !important;\n",
 640 |        "  }\n",
 641 |        "</style>\n",
 642 |        "\n",
 643 |        "<table class=\"dataframe\">\n",
 644 |        "  <thead>\n",
 645 |        "    <tr>\n",
 646 |        "      \n",
 647 |        "        \n",
 648 |        "          <th class=\"int\"> </th>\n",
 649 |        "        \n",
 650 |        "      \n",
 651 |        "        \n",
 652 |        "          <th class=\"str\">data frames </th>\n",
 653 |        "        \n",
 654 |        "      \n",
 655 |        "        \n",
 656 |        "          <th class=\"str\">staging table              </th>\n",
 657 |        "        \n",
 658 |        "      \n",
 659 |        "    </tr>\n",
 660 |        "    \n",
 661 |        "  </thead>\n",
 662 |        "  <tbody>\n",
 663 |        "    \n",
 664 |        "      <tr>\n",
 665 |        "        <th>0</th>\n",
 666 |        "          \n",
 667 |        "            \n",
 668 |        "              <td class=\"str\">population</td>\n",
 669 |        "            \n",
 670 |        "          \n",
 671 |        "            \n",
 672 |        "              <td class=\"str\">POPULATION__STAGING_TABLE_1</td>\n",
 673 |        "            \n",
 674 |        "          \n",
 675 |        "      </tr>\n",
 676 |        "    \n",
 677 |        "      <tr>\n",
 678 |        "        <th>1</th>\n",
 679 |        "          \n",
 680 |        "            \n",
 681 |        "              <td class=\"str\">cites, paper</td>\n",
 682 |        "            \n",
 683 |        "          \n",
 684 |        "            \n",
 685 |        "              <td class=\"str\">CITES__STAGING_TABLE_2</td>\n",
 686 |        "            \n",
 687 |        "          \n",
 688 |        "      </tr>\n",
 689 |        "    \n",
 690 |        "      <tr>\n",
 691 |        "        <th>2</th>\n",
 692 |        "          \n",
 693 |        "            \n",
 694 |        "              <td class=\"str\">cites, paper</td>\n",
 695 |        "            \n",
 696 |        "          \n",
 697 |        "            \n",
 698 |        "              <td class=\"str\">CITES__STAGING_TABLE_3</td>\n",
 699 |        "            \n",
 700 |        "          \n",
 701 |        "      </tr>\n",
 702 |        "    \n",
 703 |        "      <tr>\n",
 704 |        "        <th>3</th>\n",
 705 |        "          \n",
 706 |        "            \n",
 707 |        "              <td class=\"str\">content</td>\n",
 708 |        "            \n",
 709 |        "          \n",
 710 |        "            \n",
 711 |        "              <td class=\"str\">CONTENT__STAGING_TABLE_4</td>\n",
 712 |        "            \n",
 713 |        "          \n",
 714 |        "      </tr>\n",
 715 |        "    \n",
 716 |        "  </tbody>\n",
 717 |        "</table>\n",
 718 |        "            </div>\n",
 719 |        "            "
 720 |       ],
 721 |       "text/plain": [
 722 |        "population:\n",
 723 |        "  columns:\n",
 724 |        "  - class_label: categorical\n",
 725 |        "  - paper_id: join_key\n",
 726 |        "\n",
 727 |        "  joins:\n",
 728 |        "  - right: 'cites'\n",
 729 |        "    on: \n",
 730 |        "    - (population.paper_id, cites.cited_paper_id)\n",
 731 |        "    relationship: 'many-to-many'\n",
 732 |        "    lagged_targets: False\n",
 733 |        "  - right: 'cites'\n",
 734 |        "    on: \n",
 735 |        "    - (population.paper_id, cites.citing_paper_id)\n",
 736 |        "    relationship: 'many-to-many'\n",
 737 |        "    lagged_targets: False\n",
 738 |        "  - right: 'content'\n",
 739 |        "    on: \n",
 740 |        "    - (population.paper_id, content.paper_id)\n",
 741 |        "    relationship: 'many-to-many'\n",
 742 |        "    lagged_targets: False\n",
 743 |        "\n",
 744 |        "cites:\n",
 745 |        "  columns:\n",
 746 |        "  - cited_paper_id: join_key\n",
 747 |        "  - citing_paper_id: join_key\n",
 748 |        "\n",
 749 |        "  joins:\n",
 750 |        "  - right: 'content'\n",
 751 |        "    on: \n",
 752 |        "    - (cites.citing_paper_id, content.paper_id)\n",
 753 |        "    relationship: 'many-to-many'\n",
 754 |        "    lagged_targets: False\n",
 755 |        "  - right: 'paper'\n",
 756 |        "    on: \n",
 757 |        "    - (cites.citing_paper_id, paper.paper_id)\n",
 758 |        "    relationship: 'many-to-one'\n",
 759 |        "    lagged_targets: False\n",
 760 |        "\n",
 761 |        "content:\n",
 762 |        "  columns:\n",
 763 |        "  - word_cited_id: categorical\n",
 764 |        "  - paper_id: join_key\n",
 765 |        "\n",
 766 |        "paper:\n",
 767 |        "  columns:\n",
 768 |        "  - class_label: categorical\n",
 769 |        "  - paper_id: join_key\n",
 770 |        "\n",
 771 |        "cites:\n",
 772 |        "  columns:\n",
 773 |        "  - cited_paper_id: join_key\n",
 774 |        "  - citing_paper_id: join_key\n",
 775 |        "\n",
 776 |        "  joins:\n",
 777 |        "  - right: 'content'\n",
 778 |        "    on: \n",
 779 |        "    - (cites.cited_paper_id, content.paper_id)\n",
 780 |        "    relationship: 'many-to-many'\n",
 781 |        "    lagged_targets: False\n",
 782 |        "  - right: 'paper'\n",
 783 |        "    on: \n",
 784 |        "    - (cites.cited_paper_id, paper.paper_id)\n",
 785 |        "    relationship: 'many-to-one'\n",
 786 |        "    lagged_targets: False\n",
 787 |        "\n",
 788 |        "content:\n",
 789 |        "  columns:\n",
 790 |        "  - word_cited_id: categorical\n",
 791 |        "  - paper_id: join_key\n",
 792 |        "\n",
 793 |        "paper:\n",
 794 |        "  columns:\n",
 795 |        "  - class_label: categorical\n",
 796 |        "  - paper_id: join_key\n",
 797 |        "\n",
 798 |        "content:\n",
 799 |        "  columns:\n",
 800 |        "  - word_cited_id: categorical\n",
 801 |        "  - paper_id: join_key"
 802 |       ]
 803 |      },
 804 |      "execution_count": 13,
 805 |      "metadata": {},
 806 |      "output_type": "execute_result"
 807 |     }
 808 |    ],
 809 |    "source": [
 810 |     "dm = getml.data.DataModel(paper.to_placeholder(\"population\"))\n",
 811 |     "\n",
 812 |     "# We need two different placeholders for cites.\n",
 813 |     "dm.add(getml.data.to_placeholder(cites=[cites] * 2, content=content, paper=paper))\n",
 814 |     "\n",
 815 |     "dm.population.join(dm.cites[0], on=(\"paper_id\", \"cited_paper_id\"))\n",
 816 |     "\n",
 817 |     "dm.cites[0].join(dm.content, on=(\"citing_paper_id\", \"paper_id\"))\n",
 818 |     "\n",
 819 |     "dm.cites[0].join(\n",
 820 |     "    dm.paper,\n",
 821 |     "    on=(\"citing_paper_id\", \"paper_id\"),\n",
 822 |     "    relationship=getml.data.relationship.many_to_one,\n",
 823 |     ")\n",
 824 |     "\n",
 825 |     "dm.population.join(dm.cites[1], on=(\"paper_id\", \"citing_paper_id\"))\n",
 826 |     "\n",
 827 |     "dm.cites[1].join(dm.content, on=(\"cited_paper_id\", \"paper_id\"))\n",
 828 |     "\n",
 829 |     "dm.cites[1].join(\n",
 830 |     "    dm.paper,\n",
 831 |     "    on=(\"cited_paper_id\", \"paper_id\"),\n",
 832 |     "    relationship=getml.data.relationship.many_to_one,\n",
 833 |     ")\n",
 834 |     "\n",
 835 |     "dm.population.join(dm.content, on=\"paper_id\")\n",
 836 |     "\n",
 837 |     "dm"
 838 |    ]
 839 |   },
 840 |   {
 841 |    "cell_type": "markdown",
 842 |    "metadata": {},
 843 |    "source": [
 844 |     "## 2.2. Hyperparameter Search\n",
 845 |     "To mimic the approach of the GNN paper, we conduct a small Hyperparameter search, train on the train data, validate on the validate data and use the untouched test data as holdout set to get an unbiased estimate of the true performance.\n",
 846 |     "For expediency, we make a grit search along two dimensions and keep the number of levels deliberately small:\n",
 847 |     " \n",
 848 |     "    num_features: 250, 300, 350\n",
 849 |     "    built-in aggregation sets: minimal, default, all"
 850 |    ]
 851 |   },
 852 |   {
 853 |    "cell_type": "code",
 854 |    "execution_count": 14,
 855 |    "metadata": {},
 856 |    "outputs": [],
 857 |    "source": [
 858 |     "mapping = getml.preprocessors.Mapping()\n",
 859 |     "predictor = getml.predictors.XGBoostClassifier()\n",
 860 |     "\n",
 861 |     "actual_labels_val = paper[split == \"validation\"].class_label.to_numpy()\n",
 862 |     "actual_labels_test = paper[split == \"test\"].class_label.to_numpy()\n",
 863 |     "class_label = paper.class_label.unique()\n",
 864 |     "\n",
 865 |     "pipe1 = getml.pipeline.Pipeline(\n",
 866 |     "    data_model=dm, preprocessors=[mapping], predictors=[predictor]\n",
 867 |     ")"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "code",
 872 |    "execution_count": 15,
 873 |    "metadata": {},
 874 |    "outputs": [],
 875 |    "source": [
 876 |     "def prob_to_acc(prob, actual_labels, class_label) -> float:\n",
 877 |     "    ix_max = np.argmax(prob, axis=1)\n",
 878 |     "    predicted_labels = np.asarray([class_label[ix] for ix in ix_max])\n",
 879 |     "    return (actual_labels == predicted_labels).sum() / len(actual_labels)"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "code",
 884 |    "execution_count": 16,
 885 |    "metadata": {},
 886 |    "outputs": [],
 887 |    "source": [
 888 |     "%%capture\n",
 889 |     "parameter_sweep = {}\n",
 890 |     "i = 0\n",
 891 |     "for num_feat in [250, 300, 350]:\n",
 892 |     "    for aggregation_set in [\n",
 893 |     "        getml.feature_learning.aggregations.FASTPROP.Minimal,\n",
 894 |     "        getml.feature_learning.aggregations.FASTPROP.Default,\n",
 895 |     "        getml.feature_learning.aggregations.FASTPROP.All,\n",
 896 |     "    ]:\n",
 897 |     "        fast_prop = getml.feature_learning.FastProp(\n",
 898 |     "            loss_function=getml.feature_learning.loss_functions.CrossEntropyLoss,\n",
 899 |     "            aggregation=aggregation_set,\n",
 900 |     "            num_features=num_feat,\n",
 901 |     "        )\n",
 902 |     "\n",
 903 |     "        pipe1.feature_learners = [fast_prop]\n",
 904 |     "\n",
 905 |     "        pipe1.fit(container.train)\n",
 906 |     "\n",
 907 |     "        probs_val = pipe1.predict(container.validation)\n",
 908 |     "        val_acc = prob_to_acc(probs_val, actual_labels_val, class_label)\n",
 909 |     "\n",
 910 |     "        parameter_sweep[i] = {\n",
 911 |     "            \"num_feat\": num_feat,\n",
 912 |     "            \"agg_set\": aggregation_set,\n",
 913 |     "            \"val_acc\": val_acc,\n",
 914 |     "        }\n",
 915 |     "\n",
 916 |     "        i += 1"
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "code",
 921 |    "execution_count": 17,
 922 |    "metadata": {},
 923 |    "outputs": [],
 924 |    "source": [
 925 |     "best_val_acc_comb = list(\n",
 926 |     "    sorted(parameter_sweep.items(), key=lambda item: item[1][\"val_acc\"], reverse=True)\n",
 927 |     ")[0][1]"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": 18,
 933 |    "metadata": {},
 934 |    "outputs": [
 935 |     {
 936 |      "name": "stdout",
 937 |      "output_type": "stream",
 938 |      "text": [
 939 |       "Accuracy on validation set: 0.876\n",
 940 |       "Number of features used: 300\n",
 941 |       "Aggregation set used: frozenset({'MAX', 'SUM', 'AVG', 'COUNT', 'MIN'})\n"
 942 |      ]
 943 |     }
 944 |    ],
 945 |    "source": [
 946 |     "print(f\"Accuracy on validation set: {best_val_acc_comb['val_acc']}\")\n",
 947 |     "print(f\"Number of features used: {best_val_acc_comb['num_feat']}\")\n",
 948 |     "print(f\"Aggregation set used: {best_val_acc_comb['agg_set']}\")"
 949 |    ]
 950 |   },
 951 |   {
 952 |    "cell_type": "markdown",
 953 |    "metadata": {},
 954 |    "source": [
 955 |     "Now as we identified the parameter combination that yields the highest accuracy on the validation set, let's use the same parameters on the hold out data to attain an unbiased estimate of the model's predictive performance."
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "code",
 960 |    "execution_count": 19,
 961 |    "metadata": {},
 962 |    "outputs": [
 963 |     {
 964 |      "data": {
 965 |       "text/html": [
 966 |        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Checking data model<span style=\"color: #808000; text-decoration-color: #808000\">...</span>\n",
 967 |        "</pre>\n"
 968 |       ],
 969 |       "text/plain": [
 970 |        "Checking data model\u001b[33m...\u001b[0m\n"
 971 |       ]
 972 |      },
 973 |      "metadata": {},
 974 |      "output_type": "display_data"
 975 |     },
 976 |     {
 977 |      "name": "stdout",
 978 |      "output_type": "stream",
 979 |      "text": [
 980 |       "\u001b[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
 981 |       "\u001b[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
 982 |       "\u001b[?25h"
 983 |      ]
 984 |     },
 985 |     {
 986 |      "data": {
 987 |       "text/html": [
 988 |        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">The pipeline check generated <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span> issues labeled INFO and <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> issues labeled WARNING.\n",
 989 |        "</pre>\n"
 990 |       ],
 991 |       "text/plain": [
 992 |        "The pipeline check generated \u001b[1;36m3\u001b[0m issues labeled INFO and \u001b[1;36m0\u001b[0m issues labeled WARNING.\n"
 993 |       ]
 994 |      },
 995 |      "metadata": {},
 996 |      "output_type": "display_data"
 997 |     },
 998 |     {
 999 |      "data": {
1000 |       "text/html": [
1001 |        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">To see the issues in full, run <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.check</span><span style=\"font-weight: bold\">()</span> on the pipeline.\n",
1002 |        "</pre>\n"
1003 |       ],
1004 |       "text/plain": [
1005 |        "To see the issues in full, run \u001b[1;35m.check\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m on the pipeline.\n"
1006 |       ]
1007 |      },
1008 |      "metadata": {},
1009 |      "output_type": "display_data"
1010 |     },
1011 |     {
1012 |      "name": "stdout",
1013 |      "output_type": "stream",
1014 |      "text": [
1015 |       "\u001b[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1016 |       "\u001b[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1017 |       "\u001b[2K  Retrieving features from cache... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1018 |       "\u001b[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1019 |       "\u001b[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1020 |       "\u001b[2K  FastProp: Building features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1021 |       "\u001b[?25h"
1022 |      ]
1023 |     },
1024 |     {
1025 |      "data": {
1026 |       "text/html": [
1027 |        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Trained pipeline.\n",
1028 |        "</pre>\n"
1029 |       ],
1030 |       "text/plain": [
1031 |        "Trained pipeline.\n"
1032 |       ]
1033 |      },
1034 |      "metadata": {},
1035 |      "output_type": "display_data"
1036 |     },
1037 |     {
1038 |      "name": "stdout",
1039 |      "output_type": "stream",
1040 |      "text": [
1041 |       "Time taken: 0:00:00.518892.\n",
1042 |       "\n",
1043 |       "\u001b[2K  Staging... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1044 |       "\u001b[2K  Preprocessing... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1045 |       "\u001b[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1046 |       "\u001b[2K  FastProp: Building subfeatures... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1047 |       "\u001b[2K  FastProp: Building features... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 00:00\n",
1048 |       "\u001b[?25h"
1049 |      ]
1050 |     }
1051 |    ],
1052 |    "source": [
1053 |     "fast_prop = getml.feature_learning.FastProp(\n",
1054 |     "    loss_function=getml.feature_learning.loss_functions.CrossEntropyLoss,\n",
1055 |     "    aggregation=best_val_acc_comb[\"agg_set\"],\n",
1056 |     "    num_features=best_val_acc_comb[\"num_feat\"],\n",
1057 |     ")\n",
1058 |     "\n",
1059 |     "pipe1.feature_learners = [fast_prop]\n",
1060 |     "\n",
1061 |     "pipe1.fit(container.train)\n",
1062 |     "\n",
1063 |     "probs_test = pipe1.predict(container.test)\n",
1064 |     "test_acc = prob_to_acc(probs_test, actual_labels_test, class_label)"
1065 |    ]
1066 |   },
1067 |   {
1068 |    "cell_type": "code",
1069 |    "execution_count": 20,
1070 |    "metadata": {},
1071 |    "outputs": [
1072 |     {
1073 |      "name": "stdout",
1074 |      "output_type": "stream",
1075 |      "text": [
1076 |       "Accuracy on the test set: 0.906\n"
1077 |      ]
1078 |     }
1079 |    ],
1080 |    "source": [
1081 |     "print(f\"Accuracy on the test set: {test_acc}\")"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "metadata": {},
1087 |    "source": [
1088 |     "# Conclusion"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "markdown",
1093 |    "metadata": {},
1094 |    "source": [
1095 |     "This notebook demonstrates how **getML**, powered by its **FastProp** feature engineering algorithm and **XGBoost**, surpasses the current state-of-the-art on the CORA dataset. By replicating the data split and hyperparameter optimization methods of Izadi et al., we achieve a record-breaking accuracy of **90.6%**, exceeding their previous benchmark of 90.16%.\n",
1096 |     "\n",
1097 |     "At the core of this success is **FastProp**, which automates feature creation for relational datasets by efficiently generating statistical and temporal aggregates.\n",
1098 |     "\n",
1099 |     "This example highlights how cutting-edge performance can be achieved without the need for manual feature engineering or complex GNN-based approaches, enabling faster iteration and greater model interpretability.\n",
1100 |     "\n",
1101 |     "By incorporating getML into their workflows, data scientists can achieve superior results with less effort, seamlessly combining efficiency with state-of-the-art performance."
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "markdown",
1106 |    "metadata": {},
1107 |    "source": [
1108 |     "# References\n",
1109 |     "\n",
1110 |     "Izadi, Fang, Stevenson, Lin (2020): Optimization of Graph Neural Networks with Natural Gradient Descent   \n",
1111 |     "https://arxiv.org/pdf/2008.09624v1"
1112 |    ]
1113 |   }
1114 |  ],
1115 |  "metadata": {
1116 |   "kernelspec": {
1117 |    "display_name": ".venv",
1118 |    "language": "python",
1119 |    "name": "python3"
1120 |   },
1121 |   "language_info": {
1122 |    "codemirror_mode": {
1123 |     "name": "ipython",
1124 |     "version": 3
1125 |    },
1126 |    "file_extension": ".py",
1127 |    "mimetype": "text/x-python",
1128 |    "name": "python",
1129 |    "nbconvert_exporter": "python",
1130 |    "pygments_lexer": "ipython3",
1131 |    "version": "3.11.4"
1132 |   }
1133 |  },
1134 |  "nbformat": 4,
1135 |  "nbformat_minor": 2
1136 | }
1137 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   notebooks:
 3 |     build:
 4 |       context: ./
 5 |       dockerfile: Dockerfile
 6 |       target: getml-demo
 7 |     networks:
 8 |       - notebooks_network
 9 |     ports:
10 |       - "1709:1709"
11 |       - "8888:8888"
12 | 
13 | networks:
14 |   notebooks_network:
15 | 


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/air_pollution.csv:
--------------------------------------------------------------------------------
1 | ,runtime,num_features,features_per_second,normalized_runtime,normalized_runtime_per_feature,mae,rmse,rsquared
2 | getML: FastProp,0 days 00:00:13.433118,289,21.514167079021536,1.0,1.0,44.25020198393413,63.41678046742225,0.5461908346025002
3 | featuretools,0 days 00:35:42.350566,114,0.05321257909185177,159.4827474901955,404.30603902669907,43.91507134716814,62.56717509580391,0.5593690691075496
4 | tsfresh,0 days 00:01:21.479423,72,0.8836584165371371,6.065562961629608,24.34670080247843,47.11059400765295,66.59998183642728,0.5015240507138585
5 | 


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/auc-rsquared_fps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/fastprop_benchmark/comparisons/auc-rsquared_fps.png


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/comparisons.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | files = pathlib.Path(".").glob("*.csv")
  8 | 
  9 | dfs = {}
 10 | 
 11 | for file in files:
 12 |     df = pd.read_csv(file, index_col=0)
 13 |     name = file.stem
 14 |     df.index = [[name] * df.shape[0], df.index]
 15 |     dfs[name] = df
 16 | 
 17 | comparisons = pd.concat(dfs.values())
 18 | 
 19 | colors = {
 20 |     "getML: FastProp": (0.25, 0.17, 0.51),
 21 |     "featuretools": (0.96, 0.60, 0.05),
 22 |     "tsfresh": (0.32, 0.71, 0.24),
 23 | }
 24 | 
 25 | # --------------------------------------------------------------------------
 26 | 
 27 | ax = (
 28 |     comparisons.speedup_per_feature.unstack()
 29 |     .iloc[:, [1, 0, 2]]
 30 |     .plot.bar(color=colors.values())
 31 | )
 32 | 
 33 | ax.set_ylabel("Normalized runtime/feature \n (getML=1)")
 34 | ax.set_title("Runtime per feature on different data sets (lower is better)")
 35 | 
 36 | plt.tight_layout()
 37 | plt.savefig("nrpf.png")
 38 | 
 39 | # --------------------------------------------------------------------------
 40 | 
 41 | fig, axes = plt.subplots(nrows=2)
 42 | 
 43 | ax2 = (
 44 |     comparisons.features_per_second.unstack()
 45 |     .iloc[:, [1, 0, 2]]
 46 |     .plot.bar(color=colors.values(), ax=axes[0])
 47 | )
 48 | 
 49 | # for container in ax2.containers:
 50 | #     ax2.bar_label(container, label_type="edge")
 51 | ax2.set_ylabel("Features created/second")
 52 | ax2.set_xticklabels([])
 53 | ax2.set_title("Features created per second (higher is better)")
 54 | 
 55 | sc_data = comparisons.copy()[["features_per_second", "rsquared"]]
 56 | sc_data.rename(columns={"rsquared": "auc/rsquared"}, inplace=True)
 57 | sc_data["auc/rsquared"]["occupancy"] = comparisons["auc"]["occupancy"].values
 58 | 
 59 | ax4 = (
 60 |     sc_data["auc/rsquared"]
 61 |     .unstack()
 62 |     .iloc[:, [1, 0, 2]]
 63 |     .plot.bar(color=colors.values(), ax=axes[1], legend=None)
 64 | )
 65 | 
 66 | ax4.set_ylabel("AUC/Rsquared")
 67 | ax4.set_title("Performance (higher is better)")
 68 | 
 69 | fig.tight_layout(pad=1)
 70 | 
 71 | plt.savefig("fps_performance.png")
 72 | 
 73 | # --------------------------------------------------------------------------
 74 | 
 75 | ax5 = (
 76 |     sc_data["auc/rsquared"].unstack().iloc[:, [1, 0, 2]].plot.bar(color=colors.values())
 77 | )
 78 | 
 79 | ax5.set_ylabel("AUC/Rsquared")
 80 | ax5.set_title("Performance (higher is better)")
 81 | 
 82 | fig.tight_layout()
 83 | 
 84 | plt.savefig("performance.png")
 85 | 
 86 | # --------------------------------------------------------------------------
 87 | 
 88 | col = [colors[tool] for tool in comparisons.index.get_level_values(1)]
 89 | 
 90 | ax3 = sc_data.plot.scatter(x="features_per_second", y="auc/rsquared", c=col)
 91 | 
 92 | # for i, dat in enumerate(sc_data.index.get_level_values(0)):
 93 | #     point = ax3.get_children()[0].get_offsets().data[i]
 94 | #     offset = (0.001, 0.01)
 95 | #     ax3.annotate(dat, (point[0] + offset[0], point[1] + offset[1]))
 96 | 
 97 | ax3.grid(True)
 98 | 
 99 | ax3.set_ylabel("AUC/Rsquared")
100 | ax3.set_xlabel("Features/second")
101 | ax3.set_title("Performance vs. speed")
102 | 
103 | plt.tight_layout()
104 | plt.savefig("auc-rsquared_fps.png")
105 | 
106 | # --------------------------------------------------------------------------
107 | 
108 | plt.style.use("seaborn")
109 | 
110 | fig, axes = plt.subplots(nrows=2)
111 | 
112 | ax = (
113 |     comparisons.speedup_per_feature.unstack()
114 |     .iloc[:, [1, 0, 2]]
115 |     .plot.bar(color=colors.values(), ax=axes[0])
116 | )
117 | 
118 | ax.set_ylabel("Normalized runtime/feature \n (getML=1)")
119 | ax.set_title("Runtime per feature on different data sets (lower is better)")
120 | ax.set_xticklabels([])
121 | 
122 | ax4 = (
123 |     sc_data["auc/rsquared"]
124 |     .unstack()
125 |     .iloc[:, [1, 0, 2]]
126 |     .plot.bar(color=colors.values(), ax=axes[1], legend=None)
127 | )
128 | 
129 | ax4.set_ylabel("AUC/Rsquared")
130 | ax4.set_title("Predictive performance (higher is better)")
131 | 
132 | fig.tight_layout(pad=1)
133 | 
134 | 
135 | plt.savefig("nrpf_performance.png")
136 | 
137 | # --------------------------------------------------------------------------
138 | 


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/dodgers.csv:
--------------------------------------------------------------------------------
1 | ,runtime,num_features,features_per_second,normalized_runtime,normalized_runtime_per_feature,rsquared,rmse,mae
2 | getML: FastProp,0 days 00:00:12.106112,526,43.449923962633065,1.0,1.0,0.674740264167421,7.824265418275141,5.615102199203448
3 | featuretools,0 days 00:08:51.202688,59,0.11106871988302243,43.87888431892915,391.1985661525092,0.6497682308113026,8.500887091670293,6.08627656339764
4 | tsfresh,0 days 00:00:31.919755,12,0.37594267626072375,2.636664438591019,115.5759287421247,0.5778110797835911,8.913407825293008,6.788610043264059
5 | 


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/fps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/fastprop_benchmark/comparisons/fps.png


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/fps_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/fastprop_benchmark/comparisons/fps_performance.png


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/interstate94.csv:
--------------------------------------------------------------------------------
1 | ,runtime,num_features,features_per_second,normalized_runtime,normalized_runtime_per_feature,rsquared,rmse,mae
2 | getML: FastProp,0 days 00:00:04.806504,461,95.9140610013428,1.0,1.0,0.9826778318692238,261.9388731680907,180.4867341518402
3 | featuretools,0 days 00:04:27.009351,59,0.22096605475273678,55.551675604555825,434.06694801457894,0.9745821357660296,317.51997565190663,210.1987933667501
4 | 


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/nrpf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/fastprop_benchmark/comparisons/nrpf.png


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/nrpf_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/fastprop_benchmark/comparisons/nrpf_performance.png


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/occupancy.csv:
--------------------------------------------------------------------------------
1 | ,runtime,num_features,features_per_second,normalized_runtime,normalized_runtime_per_feature,accuracy,auc,cross_entropy
2 | getML: FastProp,0 days 00:00:01.825967,289,158.27793605571384,1.0,1.0,0.9888233849177106,0.9981659495576908,0.044212831751311917
3 | featuretools,0 days 00:07:20.110459,103,0.234032161167652,241.0287036950832,676.3084836973726,0.988454925079833,0.9972073080154513,0.04923589235646241
4 | tsfresh,0 days 00:00:14.295312,60,4.197183689744182,7.828899426988549,37.7105096549541,0.9877180054040776,0.9978609436533588,0.04935860166671465
5 | 


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/fastprop_benchmark/comparisons/performance.png


--------------------------------------------------------------------------------
/fastprop_benchmark/comparisons/robot.csv:
--------------------------------------------------------------------------------
1 | ,runtime,num_features,features_per_second,normalized_runtime,normalized_runtime_per_feature,rsquared,rmse,mae
2 | getML: FastProp,0 days 00:00:00.398347,134,336.3605785401951,1.0,1.0,0.9950578985273825,0.7237156701589754,0.5516075682525006
3 | featuretools,0 days 00:14:21.611109,158,0.18337738975122106,2162.9662304473236,1834.2532795156408,0.9947842730879888,0.7594597450557347,0.5828310842859015
4 | tsfresh,0 days 00:00:26.638362,120,4.504788590271459,66.87225459210187,74.66733938782374,0.9938361598092297,0.7906022324089085,0.5986000367767544
5 | 


--------------------------------------------------------------------------------
/kaggle_notebooks/resources/Cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/kaggle_notebooks/resources/Cover.png


--------------------------------------------------------------------------------
/kaggle_notebooks/resources/CoverWithTitle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/kaggle_notebooks/resources/CoverWithTitle.png


--------------------------------------------------------------------------------
/kaggle_notebooks/resources/Examples2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/kaggle_notebooks/resources/Examples2.png


--------------------------------------------------------------------------------
/kaggle_notebooks/resources/FeatureEngineeringExample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/kaggle_notebooks/resources/FeatureEngineeringExample2.png


--------------------------------------------------------------------------------
/kaggle_notebooks/resources/Linkedin_Optimized_Cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/kaggle_notebooks/resources/Linkedin_Optimized_Cover.png


--------------------------------------------------------------------------------
/kaggle_notebooks/resources/ManualFeatureEngineering2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getml/getml-demo/73e37a8d7e76ca26f45b37484705f32c9d2343ca/kaggle_notebooks/resources/ManualFeatureEngineering2.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyterlab==4.1.1
 2 | getml==1.5.0
 3 | featuretools==1.31.0
 4 | tsfresh==0.20.3
 5 | pyspark==3.5.0
 6 | seaborn==0.13.2
 7 | ipywidgets==8.1.2
 8 | plotly==5.18.0
 9 | prophet==1.1.5
10 | matplotlib==3.8.2
11 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .benchmark import Benchmark
2 | from .ft_time_series_builder import FTTimeSeriesBuilder
3 | from .load import load_or_query, load_or_retrieve
4 | from .ts_fresh_builder import TSFreshBuilder
5 | 


--------------------------------------------------------------------------------
/utils/add_original_columns.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | 
4 | def _add_original_columns(original_df, df_selected):
5 |     for colname in original_df.columns:
6 |         df_selected[colname] = np.asarray(original_df[colname])
7 | 
8 |     return df_selected
9 | 


--------------------------------------------------------------------------------
/utils/benchmark.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from contextlib import contextmanager
 3 | from datetime import timedelta
 4 | 
 5 | 
 6 | class Benchmark:
 7 |     def __init__(self):
 8 |         self._data = {}
 9 |         self._data["runtimes"] = {}
10 | 
11 |     @contextmanager
12 |     def __call__(self, name):
13 |         with self._benchmark_runtime(name):
14 |             yield
15 | 
16 |     @contextmanager
17 |     def _benchmark_runtime(self, name):
18 |         begin = time.time()
19 |         yield
20 |         end = time.time()
21 |         self._data["runtimes"][name] = timedelta(seconds=end - begin)
22 | 
23 |     @property
24 |     def runtimes(self):
25 |         return self._data["runtimes"]
26 | 
27 | 
28 | @contextmanager
29 | def benchmark(name, data):
30 |     begin = time.time()
31 |     yield
32 |     end = time.time()
33 |     data[name] = timedelta(end - begin)
34 | 


--------------------------------------------------------------------------------
/utils/ft_time_series_builder.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import time
  3 | import warnings
  4 | 
  5 | import featuretools as ft
  6 | from featuretools.exceptions import UnusedPrimitiveWarning
  7 | import numpy as np
  8 | import pandas as pd
  9 | from pandas.api.types import is_numeric_dtype
 10 | from scipy.stats import pearsonr
 11 | 
 12 | from .add_original_columns import _add_original_columns
 13 | from .print_time_taken import _print_time_taken
 14 | from .remove_target_column import _remove_target_column
 15 | 
 16 | # ------------------------------------------------------------------
 17 | 
 18 | 
 19 | class _ChunkMaker:
 20 |     """
 21 |     Helpers class to create chunks of data frames.
 22 |     """
 23 | 
 24 |     def __init__(self, data_frame, id_col, time_col, horizon, memory):
 25 |         self.data_frame = data_frame
 26 |         self.id_col = id_col
 27 |         self.time_col = time_col
 28 |         self.horizon = horizon
 29 |         self.memory = memory
 30 | 
 31 |     def make_chunk(self, current_id, now, index):
 32 |         """
 33 |         Generates a chunk of the data frame that
 34 |         contains all rows within horizon and memory.
 35 | 
 36 |         Used by roll_data_frame.
 37 |         """
 38 |         begin = now - self.horizon - self.memory
 39 |         end = now - self.horizon
 40 |         chunk = self.data_frame[
 41 |             (self.id_col == current_id)
 42 |             & (self.time_col > begin)
 43 |             & (self.time_col <= end)
 44 |         ]
 45 |         chunk["_featuretools_join_key"] = int(index)
 46 |         return chunk
 47 | 
 48 | 
 49 | # ------------------------------------------------------------------
 50 | 
 51 | 
 52 | def _make_entity_set(data_frame, rolled, time_stamp):
 53 |     relationships = [
 54 |         ("population", "_featuretools_index", "peripheral", "_featuretools_join_key")
 55 |     ]
 56 | 
 57 |     dataframes = {
 58 |         "population": (
 59 |             data_frame,
 60 |             "_featuretools_index",
 61 |             time_stamp,
 62 |         ),
 63 |         "peripheral": (
 64 |             rolled,
 65 |             "_featuretools_index",
 66 |             time_stamp,
 67 |         ),
 68 |     }
 69 | 
 70 |     return ft.EntitySet("self-join-entity-set", dataframes, relationships)
 71 | 
 72 | 
 73 | # ------------------------------------------------------------------
 74 | 
 75 | 
 76 | def _roll_data_frame(data_frame, column_id, time_stamp, horizon, memory):
 77 |     """
 78 |     Duplicates data so that it matches the format
 79 |     required by featuretools.
 80 |     """
 81 |     id_col = data_frame[column_id]
 82 |     time_col = pd.to_datetime(data_frame[time_stamp])
 83 |     chunk_maker = _ChunkMaker(data_frame, id_col, time_col, horizon, memory)
 84 |     chunks = [
 85 |         chunk_maker.make_chunk(row[column_id], pd.to_datetime(row[time_stamp]), index)
 86 |         for index, row in data_frame.iterrows()
 87 |     ]
 88 |     rolled = pd.concat(chunks, ignore_index=True).reset_index()
 89 |     rolled["_featuretools_index"] = np.arange(rolled.shape[0])
 90 |     return rolled
 91 | 
 92 | 
 93 | # ------------------------------------------------------------------
 94 | 
 95 | 
 96 | def _hide_warnings(func):
 97 |     def wrapper(*args, **kwargs):
 98 |         with warnings.catch_warnings():
 99 |             warnings.simplefilter(action="ignore", category=FutureWarning)
100 |             warnings.filterwarnings(
101 |                 'ignore',
102 |                 category=RuntimeWarning,
103 |                 message="Precision loss occurred in moment calculation due to catastrophic cancellation.")
104 |             warnings.filterwarnings(
105 |                 'ignore',
106 |                 category=UnusedPrimitiveWarning,
107 |                 message="Some specified primitives were not used during DFS")
108 |             return func(*args, **kwargs)
109 |     return wrapper
110 |     
111 | 
112 | class FTTimeSeriesBuilder:
113 |     """
114 |     Scikit-learn-style feature builder based on featuretools.
115 | 
116 |     Args:
117 | 
118 |         num_features: The (maximum) number of features to build.
119 | 
120 |         memory: How much back in time you want to go until the
121 |                 feature builder starts "forgetting" data.
122 | 
123 |         column_id: The name of the column containing the ids.
124 | 
125 |         time_stamp: The name of the column containing the time stamps.
126 | 
127 |         target: The name of the target column.
128 |     """
129 | 
130 |     all_primitives = ft.list_primitives()
131 |     breaking_agg_primitives = [
132 |         'date_first_event',
133 |         'kurtosis',
134 |     ]
135 |     agg_primitives = (all_primitives[
136 |         (all_primitives.type == "aggregation")
137 |         & (~all_primitives.name.isin(breaking_agg_primitives))]
138 |         .name
139 |         .to_list())
140 |     trans_primitives = all_primitives[all_primitives.type == "transform"].name.to_list()
141 | 
142 |     def __init__(
143 |         self,
144 |         num_features,
145 |         horizon,
146 |         memory,
147 |         column_id,
148 |         time_stamp,
149 |         target,
150 |         allow_lagged_targets=False,
151 |     ):
152 |         self.num_features = num_features
153 |         self.horizon = horizon
154 |         self.memory = memory
155 |         self.column_id = column_id
156 |         self.time_stamp = time_stamp
157 |         self.target = target
158 |         self.allow_lagged_targets = allow_lagged_targets
159 | 
160 |         self._runtime = None
161 |         self.fitted = False
162 |         self.max_depth = 2
163 | 
164 |         self.selected_features = []
165 | 
166 |     def _extract_features(self, data_frame):
167 |         data_frame = data_frame.reset_index()
168 |         del data_frame["index"]
169 |         rolled = _roll_data_frame(
170 |             data_frame, self.column_id, self.time_stamp, self.horizon, self.memory
171 |         )
172 | 
173 |         data_frame["_featuretools_index"] = np.arange(data_frame.shape[0])
174 | 
175 |         entityset = _make_entity_set(data_frame, rolled, self.time_stamp)
176 |         df_extracted, _ = ft.dfs(
177 |             entityset=entityset,
178 |             agg_primitives=self.agg_primitives,
179 |             target_dataframe_name="population",
180 |             max_depth=self.max_depth,
181 |             ignore_columns={
182 |                 "peripheral": [
183 |                     self.column_id,
184 |                     "index",
185 |                     "join_key",
186 |                     "_featuretools_join_key",
187 |                     "_featuretools_index",
188 |                 ]
189 |             },
190 |         )
191 | 
192 |         for col in df_extracted:
193 |             if is_numeric_dtype(df_extracted[col]):
194 |                 df_extracted[col][df_extracted[col].isna()] = 0
195 | 
196 |         return df_extracted
197 | 
198 |     def _select_features(self, data_frame, target):
199 |         colnames = np.asarray(data_frame.columns)
200 |         print("Selecting the best out of " + str(len(colnames)) + " features...")
201 |         colnames = np.asarray(
202 |             [
203 |                 col
204 |                 for col in colnames
205 |                 if is_numeric_dtype(data_frame[col])
206 |                 and np.var(np.asarray(data_frame[col])) > 0.0
207 |             ]
208 |         )
209 |         correlations = np.asarray(
210 |             [np.abs(pearsonr(target, data_frame[col]))[0] for col in colnames]
211 |         )
212 |         correlations[np.isnan(correlations) | np.isinf(correlations)] = 0.0
213 | 
214 |         self.selected_features = colnames[np.argsort(correlations)][::-1][
215 |             : self.num_features
216 |         ]
217 |         return data_frame[self.selected_features]
218 | 
219 |     @_hide_warnings
220 |     def fit(self, data_frame):
221 |         """
222 |         Fits the DFS on the data frame and returns
223 |         the features for the training set.
224 |         """
225 |         print("featuretools: Trying features...")
226 |         begin = time.time()
227 |         target = np.asarray(data_frame[self.target])
228 |         df_for_extraction = (
229 |             data_frame
230 |             if self.allow_lagged_targets
231 |             else _remove_target_column(data_frame, self.target)
232 |         )
233 |         df_extracted = self._extract_features(df_for_extraction)
234 |         df_selected = self._select_features(df_extracted, target)
235 |         df_selected = _add_original_columns(data_frame, df_selected)
236 |         end = time.time()
237 |         _print_time_taken(begin, end)
238 |         self.fitted = True
239 |         self._runtime = datetime.timedelta(seconds=end - begin)
240 |         return df_selected
241 | 
242 |     @property
243 |     def runtime(self):
244 |         if self.fitted:
245 |             return self._runtime
246 |     
247 |     @_hide_warnings
248 |     def transform(self, data_frame):
249 |         """
250 |         Fits the DFS on the data frame and returns
251 |         the features for the training set.
252 |         """
253 |         df_for_extraction = (
254 |             data_frame
255 |             if self.allow_lagged_targets
256 |             else _remove_target_column(data_frame, self.target)
257 |         )
258 |         df_extracted = self._extract_features(df_for_extraction)
259 |         df_selected = df_extracted[self.selected_features]
260 |         df_selected = _add_original_columns(data_frame, df_selected)
261 |         return df_selected
262 | 


--------------------------------------------------------------------------------
/utils/load.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import getml
 4 | 
 5 | 
 6 | def load_or_query(conn, name, **kwargs):
 7 |     """
 8 |     Loads the data from disk (the project folder) if present, if not, queries it from
 9 |     the database associated with `conn`.
10 | 
11 |     `kwargs` are passed to `getml.data.DataFrame.from_db`.
12 |     """
13 | 
14 |     if not getml.data.exists(name):
15 |         print(f"Querying {name!r} from {conn.dbname!r}...")
16 |         df = getml.DataFrame.from_db(name=name, table_name=name, conn=conn, **kwargs)
17 |         df.save()
18 |     else:
19 |         print(f"Loading {name!r} from disk (project folder).")
20 |         df = getml.data.load_data_frame(name)
21 | 
22 |     print()
23 | 
24 |     return df
25 | 
26 | 
27 | def load_or_retrieve(csv_file, name=None, **kwargs):
28 |     """
29 |     Loads the data from disk (the project folder) if present, if not, retrieves the
30 |     and reads the `csv_file`.
31 | 
32 |     `kwargs` are passed to `getml.data.DataFrame.from_csv`.
33 | 
34 |     If no name is supplied, the df's name is inferred from the filename.
35 |     """
36 | 
37 |     if name is None:
38 |         name = Path(csv_file).stem
39 | 
40 |     if not getml.data.exists(name):
41 |         df = getml.DataFrame.from_csv(fnames=csv_file, name=name, **kwargs)
42 |         df.save()
43 |     else:
44 |         print(f"Loading {name!r} from disk (project folder).")
45 |         df = getml.data.load_data_frame(name)
46 | 
47 |     print()
48 | 
49 |     return df
50 | 


--------------------------------------------------------------------------------
/utils/print_time_taken.py:
--------------------------------------------------------------------------------
 1 | def _print_time_taken(begin, end):
 2 | 
 3 |     seconds = end - begin
 4 | 
 5 |     hours = int(seconds / 3600)
 6 |     seconds -= float(hours * 3600)
 7 | 
 8 |     minutes = int(seconds / 60)
 9 |     seconds -= float(minutes * 60)
10 | 
11 |     seconds = round(seconds, 6)
12 | 
13 |     print("Time taken: " + str(hours) + "h:" + str(minutes) + "m:" + str(seconds))
14 | 
15 |     print("")
16 | 


--------------------------------------------------------------------------------
/utils/remove_target_column.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def _remove_target_column(data_frame, target):
 5 |     colnames = np.asarray(data_frame.columns)
 6 |     if target not in colnames:
 7 |         return data_frame
 8 |     colnames = colnames[colnames != target]
 9 |     return data_frame[colnames]
10 | 


--------------------------------------------------------------------------------
/utils/ts_fresh_builder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility wrapper around tsfresh.
  3 | """
  4 | 
  5 | import datetime
  6 | import gc
  7 | import time
  8 | import warnings
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import tsfresh
 13 | from scipy.stats import pearsonr
 14 | from tsfresh.utilities.dataframe_functions import roll_time_series
 15 | 
 16 | from .add_original_columns import _add_original_columns
 17 | from .print_time_taken import _print_time_taken
 18 | 
 19 | 
 20 | def _hide_warnings(func):
 21 |     def wrapper(*args, **kwargs):
 22 |         with warnings.catch_warnings():
 23 |             warnings.filterwarnings(
 24 |                 'ignore',
 25 |                 category=UserWarning,
 26 |                 message="Your time stamps are not uniformly sampled, which makes rolling nonsensical in some domains.")
 27 |             return func(*args, **kwargs)
 28 |     return wrapper
 29 |     
 30 | 
 31 | class TSFreshBuilder:
 32 |     """
 33 |     Scikit-learn-style feature builder based on TSFresh.
 34 | 
 35 |     Args:
 36 | 
 37 |         num_features: The (maximum) number of features to build.
 38 | 
 39 |         memory: How much back in time you want to go until the
 40 |                 feature builder starts "forgetting" data.
 41 | 
 42 |         column_id: The name of the column containing the ids.
 43 | 
 44 |         time_stamp: The name of the column containing the time stamps.
 45 | 
 46 |         target: The name of the target column.
 47 |     """
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         num_features,
 52 |         memory,
 53 |         column_id,
 54 |         time_stamp,
 55 |         target,
 56 |         horizon=0,
 57 |         allow_lagged_targets=False,
 58 |     ):
 59 |         self.num_features = num_features
 60 |         self.memory = memory
 61 |         self.column_id = column_id
 62 |         self.time_stamp = time_stamp
 63 |         self.target = target
 64 |         self.horizon = horizon
 65 |         self.allow_lagged_targets = allow_lagged_targets
 66 | 
 67 |         self._runtime = None
 68 | 
 69 |         self.selected_features = []
 70 | 
 71 |     @_hide_warnings
 72 |     def _extract_features(self, data_frame):
 73 |         df_rolled = roll_time_series(
 74 |             data_frame,
 75 |             column_id=self.column_id,
 76 |             column_sort=self.time_stamp,
 77 |             max_timeshift=self.memory,
 78 |         )
 79 | 
 80 |         extracted_minimal = tsfresh.extract_features(
 81 |             df_rolled,
 82 |             column_id=self.column_id,
 83 |             column_sort=self.time_stamp,
 84 |             default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters(),
 85 |         )
 86 | 
 87 |         extracted_index_based = tsfresh.extract_features(
 88 |             df_rolled,
 89 |             column_id=self.column_id,
 90 |             column_sort=self.time_stamp,
 91 |             default_fc_parameters=tsfresh.feature_extraction.settings.IndexBasedFCParameters(),
 92 |         )
 93 | 
 94 |         extracted_features = pd.concat(
 95 |             [extracted_minimal, extracted_index_based], axis=1
 96 |         )
 97 |         del extracted_minimal
 98 |         del extracted_index_based
 99 | 
100 |         gc.collect()
101 | 
102 |         extracted_features[np.isnan(extracted_features)] = 0.0
103 | 
104 |         extracted_features[np.isinf(extracted_features)] = 0.0
105 | 
106 |         return extracted_features
107 | 
108 |     def _remove_target_column(self, data_frame):
109 |         colnames = np.asarray(data_frame.columns)
110 | 
111 |         if self.target not in colnames:
112 |             return data_frame
113 | 
114 |         colnames = colnames[colnames != self.target]
115 | 
116 |         return data_frame[colnames]
117 | 
118 |     def _select_features(self, data_frame, target):
119 |         print(
120 |             "Selecting the best out of " + str(len(data_frame.columns)) + " features..."
121 |         )
122 | 
123 |         df_selected = tsfresh.select_features(data_frame, target)
124 | 
125 |         colnames = np.asarray(df_selected.columns)
126 | 
127 |         correlations = np.asarray(
128 |             [np.abs(pearsonr(target, df_selected[col]))[0] for col in colnames]
129 |         )
130 | 
131 |         # [::-1] is somewhat unintuitive syntax,
132 |         # but it reverses the entire column.
133 |         self.selected_features = colnames[np.argsort(correlations)][::-1][
134 |             : self.num_features
135 |         ]
136 | 
137 |         return df_selected[self.selected_features]
138 | 
139 |     def fit(self, data_frame):
140 |         """
141 |         Fits the features.
142 |         """
143 |         begin = time.time()
144 | 
145 |         target = np.asarray(data_frame[self.target])
146 | 
147 |         df_for_extraction = (
148 |             data_frame
149 |             if self.allow_lagged_targets
150 |             else self._remove_target_column(data_frame)
151 |         )
152 | 
153 |         df_extracted = self._extract_features(df_for_extraction)
154 | 
155 |         df_selected = self._select_features(df_extracted, target)
156 | 
157 |         del df_extracted
158 |         gc.collect()
159 | 
160 |         df_selected = _add_original_columns(data_frame, df_selected)
161 | 
162 |         end = time.time()
163 | 
164 |         self._runtime = datetime.timedelta(seconds=end - begin)
165 | 
166 |         _print_time_taken(begin, end)
167 | 
168 |         return df_selected
169 | 
170 |     @property
171 |     def runtime(self):
172 |         return self._runtime
173 | 
174 |     def transform(self, data_frame):
175 |         """
176 |         Transforms the raw data into a set of features.
177 |         """
178 |         df_for_extraction = (
179 |             data_frame
180 |             if self.allow_lagged_targets
181 |             else self._remove_target_column(data_frame)
182 |         )
183 | 
184 |         df_extracted = self._extract_features(df_for_extraction)
185 | 
186 |         df_selected = df_extracted[self.selected_features]
187 | 
188 |         del df_extracted
189 |         gc.collect()
190 | 
191 |         df_selected = _add_original_columns(data_frame, df_selected)
192 | 
193 |         return df_selected
194 | 


--------------------------------------------------------------------------------
/utils/zuordnung.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | import numpy as np
 4 | from torch_geometric.datasets import Planetoid
 5 | 
 6 | 
 7 | def run_zuordnung(getml_word_data):
 8 |     """
 9 |     The matching process is based on the word matrix of the abstracts' content. That data is stored differently in the data of Izadi et al's GNN paper (hereinafter referred to as GNN paper or GNN data) and getML's data source. In the GNN's case, words are stored one-hot-encoded in a matrix (e.g. [0,0,1,0,1]), while getML data source simply lists the words and their associated index in the on-hot-encoded word matrix (e.g.: [word2, word4]). The following routine first retrieves the index of the GNN matrix. Due to different offsets, the word indices between both data source do not align. Therefore, we compute the difference between adjacent word indices and compare them across sources. If the patterns match, we have found a match between both sources and save their associated dataframe indices.
10 | 
11 |     It turns out there is a perfect match between both sources and every observation in one source finds its counterpart in the other source.
12 |     """
13 | 
14 |     getml_word_data = getml_word_data.to_pandas()
15 | 
16 |     gnn_word_data = Planetoid(name="Cora", root="")
17 | 
18 |     zuordnung = []
19 |     for getml_idx in getml_word_data["paper_id"].unique():
20 |         getml_positions = [
21 |             int(ele[4:])
22 |             for ele in getml_word_data[getml_word_data["paper_id"] == getml_idx][
23 |                 "word_cited_id"
24 |             ].values
25 |         ]
26 |         getml_positions = np.sort(getml_positions)
27 | 
28 |         getml_words_pattern = [
29 |             j - i for i, j in zip(getml_positions[:-1], getml_positions[1:])
30 |         ]
31 | 
32 |         for gnn_idx in range(len(gnn_word_data[0].x)):
33 |             gnn_positions = [
34 |                 i
35 |                 for i, x in enumerate(
36 |                     [int(x) for x in list(gnn_word_data[0].x[gnn_idx])]
37 |                 )
38 |                 if x == 1
39 |             ]
40 |             gnn_words_pattern = [
41 |                 j - i for i, j in zip(gnn_positions[:-1], gnn_positions[1:])
42 |             ]
43 | 
44 |             if gnn_words_pattern == getml_words_pattern:
45 |                 match = (int(getml_idx), gnn_idx)
46 |                 zuordnung.append(match)
47 |                 break
48 | 
49 | 
50 |     with open('assets/zuordnung.json', 'w') as file:
51 |         print("Writing to file")
52 |         json.dump(zuordnung, file)
53 |         print('Done')
54 | 
55 |     print(zuordnung)
56 | 
57 | 


--------------------------------------------------------------------------------