├── .dockerignore
├── .gitignore
├── Dockerfile
├── MANIFEST.in
├── README.md
├── app-fastapi
├── Procfile
├── app
│ ├── __init__.py
│ ├── api.py
│ ├── config.py
│ ├── main.py
│ ├── schemas
│ │ ├── __init__.py
│ │ ├── health.py
│ │ └── predict.py
│ └── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ └── test_api.py
├── mypy.ini
├── requirements.txt
├── run.sh
├── runtime.txt
└── tox.ini
├── heroku.yml
├── mypy.ini
├── notebooks
├── 1. Data Analysis.ipynb
├── 2. Feature Engineering.ipynb
├── 3. Feature Engineering Pipeline.ipynb
├── 4. Machine Learning.ipynb
├── pipe.joblib
└── preprocess.py
├── pyproject.toml
├── requirements
├── deployment.txt
├── production.txt
├── requirements.txt
└── research-env.txt
├── setup.py
├── src
├── VERSION
├── __init__.py
├── config.yml
├── config
│ ├── __init__.py
│ └── core.py
├── data
│ ├── __init__.py
│ ├── test.csv
│ └── train.csv
├── pipeline.py
├── predict.py
├── processing
│ ├── __init__.py
│ ├── data_manager.py
│ └── features.py
├── train_pipeline.py
└── trained_models
│ ├── __init__.py
│ └── model_v0.0.7.pkl
├── tests
├── __init__.py
├── conftest.py
├── test_features.py
├── test_input_data.py
└── test_prediction.py
└── tox.ini
/.dockerignore:
--------------------------------------------------------------------------------
1 | notebooks*
2 | */env*
3 | */venv*
4 | .circleci*
5 | packages/src
6 | *.env
7 | *.log
8 | .git
9 | .gitignore
10 | .tox
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 |
4 |
5 | # Folders
6 |
7 |
8 | # Jupyter notebook
9 | notebooks_research/
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | env/
22 | build/
23 | develop-eggs/
24 | dist/
25 | downloads/
26 | eggs/
27 | .eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *,cover
56 | .hypothesis/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # IPython Notebook
80 | .ipynb_checkpoints
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # dotenv
89 | .env
90 |
91 | # virtualenv
92 | venv/
93 | ENV/
94 | .virtual_documents
95 |
96 | # Spyder project settings
97 | .spyderproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 | ### VirtualEnv template
102 | # Virtualenv
103 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
104 | .Python
105 | [Bb]in
106 | [Ii]nclude
107 | [Ll]ib
108 | [Ll]ib64
109 | [Ll]ocal
110 | [Ss]cripts
111 | pyvenv.cfg
112 | .venv
113 | pip-selfcheck.json
114 | ### JetBrains template
115 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
116 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
117 |
118 | # User-specific stuff:
119 | .idea/workspace.xml
120 | .idea/tasks.xml
121 | .idea/dictionaries
122 | .idea/vcs.xml
123 | .idea/jsLibraryMappings.xml
124 |
125 | # Sensitive or high-churn files:
126 | .idea/dataSources.ids
127 | .idea/dataSources.xml
128 | .idea/dataSources.local.xml
129 | .idea/sqlDataSources.xml
130 | .idea/dynamic.xml
131 | .idea/uiDesigner.xml
132 |
133 | # Gradle:
134 | .idea/gradle.xml
135 | .idea/libraries
136 |
137 | # Mongo Explorer plugin:
138 | .idea/mongoSettings.xml
139 |
140 | .idea/
141 |
142 | ## File-based project format:
143 | *.iws
144 |
145 | ## Plugin-specific files:
146 |
147 | # IntelliJ
148 | /out/
149 |
150 | # mpeltonen/sbt-idea plugin
151 | .idea_modules/
152 |
153 | # JIRA plugin
154 | atlassian-ide-plugin.xml
155 |
156 | # Crashlytics plugin (for Android Studio and IntelliJ)
157 | com_crashlytics_export_strings.xml
158 | crashlytics.properties
159 | crashlytics-build.properties
160 | fabric.properties
161 |
162 | # Darts
163 | .darts/
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # the base image that we inherit from
2 | FROM python:3.9.5-slim
3 |
4 | #best practice: create a user
5 | RUN adduser --disabled-password --gecos '' ml-api-user
6 |
7 | WORKDIR /opt/app-fastapi
8 |
9 | # copy our project inside the container
10 | ADD ./app-fastapi /opt/app-fastapi/
11 | RUN pip install --upgrade pip
12 | RUN pip install -r /opt/app-fastapi/requirements.txt
13 |
14 | RUN chmod +x /opt/app-fastapi/run.sh
15 | RUN chown -R ml-api-user:ml-api-user ./
16 |
17 | USER ml-api-user
18 |
19 | EXPOSE 8001
20 |
21 | CMD ["bash", "./run.sh"]
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.pkl
4 | recursive-include ./src/*
5 |
6 | include src/data/train.csv
7 | include src/data/test.csv
8 | include src/trained_models/*.pkl
9 | include src/VERSION
10 | include src/config.yml
11 |
12 | include ./requirements/research-env.txt
13 | include ./requirements/production.txt
14 | include ./requirements/requirements.txt
15 | exclude *.log
16 | exclude *.cfg
17 |
18 | recursive-exclude * __pycache__
19 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # End to End Machine Learning Project
2 |
3 | This project aims to apply the best software engineering practices in a Machine Learning project in order to deploy the model.
4 |
5 | We are developing a model to predict if a Data Scientist is willing to leave his/her current job.
6 | We are not interested in the accuracy of the model (which is 77%), but rather to transition from the research environment to production code, packaging, and finally deployment of the model.
7 |
8 | [https://end-to-end-ml-project.herokuapp.com/](https://end-to-end-ml-project.herokuapp.com/)
9 |
10 |
11 |
Research Code ➙ Production Code ➙ Deployment
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | ### Project Structure
27 |
28 | ```
29 | end-to-end-ML-project
30 | │ README.md
31 | │ MANIFEST.in
32 | │ mypy.ini
33 | │ pyproject.toml
34 | │ setyp.py
35 | │ .gitignore
36 | │ tox.ini
37 | │ Dockerfile
38 | │
39 | └───notebooks
40 | │ │ 1. Data Analysis.ipynb
41 | │ │ 2. Feature Engineering.ipynb
42 | │ │ 3. Feature Engineering Pipeline.ipynb
43 | │ │ 4. Machine Learning.ipynb
44 | │ │ preprocess.py
45 | │
46 | └───requirements
47 | │ │ requirements.txt
48 | │ │ research-env.txt
49 | │ │ production.txt
50 | │ │ deployment.txt
51 | │
52 | └───src
53 | │ │ VERSION
54 | │ │ __init__.py
55 | │ │ config.yml
56 | │ │ pipeline.py
57 | │ │ train_pipeline.py
58 | │ │ predict.py
59 | │ │
60 | │ └───config
61 | │ │ │ __init__.py
62 | │ │ │ core.py
63 | │ │
64 | │ └───data
65 | │ │ │ __init__.py
66 | │ │ │ train.csv
67 | │ │ │ test.csv
68 | │ │
69 | │ └───processing
70 | │ │ │ __init__.py
71 | │ │ │ data_manager.py
72 | │ │ │ features.py
73 | │ │
74 | │ └───trained_models
75 | │ │ │ __init__.py
76 | │
77 | └───app-fastapi
78 | │ ...
79 | ```
80 |
81 | ### Steps in An End-to-end ML Project
82 |
83 | 1. Start with jupyter notebooks and finalize a model.
84 | 2. Transform research code to production code.
85 | 3. Make the project a package.
86 | 4. Serve it via a REST API.
87 | 5. Dockerize it and deploy it.
88 |
89 | ### 1. Start with jupyter notebooks and finalize a model
90 |
91 | The ```notebooks``` folder is the research which is often done by a Data Scientist.
92 |
93 | Usually a Data Analysis notebook for EDA and data understanding is the first step.
94 | Then, features are created in a pipeline. Here, sciki-learn and feature-engine were used.
95 | Finally, the ML model is placed at the end of the pipeline.
96 |
97 | Research can be very time-consuming. Here, a simple pipeline is created,
98 | because the creation of a 95% accuracy model is out of the scope of this work.
99 |
100 | ### 2. Transform research code to production code
101 |
102 | The ```src``` folder is the transformation of the jupyter notebooks to a python project.
103 |
104 | Some good practices:
105 | - Create a ```config.yml``` file that contains all the constants and configurations derived from the notebooks. Accompany it with a .py file to parse it (Here it is the ```src/config/core.py```).
106 | - Tidy all extra functions written and place them in a ```processing``` folder. For example, in ```src/processing/data_manager.py``` there are functions to read the data, save, read, and remove the pipeline.
107 | - Make different file for ```train_pipeline.py``` and ```predict.py```.
108 | - Always create very small functions to test them easier and have a readable code.
109 | - Create a ```trained_models``` folder to deposit the models.
110 | - Have a ```VERSION``` file, to track the version of the project, e.g. 0.0.4
111 | - Write ```tests```. Now write more tests.
112 | - Make a ```tox.ini``` file to make life easier, test code faster, get rid of styling, type checks, linting, and PEP8 concerns.
113 |
114 | Note: In order to import your python files as packages in other python files, we need to add the project's filepath to the Path environmental Variable.
115 |
116 | ### 3. Make the project a package
117 |
118 | We need 3 files in the root of the project:
119 |
120 | 1. ```MANIFEST.ini```: Define which files to include and exclude from the package.
121 | 2. ```pyproject.toml```: Specify basic dependencies and configure tooling.
122 | 3. ```setup.py```: Package metadata, version, requirements, how to create the package.
123 |
124 | From the project directory: ```python -m build```
125 |
126 | Then, make an account to PyPI. Install twine: ```pip install twine```
127 |
128 | Upload: ```twine upload dist/end_to_end_ML_project-0.0.4-py3-none-any.whl```
129 |
130 | Now the package can be installed like any other package with ```pip install end-to-end-ML-project```
131 |
132 | It can be imported like: ```import src```
133 |
134 |
135 | ### 4. Serve it via a REST API
136 |
137 | The API should be a different repository or at least a different folder. Here it is located in the folder ```app-fastapi```.
138 |
139 | The first thing here is in the ```requirements.txt```, where we define to install the ```end-to-end-ML-project``` package,
140 | which we have published earlier.
141 |
142 | Three key files of the api are:
143 |
144 | - ```config.py```: Specify metadata of the api, and logging settings.
145 | - ```main.py```: Define the main app and the index page router.
146 | - ```api.py```: Define a health and a predict endpoint.
147 |
148 | We define some ```schemas``` for automatic validation of variable types.
149 | We define some ```schemas``` for automatic validation of variable types.
150 |
151 | We also define ```tests``` with predefined input data to predict.
152 |
153 | We also use ```logging``` and the package ```loguru```.
154 |
155 | The ```Procfile``` and ```runtime.txt``` are necessary files to deploy on Heroku.
156 |
157 |
158 | ### 5. Dockerize it and deploy it
159 |
160 | We create a ```Dockerfile``` and build the image:
161 |
162 | ```docker build -t end-to-end-ML-project:latest .```
163 |
164 | We run the image:
165 |
166 | ```docker run -p 8001:8001 -e PORT=8001 end-to-end-ml-project```
167 |
168 | We can see the output on localhost:8001/
169 |
170 | Now to deploy on Heroku, create a ```heroku.yml``` file.
171 |
172 | ```
173 | heroku login
174 | heroku cointainer:login
175 | heroku container:push web --app end-to-end-ml-project
176 | heroku container:release web --app end-to-end-ml-project
177 | heroku open --app end-to-end-ml-project
178 | ```
179 |
--------------------------------------------------------------------------------
/app-fastapi/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT
--------------------------------------------------------------------------------
/app-fastapi/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
--------------------------------------------------------------------------------
/app-fastapi/app/api.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from fastapi import APIRouter, HTTPException
7 | from fastapi.encoders import jsonable_encoder
8 | from loguru import logger
9 | from src import __version__ as model_version
10 | from src.predict import make_prediction
11 |
12 | from app import __version__, schemas
13 | from app.config import settings
14 |
15 | api_router = APIRouter()
16 |
17 |
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 | """
21 | Root Get
22 | """
23 | health = schemas.Health(
24 | name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 | )
26 |
27 | return health.dict()
28 |
29 |
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleDataInputs) -> Any:
32 | """
33 | Make predictions with the end-to-end-ML-project model
34 | """
35 | # load pydantic data
36 | input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 |
38 | # Advanced: You can improve performance of your API by rewriting the
39 | # `make prediction` function to be async and using await here.
40 | logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 | results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 |
43 | logger.info(f"Prediction results: {results.get('predictions')}")
44 |
45 | return results
46 |
--------------------------------------------------------------------------------
/app-fastapi/app/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from types import FrameType
4 | from typing import List, cast
5 |
6 | from loguru import logger
7 | from pydantic import AnyHttpUrl, BaseSettings
8 |
9 |
10 | class LoggingSettings(BaseSettings):
11 | LOGGING_LEVEL: int = logging.INFO # logging levels are type int
12 |
13 |
14 | class Settings(BaseSettings):
15 | API_V1_STR: str = "/api/v1"
16 |
17 | # Meta
18 | logging: LoggingSettings = LoggingSettings()
19 |
20 | # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 | # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 | "http://localhost:3000", # type: ignore
24 | "http://localhost:8000", # type: ignore
25 | "https://localhost:3000", # type: ignore
26 | "https://localhost:8000", # type: ignore
27 | ]
28 |
29 | PROJECT_NAME: str = "End to End ML Project"
30 |
31 | class Config:
32 | case_sensitive = True
33 |
34 |
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging # noqa
36 | class InterceptHandler(logging.Handler):
37 | def emit(self, record: logging.LogRecord) -> None: # pragma: no cover
38 | # Get corresponding Loguru level if it exists
39 | try:
40 | level = logger.level(record.levelname).name
41 | except ValueError:
42 | level = str(record.levelno)
43 |
44 | # Find caller from where originated the logged message
45 | frame, depth = logging.currentframe(), 2
46 | while frame.f_code.co_filename == logging.__file__: # noqa: WPS609
47 | frame = cast(FrameType, frame.f_back)
48 | depth += 1
49 |
50 | logger.opt(depth=depth, exception=record.exc_info).log(
51 | level,
52 | record.getMessage(),
53 | )
54 |
55 |
56 | def setup_app_logging(config: Settings) -> None:
57 | """Prepare custom logging for our application."""
58 |
59 | LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 | logging.getLogger().handlers = [InterceptHandler()]
61 | for logger_name in LOGGERS:
62 | logging_logger = logging.getLogger(logger_name)
63 | logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 |
65 | logger.configure(
66 | handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 | )
68 |
69 |
70 | settings = Settings()
--------------------------------------------------------------------------------
/app-fastapi/app/main.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from fastapi import APIRouter, FastAPI, Request
4 | from fastapi.middleware.cors import CORSMiddleware
5 | from fastapi.responses import HTMLResponse
6 | from loguru import logger
7 |
8 | from app.api import api_router
9 | from app.config import settings, setup_app_logging
10 |
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 |
14 |
15 | app = FastAPI(
16 | title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 |
19 | root_router = APIRouter()
20 |
21 |
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 | """Basic HTML response."""
25 | body = (
26 | ""
27 | ""
28 | "Welcome to the API "
29 | ""
30 | "Check the docs:
here "
31 | "
"
32 | ""
33 | ""
34 | )
35 |
36 | return HTMLResponse(content=body)
37 |
38 |
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 |
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 | app.add_middleware(
45 | CORSMiddleware,
46 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 | allow_credentials=True,
48 | allow_methods=["*"],
49 | allow_headers=["*"],
50 | )
51 |
52 |
53 | if __name__ == "__main__":
54 | # Use this for debugging purposes only
55 | logger.warning("Running in development mode. Do not run like this in production.")
56 | import uvicorn
57 |
58 | uvicorn.run(app, host="localhost", port=8001, log_level="debug")
--------------------------------------------------------------------------------
/app-fastapi/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleDataInputs, PredictionResults
--------------------------------------------------------------------------------
/app-fastapi/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class Health(BaseModel):
5 | name: str
6 | api_version: str
7 | model_version: str
8 |
--------------------------------------------------------------------------------
/app-fastapi/app/schemas/predict.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List, Optional
2 |
3 | from pydantic import BaseModel
4 |
5 |
6 | class DataInputSchema(BaseModel):
7 | city: Optional[str]
8 | city_development_index: Optional[float]
9 | gender: Optional[str]
10 | relevent_experience: Optional[str]
11 | enrolled_university: Optional[str]
12 | education_level: Optional[str]
13 | major_discipline: Optional[str]
14 | experience: Optional[str]
15 | company_size: Optional[str]
16 | company_type: Optional[str]
17 | last_new_job: Optional[str]
18 | training_hours: Optional[int]
19 |
20 |
21 | class PredictionResults(BaseModel):
22 | errors: Optional[Any]
23 | version: str
24 | predictions: Optional[List[float]]
25 |
26 |
27 | class MultipleDataInputs(BaseModel):
28 | inputs: List[DataInputSchema]
29 |
30 | class Config:
31 | schema_extra = {
32 | "example": {
33 | "inputs": [
34 | {
35 | "city": "city_41",
36 | "city_development_index": 0.8270000000000001,
37 | "gender": "Male",
38 | "relevent_experience": "Has relevent experience",
39 | "enrolled_university": "Full time course",
40 | "education_level": "Graduate",
41 | "major_discipline": "STEM",
42 | "experience": "9",
43 | "company_size": "<10",
44 | "company_type": "Funded Startup",
45 | "last_new_job": "1",
46 | "training_hours": 21
47 | }
48 | ]
49 | }
50 | }
--------------------------------------------------------------------------------
/app-fastapi/app/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/app-fastapi/app/tests/__init__.py
--------------------------------------------------------------------------------
/app-fastapi/app/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | import pandas as pd
4 | import pytest
5 | from fastapi.testclient import TestClient
6 | from src.config.core import config
7 | from src.processing.data_manager import load_dataset
8 |
9 | from app.main import app
10 |
11 |
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 | return load_dataset(file_name=config.app_config.test_data_file)
15 |
16 |
17 | @pytest.fixture()
18 | def client() -> Generator:
19 | with TestClient(app) as _client:
20 | yield _client
21 | app.dependency_overrides = {}
22 |
--------------------------------------------------------------------------------
/app-fastapi/app/tests/test_api.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from fastapi.testclient import TestClient
6 |
7 |
8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
9 | # Given
10 | payload = {
11 | # ensure pydantic plays well with np.nan
12 | "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 | }
14 |
15 | # When
16 | response = client.post(
17 | "http://localhost:8001/api/v1/predict",
18 | json=payload,
19 | )
20 |
21 | # Then
22 | assert response.status_code == 200
23 | prediction_data = response.json()
24 | assert prediction_data["predictions"]
25 | assert prediction_data["errors"] is None
26 | # assert math.isclose(prediction_data["predictions"][0], 1500, rel_tol=100)
27 |
--------------------------------------------------------------------------------
/app-fastapi/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
--------------------------------------------------------------------------------
/app-fastapi/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.75.0
2 | uvicorn==0.17.5
3 | python-multipart==0.0.5
4 | typing_extensions==3.10.0
5 | requests
6 | end-to-end-ML-project
7 |
--------------------------------------------------------------------------------
/app-fastapi/run.sh:
--------------------------------------------------------------------------------
1 | uvicorn app.main:app --host 0.0.0.0 --port $PORT
--------------------------------------------------------------------------------
/app-fastapi/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.9.5
--------------------------------------------------------------------------------
/app-fastapi/tox.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | log_cli_level=WARNING
3 |
4 | [tox]
5 | envlist = test_app, typechecks, stylechecks, lint
6 | skipsdist = True
7 |
8 | [testenv]
9 | install_command = pip install {opts} {packages}
10 |
11 | [testenv:test_app]
12 | deps =
13 | -rrequirements.txt
14 |
15 | setenv =
16 | PYTHONPATH=.
17 | PYTHONHASHSEED=0
18 |
19 | commands=
20 | pytest \
21 | -vv \
22 | {posargs:app/tests/}
23 |
24 | [testenv:run]
25 | envdir = {toxworkdir}/test_app
26 | deps =
27 | {[testenv:test_app]deps}
28 |
29 | setenv =
30 | {[testenv:test_app]setenv}
31 |
32 | commands=
33 | python app/main.py
34 |
35 |
36 | [testenv:typechecks]
37 | envdir = {toxworkdir}/test_app
38 |
39 | deps =
40 | {[testenv:test_app]deps}
41 |
42 | commands = {posargs:mypy app}
43 |
44 |
45 | [testenv:stylechecks]
46 | envdir = {toxworkdir}/test_app
47 |
48 | deps =
49 | {[testenv:test_app]deps}
50 |
51 | commands = {posargs:flake8 app}
52 |
53 |
54 | [testenv:lint]
55 | envdir = {toxworkdir}/test_app
56 |
57 | deps =
58 | {[testenv:test_app]deps}
59 |
60 | commands =
61 | isort app
62 | black app
63 | mypy app
64 | flake8 app
65 |
66 | [flake8]
67 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
68 | max-line-length = 120
--------------------------------------------------------------------------------
/heroku.yml:
--------------------------------------------------------------------------------
1 | build:
2 | docker:
3 | web: Dockerfile
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | # warn_unreachable = True
3 | warn_unused_ignores = True
4 | follow_imports = skip
5 | show_error_context = True
6 | warn_incomplete_stub = True
7 | ignore_missing_imports = True
8 | check_untyped_defs = True
9 | cache_dir = /dev/null
10 | # Cannot enable this one as we still allow defining functions without any types.
11 | # disallow_untyped_defs = True
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True
--------------------------------------------------------------------------------
/notebooks/2. Feature Engineering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "cdc43d05",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import scipy.stats as stats\n",
13 | "\n",
14 | "from sklearn.model_selection import train_test_split\n",
15 | "from sklearn.preprocessing import MinMaxScaler\n",
16 | "\n",
17 | "from feature_engine.imputation import (\n",
18 | " CategoricalImputer,\n",
19 | ")\n",
20 | "\n",
21 | "from feature_engine.transformation import (\n",
22 | " YeoJohnsonTransformer,\n",
23 | ")\n",
24 | "\n",
25 | "from feature_engine.encoding import (\n",
26 | " RareLabelEncoder,\n",
27 | " OrdinalEncoder,\n",
28 | " OneHotEncoder,\n",
29 | " CountFrequencyEncoder\n",
30 | ")\n",
31 | "\n",
32 | "import joblib\n",
33 | "\n",
34 | "import matplotlib.pyplot as plt\n",
35 | "import seaborn as sns"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "id": "b7976053",
41 | "metadata": {},
42 | "source": [
43 | "## Read Data"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "id": "5bf686d7",
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "(19158, 14)\n"
57 | ]
58 | },
59 | {
60 | "data": {
61 | "text/html": [
62 | "\n",
63 | "\n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | " \n",
80 | " enrollee_id \n",
81 | " city \n",
82 | " city_development_index \n",
83 | " gender \n",
84 | " relevent_experience \n",
85 | " enrolled_university \n",
86 | " education_level \n",
87 | " major_discipline \n",
88 | " experience \n",
89 | " company_size \n",
90 | " company_type \n",
91 | " last_new_job \n",
92 | " training_hours \n",
93 | " target \n",
94 | " \n",
95 | " \n",
96 | " \n",
97 | " \n",
98 | " 0 \n",
99 | " 8949 \n",
100 | " city_103 \n",
101 | " 0.920 \n",
102 | " Male \n",
103 | " Has relevent experience \n",
104 | " no_enrollment \n",
105 | " Graduate \n",
106 | " STEM \n",
107 | " >20 \n",
108 | " NaN \n",
109 | " NaN \n",
110 | " 1 \n",
111 | " 36 \n",
112 | " 1.0 \n",
113 | " \n",
114 | " \n",
115 | " 1 \n",
116 | " 29725 \n",
117 | " city_40 \n",
118 | " 0.776 \n",
119 | " Male \n",
120 | " No relevent experience \n",
121 | " no_enrollment \n",
122 | " Graduate \n",
123 | " STEM \n",
124 | " 15 \n",
125 | " 50-99 \n",
126 | " Pvt Ltd \n",
127 | " >4 \n",
128 | " 47 \n",
129 | " 0.0 \n",
130 | " \n",
131 | " \n",
132 | " 2 \n",
133 | " 11561 \n",
134 | " city_21 \n",
135 | " 0.624 \n",
136 | " NaN \n",
137 | " No relevent experience \n",
138 | " Full time course \n",
139 | " Graduate \n",
140 | " STEM \n",
141 | " 5 \n",
142 | " NaN \n",
143 | " NaN \n",
144 | " never \n",
145 | " 83 \n",
146 | " 0.0 \n",
147 | " \n",
148 | " \n",
149 | " 3 \n",
150 | " 33241 \n",
151 | " city_115 \n",
152 | " 0.789 \n",
153 | " NaN \n",
154 | " No relevent experience \n",
155 | " NaN \n",
156 | " Graduate \n",
157 | " Business Degree \n",
158 | " <1 \n",
159 | " NaN \n",
160 | " Pvt Ltd \n",
161 | " never \n",
162 | " 52 \n",
163 | " 1.0 \n",
164 | " \n",
165 | " \n",
166 | " 4 \n",
167 | " 666 \n",
168 | " city_162 \n",
169 | " 0.767 \n",
170 | " Male \n",
171 | " Has relevent experience \n",
172 | " no_enrollment \n",
173 | " Masters \n",
174 | " STEM \n",
175 | " >20 \n",
176 | " 50-99 \n",
177 | " Funded Startup \n",
178 | " 4 \n",
179 | " 8 \n",
180 | " 0.0 \n",
181 | " \n",
182 | " \n",
183 | "
\n",
184 | "
"
185 | ],
186 | "text/plain": [
187 | " enrollee_id city city_development_index gender \\\n",
188 | "0 8949 city_103 0.920 Male \n",
189 | "1 29725 city_40 0.776 Male \n",
190 | "2 11561 city_21 0.624 NaN \n",
191 | "3 33241 city_115 0.789 NaN \n",
192 | "4 666 city_162 0.767 Male \n",
193 | "\n",
194 | " relevent_experience enrolled_university education_level \\\n",
195 | "0 Has relevent experience no_enrollment Graduate \n",
196 | "1 No relevent experience no_enrollment Graduate \n",
197 | "2 No relevent experience Full time course Graduate \n",
198 | "3 No relevent experience NaN Graduate \n",
199 | "4 Has relevent experience no_enrollment Masters \n",
200 | "\n",
201 | " major_discipline experience company_size company_type last_new_job \\\n",
202 | "0 STEM >20 NaN NaN 1 \n",
203 | "1 STEM 15 50-99 Pvt Ltd >4 \n",
204 | "2 STEM 5 NaN NaN never \n",
205 | "3 Business Degree <1 NaN Pvt Ltd never \n",
206 | "4 STEM >20 50-99 Funded Startup 4 \n",
207 | "\n",
208 | " training_hours target \n",
209 | "0 36 1.0 \n",
210 | "1 47 0.0 \n",
211 | "2 83 0.0 \n",
212 | "3 52 1.0 \n",
213 | "4 8 0.0 "
214 | ]
215 | },
216 | "execution_count": 2,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "data = pd.read_csv('../src/data/train.csv')\n",
223 | "print(data.shape)\n",
224 | "data.head()"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "id": "d64ee1e2",
230 | "metadata": {},
231 | "source": [
232 | "## Train-Test Split"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 3,
238 | "id": "a39a4c06",
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "X_train, X_test, y_train, y_test = train_test_split(\n",
243 | " data.drop(['enrollee_id', 'target'], axis=1),\n",
244 | " data['target'],\n",
245 | " test_size=0.2,\n",
246 | " random_state=0,\n",
247 | ")"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "id": "8356eff9",
253 | "metadata": {},
254 | "source": [
255 | "## Missing Values"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "id": "a9dc024a",
261 | "metadata": {},
262 | "source": [
263 | "### Categorical Varibles"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 4,
269 | "id": "47dca2c6",
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "cat_vars = [var for var in data.columns if data[var].dtype == 'O']\n",
274 | "cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() > 0]"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 5,
280 | "id": "a3a026be",
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "data": {
285 | "text/plain": [
286 | "['city',\n",
287 | " 'gender',\n",
288 | " 'relevent_experience',\n",
289 | " 'enrolled_university',\n",
290 | " 'education_level',\n",
291 | " 'major_discipline',\n",
292 | " 'experience',\n",
293 | " 'company_size',\n",
294 | " 'company_type',\n",
295 | " 'last_new_job']"
296 | ]
297 | },
298 | "execution_count": 5,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "cat_vars"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 6,
310 | "id": "104a1eb1",
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "data": {
315 | "text/plain": [
316 | "company_type 0.320493\n",
317 | "company_size 0.309949\n",
318 | "gender 0.235306\n",
319 | "major_discipline 0.146832\n",
320 | "education_level 0.024011\n",
321 | "last_new_job 0.022080\n",
322 | "enrolled_university 0.020148\n",
323 | "experience 0.003393\n",
324 | "dtype: float64"
325 | ]
326 | },
327 | "execution_count": 6,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "data[cat_vars_with_na].isnull().mean().sort_values(ascending=False)"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 7,
339 | "id": "f912c97f",
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "cat_vars_replace_na_with_string_missing = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1]\n",
344 | "cat_vars_replace_na_with_frequent = [var for var in cat_vars_with_na if X_train[var].isnull().mean() <= 0.1]"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 8,
350 | "id": "5a00f5c2",
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/plain": [
356 | "['gender', 'major_discipline', 'company_size', 'company_type']"
357 | ]
358 | },
359 | "execution_count": 8,
360 | "metadata": {},
361 | "output_type": "execute_result"
362 | }
363 | ],
364 | "source": [
365 | "cat_vars_replace_na_with_string_missing"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 9,
371 | "id": "c6a92e10",
372 | "metadata": {},
373 | "outputs": [
374 | {
375 | "data": {
376 | "text/plain": [
377 | "['enrolled_university', 'education_level', 'experience', 'last_new_job']"
378 | ]
379 | },
380 | "execution_count": 9,
381 | "metadata": {},
382 | "output_type": "execute_result"
383 | }
384 | ],
385 | "source": [
386 | "cat_vars_replace_na_with_frequent"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 10,
392 | "id": "adbf6063",
393 | "metadata": {},
394 | "outputs": [
395 | {
396 | "data": {
397 | "text/plain": [
398 | "{'gender': 'Missing',\n",
399 | " 'major_discipline': 'Missing',\n",
400 | " 'company_size': 'Missing',\n",
401 | " 'company_type': 'Missing'}"
402 | ]
403 | },
404 | "execution_count": 10,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "cat_imputer_missing = CategoricalImputer(imputation_method='missing', variables=cat_vars_replace_na_with_string_missing)\n",
411 | "cat_imputer_missing.fit(X_train)\n",
412 | "cat_imputer_missing.imputer_dict_"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 11,
418 | "id": "6937ec37",
419 | "metadata": {},
420 | "outputs": [],
421 | "source": [
422 | "X_train = cat_imputer_missing.transform(X_train)\n",
423 | "X_test = cat_imputer_missing.transform(X_test)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 12,
429 | "id": "bd06a11c",
430 | "metadata": {},
431 | "outputs": [
432 | {
433 | "data": {
434 | "text/plain": [
435 | "{'enrolled_university': 'no_enrollment',\n",
436 | " 'education_level': 'Graduate',\n",
437 | " 'experience': '>20',\n",
438 | " 'last_new_job': '1'}"
439 | ]
440 | },
441 | "execution_count": 12,
442 | "metadata": {},
443 | "output_type": "execute_result"
444 | }
445 | ],
446 | "source": [
447 | "cat_imputer_frequent = CategoricalImputer(imputation_method='frequent', variables=cat_vars_replace_na_with_frequent)\n",
448 | "cat_imputer_frequent.fit(X_train)\n",
449 | "cat_imputer_frequent.imputer_dict_"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 13,
455 | "id": "afa6f611",
456 | "metadata": {},
457 | "outputs": [],
458 | "source": [
459 | "X_train = cat_imputer_frequent.transform(X_train)\n",
460 | "X_test = cat_imputer_frequent.transform(X_test)"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 14,
466 | "id": "5e12a3e9",
467 | "metadata": {},
468 | "outputs": [
469 | {
470 | "data": {
471 | "text/plain": [
472 | "gender 0\n",
473 | "enrolled_university 0\n",
474 | "education_level 0\n",
475 | "major_discipline 0\n",
476 | "experience 0\n",
477 | "company_size 0\n",
478 | "company_type 0\n",
479 | "last_new_job 0\n",
480 | "dtype: int64"
481 | ]
482 | },
483 | "execution_count": 14,
484 | "metadata": {},
485 | "output_type": "execute_result"
486 | }
487 | ],
488 | "source": [
489 | "X_train[cat_vars_with_na].isnull().sum()"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": 15,
495 | "id": "ab937bc7",
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "data": {
500 | "text/plain": [
501 | "gender 0\n",
502 | "enrolled_university 0\n",
503 | "education_level 0\n",
504 | "major_discipline 0\n",
505 | "experience 0\n",
506 | "company_size 0\n",
507 | "company_type 0\n",
508 | "last_new_job 0\n",
509 | "dtype: int64"
510 | ]
511 | },
512 | "execution_count": 15,
513 | "metadata": {},
514 | "output_type": "execute_result"
515 | }
516 | ],
517 | "source": [
518 | "X_test[cat_vars_with_na].isnull().sum()"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "id": "73552e80",
524 | "metadata": {},
525 | "source": [
526 | "### Numerical Variables"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 16,
532 | "id": "b108a026",
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "num_vars = [var for var in data.columns if var not in cat_vars + ['enrollee_id', 'target']]\n",
537 | "num_vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 17,
543 | "id": "5c17948b",
544 | "metadata": {},
545 | "outputs": [
546 | {
547 | "data": {
548 | "text/plain": [
549 | "['city_development_index', 'training_hours']"
550 | ]
551 | },
552 | "execution_count": 17,
553 | "metadata": {},
554 | "output_type": "execute_result"
555 | }
556 | ],
557 | "source": [
558 | "num_vars"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 18,
564 | "id": "a10fb9e6",
565 | "metadata": {},
566 | "outputs": [
567 | {
568 | "data": {
569 | "text/plain": [
570 | "[]"
571 | ]
572 | },
573 | "execution_count": 18,
574 | "metadata": {},
575 | "output_type": "execute_result"
576 | }
577 | ],
578 | "source": [
579 | "num_vars_with_na"
580 | ]
581 | },
582 | {
583 | "cell_type": "markdown",
584 | "id": "8d13b020",
585 | "metadata": {},
586 | "source": [
587 | "## Transformations"
588 | ]
589 | },
590 | {
591 | "cell_type": "markdown",
592 | "id": "ba920de7",
593 | "metadata": {},
594 | "source": [
595 | "### Numerical Variables"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 19,
601 | "id": "90d1a49d",
602 | "metadata": {},
603 | "outputs": [
604 | {
605 | "data": {
606 | "text/plain": [
607 | "{'training_hours': 0.14533500139326166}"
608 | ]
609 | },
610 | "execution_count": 19,
611 | "metadata": {},
612 | "output_type": "execute_result"
613 | }
614 | ],
615 | "source": [
616 | "num_vars_yeo_johnson = ['training_hours']\n",
617 | "\n",
618 | "yeo_transformer = YeoJohnsonTransformer(variables=num_vars_yeo_johnson)\n",
619 | "\n",
620 | "X_train = yeo_transformer.fit_transform(X_train)\n",
621 | "X_test = yeo_transformer.transform(X_test)\n",
622 | "\n",
623 | "yeo_transformer.lambda_dict_"
624 | ]
625 | },
626 | {
627 | "cell_type": "markdown",
628 | "id": "f5e4f137",
629 | "metadata": {},
630 | "source": [
631 | "### Categorical Variables"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": 20,
637 | "id": "f0ca44ac",
638 | "metadata": {},
639 | "outputs": [
640 | {
641 | "data": {
642 | "text/html": [
643 | "\n",
644 | "\n",
657 | "
\n",
658 | " \n",
659 | " \n",
660 | " \n",
661 | " city \n",
662 | " gender \n",
663 | " relevent_experience \n",
664 | " enrolled_university \n",
665 | " education_level \n",
666 | " major_discipline \n",
667 | " experience \n",
668 | " company_size \n",
669 | " company_type \n",
670 | " last_new_job \n",
671 | " \n",
672 | " \n",
673 | " \n",
674 | " \n",
675 | " 19147 \n",
676 | " city_21 \n",
677 | " Male \n",
678 | " No relevent experience \n",
679 | " Full time course \n",
680 | " Graduate \n",
681 | " STEM \n",
682 | " 1 \n",
683 | " 100-500 \n",
684 | " Pvt Ltd \n",
685 | " 1 \n",
686 | " \n",
687 | " \n",
688 | " 8464 \n",
689 | " city_21 \n",
690 | " Missing \n",
691 | " Has relevent experience \n",
692 | " Full time course \n",
693 | " Graduate \n",
694 | " STEM \n",
695 | " <1 \n",
696 | " <10 \n",
697 | " Pvt Ltd \n",
698 | " 1 \n",
699 | " \n",
700 | " \n",
701 | " 8869 \n",
702 | " city_16 \n",
703 | " Male \n",
704 | " Has relevent experience \n",
705 | " no_enrollment \n",
706 | " Masters \n",
707 | " STEM \n",
708 | " 9 \n",
709 | " Missing \n",
710 | " Pvt Ltd \n",
711 | " 1 \n",
712 | " \n",
713 | " \n",
714 | " 11645 \n",
715 | " city_118 \n",
716 | " Missing \n",
717 | " Has relevent experience \n",
718 | " Part time course \n",
719 | " Masters \n",
720 | " STEM \n",
721 | " 10 \n",
722 | " 1000-4999 \n",
723 | " Pvt Ltd \n",
724 | " 3 \n",
725 | " \n",
726 | " \n",
727 | " 7743 \n",
728 | " city_103 \n",
729 | " Missing \n",
730 | " No relevent experience \n",
731 | " no_enrollment \n",
732 | " Primary School \n",
733 | " Missing \n",
734 | " 2 \n",
735 | " Missing \n",
736 | " Missing \n",
737 | " never \n",
738 | " \n",
739 | " \n",
740 | "
\n",
741 | "
"
742 | ],
743 | "text/plain": [
744 | " city gender relevent_experience enrolled_university \\\n",
745 | "19147 city_21 Male No relevent experience Full time course \n",
746 | "8464 city_21 Missing Has relevent experience Full time course \n",
747 | "8869 city_16 Male Has relevent experience no_enrollment \n",
748 | "11645 city_118 Missing Has relevent experience Part time course \n",
749 | "7743 city_103 Missing No relevent experience no_enrollment \n",
750 | "\n",
751 | " education_level major_discipline experience company_size company_type \\\n",
752 | "19147 Graduate STEM 1 100-500 Pvt Ltd \n",
753 | "8464 Graduate STEM <1 <10 Pvt Ltd \n",
754 | "8869 Masters STEM 9 Missing Pvt Ltd \n",
755 | "11645 Masters STEM 10 1000-4999 Pvt Ltd \n",
756 | "7743 Primary School Missing 2 Missing Missing \n",
757 | "\n",
758 | " last_new_job \n",
759 | "19147 1 \n",
760 | "8464 1 \n",
761 | "8869 1 \n",
762 | "11645 3 \n",
763 | "7743 never "
764 | ]
765 | },
766 | "execution_count": 20,
767 | "metadata": {},
768 | "output_type": "execute_result"
769 | }
770 | ],
771 | "source": [
772 | "X_train[cat_vars].head()"
773 | ]
774 | },
775 | {
776 | "cell_type": "code",
777 | "execution_count": 21,
778 | "id": "a9f8e55b",
779 | "metadata": {},
780 | "outputs": [],
781 | "source": [
782 | "experience_map = {\n",
783 | " '<1': 0,\n",
784 | " '1': 1, \n",
785 | " '2': 2, \n",
786 | " '3': 3, \n",
787 | " '4': 4, \n",
788 | " '5': 5,\n",
789 | " '6': 6,\n",
790 | " '7': 7,\n",
791 | " '8': 8, \n",
792 | " '9': 9, \n",
793 | " '10': 10, \n",
794 | " '11': 11,\n",
795 | " '12': 12,\n",
796 | " '13': 13, \n",
797 | " '14': 14, \n",
798 | " '15': 15, \n",
799 | " '16': 16,\n",
800 | " '17': 17,\n",
801 | " '18': 18,\n",
802 | " '19': 19, \n",
803 | " '20': 20, \n",
804 | " '>20': 21\n",
805 | "} \n",
806 | "\n",
807 | "last_new_job_map = {\n",
808 | " 'never': 0,\n",
809 | " '1': 1, \n",
810 | " '2': 2, \n",
811 | " '3': 3, \n",
812 | " '4': 4, \n",
813 | " '>4': 5\n",
814 | "}\n",
815 | "\n",
816 | "company_size_map = {\n",
817 | " 'Missing': 0,\n",
818 | " '<10': 1,\n",
819 | " '10/49': 2, \n",
820 | " '100-500': 3, \n",
821 | " '1000-4999': 4, \n",
822 | " '10000+': 5, \n",
823 | " '50-99': 6, \n",
824 | " '500-999': 7, \n",
825 | " '5000-9999': 8\n",
826 | "}\n",
827 | "\n",
828 | "cat_vars_ordinal = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']\n",
829 | "cat_vars_ordinal_arbitrary = ['city']\n",
830 | "cat_vars_onehot = ['gender']\n",
831 | "cat_vars_count_frequency = ['company_type']\n",
832 | "\n",
833 | "ordinal_encoder = OrdinalEncoder(encoding_method='ordered', variables=cat_vars_ordinal)\n",
834 | "ordinal_encoder.fit(X_train, y_train)\n",
835 | "\n",
836 | "ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)\n",
837 | "ordinal_encoder_arbitrary.fit(X_train, y_train)\n",
838 | "\n",
839 | "count_frequency_encoder = CountFrequencyEncoder(encoding_method='frequency', variables=cat_vars_count_frequency)\n",
840 | "count_frequency_encoder.fit(X_train)\n",
841 | "\n",
842 | "onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)\n",
843 | "onehot_encoder.fit(X_train)\n",
844 | "\n",
845 | "X_train = ordinal_encoder.transform(X_train)\n",
846 | "X_test = ordinal_encoder.transform(X_test)\n",
847 | "\n",
848 | "X_train = ordinal_encoder_arbitrary.transform(X_train)\n",
849 | "X_test = ordinal_encoder_arbitrary.transform(X_test)\n",
850 | "\n",
851 | "X_train = count_frequency_encoder.transform(X_train)\n",
852 | "X_test = count_frequency_encoder.transform(X_test)\n",
853 | "\n",
854 | "X_train = onehot_encoder.transform(X_train)\n",
855 | "X_test = onehot_encoder.transform(X_test)\n",
856 | "\n",
857 | "var = 'experience'\n",
858 | "X_train[var] = X_train[var].map(experience_map)\n",
859 | "X_test[var] = X_test[var].map(experience_map)\n",
860 | "\n",
861 | "var = 'last_new_job'\n",
862 | "X_train[var] = X_train[var].map(last_new_job_map)\n",
863 | "X_test[var] = X_test[var].map(last_new_job_map)\n",
864 | "\n",
865 | "var = 'company_size'\n",
866 | "X_train[var] = X_train[var].map(company_size_map)\n",
867 | "X_test[var] = X_test[var].map(company_size_map)"
868 | ]
869 | },
870 | {
871 | "cell_type": "code",
872 | "execution_count": 22,
873 | "id": "2d28bbcd",
874 | "metadata": {},
875 | "outputs": [
876 | {
877 | "data": {
878 | "text/html": [
879 | "\n",
880 | "\n",
893 | "
\n",
894 | " \n",
895 | " \n",
896 | " \n",
897 | " city \n",
898 | " city_development_index \n",
899 | " relevent_experience \n",
900 | " enrolled_university \n",
901 | " education_level \n",
902 | " major_discipline \n",
903 | " experience \n",
904 | " company_size \n",
905 | " company_type \n",
906 | " last_new_job \n",
907 | " training_hours \n",
908 | " gender_Male \n",
909 | " gender_Missing \n",
910 | " gender_Other \n",
911 | " gender_Female \n",
912 | " \n",
913 | " \n",
914 | " \n",
915 | " \n",
916 | " 19147 \n",
917 | " 0 \n",
918 | " 0.624 \n",
919 | " 1 \n",
920 | " 2 \n",
921 | " 4 \n",
922 | " 4 \n",
923 | " 1 \n",
924 | " 3 \n",
925 | " 0.514746 \n",
926 | " 1 \n",
927 | " 5.371921 \n",
928 | " 1 \n",
929 | " 0 \n",
930 | " 0 \n",
931 | " 0 \n",
932 | " \n",
933 | " \n",
934 | " 8464 \n",
935 | " 0 \n",
936 | " 0.624 \n",
937 | " 0 \n",
938 | " 2 \n",
939 | " 4 \n",
940 | " 4 \n",
941 | " 0 \n",
942 | " 1 \n",
943 | " 0.514746 \n",
944 | " 1 \n",
945 | " 6.415291 \n",
946 | " 0 \n",
947 | " 1 \n",
948 | " 0 \n",
949 | " 0 \n",
950 | " \n",
951 | " \n",
952 | " 8869 \n",
953 | " 1 \n",
954 | " 0.910 \n",
955 | " 0 \n",
956 | " 0 \n",
957 | " 3 \n",
958 | " 4 \n",
959 | " 9 \n",
960 | " 0 \n",
961 | " 0.514746 \n",
962 | " 1 \n",
963 | " 4.748399 \n",
964 | " 1 \n",
965 | " 0 \n",
966 | " 0 \n",
967 | " 0 \n",
968 | " \n",
969 | " \n",
970 | " 11645 \n",
971 | " 2 \n",
972 | " 0.722 \n",
973 | " 0 \n",
974 | " 1 \n",
975 | " 3 \n",
976 | " 4 \n",
977 | " 10 \n",
978 | " 4 \n",
979 | " 0.514746 \n",
980 | " 3 \n",
981 | " 3.753794 \n",
982 | " 0 \n",
983 | " 1 \n",
984 | " 0 \n",
985 | " 0 \n",
986 | " \n",
987 | " \n",
988 | " 7743 \n",
989 | " 3 \n",
990 | " 0.920 \n",
991 | " 1 \n",
992 | " 0 \n",
993 | " 1 \n",
994 | " 0 \n",
995 | " 2 \n",
996 | " 0 \n",
997 | " 0.319522 \n",
998 | " 0 \n",
999 | " 5.877477 \n",
1000 | " 0 \n",
1001 | " 1 \n",
1002 | " 0 \n",
1003 | " 0 \n",
1004 | " \n",
1005 | " \n",
1006 | " ... \n",
1007 | " ... \n",
1008 | " ... \n",
1009 | " ... \n",
1010 | " ... \n",
1011 | " ... \n",
1012 | " ... \n",
1013 | " ... \n",
1014 | " ... \n",
1015 | " ... \n",
1016 | " ... \n",
1017 | " ... \n",
1018 | " ... \n",
1019 | " ... \n",
1020 | " ... \n",
1021 | " ... \n",
1022 | " \n",
1023 | " \n",
1024 | " 9225 \n",
1025 | " 31 \n",
1026 | " 0.479 \n",
1027 | " 1 \n",
1028 | " 2 \n",
1029 | " 4 \n",
1030 | " 4 \n",
1031 | " 10 \n",
1032 | " 0 \n",
1033 | " 0.319522 \n",
1034 | " 5 \n",
1035 | " 4.702184 \n",
1036 | " 1 \n",
1037 | " 0 \n",
1038 | " 0 \n",
1039 | " 0 \n",
1040 | " \n",
1041 | " \n",
1042 | " 13123 \n",
1043 | " 0 \n",
1044 | " 0.624 \n",
1045 | " 0 \n",
1046 | " 0 \n",
1047 | " 4 \n",
1048 | " 4 \n",
1049 | " 6 \n",
1050 | " 6 \n",
1051 | " 0.514746 \n",
1052 | " 1 \n",
1053 | " 3.829470 \n",
1054 | " 1 \n",
1055 | " 0 \n",
1056 | " 0 \n",
1057 | " 0 \n",
1058 | " \n",
1059 | " \n",
1060 | " 9845 \n",
1061 | " 3 \n",
1062 | " 0.920 \n",
1063 | " 0 \n",
1064 | " 0 \n",
1065 | " 4 \n",
1066 | " 4 \n",
1067 | " 6 \n",
1068 | " 6 \n",
1069 | " 0.514746 \n",
1070 | " 0 \n",
1071 | " 5.232979 \n",
1072 | " 0 \n",
1073 | " 0 \n",
1074 | " 1 \n",
1075 | " 0 \n",
1076 | " \n",
1077 | " \n",
1078 | " 10799 \n",
1079 | " 0 \n",
1080 | " 0.624 \n",
1081 | " 1 \n",
1082 | " 0 \n",
1083 | " 4 \n",
1084 | " 4 \n",
1085 | " 2 \n",
1086 | " 2 \n",
1087 | " 0.319522 \n",
1088 | " 1 \n",
1089 | " 6.030879 \n",
1090 | " 0 \n",
1091 | " 1 \n",
1092 | " 0 \n",
1093 | " 0 \n",
1094 | " \n",
1095 | " \n",
1096 | " 2732 \n",
1097 | " 1 \n",
1098 | " 0.910 \n",
1099 | " 0 \n",
1100 | " 0 \n",
1101 | " 4 \n",
1102 | " 4 \n",
1103 | " 12 \n",
1104 | " 0 \n",
1105 | " 0.319522 \n",
1106 | " 1 \n",
1107 | " 1.535819 \n",
1108 | " 1 \n",
1109 | " 0 \n",
1110 | " 0 \n",
1111 | " 0 \n",
1112 | " \n",
1113 | " \n",
1114 | "
\n",
1115 | "
15326 rows × 15 columns
\n",
1116 | "
"
1117 | ],
1118 | "text/plain": [
1119 | " city city_development_index relevent_experience enrolled_university \\\n",
1120 | "19147 0 0.624 1 2 \n",
1121 | "8464 0 0.624 0 2 \n",
1122 | "8869 1 0.910 0 0 \n",
1123 | "11645 2 0.722 0 1 \n",
1124 | "7743 3 0.920 1 0 \n",
1125 | "... ... ... ... ... \n",
1126 | "9225 31 0.479 1 2 \n",
1127 | "13123 0 0.624 0 0 \n",
1128 | "9845 3 0.920 0 0 \n",
1129 | "10799 0 0.624 1 0 \n",
1130 | "2732 1 0.910 0 0 \n",
1131 | "\n",
1132 | " education_level major_discipline experience company_size \\\n",
1133 | "19147 4 4 1 3 \n",
1134 | "8464 4 4 0 1 \n",
1135 | "8869 3 4 9 0 \n",
1136 | "11645 3 4 10 4 \n",
1137 | "7743 1 0 2 0 \n",
1138 | "... ... ... ... ... \n",
1139 | "9225 4 4 10 0 \n",
1140 | "13123 4 4 6 6 \n",
1141 | "9845 4 4 6 6 \n",
1142 | "10799 4 4 2 2 \n",
1143 | "2732 4 4 12 0 \n",
1144 | "\n",
1145 | " company_type last_new_job training_hours gender_Male \\\n",
1146 | "19147 0.514746 1 5.371921 1 \n",
1147 | "8464 0.514746 1 6.415291 0 \n",
1148 | "8869 0.514746 1 4.748399 1 \n",
1149 | "11645 0.514746 3 3.753794 0 \n",
1150 | "7743 0.319522 0 5.877477 0 \n",
1151 | "... ... ... ... ... \n",
1152 | "9225 0.319522 5 4.702184 1 \n",
1153 | "13123 0.514746 1 3.829470 1 \n",
1154 | "9845 0.514746 0 5.232979 0 \n",
1155 | "10799 0.319522 1 6.030879 0 \n",
1156 | "2732 0.319522 1 1.535819 1 \n",
1157 | "\n",
1158 | " gender_Missing gender_Other gender_Female \n",
1159 | "19147 0 0 0 \n",
1160 | "8464 1 0 0 \n",
1161 | "8869 0 0 0 \n",
1162 | "11645 1 0 0 \n",
1163 | "7743 1 0 0 \n",
1164 | "... ... ... ... \n",
1165 | "9225 0 0 0 \n",
1166 | "13123 0 0 0 \n",
1167 | "9845 0 1 0 \n",
1168 | "10799 1 0 0 \n",
1169 | "2732 0 0 0 \n",
1170 | "\n",
1171 | "[15326 rows x 15 columns]"
1172 | ]
1173 | },
1174 | "execution_count": 22,
1175 | "metadata": {},
1176 | "output_type": "execute_result"
1177 | }
1178 | ],
1179 | "source": [
1180 | "X_train"
1181 | ]
1182 | },
1183 | {
1184 | "cell_type": "code",
1185 | "execution_count": 23,
1186 | "id": "d86f015b",
1187 | "metadata": {
1188 | "scrolled": true,
1189 | "tags": []
1190 | },
1191 | "outputs": [
1192 | {
1193 | "data": {
1194 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUxElEQVR4nO3df/QldX3f8eeLhQVB5JdW6QIFLcWCIshGEK1RjAFt6tqICUoF0cqxaqLllBQaDx5s09MEYxNajcVIBJsIgqas1GBQMXqIID/k9w/dgMIiBhcFCZ6A4Lt/3Fn2282u3ztf7nzu93v3+Tjnnp2ZO3fu+86Z3dfOzGc+n1QVkiRpeFtNuwBJkrYUhq4kSY0YupIkNWLoSpLUiKErSVIjW0+7gLmOOuqouuSSS6ZdhiSpvUy7gBYW1ZnuunXrpl2CJEmDWVShK0nSLDN0JUlqxNCVJKkRQ1eSpEYMXUmSGjF0JUlqxNCVJKkRQ1eSpEYMXUmSGjF0JUlqJFU17RqesMOz9qnnvvn0aZchSRrTNWccN6lN2feyJEmaHENXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoZPHSTHJXk9iRrkpwy9PdJkrRYDRq6SZYBHwZeDewPvDHJ/kN+pyRJi9XQZ7ovAtZU1R1V9ShwHrBq4O+UJGlR2nrg7a8A7p4zvxY4dO4KSU4ETgRYsdM2/PmOZwxcksa112k3TrsESZopU29IVVVnVdXKqlq56w7Lpl2OJEmDGTp07wH2nDO/R7dMkqQtztChexWwb5J9kiwHjgFWD/ydkiQtSoPe062qx5K8G/gCsAw4u6puHvI7JUlarIZuSEVVfR74/NDfI0nSYjf1hlSSJG0pDF1JkhoxdCVJasTQlSSpEUNXkqRGeoduku2HKESSpFk3dugmOTzJLcBt3fwLknxksMokSZoxfc50/ztwJHA/QFVdD7xsiKIkSZpFvS4vV9XdGy16fIK1SJI00/r0SHV3ksOBSrIN8B7g1mHKkiRp9vQ5030H8C5GY+TeAxzUzUuSpDGMfaZbVeuAYwesRZKkmdan9fI5SXaeM79LkrMHqUqSpBnU5/LygVX1wPqZqvoRcPDEK5IkaUb1Cd2tkuyyfibJrjQYGlCSpFnRJzR/H/h6kguAAEcDvzNIVZIkzaA+DanOTXIN8Ipu0a9W1S3DlCVJ0uzpe3n4NuBH6z+XZK+qumviVUmSNIPGDt0kvwG8H/hbRj1RBSjgwGFKkyRptvQ5030PsF9V3T9UMZIkzbI+rZfvBh4cqhBJkmZdnzPdO4CvJPm/wCPrF1bVhyZelSRJM6hP6N7VvZZ3L0mS1EOfR4ZOB0iyfVX9ZLiSJEmaTX36Xn5xklsYPTZEkhck+chglUmSNGP6NKT6A+BI4H6AqroeeNkANUmSNJP6hC5VdfdGix6fYC2SJM20Pg2p7k5yOFBJtmH03O6tw5QlSdLs6XOm+w7gXcAK4B7goG5ekiSNoU/r5XXAsQPWIknSTJs3dJP8VlX9XpL/waiv5f9PVf3mIJVJkjRjxjnTXX/f9uohC5EkadbNG7pV9bkky4DnV9V/aFCTJEkzaayGVFX1OPCSgWuRJGmm9Xlk6Lokq4ELgIfXL6yqz068KkmSZlCf0N2OUW9UR8xZVoChK0nSGPo8MnTCkIVIkjTr+gx48M+SfCnJTd38gUneN1xpkiTNlj49Un0MOBX4KUBV3QAcM0RRkiTNoj6hu31VfWOjZY9NshhJkmZZn9Bdl+Q5dL1SJTkauHeQqiRJmkF9Wi+/CzgLeG6Se4A7mXBfzMt3P4C9TrPjK0nSbOrTevkO4JeS7ABsVVUPDVeWJEmzp0/r5d2SnAl8DfhKkj9MsttwpUmSNFv63NM9D/gB8Hrg6G76/CGKkiRpFvW5p7t7Vf3nOfP/JcmvT7ogSZJmVZ8z3b9MckySrbrXrwFfGKowSZJmTar+wbj0m14xeQjYAXi8W7SMDQMfVFU97ckWs3Llyrr6alsvS9IWKNMuoIU+rZd3HLIQSZJmXZ/Wy2/baH5ZkvdPviRJkuaXZOck72zwPa9Lsv8kttXnnu4rk3w+ye5JngdcAXj2K0malp2BsUM3I31yb73XARMJ3T6Xl9/UtVa+kdG93DdV1eWTKEKSpAX4b8BzklwHXAYcCOwCbAO8r6ouSrI3o0a/VwKHAK9Jchzwbxg9+no3cE1VfbDr6vjDwDOAnwBvB3YFXgv8Yjey3uur6m8WWvDYoZtkX+A9wGeAfw68Ock3q+onC/1ySZKehFOA51XVQUm2ZjQwz4+TPB24Isnqbr19geOr6ookv8Cov4kXMArna4FruvXOAt5RVd9Ocijwkao6otvOxVV14ZMtuM9zup8D3l1VX0wS4CTgKuCAJ1uEJElPUoD/muRlwM+AFcAzu/e+W1VXdNMvAS6qqr8H/j7J5wCSPBU4HLhgFHEAbDvpIvuE7ouq6scwej4I+P31xUqSNGXHMrosfEhV/TTJd4Dtuvce3uynNtgKeKCqDhqmvA1fMq6nJPl4kksAupZc/2KYsiRJmtdDbGjQuxNwXxe4rwD+yWY+cznwr5Js153d/gpAd1J5Z5I3wBONrl6wie95UvqE7icY3YzevZv/FvDeSRQhSVJfVXU/cHmSm4CDgJVJbgSOA27bzGeuAlYDNwB/wahx8IPd28cCb0tyPXAzsKpbfh5wcpJvdo2tFqxPj1RXVdUvdI2nDu6WXTfJU3F7pJKkLVazHqmSPLWq/i7J9sBXgROr6toW393nnu7D3VB+BZDkMDb870CSpKXirO4W6XbAOa0CF/qF7kmMTsmfk+RyRjesjx6kKkmSBlJVb5rWd/fpHOPaJL8I7MfoMsDtVfXT9e8neVVVXTpAjZIkzYRe3WFV1WNVdXNV3TQ3cDu/O8G6JEmaOQvpg3JztohhmSRJWqhJhu54zaAlSdpCTTJ0JUmaCUmOSnJ7kjVJTtnE+9smOb97/8puYIV59RnwYNuqeuTnLPvOuNvanFvX3s8hJ5/7ZDejReyaM46bdgmSlpBDTj53oldRrznjuHlvhSZZxmi0oVcBa4GrkqyuqlvmrPY24EdV9U+THMOoXdOvz7ftPme6X/95y6rqV3tsS5KkxepFwJqquqOqHmXUI9WqjdZZBZzTTV/IaMz5eQN93jPdJM9iNFrDU5IczIYGU08Dth+vfkmSlowVjMbZXW8tcOjm1qmqx5I8COwGrPt5Gx7n8vKRwFuAPYAPzVn+EPCfxvi8JElijNCtqnOAc5K8vqo+06AmSZKm6R5gzznze3TLNrXO2iRbMxrl6P75NtynG8iLk7wJ2Hvu56rqAz22IUnSYncVsG+SfRiF6zHAxl1HrgaOZ9S26WjgyzXGCEJ9QvciRgMcXAM8Ms+6kiQtSd092nczGs52GXB2Vd2c5APA1VW1Gvg48Mkka4AfMgrmefUZ2u+mqnregn7BmHZ41j713DefPuRXaMp8ZEjSZmwRvRr2eWTor5M8f7BKJEmacX0uL78UeEuSOxldXg5QVXXgIJVJkjRj+oTuqwerQpKkLcDYl5er6ruMmkcf0U3/pM/nJUna0o0dmkneD/xH4NRu0TbA/x6iKEmSZlGfM9V/DbwWeBigqr4H7DhEUZIkzaI+ofto9+BvASTZYZiSJEmaniRnJ7kvyU2beT9JzuyG9bshyQvH3XafhlSfTvK/gJ2TvB14K/CxHp+XJKmXuz7w/IkO7bfXaTeO8zzwJ4D/CWxurNlXA/t2r0OBP+IfDoiwSWOHblV9MMmrgB8D+wGnVdWl435ekqSloKq+Os+g9KuAc7urv1ck2TnJ7lV173zb7jOI/UnA+QatJGkLt6mh/1YA84Zun3u6OwJ/meRrSd6d5Jn9apQkacvW5znd06vqAOBdwO7AXyX54mCVSZK0OI0z9N8mLaRzi/uA7zMaN/AfLeDzkiQtZauB47pWzIcBD45zPxf63dN9J/BrwDOAC4C3V9UtC6lWkqTFKsmngJcDT0+yFng/ow6hqKqPAp8HXgOsYdQ74wnjbrvPI0N7Au+tqut6fEaSpAUb8xGfiaqqN87zfjG61dpbn3u6pwJPTXICQJJnJNlnIV8qSdKWyL6XJUlqxL6XJUlqxL6XJUlqpE/obtz38hex72VJksZm38uSJDXS55EhupA1aCVJWoB5QzfJQ3T3cTd+i9HjSk+beFWSJM2geUO3qmyhLEnSBPTqeznJS+d0jvF0O8eQJGl8T6ZzjOXYOYYkSWOzcwxJkhqxcwxJkhoZK3STBLjYzjEkSVq4sZ7TrapK8gbgJOwcQ5KkBenTOca1wANVdfJQxUiSNMv6hO6hwLFJvkvXmAqgqg6ceFWSJM2gPqF75GBVSJK0Begz4MF3hyxEkqRZ16tHKkmStHCGriRJjQwauknOTnJfkpuG/B5JkpaCoc90PwEcNfB3SJK0JAwaulX1VeCHQ36HJElLRZ9HhgaR5ETgRIAVO23Dn+94xpQram+v026cdgmSpAam3pCqqs6qqpVVtXLXHZZNuxxJkgYz9dCVJGlLYehKktTI0I8MfQr4OrBfkrVJ3jbk90mStJgN2pCqqt445PYlSVpKvLwsSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjW0+7gLmW734Ae5129bTLkCRpEJ7pSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY2kqqZdwxOSPATcPu06ZszTgXXTLmKGuD8nz306WUt1f66rqqOmXcTQFtWAB8DtVbVy2kXMkiRXu08nx/05ee7TyXJ/Lm5eXpYkqRFDV5KkRhZb6J417QJmkPt0styfk+c+nSz35yK2qBpSSZI0yxbbma4kSTPL0JUkqZFFE7pJjkpye5I1SU6Zdj2LVZI9k1yW5JYkNyd5T7d81ySXJvl29+cu3fIkObPbrzckeeGcbR3frf/tJMdP6zctBkmWJflmkou7+X2SXNntt/OTLO+Wb9vNr+ne33vONk7tlt+e5Mgp/ZRFIcnOSS5McluSW5O82GN04ZL8++7v+01JPpVkO4/RJaqqpv4ClgF/AzwbWA5cD+w/7boW4wvYHXhhN70j8C1gf+D3gFO65acAv9tNvwb4CyDAYcCV3fJdgTu6P3fppneZ9u+b4n49Cfgz4OJu/tPAMd30R4F/102/E/hoN30McH43vX933G4L7NMdz8um/bumuD/PAf5tN70c2NljdMH7cgVwJ/CUbv7TwFs8Rpfma7Gc6b4IWFNVd1TVo8B5wKop17QoVdW9VXVtN/0QcCujv5SrGP1DR/fn67rpVcC5NXIFsHOS3YEjgUur6odV9SPgUmDme4PZlCR7AP8S+ONuPsARwIXdKhvvz/X7+ULgld36q4DzquqRqroTWMPouN7iJNkJeBnwcYCqerSqHsBj9MnYGnhKkq2B7YF78RhdkhZL6K4A7p4zv7Zbpp+ju2x0MHAl8Myqurd76/vAM7vpze1b9/kGfwD8FvCzbn434IGqeqybn7tvnthv3fsPduu7PzfYB/gB8CfdJfs/TrIDHqMLUlX3AB8E7mIUtg8C1+AxuiQtltBVT0meCnwGeG9V/XjuezW6luSzYGNI8ivAfVV1zbRrmSFbAy8E/qiqDgYeZnQ5+Qkeo+Pr7n2vYvSfmX8M7MCWe8a/5C2W0L0H2HPO/B7dMm1Ckm0YBe6fVtVnu8V/212So/vzvm755vat+3zkJcBrk3yH0W2NI4A/ZHSJc33f5HP3zRP7rXt/J+B+3J9zrQXWVtWV3fyFjELYY3Rhfgm4s6p+UFU/BT7L6Lj1GF2CFkvoXgXs27XGW87o5v/qKde0KHX3Zj4O3FpVH5rz1mpgfevO44GL5iw/rmshehjwYHeJ7wvALyfZpfuf9C93y7YoVXVqVe1RVXszOu6+XFXHApcBR3erbbw/1+/no7v1q1t+TNdydB9gX+AbjX7GolJV3wfuTrJft+iVwC14jC7UXcBhSbbv/v6v358eo0vRtFtyrX8xasH4LUYt6n572vUs1hfwUkaX5W4Aruter2F0z+ZLwLeBLwK7dusH+HC3X28EVs7Z1lsZNaZYA5ww7d827Rfwcja0Xn42o3+Q1gAXANt2y7fr5td07z97zud/u9vPtwOvnvbvmfK+PAi4ujtO/w+j1sceowvfn6cDtwE3AZ9k1ALZY3QJvuwGUpKkRhbL5WVJkmaeoStJUiOGriRJjRi6kiQ1YuhKktSIoSstQUnem2T7adchqR8fGZKWoK4HrZVVtW7atUgan2e60kCSHNeND3t9kk8m2TvJl7tlX0qyV7feJ5IcPedzf9f9+fIkX5kzLu2fdr02/SajPngvS3LZdH6dpIXYev5VJPWV5ADgfcDhVbUuya6Mhls7p6rOSfJW4Ew2DMe2OQcDBwDfAy4HXlJVZyY5CXiFZ7rS0uKZrjSMI4AL1odiVf0QeDHwZ937n2TUped8vlFVa6vqZ4y6/Nx78qVKasXQlabvMbq/i0m2ApbPee+ROdOP49UpaUkzdKVhfBl4Q5LdALrLy3/NaCQjgGOBr3XT3wEO6aZfC2wzxvYfAnacVLGS2vB/zdIAqurmJL8D/FWSx4FvAr8B/EmSk4EfACd0q38MuCjJ9cAljAZ9n89ZwCVJvldVr5j8L5A0BB8ZkiSpES8vS5LUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY38P3MU1Oc0v5hnAAAAAElFTkSuQmCC\n",
1195 | "text/plain": [
1196 | ""
1197 | ]
1198 | },
1199 | "metadata": {
1200 | "needs_background": "light"
1201 | },
1202 | "output_type": "display_data"
1203 | },
1204 | {
1205 | "data": {
1206 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAVfElEQVR4nO3de9RldX3f8feHuXAZyQCaKhkYGQgNC4zKpYpiE8E0IkWGpVghFlBbiY1GjK2WLF0oriQrCeiqVquLCAoWJIpYkBosBoKJKQQGBOQWJ0B0kIgQBaqGm9/+cfbg42SY52zm7N95OPN+rXXWOfv+PXvtmc+zf3uf305VIUmShrfVtAuQJGlLYehKktSIoStJUiOGriRJjRi6kiQ1snjaBcx16KGH1iWXXDLtMiRJ7WXaBbSwoM5077333mmXIEnSYBZU6EqSNMsMXUmSGjF0JUlqxNCVJKkRQ1eSpEYMXUmSGslCesrQsmetqr2OPWXaZUiSxrTm1OMmtSp/pytJkibH0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgYP3SSHJrktydokJw29PUmSFqpBQzfJIuCjwCuAvYFjkuw95DYlSVqohj7TfQGwtqpur6qHgfOA1QNvU5KkBWno0F0BfHvO8LpunCRJW5zF0y4gyQnACQArli/hC9ufOuWKBLDy5BunXYIkzZyhz3TvAnadM7xLN+5xVXV6VR1QVQfstGzRwOVIkjQ9Q4fu1cCeSVYlWQocDVw08DYlSVqQBm1erqpHk7wV+DKwCDizqm4acpuSJC1Ug1/TraovAV8aejuSJC109kglSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0snnYBcy3deR9WnnzNtMuQJGkQnulKktSIoStJUiNjh26SNUnekmTHIQuSJGlW9TnTfS3wC8DVSc5L8vIkGaguSZJmztihW1Vrq+rdwL8EzgXOBP4+ySlJdhqqQEmSZkWva7pJngt8ADgV+DzwGuAB4LLJlyZJ0mwZ+ydDSdYAPwDOAE6qqoe6SVclOWiA2iRJmil9fqf7mqq6fe6IJKuq6o6qetWE65Ikaeb0aV4+f8xxkiRpI+Y9002yF7APsDzJ3DPanwO2GaowSZJmzTjNy78EHA7sALxyzvgHgTcNUJMkSTNp3tCtqguBC5O8qKr+b4OaJEmaSeM0L7+rqv4Y+I0kx2w4vareNkhlkiTNmHGal2/p3n38jyRJm2Gc5uUvdu9nrR+XZCvgaVX1wIC1SZI0U1JV482YnAu8GXgMuJrR3csfqqpTJ1XMsmetqr2OPWVSqxvLmlOPa7o9SdJGbRF9+ff5ne7e3ZntkcCfAauAY4coSpKkWdQndJckWcIodC+qqkeA8U6TJUlSr9D9OHAnsAz4apJnM3rYgSRJGsNYfS93N059t6pWzBn3LeDgoQqTJGnWjHWmW1U/Ad61wbiqqkcHqUqSpBnUp3n5K0n+S5Jdk+y0/jVYZZIkzZg+j/Z7bff+ljnjCth9cuVIkjS7xg7dqlo1ZCGSJM26sZuXk2yX5D1JTu+G90xy+HClSZI0W/pc0/0k8DDw4m74LuD3Jl6RJEkzqk/o7tE9begRgKr6EVtIt12SJE1Cn9B9OMm2dL1QJdkDeGiQqiRJmkF97l5+H3AJsGuSc4CDgNcPUJMkSTOpz93L/yfJGuBARs3KJ1bVvYNVJknSjBk7dJN8ETiX0cMOfjhcSZIkzaY+13RPA/41cHOS85MclWSbgeqSJGnm9GlevgK4Iski4BDgTcCZjB5mL0mS5tHnRiq6u5dfyahLyP2As4YoSpKkWdTnmu5ngRcwuoP5I8AV3dOHJEnSGPqc6Z4BHFNVjw1VjCRJs2ze0E1ySFVdBiwDVic/2wlVVV0wUG2SJM2Ucc50fxW4jNG13A0VYOhKkjSGeUO3qt7bvb9h+HIkSZpdfW6k2hp4NbDb3OWq6v2TL0uSpNnT50aqC4H7gTX4oANJknrrE7q7VNWhg1UiSdKM69MN5F8n+eXBKpEkqYckOyT5rQbbOTLJ3pNYV5/QfQmwJsltSW5IcmOSGza1QJIzk9yT5BubV6YkSf/MDsDYoZuRPrm33pHAREK3T/PyK57E+j/FqPeqs5/EspIkbcofAnsk+TpwOfBcYEdgCfCeqrowyW7Al4GrgP2Bw5IcB/x74HvAt4E1VXVakj2AjwI/D/yI0TMGdgKOAH41yXuAV1fV3z3ZgvuEbvVdeVV9tfvCkiRN2knAc6rq+UkWA9tV1QNJngFcmeSibr49geOr6sok/4rRL3Gexyicr2V0gzDA6cCbq+qbSV4I/I+qOqRbz8VVdf7mFtwndP83o+ANsA2wCrgN2GdzCkhyAnACwIrlS/jC9qfOu8zKk2/cnE1KkmZPgD9I8ivAT4AVwDO7aX9fVVd2nw8CLqyqfwL+qXtWPEmeBrwY+Nycnhe3nnSRfR7t9zM3USXZjx5t6ZtY7+mM/rrguSu27X02LUkS8DpGzcL7V9UjSe5kdIII8MMxlt8K+EFVPX+Y8n66kSelqq4FXjjBWiRJ6uNBYPvu83Lgni5wDwae/QTLfA14ZZJturPbwwGq6gHgjiSvgcdvunreRrazWfr0SPWOOYNbMXqe7ncmUYQkSX1V1X1Jvtb9QuZqYK8kNwLXALc+wTJXd9dobwC+C9zIqOMnGJ0tf6y7YWoJcB5wfff+J0neBhzV6kaquSn/KKNrvJ/f1AJJPgO8FHhGknXAe6vqjL5FSpK0MVX1G2PM9pwNhk+rqvcl2Q74Kt2NVFV1B/DPOoGqqq/R+idDVXXKpqYn+e9V9dsbLHPMky1MkqSBnN51drENcFZ3ubSJPme68zloguuSJGkQY54dD+JJ30glSZL6MXQlSWpkkqGb+WeRJGnLNcnQ/dAE1yVJ0syZ90aqrousJ+wpqqqO6N4/NbmyJEmaniSHMjqZXAR8oqr+cIPpWzN6mM/+wH3Aa6vqzvnWO87dy6d1768CngX8z274GEY/LJYkaRD7v/PsiXYPvObU4+a9FJpkEaOnDf0bYB1wdZKLqurmObP9B+D7VfWLSY4G/gh47Xzrnjd0q+qKrogPVNUBcyZ9Mck18y0vSdJTzAuAtVV1O0CS84DVwNzQXQ28r/t8PvCRJKmqTf6R0Oea7rIku68fSLIKWNZjeUmSngpWMHrO7nrrunEbnaeqHmXUleTT51txn84xfgf4iyS3M7pT+dnAb/ZYXpKkLVqfbiAvSbInsFc36taqemiYsiRJmpq7gF3nDO/SjdvYPOuSLGb0lKP75lvx2M3LXcfQ7wTeWlXXAyuTHD7u8pIkPUVcDeyZZFWSpcDRwEUbzHMRcHz3+Sjgsvmu50K/a7qfBB4GXtQN3wX8Xo/lJUla8LprtG8FvgzcAny2qm5K8v4kR3SznQE8Pcla4B3ASeOsO2ME82jG5JqqOiDJdVW1bzfu+qp63nzLjuu5K7ati3/zF+edb+XJN05qk5KkhWGL6NWwz5nuw0m2pesoI8kegNd0JUkaU5+7l98LXALsmuQcRo/ye/0QRUmSNIv63L18aZJrgQMZNQOcWFX3DlaZJEkzZpy+l/fbYNTd3fvKJCur6trJlyVJ0uwZ50z3A5uYVsAhE6pFkqSZNk7fywe3KESSpFk3TvPyqzY1vaoumFw5kiRNV5IzgcOBe6rqORuZHkaP/TsM+BHw+nEvtY7TvPzKTUwrwNCVJA3iW+//5Yk+2m/lyTeO83vgTwEfYfS83I15BbBn93oh8LHufV7jNC+/YZwVSZI0C6rqq0l228Qsq4Gzu24fr0yyQ5Kdq+ruTSwD9Ot7eXmSDya5pnt9IMnycZeXJGlGjPPov43q0yPVmcCDwL/rXg8w6o9ZkiSNoU+PVHtU1avnDJ+S5OuTLGbpzvuw8uRrJrlKSZImbZxH/21UnzPdHyd5yfqBJAcBP+6xvCRJs+Ai4LiMHAjcP871XOh3pvtm4Ow513G/z0+fJShJ0kxI8hngpcAzkqxj9OyBJQBV9XHgS4x+LrSW0U+Gxr7heKzQTbIIOLaqnpfk57oNP9DjO0iS1NuYP/GZqKo6Zp7pBbzlyax7rNCtqsfWNy0btpIkPTl9mpevS3IR8Dngh+tH2iOVJEnj6RO62wD38bMPOLBHKkmSxtTnebr2TCVJ0mYYO3ST/DzwJmC3uctV1RsnX5YkSbOnT/PyhcBfAl8BHhumHEmSZlef0N2uqv7rYJVIkjTj+vRIdXGSwwarRJKkGdcndE8Evpjkx0keSPJgEn+zK0nSmPo0Ly8HXgesqqr3J1kJ7DxMWZIkzZ4+Z7ofBQ4E1neP9SDwkYlXJEnSjOpzpvvCqtovyXUAVfX9JEsHqkuSpJnTJ3Qf6R58UPD473Z/Mslibll3H/u/8+xJrnKmrTn1uGmXIEnqoU/z8oeBLwD/IsnvA38F/MEgVUmSNIP6dAN5TpI1wMuAAEdW1S2DVSZJ0ozp07xMVd0K3DpQLZIkzbQ+zcuSJGkzGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4OGbpJdk1ye5OYkNyU5ccjtSZK0kC0eeP2PAv+5qq5Nsj2wJsmlVXXzwNuVJGnBGfRMt6rurqpru88PArcAK4bcpiRJC1Wza7pJdgP2Ba5qtU1JkhaSoZuXAUjyNODzwNur6oENpp0AnACwYvkSvrD9qS1KGsTKk2+cdgmSpAVs8DPdJEsYBe45VXXBhtOr6vSqOqCqDthp2aKhy5EkaWqGvns5wBnALVX1wSG3JUnSQjf0me5BwLHAIUm+3r0OG3ibkiQtSINe062qvwIy5DYkSXqqsEcqSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWpk8bQLmGvpzvuw8uRrpl2GJEmD8ExXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJaiRVNe0aHpfkQeC2adcxA54B3DvtImaA+3Ey3I+bb0vYh/dW1aHTLmJoC6pzDOC2qjpg2kU81SW5xv24+dyPk+F+3Hzuw9lh87IkSY0YupIkNbLQQvf0aRcwI9yPk+F+nAz34+ZzH86IBXUjlSRJs2yhnelKkjSzDF1JkhpZMKGb5NAktyVZm+SkadezkCTZNcnlSW5OclOSE7vxOyW5NMk3u/cdu/FJ8uFuX96QZL856zq+m/+bSY6f1neapiSLklyX5OJueFWSq7r99adJlnbjt+6G13bTd5uzjt/txt+W5OVT+ipTk2SHJOcnuTXJLUle5PHYT5Lf6f49fyPJZ5Js47G4Baiqqb+ARcDfAbsDS4Hrgb2nXddCeQE7A/t1n7cH/hbYG/hj4KRu/EnAH3WfDwP+DAhwIHBVN34n4Pbufcfu847T/n5T2J/vAM4FLu6GPwsc3X3+OPCfus+/BXy8+3w08Kfd5727Y3RrYFV37C6a9vdqvA/PAv5j93kpsIPHY6/9twK4A9h2zjH4eo/F2X8tlDPdFwBrq+r2qnoYOA9YPeWaFoyquruqru0+Pwjcwugf7WpG//nRvR/ZfV4NnF0jVwI7JNkZeDlwaVX9Y1V9H7gUmPkeYOZKsgvwb4FPdMMBDgHO72bZcD+u37/nAy/r5l8NnFdVD1XVHcBaRsfwFiHJcuBXgDMAqurhqvoBHo99LQa2TbIY2A64G4/FmbdQQncF8O05w+u6cdpA16y0L3AV8Myqurub9A/AM7vPT7Q/3c/w34B3AT/php8O/KCqHu2G5+6Tx/dXN/3+bv4tfT+uAr4HfLJrpv9EkmV4PI6tqu4CTgO+xShs7wfW4LE48xZK6GoMSZ4GfB54e1U9MHdaVRXg7782IcnhwD1VtWbatTzFLQb2Az5WVfsCP2TUnPw4j8dN6653r2b0B8wvAMvYss7yt1gLJXTvAnadM7xLN06dJEsYBe45VXVBN/q7XTMd3fs93fgn2p9b+n4+CDgiyZ2MLmEcAnyIUXPn+n7I5+6Tx/dXN305cB/ux3XAuqq6qhs+n1EIezyO79eAO6rqe1X1CHABo+PTY3HGLZTQvRrYs7tzbymjGwUumnJNC0Z37eYM4Jaq+uCcSRcB6+/4PB64cM7447q7Rg8E7u+a/b4M/HqSHbu/tH+9G7dFqKrfrapdqmo3RsfYZVX1OuBy4Khutg334/r9e1Q3f3Xjj+7uKF0F7An8TaOvMXVV9Q/At5P8UjfqZcDNeDz28S3gwCTbdf++1+9Dj8VZN+07uda/GN3h+LeM7r5797TrWUgv4CWMmupuAL7evQ5jdE3nz4FvAl8BdurmD/DRbl/eCBwwZ11vZHSzxVrgDdP+blPcpy/lp3cv787oP6q1wOeArbvx23TDa7vpu89Z/t3d/r0NeMW0v88U9t/zgWu6Y/J/Mbr72OOx3z48BbgV+AbwaUZ3IHsszvjLbiAlSWpkoTQvS5I08wxdSZIaMXQlSWrE0JUkqRFDV5KkRgxd6SkoyduTbDftOiT140+GpKegrletA6rq3mnXIml8nulKA0lyXPf82OuTfDrJbkku68b9eZKV3XyfSnLUnOX+X/f+0iR/Mee5ted0vTq9jVF/vZcnuXw6307Sk7F4/lkk9ZVkH+A9wIur6t4kOzF6NNtZVXVWkjcCH+anj257IvsC+wDfAb4GHFRVH07yDuBgz3SlpxbPdKVhHAJ8bn0oVtU/Ai8Czu2mf5pR957z+ZuqWldVP2HU/eduky9VUiuGrjR9j9L9W0yyFbB0zrSH5nx+DFunpKc0Q1caxmXAa5I8HaBrXv5rRk83Angd8Jfd5zuB/bvPRwBLxlj/g8D2kypWUhv+1SwNoKpuSvL7wBVJHgOuA34b+GSSdwLfA97Qzf4nwIVJrgcuYfRQ+PmcDlyS5DtVdfDkv4GkIfiTIUmSGrF5WZKkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrk/wMYpOnoPJkZCQAAAABJRU5ErkJggg==\n",
1207 | "text/plain": [
1208 | ""
1209 | ]
1210 | },
1211 | "metadata": {
1212 | "needs_background": "light"
1213 | },
1214 | "output_type": "display_data"
1215 | },
1216 | {
1217 | "data": {
1218 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAWiklEQVR4nO3de7BlZX3m8e9D09xbGpAo04hAJFBgkEsLKhYqlopG0UpIBUaFQjM98RK0nCKFE6YzOhNL01MpxVEQr+gYjUoQQqnECEZFuXSD3CF0lBmaoAgGRS2NwG/+2Kvh0HZz9m7Oevelv5+qU2fd9lq/F3bXc9Za71pvqgpJktS/rcZdgCRJWwpDV5KkRgxdSZIaMXQlSWrE0JUkqZGtx13AXMcee2x95StfGXcZkqT2Mu4CWpioM9177rln3CVIktSbiQpdSZJmmaErSVIjhq4kSY1kkl4DueOT96kDXvuOzf78mlUnLWA1kqSG7EglSZIWjqErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY30HrpJjk1ya5K1SU7v+3iSJE2qXkM3ySLgA8BLgQOBE5Mc2OcxJUmaVH2f6R4BrK2q71XVvwOfBV7Z8zElSZpIW/e8/2XAHXPm1wFHzt0gyQpgBcCynRdz/pJVAOy18vqeS5Mkqa2xd6SqqnOqanlVLd91x0XjLkeSpN70Hbp3Ak+ZM79nt0ySpC1O36F7FbBfkn2SbAOcAFzY8zElSZpIvd7TraoHkrwZuBhYBHysqm7s85iSJE2qvjtSUVVfAr7U93EkSZp0Y+9IJUnSlsLQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJaqT3UYZGsc0eB7HXytXjLkOSpF54pitJUiOGriRJjRi6kiQ1YuhKktSIoStJUiOGriRJjRi6kiQ1YuhKktRIqmrcNTxsxyfvUwe89h1Dbbtm1Uk9VyNJaijjLqAFz3QlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRnoN3SQfS3J3khv6PI4kSdOg7zPdTwDH9nwMSZKmQq+hW1XfAH7c5zEkSZoWW4+7gCQrgBUAy3ZezPlLVrHXyuvHXJUkSQtv7B2pquqcqlpeVct33XHRuMuRJKk3Yw9dSZK2FIauJEmN9P3I0GeA7wD7J1mX5PV9Hk+SpEnWa0eqqjqxz/1LkjRNvLwsSVIjhq4kSY0YupIkNWLoSpLUiKErSVIj8/ZeTvJ+oDa1vqpOXdCKJEmaUcM8MrS69yokSdoCzBu6VXXu3PkkO1TVL/orSZKk2TT0Pd0kz05yE3BLN/+MJB/srTJJkmbMKB2p3gu8BLgXoKquBY7uoSZJkmbSSL2Xq+qODRY9uIC1SJI000Z59/IdSZ4DVJLFwFuAm/spS5Kk2TPKme6fAG8ClgF3Aod085IkaQip2uQjuI/eMNm9qn7UZzHLly+v1at9QkmStkAZdwEtjHKme1mSf0jy+iRL+ypIkqRZNXToVtXvAGcABwFXJ7koyWt6q0ySpBkzau/lK6vqbcARwI+Bc+f5iCRJ6ozycownJDk5yZeBbwN3MQhfSZI0hFEeGboW+CLwzqr6Tj/lSJI0u0YJ3X2rqpLs0Fs1kiTNsFHu6T7Ldy9LkrT5fPeyJEmNjHJ5maq6I3nU88sL+u7lm9fdy+GnfXIhdzn11qw6adwlSJIWiO9eliSpEd+9LElSI0Of6VbVPcCre6xFkqSZNm/oJnk/sMlREarq1AWtSJKkGTXMma7D/kiStADmDd2qGur9ykneX1V/+vhLkiRpNo004ME8jlrAfUmSNHMWMnQlSdJjMHQlSWpkIUM3828iSdKWayFD930LuC9JkmbO0C/HSPI7wGnAU+d+rqqO6X5/YqGLkyRplozy7uXPA2cDH2aBBzqQJGlLMEroPlBVZ/VWiSRJM26Ue7p/n+SNSfZIsuv6n94qkyTpMSRZmuSNDY7zqiQHLsS+Rgndkxnc0/02sKb7ecxXRCZ5SpJLk9yU5MYkb9n8UiVJepSlwNChm4HN6UD8KmBBQneUUYb22Yz9PwD8l6q6OskSYE2Sr1bVTZuxL0mS5no38NtJvgtcChwM7AIsBs6oqguS7A1cDFwBHA68LMlJwGuAHwF3AGuq6n8l+W3gA8DuwC+A/wTsChwHPC/JGcAfVNW/bG7Bo/ReXgy8ATi6W/R14ENV9etNfaaq7gLu6qbvT3Izg/F4DV1J0uN1OvD0qjokydbADlX10yRPBC5PcmG33X7AyVV1eZJnAn8APINBOF/N4MotwDnAn1TVbUmOBD5YVcd0+7moqr7weAsepSPVWV2BH+zmX9st++NhPtz9tXEog7825i5fAawAWLbzYs5fsmqEkn7TXiuvf1yflyRNpQDvSnI08BCDE7wndev+b1Vd3k0fBVxQVb8Efpnk7wGS7AQ8B/h88vC7nrZd6CJHCd1nVtUz5sxfkuTaYT7YNeY84K1V9dO566rqHAZ/XXDwsu03OW6vJEmP4dUMLgsfXlW/TnI7sF237udDfH4r4L6qOqSf8h45yLAe7K53A5BkX4Z4Xre7LH0e8Omq+rvRS5QkaaPuB5Z00zsDd3eB+wIGL3LamMuAVyTZrjshfDlAd0L4/SR/CA93ulp/ojn3OI/LKGe6pwGXJvkeg9P4pwKnPNYHMjhH/yhwc1X99WZXKUnSBqrq3iSXJbkBuAo4IMn1DJ6suWUTn7mqu0d7HfBD4HrgJ93qVwNndR2mFgOfBa7tfn84yanA8Y+nI1Wqhr+im2RbYP9u9taq+tU82z8X+CaDRj3ULf6vVfWljW1/8LLt66L//LSh69kY7+lK0lRqNmhOkp2q6mdJdgC+AayoqqtbHHveM90kx1TVJUl+f4NVT0vCY10yrqpv4ehDkqTJck73sovtgHNbBS4Md3n5ecAlwCs2sq4A79NKkqZGVf3HcR173tCtqr/oJt9ZVd+fuy7J5rwwQ5KkLdIovZfP28iyx/2gsCRJW4ph7ukeABwE7LzBfd0n8MgzUJIkaR7D3NPdn8FzTEt59H3d+xm8l1KSJA1hmHu6FwAXJHl2VX2nQU2SJI1VkmOB9wGLgI9U1bs3WL8t8EkGgyjcC/xRVd0+335HeTnGNUnexOBS88OXlavqdSPsQ5KkoR1+2icX9PXAa1adNO9jrEkWMRht6EXAOuCqJBduMELe64F/q6qnJTkBeA/wR/Pte5SOVJ8Cngy8BPgnYE8Gl5glSZolRwBrq+p7VfXvDN5I9coNtnklcG43/QXghZkzUsKmjBK6T6uq/wb8vKrOBX4POHKEz0uSNA2WMRhnd7113bKNblNVDzB4leRu8+14lNBdP27ufUmezuDl0r81wuclSdqijRK65yTZBTgDuJDBQPR/1UtVkiSNz53AU+bM79kt2+g2SbZmcCJ673w7HrojVVV9pJv8BrDvsJ+TJGnKXAXs17118U7gBGDDV0deCJwMfAc4HrikhhhBaOgz3STvSrJ0zvwuSf7nsJ+XJGkadPdo3wxcDNwMfK6qbkzyziTHdZt9FNgtyVrgbcDpw+x76KH9klxTVYdusOzqqjpsyHbMa/ny5bV69eqF2p0kaXpsESPSjXJPd1H3MDAASbYHtn2M7SVJ0hyjvBzj08DXkny8mz+FR55RkiRJ8xilI9V7klwHvLBb9D+q6uJ+ypIkafaMcqZLVX0Z+HJPtUiSNNOGDt0k9wPre11tAyxm8HaqJ/RRmCRJs2aUy8tL1k9375d8JfCsPoqSJGkWjdJ7+WE18EUGgx9IkjQzknwsyd1JbtjE+iQ5M8naJNclGfrR2VEuL//+nNmtgOXAL4f9vCRJo/p/7/zdBR3ab6+V1w/zPPAngP/NYLzcjXkpsF/3cyRwFkMOADRKR6pXzJl+ALid3xzq6HG5ed29HH7aptooDWfNqpPGXYKkKVZV30iy92Ns8krgk91rHy9PsjTJHlV113z7HuWe7inDbitJ0gzb1NB/jz90k7yfR3ot/4aqOnWIAiVJ2uIN05FqNbAG2A44DLit+zmEwaNDkiRtSYYZ+m+j5j3TrapzAZK8AXhuN/oCSc4GvjlyqZIkTbcLgTcn+SyDDlQ/GeZ+LozWkWoX4AnAj7v5nbplkiTNjCSfAZ4PPDHJOuAvGLwQiqo6G/gS8DJgLfALBmMRDGWU0H03cHWSrzMYgulo4L+P8HlJkkYy5CM+C6qqTpxnfQFv2px9j/JyjE8AK4GDgfOA5zEY3FeSJA1hlDPdDwIPAdtX1YVJdmEQvs/spTJJkmbMKKF7ZFUdluQagKr6tyT2XpYkaUijXF7+dZJFdM/sJtmdwZmvJEkawiiheyZwPvBbSf4S+Bbwrl6qkiRpBo3yGshPJ1kDvJBB7+VXVZUdqSRJGtIo93SpqluAW3qqRZKkmbZZ4+lKkqTRGbqSJDXSa+gm2S7JlUmuTXJjknf0eTxJkibZSPd0N8OvgGOq6mdJFgPfSvLlqrq85+NKkjRxeg3d7v2UP+tmF3c/mxybV5KkWdb3mS7dCzXWAE8DPlBVV2ywfgWwAmDZzos5f8mqvkvabHutvH7cJUiSpljvHamq6sGqOoTBIL9HJHn6BuvPqarlVbV81x0X9V2OJElj06z3clXdB1wKHNvqmJIkTZK+ey/vnmRpN7098CJ8uYYkaQvV9z3dPYBzu/u6WwGfq6qLej6mJEkTqe/ey9cBh/Z5DEmSpoVvpJIkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqZG+h/YbyTZ7HMReK1ePuwxJknrhma4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjqapx1/CwHZ+8Tx3w2neMuwxJ0pDWrDppoXaVhdrRJPNMV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWqkSegmWZTkmiQXtTieJEmTqNWZ7luAmxsdS5KkidR76CbZE/g94CN9H0uSpEm2dYNjvBf4M2DJxlYmWQGsAFi282LOX7KqQUmaJXutvH7cJUjSUHo9003ycuDuqlqzqW2q6pyqWl5Vy3fdcVGf5UiSNFZ9X14+Cjguye3AZ4Fjkvyfno8pSdJE6jV0q+rtVbVnVe0NnABcUlWv6fOYkiRNKp/TlSSpkRYdqQCoqq8DX291PEmSJo1nupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNdJslKFhbLPHQey1cvW4y5AkqRee6UqS1IihK0lSI4auJEmNGLqSJDWSqhp3DQ9Lcj9w67jrWGBPBO4ZdxELbNbaNGvtAds0DWatPfD42nRPVR27kMVMoonqvQzcWlXLx13EQkqy2jZNtllrD9imaTBr7YHZbNNC8/KyJEmNGLqSJDUyaaF7zrgL6IFtmnyz1h6wTdNg1toDs9mmBTVRHakkSZplk3amK0nSzDJ0JUlqZGJCN8mxSW5NsjbJ6eOu57Ek+ViSu5PcMGfZrkm+muS27vcu3fIkObNr13VJDpvzmZO77W9LcvI42tLV8ZQklya5KcmNSd4yA23aLsmVSa7t2vSObvk+Sa7oav/bJNt0y7ft5td26/ees6+3d8tvTfKSMTVpfS2LklyT5KJuftrbc3uS65N8N8nqbtnUfu+6WpYm+UKSW5LcnOTZ09qmJPt3/2/W//w0yVuntT0ToarG/gMsAv4F2BfYBrgWOHDcdT1GvUcDhwE3zFn2V8Dp3fTpwHu66ZcBXwYCPAu4olu+K/C97vcu3fQuY2rPHsBh3fQS4J+BA6e8TQF26qYXA1d0tX4OOKFbfjbwhm76jcDZ3fQJwN920wd238dtgX267+miMX733gb8DXBRNz/t7bkdeOIGy6b2e9fVcy7wx930NsDSaW9TV9Mi4AfAU2ehPWP77zjuArr/Ic8GLp4z/3bg7eOua56a9+bRoXsrsEc3vQeDF30AfAg4ccPtgBOBD81Z/qjtxty2C4AXzUqbgB2Aq4EjGbwtZ+sNv3fAxcCzu+mtu+2y4Xdx7nZjaMeewNeAY4CLuvqmtj3d8W/nN0N3ar93wM7A9+k6qc5Cm+bU8GLgsllpz7h+JuXy8jLgjjnz67pl0+RJVXVXN/0D4End9KbaNpFt7i5DHsrgzHCq29Rdiv0ucDfwVQZndfdV1QPdJnPre7j2bv1PgN2YrDa9F/gz4KFufjemuz0ABfxDkjVJVnTLpvl7tw/wI+Dj3W2AjyTZkelu03onAJ/ppmehPWMxKaE7U2rwp9zUPYuVZCfgPOCtVfXTueumsU1V9WBVHcLgDPEI4IDxVrT5krwcuLuq1oy7lgX23Ko6DHgp8KYkR89dOYXfu60Z3Ho6q6oOBX7O4PLrw6awTXR9BY4DPr/humlszzhNSujeCTxlzvye3bJp8sMkewB0v+/ulm+qbRPV5iSLGQTup6vq77rFU92m9arqPuBSBpdflyZZ/87xufU9XHu3fmfgXianTUcBxyW5Hfgsg0vM72N62wNAVd3Z/b4bOJ/BH0fT/L1bB6yrqiu6+S8wCOFpbhMM/ii6uqp+2M1Pe3vGZlJC9ypgv64n5jYMLmNcOOaaRnUhsL5H3skM7ouuX35S16vvWcBPussyFwMvTrJL1/Pvxd2y5pIE+Chwc1X99ZxV09ym3ZMs7aa3Z3CP+mYG4Xt8t9mGbVrf1uOBS7q/4C8ETuh6A+8D7Adc2aQRc1TV26tqz6ram8G/j0uq6tVMaXsAkuyYZMn6aQbflxuY4u9dVf0AuCPJ/t2iFwI3McVt6pzII5eWYfrbMz7jvqm8/odBr7d/ZnDf7c/HXc88tX4GuAv4NYO/bF/P4H7Z14DbgH8Edu22DfCBrl3XA8vn7Od1wNru55Qxtue5DC4PXQd8t/t52ZS36WDgmq5NNwAru+X7MgiZtQwulW3bLd+um1/brd93zr7+vGvrrcBLJ+D793we6b08te3par+2+7lx/b/7af7edbUcAqzuvntfZNBbd2rbBOzI4CrJznOWTW17xv3jayAlSWpkUi4vS5I08wxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdaQp1I73sMO46JI3GR4akKdS9mWp5Vd0z7lokDc8zXaknSU7qxhS9Nsmnkuyd5JJu2deS7NVt94kkx8/53M+6389P8vU8Mjbrp7s3/ZwK/Afg0iSXjqd1kjbH1vNvImlUSQ4CzgCeU1X3JNmVwTir51bVuUleB5wJvGqeXR0KHAT8K3AZcFRVnZnkbcALPNOVpotnulI/jgE+vz4Uq+rHDAZc+Jtu/acYvH5zPldW1bqqeojB6zn3XvhSJbVi6Erj9wDdv8UkWwHbzFn3qznTD+LVKWmqGbpSPy4B/jDJbgDd5eVvMxghCODVwDe76duBw7vp44DFQ+z/fmDJQhUrqQ3/apZ6UFU3JvlL4J+SPMhgxKM/BT6e5DTgR8Ap3eYfBi5Ici3wFQYDn8/nHOArSf61ql6w8C2Q1AcfGZIkqREvL0uS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmN/H/knqrmbsTOwAAAAABJRU5ErkJggg==\n",
1219 | "text/plain": [
1220 | ""
1221 | ]
1222 | },
1223 | "metadata": {
1224 | "needs_background": "light"
1225 | },
1226 | "output_type": "display_data"
1227 | },
1228 | {
1229 | "data": {
1230 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYoUlEQVR4nO3de7SddX3n8ffHXLiENIhSyQQi0bLIgINcUgR1uUSnFi0Drpa2MEoU7cqadqowjs7AqFG6lu104lh1jVWjooiIVoQRqYNaRG1xQBIEwkVKRJTgBfEGOlW5fOeP5znxEJJz9g7nefY+J+/XWnvt5/58N3uHz/k9l9+TqkKSJHXvcaMuQJKkXYWhK0lSTwxdSZJ6YuhKktQTQ1eSpJ7MH3UBkx1//PF1+eWXj7oMSVL/MuoC+jBWLd1777131CVIktSZsQpdSZLmMkNXkqSeZJx6pFq034paedo5oy5j1tm4bvWoS5Ckx8pzupIkaeYYupIk9cTQlSSpJ4auJEk9MXQlSeqJoStJUk8MXUmSemLoSpLUE0NXkqSedB66SY5PcluSzUnO6np/kiSNq05DN8k84F3AC4FDgFOTHNLlPiVJGlddt3SPBjZX1R1V9SvgY8BJHe9TkqSx1PVD7JcBd00a3wI8Y/ICSdYAawCWLVnAJYvXzdjOl6/dNGPbkiTpsRr5hVRVtb6qVlXVqn0WzRt1OZIkdabr0L0bOGDS+P7tNEmSdjldh+61wEFJViRZCJwCXNrxPiVJGkudntOtqgeT/DnwWWAecG5V3dzlPiVJGlddX0hFVX0G+EzX+5EkadyN/EIqSZJ2FYauJEk9MXQlSeqJoStJUk8MXUmSemLoSpLUE0NXkqSeGLqSJPWk884xhrFw6aEsX7th1GVIktQJW7qSJPXE0JUkqSeGriRJPTF0JUnqiaErSVJPUlWjrmGrRfutqJWnnfOYt7Nx3eoZqEaS1KOMuoA+2NKVJKknhq4kST0xdCVJ6omhK0lSTwxdSZJ6YuhKktQTQ1eSpJ4YupIk9cTQlSSpJ4auJEk96TR0k5yb5J4kN3W5H0mSZoOuW7ofAo7veB+SJM0KnYZuVX0Z+FGX+5AkabaYP+oCkqwB1gAsW7KASxave8T85Ws3jaIsSZJm3MgvpKqq9VW1qqpW7bNo3qjLkSSpMyMPXUmSdhWGriRJPen6lqELgf8LHJxkS5JXdrk/SZLGWacXUlXVqV1uX5Kk2cTDy5Ik9cTQlSSpJ4auJEk9MXQlSeqJoStJUk8MXUmSemLoSpLUk5E/8GCyhUsPZfnaDaMuQ5KkTtjSlSSpJ4auJEk9MXQlSeqJoStJUk8MXUmSemLoSpLUk1TVqGvYatF+K2rlaedsd97Gdat7rkaS1KOMuoA+2NKVJKknhq4kST0xdCVJ6omhK0lSTwxdSZJ6YuhKktQTQ1eSpJ4YupIk9WTg0E3jpUnWtuPLkxzdXWmSJM0tw7R0/xY4Fji1Hb8feNdUKyQ5IMmVSW5JcnOSM3ayTkmSZr35Qyz7jKo6MsnXAKrqx0kWTrPOg8B/rqrrkiwGNib5fFXdsrMFS5I0Ww3T0n0gyTygAJLsCzw81QpV9d2quq4dvh+4FVi2k7VKkjSrDdPSfSdwCfCbSd4CnAy8YdCVkxwIHAFcs830NcAagGVLFnDJ4nUsX7tpiLIkSZodBg7dqrogyUbg+TRPg3hxVd06yLpJ9gI+CZxZVfdts931wHqAw5btMT6PPJIkaYYN09IFuB24b2K9JMur6ttTrZBkAU3gXlBVF+9UlZIkzQEDh26SVwFvAr4PPETT2i3gsCnWCfAB4NaqettjK1WSpNltmJbuGcDBVfXDIdZ5FnAasCnJ9e20/1ZVnxliG5IkzQnDhO5dwE+H2XhV/RNNi1iSpF3eMKF7B/DFJH8P/HJiooeNJUkazDCh++32tbB9SZKkIQxzy9A5XRYiSdJcN23oJnl7VZ2Z5NO0vVFNVlUndlKZJElzzCAt3fPb97d2WYgkSXPdtKFbVRvb9y91X44kSXPXIIeXN7Gdw8q0nWNU1Q47x5AkSb82yOHlEzqvQpKkXcAgh5e/NTGcZD/gaJqW77VV9b2ZLGbh0kNZvnbDTG5SkqSxMfDzdJP8CfBV4PdpHut3dZJXdFWYJElzzTCdY7wOOGKi7+UkTwC+ApzbRWGSJM01A7d0gR8C908av7+dJkmSBjBMS3czcE2ST9Gc0z0JuDHJa8A+mCVJms4wofuN9jXhU+374pkrR5Kkucu+lyVJ6kmqttfvxaQFeux7edF+K2rlaY/O9o3rVs/ULiRJ42mXePa6fS9LktSTgfteBjYA/1JVDwMkmQfs1mFtkiTNKcPcMnQFsOek8T2Af5jZciRJmruGCd3dq+pnEyPt8J5TLC9JkiYZJnR/nuTIiZEkRwH/MvMlSZI0Nw1zn+6ZwCeSfIfmKrP9gD/uoihJkuaiYe7TvTbJSuDgdtJtVfVAN2VJkjT3DPOUoT+kOa97E/Bi4OOTDzdLkqSpDXNO941VdX+SZwPPBz4AvLubsiRJmlqSvZP8WQ/7eXGSQ2ZiW8OE7kPt++8B76uqvwcWTrVCkt2TfDXJDUluTmJXkpKkmbI3MHDopjFM7k14MdB76N6d5L00F099JsluA6z/S+B5VfV04HDg+CTH7FSlkiQ90n8Hnprk+iR/k+SKJNcl2ZTkJIAkBya5LcmHgZuAA5K8sZ32T0kuTPLadtmnJrk8ycYk/5hkZZJnAicC69r9PPWxFDzM1ct/BBwPvLWqfpJkKc2D7Xeomo6dJ+7tXdC+pu7sWZKkwZwFPK2qDk8yH9izqu5L8kTg6iSXtssdBLysqq5O8tvAHwBPp8mk64CJnhfXA/+hqm5P8gzgb6vqee12Lquqix5rwdOGbpLfqKr7gN2BL7bT9qFpxW4YYP15NB/ot4B3VdU128xfA6wBWLZkAZcsXsfytZuG/BiSpF1cgL9M8hzgYWAZ8KR23req6up2+FnAp6rqF8Av2of5kGQv4Jk0t8ZObHPGuzoepKX7UeAEmuAsHvkkiAKeMtXKVfUQcHiSvYFLkjytvQJ6Yv56mr8uOGzZHraCJUk74yXAvsBRVfVAkjtpGosAPx9g/ccBP6mqw7sp79c7mVJVndC+r6iqp7TvE68pA3eb7fwEuJLmELUkSY/V/cDidngJcE8buMcBT97BOlcB/6690HcvmkYl7RHdb7a3x05cdPX07eznMRnk8PKU9+JW1XVTrLsv8EB7DngP4HeAvx66SkmStlFVP0xyVZKbgGuBlUk20Zz6/PoO1rm2PUd7I/B9YBPw03b2S4B3J3kDzfnejwE3tO/vS/Jq4OSq+sbO1jzI4eX/2b7vDqxqCwhwGM0HO3aKdZcC57XndR8H/F1VXbazxUqSNFlV/fsBFnvaNuNvrao3J9kT+DLthVRV9U22czS2qq5ihm4ZGuR5uscBJLkYOLKqNrXjTwPePM26NwJHPPYyJUmaMevbzi52B86b6ojtTBvmlqGDJwIXoKpuSvKvO6hJkqTODNg67sQwoXtjkvcDH2nHX0JzTFySJA1gmNA9HfhT4Ix2/MvY97IkSQMb5tF+vwD+pn09SpJPVtUfzFRhkiTNNTvT8fOODHzPriRJu6KZDF17k5IkzQlJjm8firA5yVnbmb9bko+3869JcuAg2x3mnK4kSb066nUfntEG3cZ1qzPdMm3fEu+i6dBpC3Btkkur6pZJi70S+HFV/VaSU2g6fvrj6bY9ky3daT+IJEmzwNHA5qq6o6p+RdMj1UnbLHMScF47fBHw/Ex6UsKODBS6SeYluWCaxf7rINuaysKlh/qEIUnSqC0D7po0vqWdtt1lqupBmq4knzDdhgcK3fZJQU9OsnCKZT43yLYkSdpVDXNO9w7gqraj6K2PSaqqt814VZIkjc7dwAGTxvdvp21vmS1J5tM85eiH0214mHO63wAua9dZPOklSdJcci1wUJIV7RHeU4BLt1nmUuBl7fDJwBeqatqLvobpHOMcgPb5g1TVzwZdV5Kk2aKqHkzy58BngXnAuVV1c5K/ADZU1aXAB4Dzk2wGfkQTzNPKAMHcLNg8Veh8YJ920r3A6qq6eahPM4VVq1bVhg0bZmpzkqTZY5e4A2aY0P0K8PqqurIdfy7wl1X1zJkqZtF+K2rlaefM1OYkSR3buG71TG1qlwjdYc7pLpoIXICq+iKwaMYrkiRpjhrq6uUkb6Q5xAzwUpormiVJ0gCGaem+AtgXuLh97dtOkyRJAxjm6uUfA6/usBZJkua0aUM3ydur6swkn+bRTxIqmkul31tVV3dRoCRJc8UgLd2Jc7hv3cH8JwLnAofMSEWSJI1QknOBE4B7qupp25kf4B3Ai4D/B7y8qq4bZNvThm5VbWzfvzRFgb8aZGeSJA3j23/xb2b00X7L124a5NakDwH/C/jwDua/EDiofT0DeHf7Pq2BL6RKclCSi5LckuSOiRdAVX160O1IkjTOqurLNKdOd+Qk4MPVuBrYO8nSQbY9zNXLH6RJ8weB42j+AvjIEOtLkjQXDPLov+0aJnT3qKoraHqx+lZVvRn4vSHWlyRplzZM5xi/TPI44Pa2I+i7gb0GWTHJPGADcHdVnTB8mZIkjY1BHv23XcO0dM8A9qS5V/comh6pBu108wzg1iH2JUnSuLoUWJ3GMcBPq+q7g6w4TEu3aG4fejKwoJ32PuCwqVZKsj/NYei3AK8ZYn+SJPUuyYXAc4EnJtkCvIk296rqPcBnaG4X2kxzy9DpA297iKcM3Qa8DtgEPDwxvaq+Nc16FwF/RfPA+9due3g5yRpgDcCyJQuO+sprDh60ds1Ry9duGnUJkvq3SzxlaJiW7g/aB/cOLMnEzcUb20cBPkpVrQfWAxy2bI8ZvR9LkqRxMkzovinJ+4ErgF9OTKyqi6dY51nAiUleBOwO/EaSj1TVS3eqWkmSZrFhQvd0YCXNce2Jw8tF88Sh7aqqs4GzYetD719r4EqSdlXDhO5vV5UnXCVJ2knD3DL0lSQ7/VCDqvqi9+hKknZlw7R0jwGuT/JNmnO6AaqqprxlSJIkNYYJ3eM7q0KSpF3AwKE73f24kiRpasOc05UkSY+BoStJUk8MXUmSemLoSpLUk2GuXu7cwqWHsnzthlGXIUlSJ2zpSpLUE0NXkqSeGLqSJPXE0JUkqSeGriRJPTF0JUnqSapq1DVstWi/FbXytHOmXW7jutU9VCNJ6lFGXUAfbOlKktQTQ1eSpJ4YupIk9cTQlSSpJ4auJEk9MXQlSeqJoStJUk8MXUmSemLoSpLUk84fYp/kTuB+4CHgwapa1fU+JUkaR52Hbuu4qrq3p31JkjSWPLwsSVJP+mjpFvC5JAW8t6rWT56ZZA2wBmDZkgVcsnjd1nnL127qoTxJkvrRR+g+u6ruTvKbwOeTfL2qvjwxsw3h9QCHLdtjfB55JEnSDOv88HJV3d2+3wNcAhzd9T4lSRpHnYZukkVJFk8MAy8Abupyn5IkjauuDy8/CbgkycS+PlpVl3e8T0mSxlKnoVtVdwBP73IfkiTNFt4yJElSTwxdSZJ6YuhKktQTQ1eSpJ4YupIk9cTQlSSpJ4auJEk9MXQlSepJX8/THcjCpYeyfO2GUZchSVInbOlKktQTQ1eSpJ4YupIk9cTQlSSpJ4auJEk9MXQlSepJqmrUNWy1aL8VtfK0c6ZcZuO61T1VI0nqUUZdQB9s6UqS1BNDV5Kknhi6kiT1xNCVJKknhq4kST0xdCVJ6omhK0lSTwxdSZJ6YuhKktSTzkM3yd5JLkry9SS3Jjm2631KkjSO5vewj3cAl1fVyUkWAnv2sE9JksZOp6GbZAnwHODlAFX1K+BXXe5TkqRx1XVLdwXwA+CDSZ4ObATOqKqfTyyQZA2wBmDZkgVcsnjd1pWXr93UcXmSJPWn63O684EjgXdX1RHAz4GzJi9QVeuralVVrdpn0byOy5EkaXS6Dt0twJaquqYdv4gmhCVJ2uV0GrpV9T3griQHt5OeD9zS5T4lSRpXfVy9/CrggvbK5TuA03vYpyRJY6fz0K2q64FVXe9HkqRxZ49UkiT1xNCVJKknhq4kST0xdCVJ6omhK0lSTwxdSZJ6YuhKktSTPjrHGNjCpYeyfO2GUZchSVInbOlKktQTQ1eSpJ4YupIk9SRVNeoatkpyP3DbqOsY0BOBe0ddxICsdebNljrBWrswW+qE2VPrvVV1/KiL6NpYXUgF3FZVs+LhCEk2WOvMmy21zpY6wVq7MFvqhNlV667Aw8uSJPXE0JUkqSfjFrrrR13AEKy1G7Ol1tlSJ1hrF2ZLnTC7ap3zxupCKkmS5rJxa+lKkjRnGbqSJPVkbEI3yfFJbkuyOclZI6rh3CT3JLlp0rR9knw+ye3t++Pb6UnyzrbeG5McOWmdl7XL357kZR3UeUCSK5PckuTmJGeMca27J/lqkhvaWs9pp69Ick1b08eTLGyn79aOb27nHzhpW2e3029L8rszXWu7j3lJvpbksjGv884km5Jcn2RDO23svv92H3snuSjJ15PcmuTYcaw1ycHtf8+J131JzhzTWv9T++/ppiQXtv/OxvK3qm1U1chfwDzgG8BTgIXADcAhI6jjOcCRwE2Tpv0P4Kx2+Czgr9vhFwH/BwhwDHBNO30f4I72/fHt8ONnuM6lwJHt8GLgn4FDxrTWAHu1wwuAa9oa/g44pZ3+HuBP2+E/A97TDp8CfLwdPqT9XewGrGh/L/M6+A28BvgocFk7Pq513gk8cZtpY/f9t/s5D/iTdnghsPe41jqp5nnA94Anj1utwDLgm8Aek36jLx/X36qvbb6/URfQfvnHAp+dNH42cPaIajmQR4bubcDSdngpTQceAO8FTt12OeBU4L2Tpj9iuY5q/hTwO+NeK7AncB3wDJoecuZv+/0DnwWObYfnt8tl29/E5OVmsL79gSuA5wGXtfsduzrb7d7Jo0N37L5/YAlNQGTca92mvhcAV41jrTShexdNqM9vf6u/O66/VV+PfI3L4eWJH9GELe20cfCkqvpuO/w94Ent8I5q7vWztIeKjqBpQY5lre0h2+uBe4DP0/xF/ZOqenA7+91aUzv/p8ATeqr17cB/AR5ux58wpnUCFPC5JBuTrGmnjeP3vwL4AfDB9rD9+5MsGtNaJzsFuLAdHqtaq+pu4K3At4Hv0vz2NjK+v1VNMi6hOytU8+fg2NxjlWQv4JPAmVV13+R541RrVT1UVYfTtCSPBlaOtqJHS3ICcE9VbRx1LQN6dlUdCbwQ+I9JnjN55hh9//NpTtm8u6qOAH5Oc4h2qzGqFYD2XOiJwCe2nTcOtbbnlE+i+YPmXwGLgDnfZ/FcMS6hezdwwKTx/dtp4+D7SZYCtO/3tNN3VHMvnyXJAprAvaCqLh7nWidU1U+AK2kOfe2dZKLv78n73VpTO38J8MMean0WcGKSO4GP0RxifscY1glsbe1QVfcAl9D8MTOO3/8WYEtVXdOOX0QTwuNY64QXAtdV1ffb8XGr9d8C36yqH1TVA8DFNL/fsfyt6pHGJXSvBQ5qr75bSHNo59IR1zThUmDi6sOX0Zw/nZi+ur2C8Rjgp+0hqM8CL0jy+PYv0he002ZMkgAfAG6tqreNea37Jtm7Hd6D5tzzrTThe/IOap34DCcDX2hbF5cCp7RXYq4ADgK+OlN1VtXZVbV/VR1I8/v7QlW9ZNzqBEiyKMniiWGa7+0mxvD7r6rvAXclObid9HzglnGsdZJT+fWh5YmaxqnWbwPHJNmz/X/BxH/TsfutajtGfVJ54kVzJeA/05zve/2IariQ5hzJAzR/ob+S5tzHFcDtwD8A+7TLBnhXW+8mYNWk7bwC2Ny+Tu+gzmfTHOK6Ebi+fb1oTGs9DPhaW+tNwNp2+lNo/oFvpjmMt1s7ffd2fHM7/ymTtvX69jPcBryww9/Bc/n11ctjV2db0w3t6+aJfy/j+P23+zgc2ND+Bv43zRW941rrIppW4JJJ08auVuAc4Ovtv6nzaa5AHrvfqq9Hv+wGUpKknozL4WVJkuY8Q1eSpJ4YupIk9cTQlSSpJ4auJEk9MXSlWSjN02/2HHUdkobjLUPSLNT2nLWqqu4ddS2SBmdLV+pIktXtc1ZvSHJ+kgOTfKGddkWS5e1yH0py8qT1fta+PzfJF/PrZ9Fe0PZ+9GqaPnevTHLlaD6dpJ0xf/pFJA0ryaHAG4BnVtW9Sfahea7seVV1XpJXAO8EXjzNpo4ADgW+A1wFPKuq3pnkNcBxtnSl2cWWrtSN5wGfmAjFqvoRzYMePtrOP5+mO8/pfLWqtlTVwzTdfR4486VK6ouhK43eg7T/FpM8Dlg4ad4vJw0/hEenpFnN0JW68QXgD5M8AaA9vPwVmicYAbwE+Md2+E7gqHb4RGDBANu/H1g8U8VK6od/NUsdqKqbk7wF+FKSh2ietPQq4INJXgf8ADi9Xfx9wKeS3ABcTvOg9+msBy5P8p2qOm7mP4GkLnjLkCRJPfHwsiRJPTF0JUnqiaErSVJPDF1Jknpi6EqS1BNDV5Kknhi6kiT15P8DrSIpalWeOyUAAAAASUVORK5CYII=\n",
1231 | "text/plain": [
1232 | ""
1233 | ]
1234 | },
1235 | "metadata": {
1236 | "needs_background": "light"
1237 | },
1238 | "output_type": "display_data"
1239 | }
1240 | ],
1241 | "source": [
1242 | "X_train_ = X_train.copy()\n",
1243 | "X_train_['target'] = y_train\n",
1244 | "for var in cat_vars_ordinal:\n",
1245 | " sns.catplot(y=var, hue='target', data=X_train_, kind=\"count\", height=4, aspect=1.5)\n",
1246 | " plt.show()"
1247 | ]
1248 | },
1249 | {
1250 | "cell_type": "markdown",
1251 | "id": "f51aeebc-1104-47ec-a56a-661b8acb625d",
1252 | "metadata": {},
1253 | "source": [
1254 | "## Feature Scaling"
1255 | ]
1256 | },
1257 | {
1258 | "cell_type": "code",
1259 | "execution_count": 24,
1260 | "id": "b29e497f",
1261 | "metadata": {},
1262 | "outputs": [],
1263 | "source": [
1264 | "min_max_scaler = MinMaxScaler()\n",
1265 | "min_max_scaler.fit(X_train) \n",
1266 | "\n",
1267 | "X_train = pd.DataFrame(min_max_scaler.transform(X_train), columns=X_train.columns)\n",
1268 | "\n",
1269 | "X_test = pd.DataFrame(min_max_scaler.transform(X_test), columns=X_train.columns)"
1270 | ]
1271 | },
1272 | {
1273 | "cell_type": "code",
1274 | "execution_count": 25,
1275 | "id": "28fdef63-9f00-484f-a11f-7b7b89c6b861",
1276 | "metadata": {},
1277 | "outputs": [
1278 | {
1279 | "data": {
1280 | "text/html": [
1281 | "\n",
1282 | "\n",
1295 | "
\n",
1296 | " \n",
1297 | " \n",
1298 | " \n",
1299 | " city \n",
1300 | " city_development_index \n",
1301 | " relevent_experience \n",
1302 | " enrolled_university \n",
1303 | " education_level \n",
1304 | " major_discipline \n",
1305 | " experience \n",
1306 | " company_size \n",
1307 | " company_type \n",
1308 | " last_new_job \n",
1309 | " training_hours \n",
1310 | " gender_Male \n",
1311 | " gender_Missing \n",
1312 | " gender_Other \n",
1313 | " gender_Female \n",
1314 | " \n",
1315 | " \n",
1316 | " \n",
1317 | " \n",
1318 | " 0 \n",
1319 | " 0.000000 \n",
1320 | " 0.351297 \n",
1321 | " 1.0 \n",
1322 | " 1.0 \n",
1323 | " 1.00 \n",
1324 | " 0.666667 \n",
1325 | " 0.047619 \n",
1326 | " 0.375 \n",
1327 | " 1.000000 \n",
1328 | " 0.2 \n",
1329 | " 0.551260 \n",
1330 | " 1.0 \n",
1331 | " 0.0 \n",
1332 | " 0.0 \n",
1333 | " 0.0 \n",
1334 | " \n",
1335 | " \n",
1336 | " 1 \n",
1337 | " 0.000000 \n",
1338 | " 0.351297 \n",
1339 | " 0.0 \n",
1340 | " 1.0 \n",
1341 | " 1.00 \n",
1342 | " 0.666667 \n",
1343 | " 0.000000 \n",
1344 | " 0.125 \n",
1345 | " 1.000000 \n",
1346 | " 0.2 \n",
1347 | " 0.675148 \n",
1348 | " 0.0 \n",
1349 | " 1.0 \n",
1350 | " 0.0 \n",
1351 | " 0.0 \n",
1352 | " \n",
1353 | " \n",
1354 | " 2 \n",
1355 | " 0.008197 \n",
1356 | " 0.922156 \n",
1357 | " 0.0 \n",
1358 | " 0.0 \n",
1359 | " 0.75 \n",
1360 | " 0.666667 \n",
1361 | " 0.428571 \n",
1362 | " 0.000 \n",
1363 | " 1.000000 \n",
1364 | " 0.2 \n",
1365 | " 0.477224 \n",
1366 | " 1.0 \n",
1367 | " 0.0 \n",
1368 | " 0.0 \n",
1369 | " 0.0 \n",
1370 | " \n",
1371 | " \n",
1372 | " 3 \n",
1373 | " 0.016393 \n",
1374 | " 0.546906 \n",
1375 | " 0.0 \n",
1376 | " 0.5 \n",
1377 | " 0.75 \n",
1378 | " 0.666667 \n",
1379 | " 0.476190 \n",
1380 | " 0.500 \n",
1381 | " 1.000000 \n",
1382 | " 0.6 \n",
1383 | " 0.359127 \n",
1384 | " 0.0 \n",
1385 | " 1.0 \n",
1386 | " 0.0 \n",
1387 | " 0.0 \n",
1388 | " \n",
1389 | " \n",
1390 | " 4 \n",
1391 | " 0.024590 \n",
1392 | " 0.942116 \n",
1393 | " 1.0 \n",
1394 | " 0.0 \n",
1395 | " 0.25 \n",
1396 | " 0.000000 \n",
1397 | " 0.095238 \n",
1398 | " 0.000 \n",
1399 | " 0.615819 \n",
1400 | " 0.0 \n",
1401 | " 0.611289 \n",
1402 | " 0.0 \n",
1403 | " 1.0 \n",
1404 | " 0.0 \n",
1405 | " 0.0 \n",
1406 | " \n",
1407 | " \n",
1408 | " ... \n",
1409 | " ... \n",
1410 | " ... \n",
1411 | " ... \n",
1412 | " ... \n",
1413 | " ... \n",
1414 | " ... \n",
1415 | " ... \n",
1416 | " ... \n",
1417 | " ... \n",
1418 | " ... \n",
1419 | " ... \n",
1420 | " ... \n",
1421 | " ... \n",
1422 | " ... \n",
1423 | " ... \n",
1424 | " \n",
1425 | " \n",
1426 | " 15321 \n",
1427 | " 0.254098 \n",
1428 | " 0.061876 \n",
1429 | " 1.0 \n",
1430 | " 1.0 \n",
1431 | " 1.00 \n",
1432 | " 0.666667 \n",
1433 | " 0.476190 \n",
1434 | " 0.000 \n",
1435 | " 0.615819 \n",
1436 | " 1.0 \n",
1437 | " 0.471737 \n",
1438 | " 1.0 \n",
1439 | " 0.0 \n",
1440 | " 0.0 \n",
1441 | " 0.0 \n",
1442 | " \n",
1443 | " \n",
1444 | " 15322 \n",
1445 | " 0.000000 \n",
1446 | " 0.351297 \n",
1447 | " 0.0 \n",
1448 | " 0.0 \n",
1449 | " 1.00 \n",
1450 | " 0.666667 \n",
1451 | " 0.285714 \n",
1452 | " 0.750 \n",
1453 | " 1.000000 \n",
1454 | " 0.2 \n",
1455 | " 0.368112 \n",
1456 | " 1.0 \n",
1457 | " 0.0 \n",
1458 | " 0.0 \n",
1459 | " 0.0 \n",
1460 | " \n",
1461 | " \n",
1462 | " 15323 \n",
1463 | " 0.024590 \n",
1464 | " 0.942116 \n",
1465 | " 0.0 \n",
1466 | " 0.0 \n",
1467 | " 1.00 \n",
1468 | " 0.666667 \n",
1469 | " 0.285714 \n",
1470 | " 0.750 \n",
1471 | " 1.000000 \n",
1472 | " 0.0 \n",
1473 | " 0.534762 \n",
1474 | " 0.0 \n",
1475 | " 0.0 \n",
1476 | " 1.0 \n",
1477 | " 0.0 \n",
1478 | " \n",
1479 | " \n",
1480 | " 15324 \n",
1481 | " 0.000000 \n",
1482 | " 0.351297 \n",
1483 | " 1.0 \n",
1484 | " 0.0 \n",
1485 | " 1.00 \n",
1486 | " 0.666667 \n",
1487 | " 0.095238 \n",
1488 | " 0.250 \n",
1489 | " 0.615819 \n",
1490 | " 0.2 \n",
1491 | " 0.629503 \n",
1492 | " 0.0 \n",
1493 | " 1.0 \n",
1494 | " 0.0 \n",
1495 | " 0.0 \n",
1496 | " \n",
1497 | " \n",
1498 | " 15325 \n",
1499 | " 0.008197 \n",
1500 | " 0.922156 \n",
1501 | " 0.0 \n",
1502 | " 0.0 \n",
1503 | " 1.00 \n",
1504 | " 0.666667 \n",
1505 | " 0.571429 \n",
1506 | " 0.000 \n",
1507 | " 0.615819 \n",
1508 | " 0.2 \n",
1509 | " 0.095769 \n",
1510 | " 1.0 \n",
1511 | " 0.0 \n",
1512 | " 0.0 \n",
1513 | " 0.0 \n",
1514 | " \n",
1515 | " \n",
1516 | "
\n",
1517 | "
15326 rows × 15 columns
\n",
1518 | "
"
1519 | ],
1520 | "text/plain": [
1521 | " city city_development_index relevent_experience \\\n",
1522 | "0 0.000000 0.351297 1.0 \n",
1523 | "1 0.000000 0.351297 0.0 \n",
1524 | "2 0.008197 0.922156 0.0 \n",
1525 | "3 0.016393 0.546906 0.0 \n",
1526 | "4 0.024590 0.942116 1.0 \n",
1527 | "... ... ... ... \n",
1528 | "15321 0.254098 0.061876 1.0 \n",
1529 | "15322 0.000000 0.351297 0.0 \n",
1530 | "15323 0.024590 0.942116 0.0 \n",
1531 | "15324 0.000000 0.351297 1.0 \n",
1532 | "15325 0.008197 0.922156 0.0 \n",
1533 | "\n",
1534 | " enrolled_university education_level major_discipline experience \\\n",
1535 | "0 1.0 1.00 0.666667 0.047619 \n",
1536 | "1 1.0 1.00 0.666667 0.000000 \n",
1537 | "2 0.0 0.75 0.666667 0.428571 \n",
1538 | "3 0.5 0.75 0.666667 0.476190 \n",
1539 | "4 0.0 0.25 0.000000 0.095238 \n",
1540 | "... ... ... ... ... \n",
1541 | "15321 1.0 1.00 0.666667 0.476190 \n",
1542 | "15322 0.0 1.00 0.666667 0.285714 \n",
1543 | "15323 0.0 1.00 0.666667 0.285714 \n",
1544 | "15324 0.0 1.00 0.666667 0.095238 \n",
1545 | "15325 0.0 1.00 0.666667 0.571429 \n",
1546 | "\n",
1547 | " company_size company_type last_new_job training_hours gender_Male \\\n",
1548 | "0 0.375 1.000000 0.2 0.551260 1.0 \n",
1549 | "1 0.125 1.000000 0.2 0.675148 0.0 \n",
1550 | "2 0.000 1.000000 0.2 0.477224 1.0 \n",
1551 | "3 0.500 1.000000 0.6 0.359127 0.0 \n",
1552 | "4 0.000 0.615819 0.0 0.611289 0.0 \n",
1553 | "... ... ... ... ... ... \n",
1554 | "15321 0.000 0.615819 1.0 0.471737 1.0 \n",
1555 | "15322 0.750 1.000000 0.2 0.368112 1.0 \n",
1556 | "15323 0.750 1.000000 0.0 0.534762 0.0 \n",
1557 | "15324 0.250 0.615819 0.2 0.629503 0.0 \n",
1558 | "15325 0.000 0.615819 0.2 0.095769 1.0 \n",
1559 | "\n",
1560 | " gender_Missing gender_Other gender_Female \n",
1561 | "0 0.0 0.0 0.0 \n",
1562 | "1 1.0 0.0 0.0 \n",
1563 | "2 0.0 0.0 0.0 \n",
1564 | "3 1.0 0.0 0.0 \n",
1565 | "4 1.0 0.0 0.0 \n",
1566 | "... ... ... ... \n",
1567 | "15321 0.0 0.0 0.0 \n",
1568 | "15322 0.0 0.0 0.0 \n",
1569 | "15323 0.0 1.0 0.0 \n",
1570 | "15324 1.0 0.0 0.0 \n",
1571 | "15325 0.0 0.0 0.0 \n",
1572 | "\n",
1573 | "[15326 rows x 15 columns]"
1574 | ]
1575 | },
1576 | "execution_count": 25,
1577 | "metadata": {},
1578 | "output_type": "execute_result"
1579 | }
1580 | ],
1581 | "source": [
1582 | "X_train"
1583 | ]
1584 | },
1585 | {
1586 | "cell_type": "code",
1587 | "execution_count": null,
1588 | "id": "3d88ee15-f39c-428a-a919-a5b51262bd37",
1589 | "metadata": {},
1590 | "outputs": [],
1591 | "source": []
1592 | }
1593 | ],
1594 | "metadata": {
1595 | "kernelspec": {
1596 | "display_name": "Python 3 (ipykernel)",
1597 | "language": "python",
1598 | "name": "python3"
1599 | },
1600 | "language_info": {
1601 | "codemirror_mode": {
1602 | "name": "ipython",
1603 | "version": 3
1604 | },
1605 | "file_extension": ".py",
1606 | "mimetype": "text/x-python",
1607 | "name": "python",
1608 | "nbconvert_exporter": "python",
1609 | "pygments_lexer": "ipython3",
1610 | "version": "3.9.10"
1611 | }
1612 | },
1613 | "nbformat": 4,
1614 | "nbformat_minor": 5
1615 | }
1616 |
--------------------------------------------------------------------------------
/notebooks/3. Feature Engineering Pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "id": "7c6d8ff9-e3b1-4abe-8ac9-6271a5598527",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import scipy.stats as stats\n",
13 | "\n",
14 | "from sklearn.model_selection import train_test_split\n",
15 | "from sklearn.preprocessing import MinMaxScaler\n",
16 | "from sklearn.pipeline import Pipeline\n",
17 | "\n",
18 | "from feature_engine.imputation import (\n",
19 | " CategoricalImputer,\n",
20 | ")\n",
21 | "\n",
22 | "from feature_engine.transformation import (\n",
23 | " YeoJohnsonTransformer,\n",
24 | ")\n",
25 | "\n",
26 | "from feature_engine.encoding import (\n",
27 | " RareLabelEncoder,\n",
28 | " OrdinalEncoder,\n",
29 | " OneHotEncoder,\n",
30 | " CountFrequencyEncoder\n",
31 | ")\n",
32 | "\n",
33 | "import joblib\n",
34 | "\n",
35 | "import preprocess as pp"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "id": "ad1fb6dc-c5cf-4306-8adf-71c51ac67e0a",
41 | "metadata": {},
42 | "source": [
43 | "## Read Data"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 21,
49 | "id": "08560625-7ac7-48fa-8d03-b9b3f8b17f59",
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "(19158, 14)\n"
57 | ]
58 | },
59 | {
60 | "data": {
61 | "text/html": [
62 | "\n",
63 | "\n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | " \n",
80 | " enrollee_id \n",
81 | " city \n",
82 | " city_development_index \n",
83 | " gender \n",
84 | " relevent_experience \n",
85 | " enrolled_university \n",
86 | " education_level \n",
87 | " major_discipline \n",
88 | " experience \n",
89 | " company_size \n",
90 | " company_type \n",
91 | " last_new_job \n",
92 | " training_hours \n",
93 | " target \n",
94 | " \n",
95 | " \n",
96 | " \n",
97 | " \n",
98 | " 0 \n",
99 | " 8949 \n",
100 | " city_103 \n",
101 | " 0.920 \n",
102 | " Male \n",
103 | " Has relevent experience \n",
104 | " no_enrollment \n",
105 | " Graduate \n",
106 | " STEM \n",
107 | " >20 \n",
108 | " NaN \n",
109 | " NaN \n",
110 | " 1 \n",
111 | " 36 \n",
112 | " 1.0 \n",
113 | " \n",
114 | " \n",
115 | " 1 \n",
116 | " 29725 \n",
117 | " city_40 \n",
118 | " 0.776 \n",
119 | " Male \n",
120 | " No relevent experience \n",
121 | " no_enrollment \n",
122 | " Graduate \n",
123 | " STEM \n",
124 | " 15 \n",
125 | " 50-99 \n",
126 | " Pvt Ltd \n",
127 | " >4 \n",
128 | " 47 \n",
129 | " 0.0 \n",
130 | " \n",
131 | " \n",
132 | " 2 \n",
133 | " 11561 \n",
134 | " city_21 \n",
135 | " 0.624 \n",
136 | " NaN \n",
137 | " No relevent experience \n",
138 | " Full time course \n",
139 | " Graduate \n",
140 | " STEM \n",
141 | " 5 \n",
142 | " NaN \n",
143 | " NaN \n",
144 | " never \n",
145 | " 83 \n",
146 | " 0.0 \n",
147 | " \n",
148 | " \n",
149 | " 3 \n",
150 | " 33241 \n",
151 | " city_115 \n",
152 | " 0.789 \n",
153 | " NaN \n",
154 | " No relevent experience \n",
155 | " NaN \n",
156 | " Graduate \n",
157 | " Business Degree \n",
158 | " <1 \n",
159 | " NaN \n",
160 | " Pvt Ltd \n",
161 | " never \n",
162 | " 52 \n",
163 | " 1.0 \n",
164 | " \n",
165 | " \n",
166 | " 4 \n",
167 | " 666 \n",
168 | " city_162 \n",
169 | " 0.767 \n",
170 | " Male \n",
171 | " Has relevent experience \n",
172 | " no_enrollment \n",
173 | " Masters \n",
174 | " STEM \n",
175 | " >20 \n",
176 | " 50-99 \n",
177 | " Funded Startup \n",
178 | " 4 \n",
179 | " 8 \n",
180 | " 0.0 \n",
181 | " \n",
182 | " \n",
183 | "
\n",
184 | "
"
185 | ],
186 | "text/plain": [
187 | " enrollee_id city city_development_index gender \\\n",
188 | "0 8949 city_103 0.920 Male \n",
189 | "1 29725 city_40 0.776 Male \n",
190 | "2 11561 city_21 0.624 NaN \n",
191 | "3 33241 city_115 0.789 NaN \n",
192 | "4 666 city_162 0.767 Male \n",
193 | "\n",
194 | " relevent_experience enrolled_university education_level \\\n",
195 | "0 Has relevent experience no_enrollment Graduate \n",
196 | "1 No relevent experience no_enrollment Graduate \n",
197 | "2 No relevent experience Full time course Graduate \n",
198 | "3 No relevent experience NaN Graduate \n",
199 | "4 Has relevent experience no_enrollment Masters \n",
200 | "\n",
201 | " major_discipline experience company_size company_type last_new_job \\\n",
202 | "0 STEM >20 NaN NaN 1 \n",
203 | "1 STEM 15 50-99 Pvt Ltd >4 \n",
204 | "2 STEM 5 NaN NaN never \n",
205 | "3 Business Degree <1 NaN Pvt Ltd never \n",
206 | "4 STEM >20 50-99 Funded Startup 4 \n",
207 | "\n",
208 | " training_hours target \n",
209 | "0 36 1.0 \n",
210 | "1 47 0.0 \n",
211 | "2 83 0.0 \n",
212 | "3 52 1.0 \n",
213 | "4 8 0.0 "
214 | ]
215 | },
216 | "execution_count": 21,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "data = pd.read_csv('../src/data/train.csv')\n",
223 | "print(data.shape)\n",
224 | "data.head()"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "id": "2e71c3f7-ea54-44b9-ad7f-6d7a5e9bbda6",
230 | "metadata": {},
231 | "source": [
232 | "## Train-Test split"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 22,
238 | "id": "2c5dce77-a9a3-467d-a688-a26bceecf6ac",
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "X_train, X_test, y_train, y_test = train_test_split(\n",
243 | " data.drop(['enrollee_id', 'target'], axis=1),\n",
244 | " data['target'],\n",
245 | " test_size=0.2,\n",
246 | " random_state=0,\n",
247 | ")"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "id": "95436658-505f-49b7-baad-4d6c143bdf4c",
253 | "metadata": {},
254 | "source": [
255 | "## Config"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 11,
261 | "id": "c7089a44-fe95-446e-90b8-0618cea9352e",
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']\n",
266 | "\n",
267 | "CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']\n",
268 | "\n",
269 | "NUM_VARS = ['city_development_index', 'training_hours']\n",
270 | "\n",
271 | "NUM_VARS_YEO_JOHNSON = ['training_hours']\n",
272 | "\n",
273 | "CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']\n",
274 | "CAT_VARS_ORDINAL_ARBITRARY = ['city']\n",
275 | "CAT_VARS_ONEHOT = ['gender']\n",
276 | "CAT_VARS_COUNT_FREQUENCY = ['company_type']\n",
277 | "\n",
278 | "EXPERIENCE_VAR = ['experience']\n",
279 | "\n",
280 | "EXPERIENCE_MAP = {\n",
281 | " '<1': 0,\n",
282 | " '1': 1, \n",
283 | " '2': 2, \n",
284 | " '3': 3, \n",
285 | " '4': 4, \n",
286 | " '5': 5,\n",
287 | " '6': 6,\n",
288 | " '7': 7,\n",
289 | " '8': 8, \n",
290 | " '9': 9, \n",
291 | " '10': 10, \n",
292 | " '11': 11,\n",
293 | " '12': 12,\n",
294 | " '13': 13, \n",
295 | " '14': 14, \n",
296 | " '15': 15, \n",
297 | " '16': 16,\n",
298 | " '17': 17,\n",
299 | " '18': 18,\n",
300 | " '19': 19, \n",
301 | " '20': 20, \n",
302 | " '>20': 21\n",
303 | "} \n",
304 | "LAST_NEW_JOB_VAR = ['last_new_job']\n",
305 | "\n",
306 | "LAST_NEW_JOB_MAP = {\n",
307 | " 'never': 0,\n",
308 | " '1': 1, \n",
309 | " '2': 2, \n",
310 | " '3': 3, \n",
311 | " '4': 4, \n",
312 | " '>4': 5\n",
313 | "}\n",
314 | "\n",
315 | "COMPANY_SIZE_VAR = ['company_size']\n",
316 | "\n",
317 | "COMPANY_SIZE_MAP = {\n",
318 | " 'Missing': 0,\n",
319 | " '<10': 1,\n",
320 | " '10/49': 2, \n",
321 | " '100-500': 3, \n",
322 | " '1000-4999': 4, \n",
323 | " '10000+': 5, \n",
324 | " '50-99': 6, \n",
325 | " '500-999': 7, \n",
326 | " '5000-9999': 8\n",
327 | "}"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "id": "30198f77-bab4-498d-aba2-344cf450b1ef",
333 | "metadata": {},
334 | "source": [
335 | "## Feature Engineering Pipeline"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 23,
341 | "id": "8141f99c-d63c-4eeb-a5a3-33d78cd0428e",
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "fe_pipe = Pipeline([\n",
346 | " ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),\n",
347 | " ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),\n",
348 | " ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),\n",
349 | " ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),\n",
350 | " ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),\n",
351 | " ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),\n",
352 | " ('onehot_encoder', OneHotEncoder(variables=CAT_VARS_ONEHOT)),\n",
353 | " ('experience_map', pp.Mapper(variables=EXPERIENCE_VAR, mappings=EXPERIENCE_MAP)),\n",
354 | " ('last_new_job_map', pp.Mapper(variables=LAST_NEW_JOB_VAR, mappings=LAST_NEW_JOB_MAP)),\n",
355 | " ('company_size_map', pp.Mapper(variables=COMPANY_SIZE_VAR, mappings=COMPANY_SIZE_MAP)),\n",
356 | " # ('min_max_scaler', MinMaxScaler())\n",
357 | "])"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 24,
363 | "id": "9b4aee44-401a-48b4-8292-d6ff4096fc6f",
364 | "metadata": {},
365 | "outputs": [
366 | {
367 | "data": {
368 | "text/html": [
369 | "\n",
370 | "\n",
383 | "
\n",
384 | " \n",
385 | " \n",
386 | " \n",
387 | " city \n",
388 | " city_development_index \n",
389 | " gender \n",
390 | " relevent_experience \n",
391 | " enrolled_university \n",
392 | " education_level \n",
393 | " major_discipline \n",
394 | " experience \n",
395 | " company_size \n",
396 | " company_type \n",
397 | " last_new_job \n",
398 | " training_hours \n",
399 | " \n",
400 | " \n",
401 | " \n",
402 | " \n",
403 | " 19147 \n",
404 | " city_21 \n",
405 | " 0.624 \n",
406 | " Male \n",
407 | " No relevent experience \n",
408 | " Full time course \n",
409 | " Graduate \n",
410 | " STEM \n",
411 | " 1 \n",
412 | " 100-500 \n",
413 | " Pvt Ltd \n",
414 | " 1 \n",
415 | " 52 \n",
416 | " \n",
417 | " \n",
418 | " 8464 \n",
419 | " city_21 \n",
420 | " 0.624 \n",
421 | " NaN \n",
422 | " Has relevent experience \n",
423 | " Full time course \n",
424 | " Graduate \n",
425 | " STEM \n",
426 | " <1 \n",
427 | " <10 \n",
428 | " Pvt Ltd \n",
429 | " NaN \n",
430 | " 92 \n",
431 | " \n",
432 | " \n",
433 | " 8869 \n",
434 | " city_16 \n",
435 | " 0.910 \n",
436 | " Male \n",
437 | " Has relevent experience \n",
438 | " no_enrollment \n",
439 | " Masters \n",
440 | " STEM \n",
441 | " 9 \n",
442 | " NaN \n",
443 | " Pvt Ltd \n",
444 | " 1 \n",
445 | " 36 \n",
446 | " \n",
447 | " \n",
448 | " 11645 \n",
449 | " city_118 \n",
450 | " 0.722 \n",
451 | " NaN \n",
452 | " Has relevent experience \n",
453 | " Part time course \n",
454 | " Masters \n",
455 | " STEM \n",
456 | " 10 \n",
457 | " 1000-4999 \n",
458 | " Pvt Ltd \n",
459 | " 3 \n",
460 | " 19 \n",
461 | " \n",
462 | " \n",
463 | " 7743 \n",
464 | " city_103 \n",
465 | " 0.920 \n",
466 | " NaN \n",
467 | " No relevent experience \n",
468 | " no_enrollment \n",
469 | " Primary School \n",
470 | " NaN \n",
471 | " 2 \n",
472 | " NaN \n",
473 | " NaN \n",
474 | " never \n",
475 | " 69 \n",
476 | " \n",
477 | " \n",
478 | "
\n",
479 | "
"
480 | ],
481 | "text/plain": [
482 | " city city_development_index gender relevent_experience \\\n",
483 | "19147 city_21 0.624 Male No relevent experience \n",
484 | "8464 city_21 0.624 NaN Has relevent experience \n",
485 | "8869 city_16 0.910 Male Has relevent experience \n",
486 | "11645 city_118 0.722 NaN Has relevent experience \n",
487 | "7743 city_103 0.920 NaN No relevent experience \n",
488 | "\n",
489 | " enrolled_university education_level major_discipline experience \\\n",
490 | "19147 Full time course Graduate STEM 1 \n",
491 | "8464 Full time course Graduate STEM <1 \n",
492 | "8869 no_enrollment Masters STEM 9 \n",
493 | "11645 Part time course Masters STEM 10 \n",
494 | "7743 no_enrollment Primary School NaN 2 \n",
495 | "\n",
496 | " company_size company_type last_new_job training_hours \n",
497 | "19147 100-500 Pvt Ltd 1 52 \n",
498 | "8464 <10 Pvt Ltd NaN 92 \n",
499 | "8869 NaN Pvt Ltd 1 36 \n",
500 | "11645 1000-4999 Pvt Ltd 3 19 \n",
501 | "7743 NaN NaN never 69 "
502 | ]
503 | },
504 | "execution_count": 24,
505 | "metadata": {},
506 | "output_type": "execute_result"
507 | }
508 | ],
509 | "source": [
510 | "X_train.head()"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 25,
516 | "id": "30c79ad3-af73-434e-9425-5c8d8142f2bb",
517 | "metadata": {},
518 | "outputs": [
519 | {
520 | "data": {
521 | "text/plain": [
522 | "Pipeline(steps=[('cat_imputer_missing',\n",
523 | " CategoricalImputer(variables=['gender', 'major_discipline',\n",
524 | " 'company_size',\n",
525 | " 'company_type'])),\n",
526 | " ('cat_imputer_frequent',\n",
527 | " CategoricalImputer(imputation_method='frequent',\n",
528 | " variables=['enrolled_university',\n",
529 | " 'education_level', 'experience',\n",
530 | " 'last_new_job'])),\n",
531 | " ('num_transformer_yeo_johnson',\n",
532 | " YeoJohnsonTransformer(variables=['t...\n",
533 | " '20': 20, '3': 3, '4': 4, '5': 5, '6': 6,\n",
534 | " '7': 7, '8': 8, '9': 9, '<1': 0, '>20': 21},\n",
535 | " variables=['experience'])),\n",
536 | " ('last_new_job_map',\n",
537 | " Mapper(mappings={'1': 1, '2': 2, '3': 3, '4': 4, '>4': 5,\n",
538 | " 'never': 0},\n",
539 | " variables=['last_new_job'])),\n",
540 | " ('company_size_map',\n",
541 | " Mapper(mappings={'10/49': 2, '100-500': 3, '1000-4999': 4,\n",
542 | " '10000+': 5, '50-99': 6, '500-999': 7,\n",
543 | " '5000-9999': 8, '<10': 1, 'Missing': 0},\n",
544 | " variables=['company_size']))])"
545 | ]
546 | },
547 | "execution_count": 25,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "fe_pipe.fit(X_train, y_train)"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 26,
559 | "id": "5e070fd7-5a92-4cb0-8e3c-03632c84875f",
560 | "metadata": {},
561 | "outputs": [],
562 | "source": [
563 | "X_train = fe_pipe.transform(X_train)\n",
564 | "X_test = fe_pipe.transform(X_test)"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": 27,
570 | "id": "5313bf6c-7c5a-4ae9-b9b9-74a1bda12326",
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "data": {
575 | "text/html": [
576 | "\n",
577 | "\n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " \n",
594 | " city \n",
595 | " city_development_index \n",
596 | " relevent_experience \n",
597 | " enrolled_university \n",
598 | " education_level \n",
599 | " major_discipline \n",
600 | " experience \n",
601 | " company_size \n",
602 | " company_type \n",
603 | " last_new_job \n",
604 | " training_hours \n",
605 | " gender_Male \n",
606 | " gender_Missing \n",
607 | " gender_Other \n",
608 | " gender_Female \n",
609 | " \n",
610 | " \n",
611 | " \n",
612 | " \n",
613 | " 19147 \n",
614 | " 0 \n",
615 | " 0.624 \n",
616 | " 1 \n",
617 | " 2 \n",
618 | " 4 \n",
619 | " 4 \n",
620 | " 1 \n",
621 | " 3 \n",
622 | " 0.514746 \n",
623 | " 1 \n",
624 | " 5.371921 \n",
625 | " 1 \n",
626 | " 0 \n",
627 | " 0 \n",
628 | " 0 \n",
629 | " \n",
630 | " \n",
631 | " 8464 \n",
632 | " 0 \n",
633 | " 0.624 \n",
634 | " 0 \n",
635 | " 2 \n",
636 | " 4 \n",
637 | " 4 \n",
638 | " 0 \n",
639 | " 1 \n",
640 | " 0.514746 \n",
641 | " 1 \n",
642 | " 6.415291 \n",
643 | " 0 \n",
644 | " 1 \n",
645 | " 0 \n",
646 | " 0 \n",
647 | " \n",
648 | " \n",
649 | " 8869 \n",
650 | " 1 \n",
651 | " 0.910 \n",
652 | " 0 \n",
653 | " 0 \n",
654 | " 3 \n",
655 | " 4 \n",
656 | " 9 \n",
657 | " 0 \n",
658 | " 0.514746 \n",
659 | " 1 \n",
660 | " 4.748399 \n",
661 | " 1 \n",
662 | " 0 \n",
663 | " 0 \n",
664 | " 0 \n",
665 | " \n",
666 | " \n",
667 | " 11645 \n",
668 | " 2 \n",
669 | " 0.722 \n",
670 | " 0 \n",
671 | " 1 \n",
672 | " 3 \n",
673 | " 4 \n",
674 | " 10 \n",
675 | " 4 \n",
676 | " 0.514746 \n",
677 | " 3 \n",
678 | " 3.753794 \n",
679 | " 0 \n",
680 | " 1 \n",
681 | " 0 \n",
682 | " 0 \n",
683 | " \n",
684 | " \n",
685 | " 7743 \n",
686 | " 3 \n",
687 | " 0.920 \n",
688 | " 1 \n",
689 | " 0 \n",
690 | " 1 \n",
691 | " 0 \n",
692 | " 2 \n",
693 | " 0 \n",
694 | " 0.319522 \n",
695 | " 0 \n",
696 | " 5.877477 \n",
697 | " 0 \n",
698 | " 1 \n",
699 | " 0 \n",
700 | " 0 \n",
701 | " \n",
702 | " \n",
703 | " ... \n",
704 | " ... \n",
705 | " ... \n",
706 | " ... \n",
707 | " ... \n",
708 | " ... \n",
709 | " ... \n",
710 | " ... \n",
711 | " ... \n",
712 | " ... \n",
713 | " ... \n",
714 | " ... \n",
715 | " ... \n",
716 | " ... \n",
717 | " ... \n",
718 | " ... \n",
719 | " \n",
720 | " \n",
721 | " 9225 \n",
722 | " 31 \n",
723 | " 0.479 \n",
724 | " 1 \n",
725 | " 2 \n",
726 | " 4 \n",
727 | " 4 \n",
728 | " 10 \n",
729 | " 0 \n",
730 | " 0.319522 \n",
731 | " 5 \n",
732 | " 4.702184 \n",
733 | " 1 \n",
734 | " 0 \n",
735 | " 0 \n",
736 | " 0 \n",
737 | " \n",
738 | " \n",
739 | " 13123 \n",
740 | " 0 \n",
741 | " 0.624 \n",
742 | " 0 \n",
743 | " 0 \n",
744 | " 4 \n",
745 | " 4 \n",
746 | " 6 \n",
747 | " 6 \n",
748 | " 0.514746 \n",
749 | " 1 \n",
750 | " 3.829470 \n",
751 | " 1 \n",
752 | " 0 \n",
753 | " 0 \n",
754 | " 0 \n",
755 | " \n",
756 | " \n",
757 | " 9845 \n",
758 | " 3 \n",
759 | " 0.920 \n",
760 | " 0 \n",
761 | " 0 \n",
762 | " 4 \n",
763 | " 4 \n",
764 | " 6 \n",
765 | " 6 \n",
766 | " 0.514746 \n",
767 | " 0 \n",
768 | " 5.232979 \n",
769 | " 0 \n",
770 | " 0 \n",
771 | " 1 \n",
772 | " 0 \n",
773 | " \n",
774 | " \n",
775 | " 10799 \n",
776 | " 0 \n",
777 | " 0.624 \n",
778 | " 1 \n",
779 | " 0 \n",
780 | " 4 \n",
781 | " 4 \n",
782 | " 2 \n",
783 | " 2 \n",
784 | " 0.319522 \n",
785 | " 1 \n",
786 | " 6.030879 \n",
787 | " 0 \n",
788 | " 1 \n",
789 | " 0 \n",
790 | " 0 \n",
791 | " \n",
792 | " \n",
793 | " 2732 \n",
794 | " 1 \n",
795 | " 0.910 \n",
796 | " 0 \n",
797 | " 0 \n",
798 | " 4 \n",
799 | " 4 \n",
800 | " 12 \n",
801 | " 0 \n",
802 | " 0.319522 \n",
803 | " 1 \n",
804 | " 1.535819 \n",
805 | " 1 \n",
806 | " 0 \n",
807 | " 0 \n",
808 | " 0 \n",
809 | " \n",
810 | " \n",
811 | "
\n",
812 | "
15326 rows × 15 columns
\n",
813 | "
"
814 | ],
815 | "text/plain": [
816 | " city city_development_index relevent_experience enrolled_university \\\n",
817 | "19147 0 0.624 1 2 \n",
818 | "8464 0 0.624 0 2 \n",
819 | "8869 1 0.910 0 0 \n",
820 | "11645 2 0.722 0 1 \n",
821 | "7743 3 0.920 1 0 \n",
822 | "... ... ... ... ... \n",
823 | "9225 31 0.479 1 2 \n",
824 | "13123 0 0.624 0 0 \n",
825 | "9845 3 0.920 0 0 \n",
826 | "10799 0 0.624 1 0 \n",
827 | "2732 1 0.910 0 0 \n",
828 | "\n",
829 | " education_level major_discipline experience company_size \\\n",
830 | "19147 4 4 1 3 \n",
831 | "8464 4 4 0 1 \n",
832 | "8869 3 4 9 0 \n",
833 | "11645 3 4 10 4 \n",
834 | "7743 1 0 2 0 \n",
835 | "... ... ... ... ... \n",
836 | "9225 4 4 10 0 \n",
837 | "13123 4 4 6 6 \n",
838 | "9845 4 4 6 6 \n",
839 | "10799 4 4 2 2 \n",
840 | "2732 4 4 12 0 \n",
841 | "\n",
842 | " company_type last_new_job training_hours gender_Male \\\n",
843 | "19147 0.514746 1 5.371921 1 \n",
844 | "8464 0.514746 1 6.415291 0 \n",
845 | "8869 0.514746 1 4.748399 1 \n",
846 | "11645 0.514746 3 3.753794 0 \n",
847 | "7743 0.319522 0 5.877477 0 \n",
848 | "... ... ... ... ... \n",
849 | "9225 0.319522 5 4.702184 1 \n",
850 | "13123 0.514746 1 3.829470 1 \n",
851 | "9845 0.514746 0 5.232979 0 \n",
852 | "10799 0.319522 1 6.030879 0 \n",
853 | "2732 0.319522 1 1.535819 1 \n",
854 | "\n",
855 | " gender_Missing gender_Other gender_Female \n",
856 | "19147 0 0 0 \n",
857 | "8464 1 0 0 \n",
858 | "8869 0 0 0 \n",
859 | "11645 1 0 0 \n",
860 | "7743 1 0 0 \n",
861 | "... ... ... ... \n",
862 | "9225 0 0 0 \n",
863 | "13123 0 0 0 \n",
864 | "9845 0 1 0 \n",
865 | "10799 1 0 0 \n",
866 | "2732 0 0 0 \n",
867 | "\n",
868 | "[15326 rows x 15 columns]"
869 | ]
870 | },
871 | "execution_count": 27,
872 | "metadata": {},
873 | "output_type": "execute_result"
874 | }
875 | ],
876 | "source": [
877 | "X_train"
878 | ]
879 | },
880 | {
881 | "cell_type": "code",
882 | "execution_count": null,
883 | "id": "cc449c65-ac72-4c2c-96a6-3b184315601e",
884 | "metadata": {},
885 | "outputs": [],
886 | "source": []
887 | }
888 | ],
889 | "metadata": {
890 | "kernelspec": {
891 | "display_name": "Python 3 (ipykernel)",
892 | "language": "python",
893 | "name": "python3"
894 | },
895 | "language_info": {
896 | "codemirror_mode": {
897 | "name": "ipython",
898 | "version": 3
899 | },
900 | "file_extension": ".py",
901 | "mimetype": "text/x-python",
902 | "name": "python",
903 | "nbconvert_exporter": "python",
904 | "pygments_lexer": "ipython3",
905 | "version": "3.9.7"
906 | }
907 | },
908 | "nbformat": 4,
909 | "nbformat_minor": 5
910 | }
911 |
--------------------------------------------------------------------------------
/notebooks/4. Machine Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "4419c01c",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import scipy.stats as stats\n",
13 | "\n",
14 | "from sklearn.model_selection import train_test_split\n",
15 | "from sklearn.preprocessing import MinMaxScaler\n",
16 | "from sklearn.pipeline import Pipeline\n",
17 | "from sklearn.linear_model import LogisticRegression\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "\n",
20 | "from feature_engine.imputation import (\n",
21 | " CategoricalImputer,\n",
22 | ")\n",
23 | "\n",
24 | "from feature_engine.transformation import (\n",
25 | " YeoJohnsonTransformer,\n",
26 | ")\n",
27 | "\n",
28 | "from feature_engine.encoding import (\n",
29 | " RareLabelEncoder,\n",
30 | " OrdinalEncoder,\n",
31 | " OneHotEncoder,\n",
32 | " CountFrequencyEncoder\n",
33 | ")\n",
34 | "\n",
35 | "import joblib\n",
36 | "\n",
37 | "import preprocess as pp"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "id": "a9613b01",
43 | "metadata": {},
44 | "source": [
45 | "## Read Data"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "id": "9c4bb14b",
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "name": "stdout",
56 | "output_type": "stream",
57 | "text": [
58 | "(19158, 14)\n"
59 | ]
60 | },
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " enrollee_id \n",
83 | " city \n",
84 | " city_development_index \n",
85 | " gender \n",
86 | " relevent_experience \n",
87 | " enrolled_university \n",
88 | " education_level \n",
89 | " major_discipline \n",
90 | " experience \n",
91 | " company_size \n",
92 | " company_type \n",
93 | " last_new_job \n",
94 | " training_hours \n",
95 | " target \n",
96 | " \n",
97 | " \n",
98 | " \n",
99 | " \n",
100 | " 0 \n",
101 | " 8949 \n",
102 | " city_103 \n",
103 | " 0.920 \n",
104 | " Male \n",
105 | " Has relevent experience \n",
106 | " no_enrollment \n",
107 | " Graduate \n",
108 | " STEM \n",
109 | " >20 \n",
110 | " NaN \n",
111 | " NaN \n",
112 | " 1 \n",
113 | " 36 \n",
114 | " 1.0 \n",
115 | " \n",
116 | " \n",
117 | " 1 \n",
118 | " 29725 \n",
119 | " city_40 \n",
120 | " 0.776 \n",
121 | " Male \n",
122 | " No relevent experience \n",
123 | " no_enrollment \n",
124 | " Graduate \n",
125 | " STEM \n",
126 | " 15 \n",
127 | " 50-99 \n",
128 | " Pvt Ltd \n",
129 | " >4 \n",
130 | " 47 \n",
131 | " 0.0 \n",
132 | " \n",
133 | " \n",
134 | " 2 \n",
135 | " 11561 \n",
136 | " city_21 \n",
137 | " 0.624 \n",
138 | " NaN \n",
139 | " No relevent experience \n",
140 | " Full time course \n",
141 | " Graduate \n",
142 | " STEM \n",
143 | " 5 \n",
144 | " NaN \n",
145 | " NaN \n",
146 | " never \n",
147 | " 83 \n",
148 | " 0.0 \n",
149 | " \n",
150 | " \n",
151 | " 3 \n",
152 | " 33241 \n",
153 | " city_115 \n",
154 | " 0.789 \n",
155 | " NaN \n",
156 | " No relevent experience \n",
157 | " NaN \n",
158 | " Graduate \n",
159 | " Business Degree \n",
160 | " <1 \n",
161 | " NaN \n",
162 | " Pvt Ltd \n",
163 | " never \n",
164 | " 52 \n",
165 | " 1.0 \n",
166 | " \n",
167 | " \n",
168 | " 4 \n",
169 | " 666 \n",
170 | " city_162 \n",
171 | " 0.767 \n",
172 | " Male \n",
173 | " Has relevent experience \n",
174 | " no_enrollment \n",
175 | " Masters \n",
176 | " STEM \n",
177 | " >20 \n",
178 | " 50-99 \n",
179 | " Funded Startup \n",
180 | " 4 \n",
181 | " 8 \n",
182 | " 0.0 \n",
183 | " \n",
184 | " \n",
185 | "
\n",
186 | "
"
187 | ],
188 | "text/plain": [
189 | " enrollee_id city city_development_index gender \\\n",
190 | "0 8949 city_103 0.920 Male \n",
191 | "1 29725 city_40 0.776 Male \n",
192 | "2 11561 city_21 0.624 NaN \n",
193 | "3 33241 city_115 0.789 NaN \n",
194 | "4 666 city_162 0.767 Male \n",
195 | "\n",
196 | " relevent_experience enrolled_university education_level \\\n",
197 | "0 Has relevent experience no_enrollment Graduate \n",
198 | "1 No relevent experience no_enrollment Graduate \n",
199 | "2 No relevent experience Full time course Graduate \n",
200 | "3 No relevent experience NaN Graduate \n",
201 | "4 Has relevent experience no_enrollment Masters \n",
202 | "\n",
203 | " major_discipline experience company_size company_type last_new_job \\\n",
204 | "0 STEM >20 NaN NaN 1 \n",
205 | "1 STEM 15 50-99 Pvt Ltd >4 \n",
206 | "2 STEM 5 NaN NaN never \n",
207 | "3 Business Degree <1 NaN Pvt Ltd never \n",
208 | "4 STEM >20 50-99 Funded Startup 4 \n",
209 | "\n",
210 | " training_hours target \n",
211 | "0 36 1.0 \n",
212 | "1 47 0.0 \n",
213 | "2 83 0.0 \n",
214 | "3 52 1.0 \n",
215 | "4 8 0.0 "
216 | ]
217 | },
218 | "execution_count": 2,
219 | "metadata": {},
220 | "output_type": "execute_result"
221 | }
222 | ],
223 | "source": [
224 | "data = pd.read_csv('../src/data/train.csv')\n",
225 | "print(data.shape)\n",
226 | "data.head()"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "id": "f50f7905",
232 | "metadata": {},
233 | "source": [
234 | "## Train-Test Split"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 3,
240 | "id": "cfb4ac7f",
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "X_train, X_test, y_train, y_test = train_test_split(\n",
245 | " data.drop(['enrollee_id', 'target'], axis=1),\n",
246 | " data['target'],\n",
247 | " test_size=0.2,\n",
248 | " random_state=0,\n",
249 | ")"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 4,
255 | "id": "2bf8009e",
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "data": {
260 | "text/html": [
261 | "\n",
262 | "\n",
275 | "
\n",
276 | " \n",
277 | " \n",
278 | " \n",
279 | " enrollee_id \n",
280 | " city_development_index \n",
281 | " training_hours \n",
282 | " target \n",
283 | " \n",
284 | " \n",
285 | " \n",
286 | " \n",
287 | " count \n",
288 | " 19158.000000 \n",
289 | " 19158.000000 \n",
290 | " 19158.000000 \n",
291 | " 19158.000000 \n",
292 | " \n",
293 | " \n",
294 | " mean \n",
295 | " 16875.358179 \n",
296 | " 0.828848 \n",
297 | " 65.366896 \n",
298 | " 0.249348 \n",
299 | " \n",
300 | " \n",
301 | " std \n",
302 | " 9616.292592 \n",
303 | " 0.123362 \n",
304 | " 60.058462 \n",
305 | " 0.432647 \n",
306 | " \n",
307 | " \n",
308 | " min \n",
309 | " 1.000000 \n",
310 | " 0.448000 \n",
311 | " 1.000000 \n",
312 | " 0.000000 \n",
313 | " \n",
314 | " \n",
315 | " 25% \n",
316 | " 8554.250000 \n",
317 | " 0.740000 \n",
318 | " 23.000000 \n",
319 | " 0.000000 \n",
320 | " \n",
321 | " \n",
322 | " 50% \n",
323 | " 16982.500000 \n",
324 | " 0.903000 \n",
325 | " 47.000000 \n",
326 | " 0.000000 \n",
327 | " \n",
328 | " \n",
329 | " 75% \n",
330 | " 25169.750000 \n",
331 | " 0.920000 \n",
332 | " 88.000000 \n",
333 | " 0.000000 \n",
334 | " \n",
335 | " \n",
336 | " max \n",
337 | " 33380.000000 \n",
338 | " 0.949000 \n",
339 | " 336.000000 \n",
340 | " 1.000000 \n",
341 | " \n",
342 | " \n",
343 | "
\n",
344 | "
"
345 | ],
346 | "text/plain": [
347 | " enrollee_id city_development_index training_hours target\n",
348 | "count 19158.000000 19158.000000 19158.000000 19158.000000\n",
349 | "mean 16875.358179 0.828848 65.366896 0.249348\n",
350 | "std 9616.292592 0.123362 60.058462 0.432647\n",
351 | "min 1.000000 0.448000 1.000000 0.000000\n",
352 | "25% 8554.250000 0.740000 23.000000 0.000000\n",
353 | "50% 16982.500000 0.903000 47.000000 0.000000\n",
354 | "75% 25169.750000 0.920000 88.000000 0.000000\n",
355 | "max 33380.000000 0.949000 336.000000 1.000000"
356 | ]
357 | },
358 | "execution_count": 4,
359 | "metadata": {},
360 | "output_type": "execute_result"
361 | }
362 | ],
363 | "source": [
364 | "data.describe()"
365 | ]
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "id": "200a7584",
370 | "metadata": {},
371 | "source": [
372 | "## Config"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 6,
378 | "id": "563d7aac",
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']\n",
383 | "\n",
384 | "CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']\n",
385 | "\n",
386 | "NUM_VARS = ['city_development_index', 'training_hours']\n",
387 | "\n",
388 | "NUM_VARS_YEO_JOHNSON = ['training_hours']\n",
389 | "\n",
390 | "CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']\n",
391 | "CAT_VARS_ORDINAL_ARBITRARY = ['city']\n",
392 | "CAT_VARS_ONEHOT = ['gender']\n",
393 | "CAT_VARS_COUNT_FREQUENCY = ['company_type']\n",
394 | "\n",
395 | "EXPERIENCE_VAR = ['experience']\n",
396 | "\n",
397 | "EXPERIENCE_MAP = {\n",
398 | " '<1': 0,\n",
399 | " '1': 1, \n",
400 | " '2': 2, \n",
401 | " '3': 3, \n",
402 | " '4': 4, \n",
403 | " '5': 5,\n",
404 | " '6': 6,\n",
405 | " '7': 7,\n",
406 | " '8': 8, \n",
407 | " '9': 9, \n",
408 | " '10': 10, \n",
409 | " '11': 11,\n",
410 | " '12': 12,\n",
411 | " '13': 13, \n",
412 | " '14': 14, \n",
413 | " '15': 15, \n",
414 | " '16': 16,\n",
415 | " '17': 17,\n",
416 | " '18': 18,\n",
417 | " '19': 19, \n",
418 | " '20': 20, \n",
419 | " '>20': 21\n",
420 | "} \n",
421 | "LAST_NEW_JOB_VAR = ['last_new_job']\n",
422 | "\n",
423 | "LAST_NEW_JOB_MAP = {\n",
424 | " 'never': 0,\n",
425 | " '1': 1, \n",
426 | " '2': 2, \n",
427 | " '3': 3, \n",
428 | " '4': 4, \n",
429 | " '>4': 5\n",
430 | "}\n",
431 | "\n",
432 | "COMPANY_SIZE_VAR = ['company_size']\n",
433 | "\n",
434 | "COMPANY_SIZE_MAP = {\n",
435 | " 'Missing': 0,\n",
436 | " '<10': 1,\n",
437 | " '10/49': 2, \n",
438 | " '50-99': 3, \n",
439 | " '100-500': 4, \n",
440 | " '500-999': 5, \n",
441 | " '1000-4999': 6, \n",
442 | " '5000-9999': 7,\n",
443 | " '10000+': 8, \n",
444 | "}"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "id": "e16607b7",
450 | "metadata": {},
451 | "source": [
452 | "## Pipeline"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 7,
458 | "id": "1b60d336",
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "pipe = Pipeline([\n",
463 | " ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),\n",
464 | " ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),\n",
465 | " ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),\n",
466 | " ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),\n",
467 | " ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),\n",
468 | " ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),\n",
469 | " ('onehot_encoder', OneHotEncoder(variables=CAT_VARS_ONEHOT)),\n",
470 | " ('experience_map', pp.Mapper(variables=EXPERIENCE_VAR, mappings=EXPERIENCE_MAP)),\n",
471 | " ('last_new_job_map', pp.Mapper(variables=LAST_NEW_JOB_VAR, mappings=LAST_NEW_JOB_MAP)),\n",
472 | " ('company_size_map', pp.Mapper(variables=COMPANY_SIZE_VAR, mappings=COMPANY_SIZE_MAP)),\n",
473 | " ('min_max_scaler', MinMaxScaler()),\n",
474 | " \n",
475 | " ('logistic_regression', LogisticRegression(random_state=0))\n",
476 | "])"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 8,
482 | "id": "dda370e1",
483 | "metadata": {},
484 | "outputs": [
485 | {
486 | "data": {
487 | "text/plain": [
488 | "Pipeline(steps=[('cat_imputer_missing',\n",
489 | " CategoricalImputer(variables=['gender', 'major_discipline',\n",
490 | " 'company_size',\n",
491 | " 'company_type'])),\n",
492 | " ('cat_imputer_frequent',\n",
493 | " CategoricalImputer(imputation_method='frequent',\n",
494 | " variables=['enrolled_university',\n",
495 | " 'education_level', 'experience',\n",
496 | " 'last_new_job'])),\n",
497 | " ('num_transformer_yeo_johnson',\n",
498 | " YeoJohnsonTransformer(variables=['t...\n",
499 | " Mapper(mappings={'1': 1, '2': 2, '3': 3, '4': 4, '>4': 5,\n",
500 | " 'never': 0},\n",
501 | " variables=['last_new_job'])),\n",
502 | " ('company_size_map',\n",
503 | " Mapper(mappings={'10/49': 2, '100-500': 4, '1000-4999': 6,\n",
504 | " '10000+': 8, '50-99': 3, '500-999': 5,\n",
505 | " '5000-9999': 7, '<10': 1, 'Missing': 0},\n",
506 | " variables=['company_size'])),\n",
507 | " ('min_max_scaler', MinMaxScaler()),\n",
508 | " ('logistic_regression', LogisticRegression(random_state=0))])"
509 | ]
510 | },
511 | "execution_count": 8,
512 | "metadata": {},
513 | "output_type": "execute_result"
514 | }
515 | ],
516 | "source": [
517 | "pipe.fit(X_train, y_train)"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 9,
523 | "id": "2624855f",
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "preds = pipe.predict(X_test)"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 10,
533 | "id": "d24ccde7",
534 | "metadata": {},
535 | "outputs": [
536 | {
537 | "data": {
538 | "text/plain": [
539 | "0.774008350730689"
540 | ]
541 | },
542 | "execution_count": 10,
543 | "metadata": {},
544 | "output_type": "execute_result"
545 | }
546 | ],
547 | "source": [
548 | "accuracy_score(y_test, preds)"
549 | ]
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "id": "25bd4047",
554 | "metadata": {},
555 | "source": [
556 | "## Save the pipe"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 11,
562 | "id": "943b6d87",
563 | "metadata": {},
564 | "outputs": [
565 | {
566 | "data": {
567 | "text/plain": [
568 | "['pipe.joblib']"
569 | ]
570 | },
571 | "execution_count": 11,
572 | "metadata": {},
573 | "output_type": "execute_result"
574 | }
575 | ],
576 | "source": [
577 | "joblib.dump(pipe, 'pipe.joblib') "
578 | ]
579 | },
580 | {
581 | "cell_type": "markdown",
582 | "id": "d92a3839",
583 | "metadata": {},
584 | "source": [
585 | "## Score new data"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 12,
591 | "id": "b3988421",
592 | "metadata": {},
593 | "outputs": [],
594 | "source": [
595 | "new_data = pd.read_csv('../src/data/test.csv')"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 13,
601 | "id": "6f5ba36f",
602 | "metadata": {
603 | "tags": []
604 | },
605 | "outputs": [
606 | {
607 | "data": {
608 | "text/html": [
609 | "\n",
610 | "\n",
623 | "
\n",
624 | " \n",
625 | " \n",
626 | " \n",
627 | " city \n",
628 | " city_development_index \n",
629 | " gender \n",
630 | " relevent_experience \n",
631 | " enrolled_university \n",
632 | " education_level \n",
633 | " major_discipline \n",
634 | " experience \n",
635 | " company_size \n",
636 | " company_type \n",
637 | " last_new_job \n",
638 | " training_hours \n",
639 | " \n",
640 | " \n",
641 | " \n",
642 | " \n",
643 | " 0 \n",
644 | " city_41 \n",
645 | " 0.827 \n",
646 | " Male \n",
647 | " Has relevent experience \n",
648 | " Full time course \n",
649 | " Graduate \n",
650 | " STEM \n",
651 | " 9 \n",
652 | " <10 \n",
653 | " NaN \n",
654 | " 1 \n",
655 | " 21 \n",
656 | " \n",
657 | " \n",
658 | " 1 \n",
659 | " city_103 \n",
660 | " 0.920 \n",
661 | " Female \n",
662 | " Has relevent experience \n",
663 | " no_enrollment \n",
664 | " Graduate \n",
665 | " STEM \n",
666 | " 5 \n",
667 | " NaN \n",
668 | " Pvt Ltd \n",
669 | " 1 \n",
670 | " 98 \n",
671 | " \n",
672 | " \n",
673 | " 2 \n",
674 | " city_21 \n",
675 | " 0.624 \n",
676 | " Male \n",
677 | " No relevent experience \n",
678 | " no_enrollment \n",
679 | " High School \n",
680 | " NaN \n",
681 | " <1 \n",
682 | " NaN \n",
683 | " Pvt Ltd \n",
684 | " never \n",
685 | " 15 \n",
686 | " \n",
687 | " \n",
688 | " 3 \n",
689 | " city_13 \n",
690 | " 0.827 \n",
691 | " Male \n",
692 | " Has relevent experience \n",
693 | " no_enrollment \n",
694 | " Masters \n",
695 | " STEM \n",
696 | " 11 \n",
697 | " 10/49 \n",
698 | " Pvt Ltd \n",
699 | " 1 \n",
700 | " 39 \n",
701 | " \n",
702 | " \n",
703 | " 4 \n",
704 | " city_103 \n",
705 | " 0.920 \n",
706 | " Male \n",
707 | " Has relevent experience \n",
708 | " no_enrollment \n",
709 | " Graduate \n",
710 | " STEM \n",
711 | " >20 \n",
712 | " 10000+ \n",
713 | " Pvt Ltd \n",
714 | " >4 \n",
715 | " 72 \n",
716 | " \n",
717 | " \n",
718 | " ... \n",
719 | " ... \n",
720 | " ... \n",
721 | " ... \n",
722 | " ... \n",
723 | " ... \n",
724 | " ... \n",
725 | " ... \n",
726 | " ... \n",
727 | " ... \n",
728 | " ... \n",
729 | " ... \n",
730 | " ... \n",
731 | " \n",
732 | " \n",
733 | " 2124 \n",
734 | " city_103 \n",
735 | " 0.920 \n",
736 | " Male \n",
737 | " No relevent experience \n",
738 | " no_enrollment \n",
739 | " Graduate \n",
740 | " Humanities \n",
741 | " 16 \n",
742 | " NaN \n",
743 | " Public Sector \n",
744 | " 4 \n",
745 | " 15 \n",
746 | " \n",
747 | " \n",
748 | " 2125 \n",
749 | " city_136 \n",
750 | " 0.897 \n",
751 | " Male \n",
752 | " Has relevent experience \n",
753 | " no_enrollment \n",
754 | " Masters \n",
755 | " STEM \n",
756 | " 18 \n",
757 | " NaN \n",
758 | " NaN \n",
759 | " 2 \n",
760 | " 30 \n",
761 | " \n",
762 | " \n",
763 | " 2126 \n",
764 | " city_100 \n",
765 | " 0.887 \n",
766 | " Male \n",
767 | " No relevent experience \n",
768 | " no_enrollment \n",
769 | " Primary School \n",
770 | " NaN \n",
771 | " 3 \n",
772 | " NaN \n",
773 | " Pvt Ltd \n",
774 | " never \n",
775 | " 18 \n",
776 | " \n",
777 | " \n",
778 | " 2127 \n",
779 | " city_102 \n",
780 | " 0.804 \n",
781 | " Male \n",
782 | " Has relevent experience \n",
783 | " Full time course \n",
784 | " High School \n",
785 | " NaN \n",
786 | " 7 \n",
787 | " 100-500 \n",
788 | " Public Sector \n",
789 | " 1 \n",
790 | " 84 \n",
791 | " \n",
792 | " \n",
793 | " 2128 \n",
794 | " city_102 \n",
795 | " 0.804 \n",
796 | " Male \n",
797 | " Has relevent experience \n",
798 | " no_enrollment \n",
799 | " Masters \n",
800 | " STEM \n",
801 | " 15 \n",
802 | " 10000+ \n",
803 | " Pvt Ltd \n",
804 | " 2 \n",
805 | " 11 \n",
806 | " \n",
807 | " \n",
808 | "
\n",
809 | "
2129 rows × 12 columns
\n",
810 | "
"
811 | ],
812 | "text/plain": [
813 | " city city_development_index gender relevent_experience \\\n",
814 | "0 city_41 0.827 Male Has relevent experience \n",
815 | "1 city_103 0.920 Female Has relevent experience \n",
816 | "2 city_21 0.624 Male No relevent experience \n",
817 | "3 city_13 0.827 Male Has relevent experience \n",
818 | "4 city_103 0.920 Male Has relevent experience \n",
819 | "... ... ... ... ... \n",
820 | "2124 city_103 0.920 Male No relevent experience \n",
821 | "2125 city_136 0.897 Male Has relevent experience \n",
822 | "2126 city_100 0.887 Male No relevent experience \n",
823 | "2127 city_102 0.804 Male Has relevent experience \n",
824 | "2128 city_102 0.804 Male Has relevent experience \n",
825 | "\n",
826 | " enrolled_university education_level major_discipline experience \\\n",
827 | "0 Full time course Graduate STEM 9 \n",
828 | "1 no_enrollment Graduate STEM 5 \n",
829 | "2 no_enrollment High School NaN <1 \n",
830 | "3 no_enrollment Masters STEM 11 \n",
831 | "4 no_enrollment Graduate STEM >20 \n",
832 | "... ... ... ... ... \n",
833 | "2124 no_enrollment Graduate Humanities 16 \n",
834 | "2125 no_enrollment Masters STEM 18 \n",
835 | "2126 no_enrollment Primary School NaN 3 \n",
836 | "2127 Full time course High School NaN 7 \n",
837 | "2128 no_enrollment Masters STEM 15 \n",
838 | "\n",
839 | " company_size company_type last_new_job training_hours \n",
840 | "0 <10 NaN 1 21 \n",
841 | "1 NaN Pvt Ltd 1 98 \n",
842 | "2 NaN Pvt Ltd never 15 \n",
843 | "3 10/49 Pvt Ltd 1 39 \n",
844 | "4 10000+ Pvt Ltd >4 72 \n",
845 | "... ... ... ... ... \n",
846 | "2124 NaN Public Sector 4 15 \n",
847 | "2125 NaN NaN 2 30 \n",
848 | "2126 NaN Pvt Ltd never 18 \n",
849 | "2127 100-500 Public Sector 1 84 \n",
850 | "2128 10000+ Pvt Ltd 2 11 \n",
851 | "\n",
852 | "[2129 rows x 12 columns]"
853 | ]
854 | },
855 | "execution_count": 13,
856 | "metadata": {},
857 | "output_type": "execute_result"
858 | }
859 | ],
860 | "source": [
861 | "new_data = new_data.drop(['enrollee_id'], axis=1)\n",
862 | "new_data"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 14,
868 | "id": "67b39187",
869 | "metadata": {},
870 | "outputs": [],
871 | "source": [
872 | "new_preds = pipe.predict(new_data)"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": null,
878 | "id": "21ebea6c",
879 | "metadata": {},
880 | "outputs": [],
881 | "source": [
882 | "new_preds"
883 | ]
884 | },
885 | {
886 | "cell_type": "markdown",
887 | "id": "e2e394d7",
888 | "metadata": {},
889 | "source": [
890 | "TODO:\n",
891 | " \n",
892 | "- hyperparameter tuning\n",
893 | "- multiple algorithms\n",
894 | "- cross validation\n",
895 | "- sklearn similar projects\n",
896 | "- optuna"
897 | ]
898 | },
899 | {
900 | "cell_type": "code",
901 | "execution_count": 15,
902 | "id": "17a6a178",
903 | "metadata": {},
904 | "outputs": [
905 | {
906 | "data": {
907 | "text/plain": [
908 | "array([0., 0., 1., ..., 0., 0., 0.])"
909 | ]
910 | },
911 | "execution_count": 15,
912 | "metadata": {},
913 | "output_type": "execute_result"
914 | }
915 | ],
916 | "source": [
917 | "pipe.predict(new_data)"
918 | ]
919 | },
920 | {
921 | "cell_type": "code",
922 | "execution_count": 20,
923 | "id": "e8b02e6b",
924 | "metadata": {},
925 | "outputs": [
926 | {
927 | "data": {
928 | "text/plain": [
929 | "numpy.ndarray"
930 | ]
931 | },
932 | "execution_count": 20,
933 | "metadata": {},
934 | "output_type": "execute_result"
935 | }
936 | ],
937 | "source": [
938 | "type(pipe.predict(data.drop(['enrollee_id', 'target'], axis=1)))"
939 | ]
940 | },
941 | {
942 | "cell_type": "code",
943 | "execution_count": 21,
944 | "id": "b497132c",
945 | "metadata": {},
946 | "outputs": [
947 | {
948 | "data": {
949 | "text/plain": [
950 | "True"
951 | ]
952 | },
953 | "execution_count": 21,
954 | "metadata": {},
955 | "output_type": "execute_result"
956 | }
957 | ],
958 | "source": [
959 | "isinstance(pipe.predict(data.drop(['enrollee_id', 'target'], axis=1))[:10], np.ndarray)"
960 | ]
961 | },
962 | {
963 | "cell_type": "code",
964 | "execution_count": 35,
965 | "id": "7b8396cc",
966 | "metadata": {},
967 | "outputs": [],
968 | "source": [
969 | "REQUIREMENTS_DIR = '../requirements'\n",
970 | "fname=\"production.txt\"\n",
971 | "with open(f'{REQUIREMENTS_DIR}/{fname}') as fd:\n",
972 | " reqs = fd.read().splitlines()\n",
973 | "reqs = list(filter(None, reqs))"
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": 36,
979 | "id": "82d589f2-5682-4d41-a416-40107feff6db",
980 | "metadata": {},
981 | "outputs": [],
982 | "source": [
983 | "for req in reqs:\n",
984 | " if '-r' in req:\n",
985 | " with open(f\"{REQUIREMENTS_DIR}/{req.split(' ')[1]}\") as fd:\n",
986 | " extra_reqs = fd.read().splitlines()\n",
987 | " reqs.remove(req)"
988 | ]
989 | },
990 | {
991 | "cell_type": "code",
992 | "execution_count": 37,
993 | "id": "fbcb5bf8-f71c-443c-96b6-1fa2a6603108",
994 | "metadata": {},
995 | "outputs": [],
996 | "source": [
997 | "reqs=extra_reqs+reqs"
998 | ]
999 | },
1000 | {
1001 | "cell_type": "code",
1002 | "execution_count": 38,
1003 | "id": "d3573470-3693-40a2-99c9-07893ae991f5",
1004 | "metadata": {},
1005 | "outputs": [
1006 | {
1007 | "data": {
1008 | "text/plain": [
1009 | "['feature-engine==1.2.0',\n",
1010 | " 'scikit-learn==1.0.2',\n",
1011 | " 'scipy==1.8.0',\n",
1012 | " 'seaborn==0.11.2',\n",
1013 | " 'pandas==1.4.1',\n",
1014 | " 'numpy==1.22.3',\n",
1015 | " 'joblib==1.1.0',\n",
1016 | " 'loguru==0.6.0',\n",
1017 | " 'tox==3.24.5',\n",
1018 | " 'pytest==7.0.1',\n",
1019 | " 'black==22.1.0',\n",
1020 | " 'flake8==4.0.1',\n",
1021 | " 'mypy==0.931',\n",
1022 | " 'isort==5.10.1',\n",
1023 | " 'pydantic==1.9.0',\n",
1024 | " 'strictyaml==1.6.1']"
1025 | ]
1026 | },
1027 | "execution_count": 38,
1028 | "metadata": {},
1029 | "output_type": "execute_result"
1030 | }
1031 | ],
1032 | "source": [
1033 | "reqs"
1034 | ]
1035 | },
1036 | {
1037 | "cell_type": "code",
1038 | "execution_count": null,
1039 | "id": "51185c06-71e0-4fde-bef1-328ff6b745a9",
1040 | "metadata": {},
1041 | "outputs": [],
1042 | "source": []
1043 | }
1044 | ],
1045 | "metadata": {
1046 | "kernelspec": {
1047 | "display_name": "Python 3 (ipykernel)",
1048 | "language": "python",
1049 | "name": "python3"
1050 | },
1051 | "language_info": {
1052 | "codemirror_mode": {
1053 | "name": "ipython",
1054 | "version": 3
1055 | },
1056 | "file_extension": ".py",
1057 | "mimetype": "text/x-python",
1058 | "name": "python",
1059 | "nbconvert_exporter": "python",
1060 | "pygments_lexer": "ipython3",
1061 | "version": "3.9.10"
1062 | }
1063 | },
1064 | "nbformat": 4,
1065 | "nbformat_minor": 5
1066 | }
1067 |
--------------------------------------------------------------------------------
/notebooks/pipe.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/notebooks/pipe.joblib
--------------------------------------------------------------------------------
/notebooks/preprocess.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 |
6 |
7 | class Mapper(BaseEstimator, TransformerMixin):
8 |
9 | def __init__(self, variables, mappings):
10 |
11 | if not isinstance(variables, list):
12 | raise ValueError('variables should be a list')
13 |
14 | self.variables = variables
15 | self.mappings = mappings
16 |
17 | def fit(self, X, y=None):
18 | # fit statement to be in-line with the sklearn pipeline
19 | return self
20 |
21 | def transform(self, X):
22 | X = X.copy()
23 | for feature in self.variables:
24 | X[feature] = X[feature].map(self.mappings)
25 |
26 | return X
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
8 | [tool.pytest.ini_options]
9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 | "error",
19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 | # produced by older pyparsing<=2.2.0.
21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 | "ignore:The distutils package is deprecated:DeprecationWarning",
25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 | # produced by pytest-xdist
28 | "ignore:.*type argument to addoption.*:DeprecationWarning",
29 | # produced on execnet (pytest-xdist)
30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 | # pytest's own futurewarnings
32 | "ignore::pytest.PytestExperimentalApiWarning",
33 | # Do not cause SyntaxError for invalid escape sequences in py37.
34 | # Those are caught/handled by pyupgrade, and not easy to filter with the
35 | # module being the filename (with .py removed).
36 | "default:invalid escape sequence:DeprecationWarning",
37 | # ignore use of unregistered marks, because we use many to test the implementation
38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 |
41 | [tool.black]
42 | target-version = ['py39']
43 |
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 120
47 | lines_between_sections = 1
48 | known_first_party = "sentry"
49 | skip = "migrations"
--------------------------------------------------------------------------------
/requirements/deployment.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | -r production.txt
3 |
4 | fastapi==0.75.0
5 | uvicorn==0.17.5
6 | python-multipart==0.0.5
7 | typing_extensions==3.10.0
--------------------------------------------------------------------------------
/requirements/production.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | tox==3.24.5
4 | pytest==7.0.1
5 | black==22.1.0
6 | flake8==4.0.1
7 | mypy==0.931
8 | isort==5.10.1
9 | pydantic==1.9.0
10 | strictyaml==1.6.1
11 |
--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | feature-engine==1.2.0
2 | scikit-learn==1.0.2
3 | scipy==1.8.0
4 | seaborn==0.11.2
5 | pandas==1.4.1
6 | numpy==1.22.3
7 | joblib==1.1.0
8 | loguru==0.6.0
9 |
--------------------------------------------------------------------------------
/requirements/research-env.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | jupyterlab==3.3.0
4 | jupyterlab-lsp==3.10.0
5 | jupyter-lsp==1.5.1
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from pathlib import Path
5 |
6 | from setuptools import find_packages, setup
7 |
8 | # Package meta-data.
9 | NAME = 'end-to-end-ML-project'
10 | DESCRIPTION = "End to End ML Project"
11 | URL = "https://github.com/Deffro/end-to-end-ML-project"
12 | EMAIL = "dimitris.effrosynidis@gmail.com"
13 | AUTHOR = "Dimitris Effrosynidis"
14 | REQUIRES_PYTHON = ">=3.7.0"
15 |
16 |
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # If you do change the License, remember to change the
21 | # Trove Classifier for that!
22 | long_description = DESCRIPTION
23 |
24 | # Load the package's VERSION file as a dictionary.
25 | about = {}
26 | ROOT_DIR = Path(__file__).resolve().parent
27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
28 | PACKAGE_DIR = ROOT_DIR / 'src'
29 | with open(PACKAGE_DIR / "VERSION") as f:
30 | _version = f.read().strip()
31 | about["__version__"] = _version
32 |
33 |
34 | # What packages are required for this module to be executed?
35 | def list_reqs(fname="production.txt"):
36 | with open(REQUIREMENTS_DIR / fname) as fd:
37 | reqs = fd.read().splitlines()
38 | # remove empty line
39 | reqs = list(filter(None, reqs))
40 |
41 | # add the packages from the -r requirement.txt in production.txt
42 | for req in reqs:
43 | if '-r' in req:
44 | with open(f"{REQUIREMENTS_DIR}/{req.split(' ')[1]}") as fd:
45 | extra_reqs = fd.read().splitlines()
46 | reqs.remove(req)
47 | return extra_reqs+reqs
48 |
49 |
50 | # Where the magic happens:
51 | setup(
52 | name=NAME,
53 | version=about["__version__"],
54 | description=DESCRIPTION,
55 | long_description=long_description,
56 | long_description_content_type="text/markdown",
57 | author=AUTHOR,
58 | author_email=EMAIL,
59 | python_requires=REQUIRES_PYTHON,
60 | url=URL,
61 | packages=find_packages(exclude=("tests",)),
62 | package_data={"src": ["VERSION"]},
63 | install_requires=list_reqs(),
64 | extras_require={},
65 | include_package_data=True,
66 | license="BSD-3",
67 | classifiers=[
68 | # Trove classifiers
69 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
70 | "License :: OSI Approved :: MIT License",
71 | "Programming Language :: Python",
72 | "Programming Language :: Python :: 3",
73 | "Programming Language :: Python :: 3.6",
74 | "Programming Language :: Python :: 3.7",
75 | "Programming Language :: Python :: 3.8",
76 | "Programming Language :: Python :: 3.9",
77 | "Programming Language :: Python :: Implementation :: CPython",
78 | "Programming Language :: Python :: Implementation :: PyPy",
79 | ],
80 | )
81 |
--------------------------------------------------------------------------------
/src/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.7
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from src.config.core import PACKAGE_ROOT
2 |
3 | with open(PACKAGE_ROOT / "VERSION") as version_file:
4 | __version__ = version_file.read().strip()
5 |
--------------------------------------------------------------------------------
/src/config.yml:
--------------------------------------------------------------------------------
1 | # Data Files
2 | training_data_file: train.csv
3 | test_data_file: test.csv
4 | pipeline_save_file: model_v
5 |
6 | # Variables
7 | target: target
8 |
9 | var_to_drop: enrollee_id
10 |
11 | cat_vars_replace_na_with_string_missing:
12 | - gender
13 | - major_discipline
14 | - company_size
15 | - company_type
16 |
17 | cat_vars_replace_na_with_frequent:
18 | - enrolled_university
19 | - education_level
20 | - experience
21 | - last_new_job
22 |
23 | num_vars:
24 | - city_development_index
25 | - training_hours
26 |
27 | num_vars_yeo_johnson:
28 | - training_hours
29 |
30 | cat_vars_ordinal:
31 | - relevent_experience
32 | - enrolled_university
33 | - education_level
34 | - major_discipline
35 |
36 | cat_vars_ordinal_arbitrary:
37 | - city
38 |
39 | cat_vars_onehot:
40 | - gender
41 |
42 | cat_vars_count_frequency:
43 | - company_type
44 |
45 | experience_var:
46 | - experience
47 |
48 | experience_map:
49 | <1: 0
50 | 1: 1
51 | 2: 2
52 | 3: 3
53 | 4: 4
54 | 5: 5
55 | 6: 6
56 | 7: 7
57 | 8: 8
58 | 9: 9
59 | 10: 10
60 | 11: 11
61 | 12: 12
62 | 13: 13
63 | 14: 14
64 | 15: 15
65 | 16: 16
66 | 17: 17
67 | 18: 18
68 | 19: 19
69 | 20: 20
70 | '>20': 21
71 |
72 | last_new_job_var:
73 | - last_new_job
74 |
75 | last_new_job_map:
76 | never: 0
77 | 1: 1
78 | 2: 2
79 | 3: 3
80 | 4: 4
81 | '>4': 5
82 |
83 | company_size_var:
84 | - company_size
85 |
86 | company_size_map:
87 | Missing: 0
88 | <10: 1
89 | 10/49: 2
90 | 50-99: 3
91 | 100-500: 4
92 | 500-999: 5
93 | 1000-4999: 6
94 | 5000-9999: 7
95 | 10000+: 8
96 |
97 | # Initializations
98 |
99 | test_size: 0.1
100 |
101 | # Model specific
102 |
103 | random_state: 43
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/src/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/config/__init__.py
--------------------------------------------------------------------------------
/src/config/core.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Dict, List
3 |
4 | from pydantic import BaseModel
5 | from strictyaml import YAML, load
6 |
7 | import src
8 |
9 | PACKAGE_ROOT = Path(src.__file__).resolve().parent
10 | ROOT = PACKAGE_ROOT.parent
11 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
12 | DATASET_DIR = PACKAGE_ROOT / "data"
13 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
14 |
15 |
16 | class AppConfig(BaseModel):
17 | """
18 | Application-level config.
19 | """
20 |
21 | training_data_file: str
22 | test_data_file: str
23 | pipeline_save_file: str
24 |
25 |
26 | class ModelConfig(BaseModel):
27 | target: str
28 | var_to_drop: str
29 | cat_vars_replace_na_with_string_missing: List[str]
30 | cat_vars_replace_na_with_frequent: List[str]
31 | num_vars: List[str]
32 | num_vars_yeo_johnson: List[str]
33 | cat_vars_ordinal: List[str]
34 | cat_vars_ordinal_arbitrary: List[str]
35 | cat_vars_onehot: List[str]
36 | cat_vars_count_frequency: List[str]
37 | experience_var: List[str]
38 | experience_map: Dict[str, int]
39 | last_new_job_var: List[str]
40 | last_new_job_map: Dict[str, int]
41 | company_size_var: List[str]
42 | company_size_map: Dict[str, int]
43 | test_size: float
44 | random_state: int
45 |
46 |
47 | class Config(BaseModel):
48 | """Master config object. Name and match the pydantic configs"""
49 |
50 | app_config: AppConfig
51 | model_config: ModelConfig
52 |
53 |
54 | def find_config_file() -> Path:
55 | """Locate the configuration file."""
56 |
57 | if CONFIG_FILE_PATH.is_file():
58 | return CONFIG_FILE_PATH
59 | raise Exception(f"Config not found at {CONFIG_FILE_PATH}")
60 |
61 |
62 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
63 | """Parse YAML containing the package configuration."""
64 |
65 | if not cfg_path:
66 | cfg_path = find_config_file()
67 |
68 | if cfg_path:
69 | with open(cfg_path, "r") as conf_file:
70 | parsed_config = load(conf_file.read())
71 | return parsed_config
72 | raise OSError(f"Did not find config file at path: {cfg_path}")
73 |
74 |
75 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
76 | """Run validation on config values."""
77 | if parsed_config is None:
78 | parsed_config = fetch_config_from_yaml()
79 |
80 | # specify the data attribute from the strictyaml YAML type.
81 | _config = Config(
82 | app_config=AppConfig(**parsed_config.data),
83 | model_config=ModelConfig(**parsed_config.data),
84 | )
85 |
86 | return _config
87 |
88 |
89 | config = create_and_validate_config()
90 |
--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/data/__init__.py
--------------------------------------------------------------------------------
/src/pipeline.py:
--------------------------------------------------------------------------------
1 | from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder, OrdinalEncoder
2 | from feature_engine.imputation import CategoricalImputer
3 | from feature_engine.transformation import YeoJohnsonTransformer
4 | from sklearn.linear_model import LogisticRegression
5 | from sklearn.pipeline import Pipeline
6 | from sklearn.preprocessing import MinMaxScaler
7 |
8 | from src.config.core import config
9 | from src.processing import features as pp
10 |
11 | pipe = Pipeline(
12 | [
13 | (
14 | "cat_imputer_missing",
15 | CategoricalImputer(
16 | imputation_method="missing",
17 | variables=config.model_config.cat_vars_replace_na_with_string_missing,
18 | ),
19 | ),
20 | (
21 | "cat_imputer_frequent",
22 | CategoricalImputer(
23 | imputation_method="frequent",
24 | variables=config.model_config.cat_vars_replace_na_with_frequent,
25 | ),
26 | ),
27 | (
28 | "num_transformer_yeo_johnson",
29 | YeoJohnsonTransformer(variables=config.model_config.num_vars_yeo_johnson),
30 | ),
31 | (
32 | "ordinal_encoder",
33 | OrdinalEncoder(
34 | encoding_method="ordered",
35 | variables=config.model_config.cat_vars_ordinal,
36 | ),
37 | ),
38 | (
39 | "ordinal_encoder_arbitrary",
40 | OrdinalEncoder(
41 | encoding_method="arbitrary",
42 | variables=config.model_config.cat_vars_ordinal_arbitrary,
43 | ),
44 | ),
45 | (
46 | "count_frequency_encoder",
47 | CountFrequencyEncoder(
48 | encoding_method="frequency",
49 | variables=config.model_config.cat_vars_count_frequency,
50 | ),
51 | ),
52 | (
53 | "onehot_encoder",
54 | OneHotEncoder(variables=config.model_config.cat_vars_onehot),
55 | ),
56 | (
57 | "experience_map",
58 | pp.Mapper(
59 | variables=config.model_config.experience_var,
60 | mappings=config.model_config.experience_map,
61 | ),
62 | ),
63 | (
64 | "last_new_job_map",
65 | pp.Mapper(
66 | variables=config.model_config.last_new_job_var,
67 | mappings=config.model_config.last_new_job_map,
68 | ),
69 | ),
70 | (
71 | "company_size_map",
72 | pp.Mapper(
73 | variables=config.model_config.company_size_var,
74 | mappings=config.model_config.company_size_map,
75 | ),
76 | ),
77 | ("min_max_scaler", MinMaxScaler()),
78 | (
79 | "logistic_regression",
80 | LogisticRegression(random_state=config.model_config.random_state),
81 | ),
82 | ]
83 | )
84 |
--------------------------------------------------------------------------------
/src/predict.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from src import __version__ as _version
4 | from src.config.core import config
5 | from src.processing.data_manager import load_pipeline
6 |
7 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
8 | trained_pipe = load_pipeline(file_name=pipeline_file_name)
9 |
10 |
11 | def make_prediction(input_data) -> dict:
12 | """Make a prediction using a saved model pipeline."""
13 |
14 | data = pd.DataFrame(input_data)
15 |
16 | if config.model_config.target in data.columns:
17 | data = data.drop([config.model_config.target], axis=1)
18 |
19 | predictions = trained_pipe.predict(X=data)
20 | results = {
21 | "predictions": [pred for pred in predictions],
22 | "version": _version,
23 | }
24 |
25 | return results
26 |
--------------------------------------------------------------------------------
/src/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/processing/__init__.py
--------------------------------------------------------------------------------
/src/processing/data_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from pathlib import Path
3 |
4 | import joblib
5 | import pandas as pd
6 | from sklearn.pipeline import Pipeline
7 |
8 | from src import __version__ as _version
9 | from src.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
10 |
11 |
12 | def load_dataset(file_name: str) -> pd.DataFrame:
13 | df = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
14 | df = df.drop(columns=[config.model_config.var_to_drop])
15 | return df
16 |
17 |
18 | def save_pipeline(pipeline_to_persist: Pipeline) -> None:
19 | """Persist the pipeline.
20 | Saves the versioned model, and overwrites any previous
21 | saved models. This ensures that when the package is
22 | published, there is only one trained model that can be
23 | called, and we know exactly how it was built.
24 | """
25 |
26 | # Prepare versioned save file name
27 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
28 | save_path = TRAINED_MODEL_DIR / save_file_name
29 |
30 | remove_old_pipelines(files_to_keep=[save_file_name])
31 | joblib.dump(pipeline_to_persist, save_path)
32 |
33 |
34 | def load_pipeline(file_name: str) -> Pipeline:
35 | """Load a persisted pipeline."""
36 |
37 | file_path = TRAINED_MODEL_DIR / file_name
38 | trained_model = joblib.load(filename=file_path)
39 | return trained_model
40 |
41 |
42 | def remove_old_pipelines(files_to_keep: t.List[str]) -> None:
43 | """
44 | Remove old model pipelines.
45 | This is to ensure there is a simple one-to-one
46 | mapping between the package version and the model
47 | version to be imported and used by other applications.
48 | """
49 | do_not_delete = files_to_keep + ["__init__.py", ".ipynb_checkpoints"]
50 | for model_file in TRAINED_MODEL_DIR.iterdir():
51 | if model_file.name not in do_not_delete:
52 | model_file.unlink()
53 |
--------------------------------------------------------------------------------
/src/processing/features.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pandas as pd
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 |
6 |
7 | class Mapper(BaseEstimator, TransformerMixin):
8 | """Categorical variable mapper."""
9 |
10 | def __init__(self, variables: List[str], mappings: dict):
11 |
12 | if not isinstance(variables, list):
13 | raise ValueError("variables should be a list")
14 |
15 | self.variables = variables
16 | self.mappings = mappings
17 |
18 | def fit(self, x: pd.DataFrame, y: pd.Series = None):
19 | # fit statement to be in line with the sklearn pipeline
20 | return self
21 |
22 | def transform(self, x: pd.DataFrame) -> pd.DataFrame:
23 | x = x.copy()
24 | for feature in self.variables:
25 | x[feature] = x[feature].map(self.mappings)
26 |
27 | return x
28 |
--------------------------------------------------------------------------------
/src/train_pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.model_selection import train_test_split
2 |
3 | from config.core import config
4 | from pipeline import pipe
5 | from processing.data_manager import load_dataset, save_pipeline
6 |
7 |
8 | def run_training() -> None:
9 | """Train the model."""
10 |
11 | # read training data
12 | data = load_dataset(file_name=config.app_config.training_data_file)
13 |
14 | # divide train and test
15 | X_train, X_test, y_train, y_test = train_test_split(
16 | data[[c for c in data.columns if c != config.model_config.target]],
17 | data[config.model_config.target],
18 | test_size=config.model_config.test_size,
19 | random_state=config.model_config.random_state,
20 | )
21 |
22 | # fit model
23 | pipe.fit(X_train, y_train)
24 |
25 | # persist trained model
26 | save_pipeline(pipeline_to_persist=pipe)
27 |
28 |
29 | if __name__ == "__main__":
30 | run_training()
31 |
--------------------------------------------------------------------------------
/src/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/trained_models/__init__.py
--------------------------------------------------------------------------------
/src/trained_models/model_v0.0.7.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/trained_models/model_v0.0.7.pkl
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/tests/__init__.py
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from src.config.core import config
4 | from src.processing.data_manager import load_dataset
5 |
6 |
7 | @pytest.fixture()
8 | def train_data():
9 | return load_dataset(file_name=config.app_config.training_data_file)
10 |
11 |
12 | @pytest.fixture()
13 | def test_data():
14 | return load_dataset(file_name=config.app_config.test_data_file)
15 |
--------------------------------------------------------------------------------
/tests/test_features.py:
--------------------------------------------------------------------------------
1 | from feature_engine.transformation import YeoJohnsonTransformer
2 |
3 | from src.config.core import config
4 |
5 |
6 | def test_yeo_johnson(train_data):
7 | assert train_data[config.model_config.num_vars_yeo_johnson].iloc[0].values[0] == 36
8 |
9 | yeo_transformer = YeoJohnsonTransformer(
10 | variables=config.model_config.num_vars_yeo_johnson
11 | )
12 | subject = yeo_transformer.fit_transform(train_data)
13 |
14 | assert (
15 | subject[config.model_config.num_vars_yeo_johnson].iloc[0].values[0]
16 | == 4.719119791024215
17 | )
18 |
--------------------------------------------------------------------------------
/tests/test_input_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from src.config.core import config
4 |
5 |
6 | def test_column_types(train_data):
7 | # Check if data is DataFrame
8 | assert isinstance(train_data, pd.DataFrame)
9 | cat_vars = [
10 | f
11 | for f in train_data.columns
12 | if f
13 | not in (
14 | config.model_config.num_vars
15 | + [config.model_config.target]
16 | + [config.model_config.var_to_drop]
17 | )
18 | ]
19 | # Check column types
20 | for f in cat_vars:
21 | assert train_data[f].dtype == "O"
22 |
23 | assert train_data["training_hours"].dtype == "int64"
24 | assert train_data["city_development_index"].dtype == "float64"
25 |
26 |
27 | def test_number_of_columns(train_data):
28 | assert train_data.drop([config.model_config.target], axis=1).shape[1] == 12
29 |
--------------------------------------------------------------------------------
/tests/test_prediction.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import numpy as np
4 | from sklearn.metrics import accuracy_score
5 | from sklearn.model_selection import train_test_split
6 |
7 | from src.config.core import config
8 | from src.predict import make_prediction
9 |
10 |
11 | def test_make_prediction(train_data):
12 |
13 | expected_first_10_predictions = [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
14 |
15 | result = make_prediction(input_data=train_data)
16 | predictions = result.get("predictions")
17 |
18 | assert predictions[:10] == expected_first_10_predictions
19 | assert isinstance(predictions, list)
20 | assert isinstance(predictions[0], np.float64)
21 |
22 |
23 | def test_accuracy_over_threshold(train_data):
24 |
25 | X_train, X_test, y_train, y_test = train_test_split(
26 | train_data.drop([config.model_config.target], axis=1),
27 | train_data[config.model_config.target],
28 | test_size=config.model_config.test_size,
29 | random_state=config.model_config.random_state,
30 | )
31 |
32 | result = make_prediction(input_data=X_test)
33 | predictions = result.get("predictions")
34 |
35 | assert accuracy_score(y_test, predictions) > 0.77
36 |
37 |
38 | def test_serving_latency(train_data):
39 |
40 | s = time.time()
41 | for i in range(100):
42 | make_prediction(input_data=train_data[:1])
43 | elapsed_time = time.time() - s
44 | assert elapsed_time < 5
45 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = test_package, typechecks, stylechecks, lint
3 | skipsdist = True
4 |
5 | [testenv]
6 | install_command = pip install {opts} {packages}
7 |
8 | [testenv:test_package]
9 | deps =
10 | -rrequirements/production.txt
11 |
12 | setenv =
13 | PYTHONPATH=.
14 | PYTHONHASHSEED=0
15 |
16 | commands=
17 | python src/train_pipeline.py
18 | pytest \
19 | -s \
20 | -vv \
21 | {posargs:tests/}
22 |
23 | [testenv:train]
24 | envdir = {toxworkdir}/test_package
25 | deps =
26 | {[testenv:test_package]deps}
27 |
28 | setenv =
29 | {[testenv:test_package]setenv}
30 |
31 | commands=
32 | python src/train_pipeline.py
33 |
34 |
35 | [testenv:typechecks]
36 | envdir = {toxworkdir}/test_package
37 |
38 | deps =
39 | {[testenv:test_package]deps}
40 |
41 | commands = {posargs:mypy src}
42 |
43 |
44 | [testenv:stylechecks]
45 | envdir = {toxworkdir}/test_package
46 |
47 | deps =
48 | {[testenv:test_package]deps}
49 |
50 | commands = {posargs:flake8 src tests}
51 |
52 |
53 | [testenv:lint]
54 | envdir = {toxworkdir}/test_package
55 |
56 | deps =
57 | {[testenv:test_package]deps}
58 |
59 | commands =
60 | isort src tests
61 | black src tests
62 | mypy src
63 | flake8 src
64 |
65 | [flake8]
66 | exclude = .git,env
67 | max-line-length = 120
--------------------------------------------------------------------------------