├── .dockerignore ├── .gitignore ├── Dockerfile ├── MANIFEST.in ├── README.md ├── app-fastapi ├── Procfile ├── app │ ├── __init__.py │ ├── api.py │ ├── config.py │ ├── main.py │ ├── schemas │ │ ├── __init__.py │ │ ├── health.py │ │ └── predict.py │ └── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_api.py ├── mypy.ini ├── requirements.txt ├── run.sh ├── runtime.txt └── tox.ini ├── heroku.yml ├── mypy.ini ├── notebooks ├── 1. Data Analysis.ipynb ├── 2. Feature Engineering.ipynb ├── 3. Feature Engineering Pipeline.ipynb ├── 4. Machine Learning.ipynb ├── pipe.joblib └── preprocess.py ├── pyproject.toml ├── requirements ├── deployment.txt ├── production.txt ├── requirements.txt └── research-env.txt ├── setup.py ├── src ├── VERSION ├── __init__.py ├── config.yml ├── config │ ├── __init__.py │ └── core.py ├── data │ ├── __init__.py │ ├── test.csv │ └── train.csv ├── pipeline.py ├── predict.py ├── processing │ ├── __init__.py │ ├── data_manager.py │ └── features.py ├── train_pipeline.py └── trained_models │ ├── __init__.py │ └── model_v0.0.7.pkl ├── tests ├── __init__.py ├── conftest.py ├── test_features.py ├── test_input_data.py └── test_prediction.py └── tox.ini /.dockerignore: -------------------------------------------------------------------------------- 1 | notebooks* 2 | */env* 3 | */venv* 4 | .circleci* 5 | packages/src 6 | *.env 7 | *.log 8 | .git 9 | .gitignore 10 | .tox -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | 4 | 5 | # Folders 6 | 7 | 8 | # Jupyter notebook 9 | notebooks_research/ 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | env/ 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *,cover 56 | .hypothesis/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # IPython Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | venv/ 93 | ENV/ 94 | .virtual_documents 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | ### VirtualEnv template 102 | # Virtualenv 103 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 104 | .Python 105 | [Bb]in 106 | [Ii]nclude 107 | [Ll]ib 108 | [Ll]ib64 109 | [Ll]ocal 110 | [Ss]cripts 111 | pyvenv.cfg 112 | .venv 113 | pip-selfcheck.json 114 | ### JetBrains template 115 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 116 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 117 | 118 | # User-specific stuff: 119 | .idea/workspace.xml 120 | .idea/tasks.xml 121 | .idea/dictionaries 122 | .idea/vcs.xml 123 | .idea/jsLibraryMappings.xml 124 | 125 | # Sensitive or high-churn files: 126 | .idea/dataSources.ids 127 | .idea/dataSources.xml 128 | .idea/dataSources.local.xml 129 | .idea/sqlDataSources.xml 130 | .idea/dynamic.xml 131 | .idea/uiDesigner.xml 132 | 133 | # Gradle: 134 | .idea/gradle.xml 135 | .idea/libraries 136 | 137 | # Mongo Explorer plugin: 138 | .idea/mongoSettings.xml 139 | 140 | .idea/ 141 | 142 | ## File-based project format: 143 | *.iws 144 | 145 | ## Plugin-specific files: 146 | 147 | # IntelliJ 148 | /out/ 149 | 150 | # mpeltonen/sbt-idea plugin 151 | .idea_modules/ 152 | 153 | # JIRA plugin 154 | atlassian-ide-plugin.xml 155 | 156 | # Crashlytics plugin (for Android Studio and IntelliJ) 157 | com_crashlytics_export_strings.xml 158 | crashlytics.properties 159 | crashlytics-build.properties 160 | fabric.properties 161 | 162 | # Darts 163 | .darts/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # the base image that we inherit from 2 | FROM python:3.9.5-slim 3 | 4 | #best practice: create a user 5 | RUN adduser --disabled-password --gecos '' ml-api-user 6 | 7 | WORKDIR /opt/app-fastapi 8 | 9 | # copy our project inside the container 10 | ADD ./app-fastapi /opt/app-fastapi/ 11 | RUN pip install --upgrade pip 12 | RUN pip install -r /opt/app-fastapi/requirements.txt 13 | 14 | RUN chmod +x /opt/app-fastapi/run.sh 15 | RUN chown -R ml-api-user:ml-api-user ./ 16 | 17 | USER ml-api-user 18 | 19 | EXPOSE 8001 20 | 21 | CMD ["bash", "./run.sh"] -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.pkl 4 | recursive-include ./src/* 5 | 6 | include src/data/train.csv 7 | include src/data/test.csv 8 | include src/trained_models/*.pkl 9 | include src/VERSION 10 | include src/config.yml 11 | 12 | include ./requirements/research-env.txt 13 | include ./requirements/production.txt 14 | include ./requirements/requirements.txt 15 | exclude *.log 16 | exclude *.cfg 17 | 18 | recursive-exclude * __pycache__ 19 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # End to End Machine Learning Project 2 | 3 | This project aims to apply the best software engineering practices in a Machine Learning project in order to deploy the model. 4 | 5 | We are developing a model to predict if a Data Scientist is willing to leave his/her current job. 6 | We are not interested in the accuracy of the model (which is 77%), but rather to transition from the research environment to production code, packaging, and finally deployment of the model. 7 | 8 | [https://end-to-end-ml-project.herokuapp.com/](https://end-to-end-ml-project.herokuapp.com/) 9 | 10 | 11 |

Research Code ➙ Production Code ➙ Deployment

12 | 13 | Python 14 | Scikit-learn 15 | Jupyter 16 | Anaconda 17 | PyCharm 18 | Docker 19 | Git 20 | Heroku 21 | FastAPI 22 | Pytest 23 | tox 24 | 25 | 26 | ### Project Structure 27 | 28 | ``` 29 | end-to-end-ML-project 30 | │ README.md 31 | │ MANIFEST.in 32 | │ mypy.ini 33 | │ pyproject.toml 34 | │ setyp.py 35 | │ .gitignore 36 | │ tox.ini 37 | │ Dockerfile 38 | │ 39 | └───notebooks 40 | │ │ 1. Data Analysis.ipynb 41 | │ │ 2. Feature Engineering.ipynb 42 | │ │ 3. Feature Engineering Pipeline.ipynb 43 | │ │ 4. Machine Learning.ipynb 44 | │ │ preprocess.py 45 | │ 46 | └───requirements 47 | │ │ requirements.txt 48 | │ │ research-env.txt 49 | │ │ production.txt 50 | │ │ deployment.txt 51 | │ 52 | └───src 53 | │ │ VERSION 54 | │ │ __init__.py 55 | │ │ config.yml 56 | │ │ pipeline.py 57 | │ │ train_pipeline.py 58 | │ │ predict.py 59 | │ │ 60 | │ └───config 61 | │ │ │ __init__.py 62 | │ │ │ core.py 63 | │ │ 64 | │ └───data 65 | │ │ │ __init__.py 66 | │ │ │ train.csv 67 | │ │ │ test.csv 68 | │ │ 69 | │ └───processing 70 | │ │ │ __init__.py 71 | │ │ │ data_manager.py 72 | │ │ │ features.py 73 | │ │ 74 | │ └───trained_models 75 | │ │ │ __init__.py 76 | │ 77 | └───app-fastapi 78 | │ ... 79 | ``` 80 | 81 | ### Steps in An End-to-end ML Project 82 | 83 | 1. Start with jupyter notebooks and finalize a model. 84 | 2. Transform research code to production code. 85 | 3. Make the project a package. 86 | 4. Serve it via a REST API. 87 | 5. Dockerize it and deploy it. 88 | 89 | ### 1. Start with jupyter notebooks and finalize a model 90 | 91 | The ```notebooks``` folder is the research which is often done by a Data Scientist. 92 | 93 | Usually a Data Analysis notebook for EDA and data understanding is the first step. 94 | Then, features are created in a pipeline. Here, sciki-learn and feature-engine were used. 95 | Finally, the ML model is placed at the end of the pipeline. 96 | 97 | Research can be very time-consuming. Here, a simple pipeline is created, 98 | because the creation of a 95% accuracy model is out of the scope of this work. 99 | 100 | ### 2. Transform research code to production code 101 | 102 | The ```src``` folder is the transformation of the jupyter notebooks to a python project. 103 | 104 | Some good practices: 105 | - Create a ```config.yml``` file that contains all the constants and configurations derived from the notebooks. Accompany it with a .py file to parse it (Here it is the ```src/config/core.py```). 106 | - Tidy all extra functions written and place them in a ```processing``` folder. For example, in ```src/processing/data_manager.py``` there are functions to read the data, save, read, and remove the pipeline. 107 | - Make different file for ```train_pipeline.py``` and ```predict.py```. 108 | - Always create very small functions to test them easier and have a readable code. 109 | - Create a ```trained_models``` folder to deposit the models. 110 | - Have a ```VERSION``` file, to track the version of the project, e.g. 0.0.4 111 | - Write ```tests```. Now write more tests. 112 | - Make a ```tox.ini``` file to make life easier, test code faster, get rid of styling, type checks, linting, and PEP8 concerns. 113 | 114 | Note: In order to import your python files as packages in other python files, we need to add the project's filepath to the Path environmental Variable. 115 | 116 | ### 3. Make the project a package 117 | 118 | We need 3 files in the root of the project: 119 | 120 | 1. ```MANIFEST.ini```: Define which files to include and exclude from the package. 121 | 2. ```pyproject.toml```: Specify basic dependencies and configure tooling. 122 | 3. ```setup.py```: Package metadata, version, requirements, how to create the package. 123 | 124 | From the project directory: ```python -m build``` 125 | 126 | Then, make an account to PyPI. Install twine: ```pip install twine``` 127 | 128 | Upload: ```twine upload dist/end_to_end_ML_project-0.0.4-py3-none-any.whl``` 129 | 130 | Now the package can be installed like any other package with ```pip install end-to-end-ML-project``` 131 | 132 | It can be imported like: ```import src``` 133 | 134 | 135 | ### 4. Serve it via a REST API 136 | 137 | The API should be a different repository or at least a different folder. Here it is located in the folder ```app-fastapi```. 138 | 139 | The first thing here is in the ```requirements.txt```, where we define to install the ```end-to-end-ML-project``` package, 140 | which we have published earlier. 141 | 142 | Three key files of the api are: 143 | 144 | - ```config.py```: Specify metadata of the api, and logging settings. 145 | - ```main.py```: Define the main app and the index page router. 146 | - ```api.py```: Define a health and a predict endpoint. 147 | 148 | We define some ```schemas``` for automatic validation of variable types. 149 | We define some ```schemas``` for automatic validation of variable types. 150 | 151 | We also define ```tests``` with predefined input data to predict. 152 | 153 | We also use ```logging``` and the package ```loguru```. 154 | 155 | The ```Procfile``` and ```runtime.txt``` are necessary files to deploy on Heroku. 156 | 157 | 158 | ### 5. Dockerize it and deploy it 159 | 160 | We create a ```Dockerfile``` and build the image: 161 | 162 | ```docker build -t end-to-end-ML-project:latest .``` 163 | 164 | We run the image: 165 | 166 | ```docker run -p 8001:8001 -e PORT=8001 end-to-end-ml-project``` 167 | 168 | We can see the output on localhost:8001/ 169 | 170 | Now to deploy on Heroku, create a ```heroku.yml``` file. 171 | 172 | ``` 173 | heroku login 174 | heroku cointainer:login 175 | heroku container:push web --app end-to-end-ml-project 176 | heroku container:release web --app end-to-end-ml-project 177 | heroku open --app end-to-end-ml-project 178 | ``` 179 | -------------------------------------------------------------------------------- /app-fastapi/Procfile: -------------------------------------------------------------------------------- 1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT -------------------------------------------------------------------------------- /app-fastapi/app/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" -------------------------------------------------------------------------------- /app-fastapi/app/api.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from fastapi import APIRouter, HTTPException 7 | from fastapi.encoders import jsonable_encoder 8 | from loguru import logger 9 | from src import __version__ as model_version 10 | from src.predict import make_prediction 11 | 12 | from app import __version__, schemas 13 | from app.config import settings 14 | 15 | api_router = APIRouter() 16 | 17 | 18 | @api_router.get("/health", response_model=schemas.Health, status_code=200) 19 | def health() -> dict: 20 | """ 21 | Root Get 22 | """ 23 | health = schemas.Health( 24 | name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version 25 | ) 26 | 27 | return health.dict() 28 | 29 | 30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200) 31 | async def predict(input_data: schemas.MultipleDataInputs) -> Any: 32 | """ 33 | Make predictions with the end-to-end-ML-project model 34 | """ 35 | # load pydantic data 36 | input_df = pd.DataFrame(jsonable_encoder(input_data.inputs)) 37 | 38 | # Advanced: You can improve performance of your API by rewriting the 39 | # `make prediction` function to be async and using await here. 40 | logger.info(f"Making prediction on inputs: {input_data.inputs}") 41 | results = make_prediction(input_data=input_df.replace({np.nan: None})) 42 | 43 | logger.info(f"Prediction results: {results.get('predictions')}") 44 | 45 | return results 46 | -------------------------------------------------------------------------------- /app-fastapi/app/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from types import FrameType 4 | from typing import List, cast 5 | 6 | from loguru import logger 7 | from pydantic import AnyHttpUrl, BaseSettings 8 | 9 | 10 | class LoggingSettings(BaseSettings): 11 | LOGGING_LEVEL: int = logging.INFO # logging levels are type int 12 | 13 | 14 | class Settings(BaseSettings): 15 | API_V1_STR: str = "/api/v1" 16 | 17 | # Meta 18 | logging: LoggingSettings = LoggingSettings() 19 | 20 | # BACKEND_CORS_ORIGINS is a comma-separated list of origins 21 | # e.g: http://localhost,http://localhost:4200,http://localhost:3000 22 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [ 23 | "http://localhost:3000", # type: ignore 24 | "http://localhost:8000", # type: ignore 25 | "https://localhost:3000", # type: ignore 26 | "https://localhost:8000", # type: ignore 27 | ] 28 | 29 | PROJECT_NAME: str = "End to End ML Project" 30 | 31 | class Config: 32 | case_sensitive = True 33 | 34 | 35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging # noqa 36 | class InterceptHandler(logging.Handler): 37 | def emit(self, record: logging.LogRecord) -> None: # pragma: no cover 38 | # Get corresponding Loguru level if it exists 39 | try: 40 | level = logger.level(record.levelname).name 41 | except ValueError: 42 | level = str(record.levelno) 43 | 44 | # Find caller from where originated the logged message 45 | frame, depth = logging.currentframe(), 2 46 | while frame.f_code.co_filename == logging.__file__: # noqa: WPS609 47 | frame = cast(FrameType, frame.f_back) 48 | depth += 1 49 | 50 | logger.opt(depth=depth, exception=record.exc_info).log( 51 | level, 52 | record.getMessage(), 53 | ) 54 | 55 | 56 | def setup_app_logging(config: Settings) -> None: 57 | """Prepare custom logging for our application.""" 58 | 59 | LOGGERS = ("uvicorn.asgi", "uvicorn.access") 60 | logging.getLogger().handlers = [InterceptHandler()] 61 | for logger_name in LOGGERS: 62 | logging_logger = logging.getLogger(logger_name) 63 | logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)] 64 | 65 | logger.configure( 66 | handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}] 67 | ) 68 | 69 | 70 | settings = Settings() -------------------------------------------------------------------------------- /app-fastapi/app/main.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from fastapi import APIRouter, FastAPI, Request 4 | from fastapi.middleware.cors import CORSMiddleware 5 | from fastapi.responses import HTMLResponse 6 | from loguru import logger 7 | 8 | from app.api import api_router 9 | from app.config import settings, setup_app_logging 10 | 11 | # setup logging as early as possible 12 | setup_app_logging(config=settings) 13 | 14 | 15 | app = FastAPI( 16 | title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json" 17 | ) 18 | 19 | root_router = APIRouter() 20 | 21 | 22 | @root_router.get("/") 23 | def index(request: Request) -> Any: 24 | """Basic HTML response.""" 25 | body = ( 26 | "" 27 | "" 28 | "

Welcome to the API

" 29 | "
" 30 | "Check the docs: here" 31 | "
" 32 | "" 33 | "" 34 | ) 35 | 36 | return HTMLResponse(content=body) 37 | 38 | 39 | app.include_router(api_router, prefix=settings.API_V1_STR) 40 | app.include_router(root_router) 41 | 42 | # Set all CORS enabled origins 43 | if settings.BACKEND_CORS_ORIGINS: 44 | app.add_middleware( 45 | CORSMiddleware, 46 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS], 47 | allow_credentials=True, 48 | allow_methods=["*"], 49 | allow_headers=["*"], 50 | ) 51 | 52 | 53 | if __name__ == "__main__": 54 | # Use this for debugging purposes only 55 | logger.warning("Running in development mode. Do not run like this in production.") 56 | import uvicorn 57 | 58 | uvicorn.run(app, host="localhost", port=8001, log_level="debug") -------------------------------------------------------------------------------- /app-fastapi/app/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | from .health import Health 2 | from .predict import MultipleDataInputs, PredictionResults -------------------------------------------------------------------------------- /app-fastapi/app/schemas/health.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Health(BaseModel): 5 | name: str 6 | api_version: str 7 | model_version: str 8 | -------------------------------------------------------------------------------- /app-fastapi/app/schemas/predict.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class DataInputSchema(BaseModel): 7 | city: Optional[str] 8 | city_development_index: Optional[float] 9 | gender: Optional[str] 10 | relevent_experience: Optional[str] 11 | enrolled_university: Optional[str] 12 | education_level: Optional[str] 13 | major_discipline: Optional[str] 14 | experience: Optional[str] 15 | company_size: Optional[str] 16 | company_type: Optional[str] 17 | last_new_job: Optional[str] 18 | training_hours: Optional[int] 19 | 20 | 21 | class PredictionResults(BaseModel): 22 | errors: Optional[Any] 23 | version: str 24 | predictions: Optional[List[float]] 25 | 26 | 27 | class MultipleDataInputs(BaseModel): 28 | inputs: List[DataInputSchema] 29 | 30 | class Config: 31 | schema_extra = { 32 | "example": { 33 | "inputs": [ 34 | { 35 | "city": "city_41", 36 | "city_development_index": 0.8270000000000001, 37 | "gender": "Male", 38 | "relevent_experience": "Has relevent experience", 39 | "enrolled_university": "Full time course", 40 | "education_level": "Graduate", 41 | "major_discipline": "STEM", 42 | "experience": "9", 43 | "company_size": "<10", 44 | "company_type": "Funded Startup", 45 | "last_new_job": "1", 46 | "training_hours": 21 47 | } 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /app-fastapi/app/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/app-fastapi/app/tests/__init__.py -------------------------------------------------------------------------------- /app-fastapi/app/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | import pandas as pd 4 | import pytest 5 | from fastapi.testclient import TestClient 6 | from src.config.core import config 7 | from src.processing.data_manager import load_dataset 8 | 9 | from app.main import app 10 | 11 | 12 | @pytest.fixture(scope="module") 13 | def test_data() -> pd.DataFrame: 14 | return load_dataset(file_name=config.app_config.test_data_file) 15 | 16 | 17 | @pytest.fixture() 18 | def client() -> Generator: 19 | with TestClient(app) as _client: 20 | yield _client 21 | app.dependency_overrides = {} 22 | -------------------------------------------------------------------------------- /app-fastapi/app/tests/test_api.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from fastapi.testclient import TestClient 6 | 7 | 8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None: 9 | # Given 10 | payload = { 11 | # ensure pydantic plays well with np.nan 12 | "inputs": test_data.replace({np.nan: None}).to_dict(orient="records") 13 | } 14 | 15 | # When 16 | response = client.post( 17 | "http://localhost:8001/api/v1/predict", 18 | json=payload, 19 | ) 20 | 21 | # Then 22 | assert response.status_code == 200 23 | prediction_data = response.json() 24 | assert prediction_data["predictions"] 25 | assert prediction_data["errors"] is None 26 | # assert math.isclose(prediction_data["predictions"][0], 1500, rel_tol=100) 27 | -------------------------------------------------------------------------------- /app-fastapi/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = pydantic.mypy 3 | ignore_missing_imports = True 4 | disallow_untyped_defs = True -------------------------------------------------------------------------------- /app-fastapi/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.75.0 2 | uvicorn==0.17.5 3 | python-multipart==0.0.5 4 | typing_extensions==3.10.0 5 | requests 6 | end-to-end-ML-project 7 | -------------------------------------------------------------------------------- /app-fastapi/run.sh: -------------------------------------------------------------------------------- 1 | uvicorn app.main:app --host 0.0.0.0 --port $PORT -------------------------------------------------------------------------------- /app-fastapi/runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.9.5 -------------------------------------------------------------------------------- /app-fastapi/tox.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_cli_level=WARNING 3 | 4 | [tox] 5 | envlist = test_app, typechecks, stylechecks, lint 6 | skipsdist = True 7 | 8 | [testenv] 9 | install_command = pip install {opts} {packages} 10 | 11 | [testenv:test_app] 12 | deps = 13 | -rrequirements.txt 14 | 15 | setenv = 16 | PYTHONPATH=. 17 | PYTHONHASHSEED=0 18 | 19 | commands= 20 | pytest \ 21 | -vv \ 22 | {posargs:app/tests/} 23 | 24 | [testenv:run] 25 | envdir = {toxworkdir}/test_app 26 | deps = 27 | {[testenv:test_app]deps} 28 | 29 | setenv = 30 | {[testenv:test_app]setenv} 31 | 32 | commands= 33 | python app/main.py 34 | 35 | 36 | [testenv:typechecks] 37 | envdir = {toxworkdir}/test_app 38 | 39 | deps = 40 | {[testenv:test_app]deps} 41 | 42 | commands = {posargs:mypy app} 43 | 44 | 45 | [testenv:stylechecks] 46 | envdir = {toxworkdir}/test_app 47 | 48 | deps = 49 | {[testenv:test_app]deps} 50 | 51 | commands = {posargs:flake8 app} 52 | 53 | 54 | [testenv:lint] 55 | envdir = {toxworkdir}/test_app 56 | 57 | deps = 58 | {[testenv:test_app]deps} 59 | 60 | commands = 61 | isort app 62 | black app 63 | mypy app 64 | flake8 app 65 | 66 | [flake8] 67 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic 68 | max-line-length = 120 -------------------------------------------------------------------------------- /heroku.yml: -------------------------------------------------------------------------------- 1 | build: 2 | docker: 3 | web: Dockerfile -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | # warn_unreachable = True 3 | warn_unused_ignores = True 4 | follow_imports = skip 5 | show_error_context = True 6 | warn_incomplete_stub = True 7 | ignore_missing_imports = True 8 | check_untyped_defs = True 9 | cache_dir = /dev/null 10 | # Cannot enable this one as we still allow defining functions without any types. 11 | # disallow_untyped_defs = True 12 | warn_redundant_casts = True 13 | warn_unused_configs = True 14 | strict_optional = True -------------------------------------------------------------------------------- /notebooks/2. Feature Engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "cdc43d05", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import scipy.stats as stats\n", 13 | "\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn.preprocessing import MinMaxScaler\n", 16 | "\n", 17 | "from feature_engine.imputation import (\n", 18 | " CategoricalImputer,\n", 19 | ")\n", 20 | "\n", 21 | "from feature_engine.transformation import (\n", 22 | " YeoJohnsonTransformer,\n", 23 | ")\n", 24 | "\n", 25 | "from feature_engine.encoding import (\n", 26 | " RareLabelEncoder,\n", 27 | " OrdinalEncoder,\n", 28 | " OneHotEncoder,\n", 29 | " CountFrequencyEncoder\n", 30 | ")\n", 31 | "\n", 32 | "import joblib\n", 33 | "\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "b7976053", 41 | "metadata": {}, 42 | "source": [ 43 | "## Read Data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "id": "5bf686d7", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(19158, 14)\n" 57 | ] 58 | }, 59 | { 60 | "data": { 61 | "text/html": [ 62 | "
\n", 63 | "\n", 76 | "\n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | "
enrollee_idcitycity_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hourstarget
08949city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM>20NaNNaN1361.0
129725city_400.776MaleNo relevent experienceno_enrollmentGraduateSTEM1550-99Pvt Ltd>4470.0
211561city_210.624NaNNo relevent experienceFull time courseGraduateSTEM5NaNNaNnever830.0
333241city_1150.789NaNNo relevent experienceNaNGraduateBusiness Degree<1NaNPvt Ltdnever521.0
4666city_1620.767MaleHas relevent experienceno_enrollmentMastersSTEM>2050-99Funded Startup480.0
\n", 184 | "
" 185 | ], 186 | "text/plain": [ 187 | " enrollee_id city city_development_index gender \\\n", 188 | "0 8949 city_103 0.920 Male \n", 189 | "1 29725 city_40 0.776 Male \n", 190 | "2 11561 city_21 0.624 NaN \n", 191 | "3 33241 city_115 0.789 NaN \n", 192 | "4 666 city_162 0.767 Male \n", 193 | "\n", 194 | " relevent_experience enrolled_university education_level \\\n", 195 | "0 Has relevent experience no_enrollment Graduate \n", 196 | "1 No relevent experience no_enrollment Graduate \n", 197 | "2 No relevent experience Full time course Graduate \n", 198 | "3 No relevent experience NaN Graduate \n", 199 | "4 Has relevent experience no_enrollment Masters \n", 200 | "\n", 201 | " major_discipline experience company_size company_type last_new_job \\\n", 202 | "0 STEM >20 NaN NaN 1 \n", 203 | "1 STEM 15 50-99 Pvt Ltd >4 \n", 204 | "2 STEM 5 NaN NaN never \n", 205 | "3 Business Degree <1 NaN Pvt Ltd never \n", 206 | "4 STEM >20 50-99 Funded Startup 4 \n", 207 | "\n", 208 | " training_hours target \n", 209 | "0 36 1.0 \n", 210 | "1 47 0.0 \n", 211 | "2 83 0.0 \n", 212 | "3 52 1.0 \n", 213 | "4 8 0.0 " 214 | ] 215 | }, 216 | "execution_count": 2, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_csv('../src/data/train.csv')\n", 223 | "print(data.shape)\n", 224 | "data.head()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "id": "d64ee1e2", 230 | "metadata": {}, 231 | "source": [ 232 | "## Train-Test Split" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 3, 238 | "id": "a39a4c06", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "X_train, X_test, y_train, y_test = train_test_split(\n", 243 | " data.drop(['enrollee_id', 'target'], axis=1),\n", 244 | " data['target'],\n", 245 | " test_size=0.2,\n", 246 | " random_state=0,\n", 247 | ")" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "8356eff9", 253 | "metadata": {}, 254 | "source": [ 255 | "## Missing Values" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "a9dc024a", 261 | "metadata": {}, 262 | "source": [ 263 | "### Categorical Varibles" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 4, 269 | "id": "47dca2c6", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "cat_vars = [var for var in data.columns if data[var].dtype == 'O']\n", 274 | "cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() > 0]" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 5, 280 | "id": "a3a026be", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "['city',\n", 287 | " 'gender',\n", 288 | " 'relevent_experience',\n", 289 | " 'enrolled_university',\n", 290 | " 'education_level',\n", 291 | " 'major_discipline',\n", 292 | " 'experience',\n", 293 | " 'company_size',\n", 294 | " 'company_type',\n", 295 | " 'last_new_job']" 296 | ] 297 | }, 298 | "execution_count": 5, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "cat_vars" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 6, 310 | "id": "104a1eb1", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "company_type 0.320493\n", 317 | "company_size 0.309949\n", 318 | "gender 0.235306\n", 319 | "major_discipline 0.146832\n", 320 | "education_level 0.024011\n", 321 | "last_new_job 0.022080\n", 322 | "enrolled_university 0.020148\n", 323 | "experience 0.003393\n", 324 | "dtype: float64" 325 | ] 326 | }, 327 | "execution_count": 6, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "data[cat_vars_with_na].isnull().mean().sort_values(ascending=False)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 7, 339 | "id": "f912c97f", 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "cat_vars_replace_na_with_string_missing = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1]\n", 344 | "cat_vars_replace_na_with_frequent = [var for var in cat_vars_with_na if X_train[var].isnull().mean() <= 0.1]" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 8, 350 | "id": "5a00f5c2", 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/plain": [ 356 | "['gender', 'major_discipline', 'company_size', 'company_type']" 357 | ] 358 | }, 359 | "execution_count": 8, 360 | "metadata": {}, 361 | "output_type": "execute_result" 362 | } 363 | ], 364 | "source": [ 365 | "cat_vars_replace_na_with_string_missing" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 9, 371 | "id": "c6a92e10", 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "['enrolled_university', 'education_level', 'experience', 'last_new_job']" 378 | ] 379 | }, 380 | "execution_count": 9, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "cat_vars_replace_na_with_frequent" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 10, 392 | "id": "adbf6063", 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "data": { 397 | "text/plain": [ 398 | "{'gender': 'Missing',\n", 399 | " 'major_discipline': 'Missing',\n", 400 | " 'company_size': 'Missing',\n", 401 | " 'company_type': 'Missing'}" 402 | ] 403 | }, 404 | "execution_count": 10, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "cat_imputer_missing = CategoricalImputer(imputation_method='missing', variables=cat_vars_replace_na_with_string_missing)\n", 411 | "cat_imputer_missing.fit(X_train)\n", 412 | "cat_imputer_missing.imputer_dict_" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 11, 418 | "id": "6937ec37", 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "X_train = cat_imputer_missing.transform(X_train)\n", 423 | "X_test = cat_imputer_missing.transform(X_test)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 12, 429 | "id": "bd06a11c", 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "{'enrolled_university': 'no_enrollment',\n", 436 | " 'education_level': 'Graduate',\n", 437 | " 'experience': '>20',\n", 438 | " 'last_new_job': '1'}" 439 | ] 440 | }, 441 | "execution_count": 12, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "cat_imputer_frequent = CategoricalImputer(imputation_method='frequent', variables=cat_vars_replace_na_with_frequent)\n", 448 | "cat_imputer_frequent.fit(X_train)\n", 449 | "cat_imputer_frequent.imputer_dict_" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 13, 455 | "id": "afa6f611", 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "X_train = cat_imputer_frequent.transform(X_train)\n", 460 | "X_test = cat_imputer_frequent.transform(X_test)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 14, 466 | "id": "5e12a3e9", 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "gender 0\n", 473 | "enrolled_university 0\n", 474 | "education_level 0\n", 475 | "major_discipline 0\n", 476 | "experience 0\n", 477 | "company_size 0\n", 478 | "company_type 0\n", 479 | "last_new_job 0\n", 480 | "dtype: int64" 481 | ] 482 | }, 483 | "execution_count": 14, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "X_train[cat_vars_with_na].isnull().sum()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 15, 495 | "id": "ab937bc7", 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "gender 0\n", 502 | "enrolled_university 0\n", 503 | "education_level 0\n", 504 | "major_discipline 0\n", 505 | "experience 0\n", 506 | "company_size 0\n", 507 | "company_type 0\n", 508 | "last_new_job 0\n", 509 | "dtype: int64" 510 | ] 511 | }, 512 | "execution_count": 15, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "X_test[cat_vars_with_na].isnull().sum()" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "id": "73552e80", 524 | "metadata": {}, 525 | "source": [ 526 | "### Numerical Variables" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 16, 532 | "id": "b108a026", 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "num_vars = [var for var in data.columns if var not in cat_vars + ['enrollee_id', 'target']]\n", 537 | "num_vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 17, 543 | "id": "5c17948b", 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "['city_development_index', 'training_hours']" 550 | ] 551 | }, 552 | "execution_count": 17, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "num_vars" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 18, 564 | "id": "a10fb9e6", 565 | "metadata": {}, 566 | "outputs": [ 567 | { 568 | "data": { 569 | "text/plain": [ 570 | "[]" 571 | ] 572 | }, 573 | "execution_count": 18, 574 | "metadata": {}, 575 | "output_type": "execute_result" 576 | } 577 | ], 578 | "source": [ 579 | "num_vars_with_na" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "id": "8d13b020", 585 | "metadata": {}, 586 | "source": [ 587 | "## Transformations" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "id": "ba920de7", 593 | "metadata": {}, 594 | "source": [ 595 | "### Numerical Variables" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 19, 601 | "id": "90d1a49d", 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/plain": [ 607 | "{'training_hours': 0.14533500139326166}" 608 | ] 609 | }, 610 | "execution_count": 19, 611 | "metadata": {}, 612 | "output_type": "execute_result" 613 | } 614 | ], 615 | "source": [ 616 | "num_vars_yeo_johnson = ['training_hours']\n", 617 | "\n", 618 | "yeo_transformer = YeoJohnsonTransformer(variables=num_vars_yeo_johnson)\n", 619 | "\n", 620 | "X_train = yeo_transformer.fit_transform(X_train)\n", 621 | "X_test = yeo_transformer.transform(X_test)\n", 622 | "\n", 623 | "yeo_transformer.lambda_dict_" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "id": "f5e4f137", 629 | "metadata": {}, 630 | "source": [ 631 | "### Categorical Variables" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 20, 637 | "id": "f0ca44ac", 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/html": [ 643 | "
\n", 644 | "\n", 657 | "\n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | "
citygenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_job
19147city_21MaleNo relevent experienceFull time courseGraduateSTEM1100-500Pvt Ltd1
8464city_21MissingHas relevent experienceFull time courseGraduateSTEM<1<10Pvt Ltd1
8869city_16MaleHas relevent experienceno_enrollmentMastersSTEM9MissingPvt Ltd1
11645city_118MissingHas relevent experiencePart time courseMastersSTEM101000-4999Pvt Ltd3
7743city_103MissingNo relevent experienceno_enrollmentPrimary SchoolMissing2MissingMissingnever
\n", 741 | "
" 742 | ], 743 | "text/plain": [ 744 | " city gender relevent_experience enrolled_university \\\n", 745 | "19147 city_21 Male No relevent experience Full time course \n", 746 | "8464 city_21 Missing Has relevent experience Full time course \n", 747 | "8869 city_16 Male Has relevent experience no_enrollment \n", 748 | "11645 city_118 Missing Has relevent experience Part time course \n", 749 | "7743 city_103 Missing No relevent experience no_enrollment \n", 750 | "\n", 751 | " education_level major_discipline experience company_size company_type \\\n", 752 | "19147 Graduate STEM 1 100-500 Pvt Ltd \n", 753 | "8464 Graduate STEM <1 <10 Pvt Ltd \n", 754 | "8869 Masters STEM 9 Missing Pvt Ltd \n", 755 | "11645 Masters STEM 10 1000-4999 Pvt Ltd \n", 756 | "7743 Primary School Missing 2 Missing Missing \n", 757 | "\n", 758 | " last_new_job \n", 759 | "19147 1 \n", 760 | "8464 1 \n", 761 | "8869 1 \n", 762 | "11645 3 \n", 763 | "7743 never " 764 | ] 765 | }, 766 | "execution_count": 20, 767 | "metadata": {}, 768 | "output_type": "execute_result" 769 | } 770 | ], 771 | "source": [ 772 | "X_train[cat_vars].head()" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 21, 778 | "id": "a9f8e55b", 779 | "metadata": {}, 780 | "outputs": [], 781 | "source": [ 782 | "experience_map = {\n", 783 | " '<1': 0,\n", 784 | " '1': 1, \n", 785 | " '2': 2, \n", 786 | " '3': 3, \n", 787 | " '4': 4, \n", 788 | " '5': 5,\n", 789 | " '6': 6,\n", 790 | " '7': 7,\n", 791 | " '8': 8, \n", 792 | " '9': 9, \n", 793 | " '10': 10, \n", 794 | " '11': 11,\n", 795 | " '12': 12,\n", 796 | " '13': 13, \n", 797 | " '14': 14, \n", 798 | " '15': 15, \n", 799 | " '16': 16,\n", 800 | " '17': 17,\n", 801 | " '18': 18,\n", 802 | " '19': 19, \n", 803 | " '20': 20, \n", 804 | " '>20': 21\n", 805 | "} \n", 806 | "\n", 807 | "last_new_job_map = {\n", 808 | " 'never': 0,\n", 809 | " '1': 1, \n", 810 | " '2': 2, \n", 811 | " '3': 3, \n", 812 | " '4': 4, \n", 813 | " '>4': 5\n", 814 | "}\n", 815 | "\n", 816 | "company_size_map = {\n", 817 | " 'Missing': 0,\n", 818 | " '<10': 1,\n", 819 | " '10/49': 2, \n", 820 | " '100-500': 3, \n", 821 | " '1000-4999': 4, \n", 822 | " '10000+': 5, \n", 823 | " '50-99': 6, \n", 824 | " '500-999': 7, \n", 825 | " '5000-9999': 8\n", 826 | "}\n", 827 | "\n", 828 | "cat_vars_ordinal = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']\n", 829 | "cat_vars_ordinal_arbitrary = ['city']\n", 830 | "cat_vars_onehot = ['gender']\n", 831 | "cat_vars_count_frequency = ['company_type']\n", 832 | "\n", 833 | "ordinal_encoder = OrdinalEncoder(encoding_method='ordered', variables=cat_vars_ordinal)\n", 834 | "ordinal_encoder.fit(X_train, y_train)\n", 835 | "\n", 836 | "ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)\n", 837 | "ordinal_encoder_arbitrary.fit(X_train, y_train)\n", 838 | "\n", 839 | "count_frequency_encoder = CountFrequencyEncoder(encoding_method='frequency', variables=cat_vars_count_frequency)\n", 840 | "count_frequency_encoder.fit(X_train)\n", 841 | "\n", 842 | "onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)\n", 843 | "onehot_encoder.fit(X_train)\n", 844 | "\n", 845 | "X_train = ordinal_encoder.transform(X_train)\n", 846 | "X_test = ordinal_encoder.transform(X_test)\n", 847 | "\n", 848 | "X_train = ordinal_encoder_arbitrary.transform(X_train)\n", 849 | "X_test = ordinal_encoder_arbitrary.transform(X_test)\n", 850 | "\n", 851 | "X_train = count_frequency_encoder.transform(X_train)\n", 852 | "X_test = count_frequency_encoder.transform(X_test)\n", 853 | "\n", 854 | "X_train = onehot_encoder.transform(X_train)\n", 855 | "X_test = onehot_encoder.transform(X_test)\n", 856 | "\n", 857 | "var = 'experience'\n", 858 | "X_train[var] = X_train[var].map(experience_map)\n", 859 | "X_test[var] = X_test[var].map(experience_map)\n", 860 | "\n", 861 | "var = 'last_new_job'\n", 862 | "X_train[var] = X_train[var].map(last_new_job_map)\n", 863 | "X_test[var] = X_test[var].map(last_new_job_map)\n", 864 | "\n", 865 | "var = 'company_size'\n", 866 | "X_train[var] = X_train[var].map(company_size_map)\n", 867 | "X_test[var] = X_test[var].map(company_size_map)" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 22, 873 | "id": "2d28bbcd", 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "data": { 878 | "text/html": [ 879 | "
\n", 880 | "\n", 893 | "\n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | "
citycity_development_indexrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hoursgender_Malegender_Missinggender_Othergender_Female
1914700.6241244130.51474615.3719211000
846400.6240244010.51474616.4152910100
886910.9100034900.51474614.7483991000
1164520.72201341040.51474633.7537940100
774330.9201010200.31952205.8774770100
................................................
9225310.47912441000.31952254.7021841000
1312300.6240044660.51474613.8294701000
984530.9200044660.51474605.2329790010
1079900.6241044220.31952216.0308790100
273210.91000441200.31952211.5358191000
\n", 1115 | "

15326 rows × 15 columns

\n", 1116 | "
" 1117 | ], 1118 | "text/plain": [ 1119 | " city city_development_index relevent_experience enrolled_university \\\n", 1120 | "19147 0 0.624 1 2 \n", 1121 | "8464 0 0.624 0 2 \n", 1122 | "8869 1 0.910 0 0 \n", 1123 | "11645 2 0.722 0 1 \n", 1124 | "7743 3 0.920 1 0 \n", 1125 | "... ... ... ... ... \n", 1126 | "9225 31 0.479 1 2 \n", 1127 | "13123 0 0.624 0 0 \n", 1128 | "9845 3 0.920 0 0 \n", 1129 | "10799 0 0.624 1 0 \n", 1130 | "2732 1 0.910 0 0 \n", 1131 | "\n", 1132 | " education_level major_discipline experience company_size \\\n", 1133 | "19147 4 4 1 3 \n", 1134 | "8464 4 4 0 1 \n", 1135 | "8869 3 4 9 0 \n", 1136 | "11645 3 4 10 4 \n", 1137 | "7743 1 0 2 0 \n", 1138 | "... ... ... ... ... \n", 1139 | "9225 4 4 10 0 \n", 1140 | "13123 4 4 6 6 \n", 1141 | "9845 4 4 6 6 \n", 1142 | "10799 4 4 2 2 \n", 1143 | "2732 4 4 12 0 \n", 1144 | "\n", 1145 | " company_type last_new_job training_hours gender_Male \\\n", 1146 | "19147 0.514746 1 5.371921 1 \n", 1147 | "8464 0.514746 1 6.415291 0 \n", 1148 | "8869 0.514746 1 4.748399 1 \n", 1149 | "11645 0.514746 3 3.753794 0 \n", 1150 | "7743 0.319522 0 5.877477 0 \n", 1151 | "... ... ... ... ... \n", 1152 | "9225 0.319522 5 4.702184 1 \n", 1153 | "13123 0.514746 1 3.829470 1 \n", 1154 | "9845 0.514746 0 5.232979 0 \n", 1155 | "10799 0.319522 1 6.030879 0 \n", 1156 | "2732 0.319522 1 1.535819 1 \n", 1157 | "\n", 1158 | " gender_Missing gender_Other gender_Female \n", 1159 | "19147 0 0 0 \n", 1160 | "8464 1 0 0 \n", 1161 | "8869 0 0 0 \n", 1162 | "11645 1 0 0 \n", 1163 | "7743 1 0 0 \n", 1164 | "... ... ... ... \n", 1165 | "9225 0 0 0 \n", 1166 | "13123 0 0 0 \n", 1167 | "9845 0 1 0 \n", 1168 | "10799 1 0 0 \n", 1169 | "2732 0 0 0 \n", 1170 | "\n", 1171 | "[15326 rows x 15 columns]" 1172 | ] 1173 | }, 1174 | "execution_count": 22, 1175 | "metadata": {}, 1176 | "output_type": "execute_result" 1177 | } 1178 | ], 1179 | "source": [ 1180 | "X_train" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "execution_count": 23, 1186 | "id": "d86f015b", 1187 | "metadata": { 1188 | "scrolled": true, 1189 | "tags": [] 1190 | }, 1191 | "outputs": [ 1192 | { 1193 | "data": { 1194 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUxElEQVR4nO3df/QldX3f8eeLhQVB5JdW6QIFLcWCIshGEK1RjAFt6tqICUoF0cqxaqLllBQaDx5s09MEYxNajcVIBJsIgqas1GBQMXqIID/k9w/dgMIiBhcFCZ6A4Lt/3Fn2282u3ztf7nzu93v3+Tjnnp2ZO3fu+86Z3dfOzGc+n1QVkiRpeFtNuwBJkrYUhq4kSY0YupIkNWLoSpLUiKErSVIjW0+7gLmOOuqouuSSS6ZdhiSpvUy7gBYW1ZnuunXrpl2CJEmDWVShK0nSLDN0JUlqxNCVJKkRQ1eSpEYMXUmSGjF0JUlqxNCVJKkRQ1eSpEYMXUmSGjF0JUlqJFU17RqesMOz9qnnvvn0aZchSRrTNWccN6lN2feyJEmaHENXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoZPHSTHJXk9iRrkpwy9PdJkrRYDRq6SZYBHwZeDewPvDHJ/kN+pyRJi9XQZ7ovAtZU1R1V9ShwHrBq4O+UJGlR2nrg7a8A7p4zvxY4dO4KSU4ETgRYsdM2/PmOZwxcksa112k3TrsESZopU29IVVVnVdXKqlq56w7Lpl2OJEmDGTp07wH2nDO/R7dMkqQtztChexWwb5J9kiwHjgFWD/ydkiQtSoPe062qx5K8G/gCsAw4u6puHvI7JUlarIZuSEVVfR74/NDfI0nSYjf1hlSSJG0pDF1JkhoxdCVJasTQlSSpEUNXkqRGeoduku2HKESSpFk3dugmOTzJLcBt3fwLknxksMokSZoxfc50/ztwJHA/QFVdD7xsiKIkSZpFvS4vV9XdGy16fIK1SJI00/r0SHV3ksOBSrIN8B7g1mHKkiRp9vQ5030H8C5GY+TeAxzUzUuSpDGMfaZbVeuAYwesRZKkmdan9fI5SXaeM79LkrMHqUqSpBnU5/LygVX1wPqZqvoRcPDEK5IkaUb1Cd2tkuyyfibJrjQYGlCSpFnRJzR/H/h6kguAAEcDvzNIVZIkzaA+DanOTXIN8Ipu0a9W1S3DlCVJ0uzpe3n4NuBH6z+XZK+qumviVUmSNIPGDt0kvwG8H/hbRj1RBSjgwGFKkyRptvQ5030PsF9V3T9UMZIkzbI+rZfvBh4cqhBJkmZdnzPdO4CvJPm/wCPrF1bVhyZelSRJM6hP6N7VvZZ3L0mS1EOfR4ZOB0iyfVX9ZLiSJEmaTX36Xn5xklsYPTZEkhck+chglUmSNGP6NKT6A+BI4H6AqroeeNkANUmSNJP6hC5VdfdGix6fYC2SJM20Pg2p7k5yOFBJtmH03O6tw5QlSdLs6XOm+w7gXcAK4B7goG5ekiSNoU/r5XXAsQPWIknSTJs3dJP8VlX9XpL/waiv5f9PVf3mIJVJkjRjxjnTXX/f9uohC5EkadbNG7pV9bkky4DnV9V/aFCTJEkzaayGVFX1OPCSgWuRJGmm9Xlk6Lokq4ELgIfXL6yqz068KkmSZlCf0N2OUW9UR8xZVoChK0nSGPo8MnTCkIVIkjTr+gx48M+SfCnJTd38gUneN1xpkiTNlj49Un0MOBX4KUBV3QAcM0RRkiTNoj6hu31VfWOjZY9NshhJkmZZn9Bdl+Q5dL1SJTkauHeQqiRJmkF9Wi+/CzgLeG6Se4A7mXBfzMt3P4C9TrPjK0nSbOrTevkO4JeS7ABsVVUPDVeWJEmzp0/r5d2SnAl8DfhKkj9MsttwpUmSNFv63NM9D/gB8Hrg6G76/CGKkiRpFvW5p7t7Vf3nOfP/JcmvT7ogSZJmVZ8z3b9MckySrbrXrwFfGKowSZJmTar+wbj0m14xeQjYAXi8W7SMDQMfVFU97ckWs3Llyrr6alsvS9IWKNMuoIU+rZd3HLIQSZJmXZ/Wy2/baH5ZkvdPviRJkuaXZOck72zwPa9Lsv8kttXnnu4rk3w+ye5JngdcAXj2K0malp2BsUM3I31yb73XARMJ3T6Xl9/UtVa+kdG93DdV1eWTKEKSpAX4b8BzklwHXAYcCOwCbAO8r6ouSrI3o0a/VwKHAK9Jchzwbxg9+no3cE1VfbDr6vjDwDOAnwBvB3YFXgv8Yjey3uur6m8WWvDYoZtkX+A9wGeAfw68Ock3q+onC/1ySZKehFOA51XVQUm2ZjQwz4+TPB24Isnqbr19geOr6ookv8Cov4kXMArna4FruvXOAt5RVd9Ocijwkao6otvOxVV14ZMtuM9zup8D3l1VX0wS4CTgKuCAJ1uEJElPUoD/muRlwM+AFcAzu/e+W1VXdNMvAS6qqr8H/j7J5wCSPBU4HLhgFHEAbDvpIvuE7ouq6scwej4I+P31xUqSNGXHMrosfEhV/TTJd4Dtuvce3uynNtgKeKCqDhqmvA1fMq6nJPl4kksAupZc/2KYsiRJmtdDbGjQuxNwXxe4rwD+yWY+cznwr5Js153d/gpAd1J5Z5I3wBONrl6wie95UvqE7icY3YzevZv/FvDeSRQhSVJfVXU/cHmSm4CDgJVJbgSOA27bzGeuAlYDNwB/wahx8IPd28cCb0tyPXAzsKpbfh5wcpJvdo2tFqxPj1RXVdUvdI2nDu6WXTfJU3F7pJKkLVazHqmSPLWq/i7J9sBXgROr6toW393nnu7D3VB+BZDkMDb870CSpKXirO4W6XbAOa0CF/qF7kmMTsmfk+RyRjesjx6kKkmSBlJVb5rWd/fpHOPaJL8I7MfoMsDtVfXT9e8neVVVXTpAjZIkzYRe3WFV1WNVdXNV3TQ3cDu/O8G6JEmaOQvpg3JztohhmSRJWqhJhu54zaAlSdpCTTJ0JUmaCUmOSnJ7kjVJTtnE+9smOb97/8puYIV59RnwYNuqeuTnLPvOuNvanFvX3s8hJ5/7ZDejReyaM46bdgmSlpBDTj53oldRrznjuHlvhSZZxmi0oVcBa4GrkqyuqlvmrPY24EdV9U+THMOoXdOvz7ftPme6X/95y6rqV3tsS5KkxepFwJqquqOqHmXUI9WqjdZZBZzTTV/IaMz5eQN93jPdJM9iNFrDU5IczIYGU08Dth+vfkmSlowVjMbZXW8tcOjm1qmqx5I8COwGrPt5Gx7n8vKRwFuAPYAPzVn+EPCfxvi8JElijNCtqnOAc5K8vqo+06AmSZKm6R5gzznze3TLNrXO2iRbMxrl6P75NtynG8iLk7wJ2Hvu56rqAz22IUnSYncVsG+SfRiF6zHAxl1HrgaOZ9S26WjgyzXGCEJ9QvciRgMcXAM8Ms+6kiQtSd092nczGs52GXB2Vd2c5APA1VW1Gvg48Mkka4AfMgrmefUZ2u+mqnregn7BmHZ41j713DefPuRXaMp8ZEjSZmwRvRr2eWTor5M8f7BKJEmacX0uL78UeEuSOxldXg5QVXXgIJVJkjRj+oTuqwerQpKkLcDYl5er6ruMmkcf0U3/pM/nJUna0o0dmkneD/xH4NRu0TbA/x6iKEmSZlGfM9V/DbwWeBigqr4H7DhEUZIkzaI+ofto9+BvASTZYZiSJEmaniRnJ7kvyU2beT9JzuyG9bshyQvH3XafhlSfTvK/gJ2TvB14K/CxHp+XJKmXuz7w/IkO7bfXaTeO8zzwJ4D/CWxurNlXA/t2r0OBP+IfDoiwSWOHblV9MMmrgB8D+wGnVdWl435ekqSloKq+Os+g9KuAc7urv1ck2TnJ7lV173zb7jOI/UnA+QatJGkLt6mh/1YA84Zun3u6OwJ/meRrSd6d5Jn9apQkacvW5znd06vqAOBdwO7AXyX54mCVSZK0OI0z9N8mLaRzi/uA7zMaN/AfLeDzkiQtZauB47pWzIcBD45zPxf63dN9J/BrwDOAC4C3V9UtC6lWkqTFKsmngJcDT0+yFng/ow6hqKqPAp8HXgOsYdQ74wnjbrvPI0N7Au+tqut6fEaSpAUb8xGfiaqqN87zfjG61dpbn3u6pwJPTXICQJJnJNlnIV8qSdKWyL6XJUlqxL6XJUlqxL6XJUlqpE/obtz38hex72VJksZm38uSJDXS55EhupA1aCVJWoB5QzfJQ3T3cTd+i9HjSk+beFWSJM2geUO3qmyhLEnSBPTqeznJS+d0jvF0O8eQJGl8T6ZzjOXYOYYkSWOzcwxJkhqxcwxJkhoZK3STBLjYzjEkSVq4sZ7TrapK8gbgJOwcQ5KkBenTOca1wANVdfJQxUiSNMv6hO6hwLFJvkvXmAqgqg6ceFWSJM2gPqF75GBVSJK0Begz4MF3hyxEkqRZ16tHKkmStHCGriRJjQwauknOTnJfkpuG/B5JkpaCoc90PwEcNfB3SJK0JAwaulX1VeCHQ36HJElLRZ9HhgaR5ETgRIAVO23Dn+94xpQram+v026cdgmSpAam3pCqqs6qqpVVtXLXHZZNuxxJkgYz9dCVJGlLYehKktTI0I8MfQr4OrBfkrVJ3jbk90mStJgN2pCqqt445PYlSVpKvLwsSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjW0+7gLmW734Ae5129bTLkCRpEJ7pSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY2kqqZdwxOSPATcPu06ZszTgXXTLmKGuD8nz306WUt1f66rqqOmXcTQFtWAB8DtVbVy2kXMkiRXu08nx/05ee7TyXJ/Lm5eXpYkqRFDV5KkRhZb6J417QJmkPt0styfk+c+nSz35yK2qBpSSZI0yxbbma4kSTPL0JUkqZFFE7pJjkpye5I1SU6Zdj2LVZI9k1yW5JYkNyd5T7d81ySXJvl29+cu3fIkObPbrzckeeGcbR3frf/tJMdP6zctBkmWJflmkou7+X2SXNntt/OTLO+Wb9vNr+ne33vONk7tlt+e5Mgp/ZRFIcnOSS5McluSW5O82GN04ZL8++7v+01JPpVkO4/RJaqqpv4ClgF/AzwbWA5cD+w/7boW4wvYHXhhN70j8C1gf+D3gFO65acAv9tNvwb4CyDAYcCV3fJdgTu6P3fppneZ9u+b4n49Cfgz4OJu/tPAMd30R4F/102/E/hoN30McH43vX933G4L7NMdz8um/bumuD/PAf5tN70c2NljdMH7cgVwJ/CUbv7TwFs8Rpfma7Gc6b4IWFNVd1TVo8B5wKop17QoVdW9VXVtN/0QcCujv5SrGP1DR/fn67rpVcC5NXIFsHOS3YEjgUur6odV9SPgUmDme4PZlCR7AP8S+ONuPsARwIXdKhvvz/X7+ULgld36q4DzquqRqroTWMPouN7iJNkJeBnwcYCqerSqHsBj9MnYGnhKkq2B7YF78RhdkhZL6K4A7p4zv7Zbpp+ju2x0MHAl8Myqurd76/vAM7vpze1b9/kGfwD8FvCzbn434IGqeqybn7tvnthv3fsPduu7PzfYB/gB8CfdJfs/TrIDHqMLUlX3AB8E7mIUtg8C1+AxuiQtltBVT0meCnwGeG9V/XjuezW6luSzYGNI8ivAfVV1zbRrmSFbAy8E/qiqDgYeZnQ5+Qkeo+Pr7n2vYvSfmX8M7MCWe8a/5C2W0L0H2HPO/B7dMm1Ckm0YBe6fVtVnu8V/212So/vzvm755vat+3zkJcBrk3yH0W2NI4A/ZHSJc33f5HP3zRP7rXt/J+B+3J9zrQXWVtWV3fyFjELYY3Rhfgm4s6p+UFU/BT7L6Lj1GF2CFkvoXgXs27XGW87o5v/qKde0KHX3Zj4O3FpVH5rz1mpgfevO44GL5iw/rmshehjwYHeJ7wvALyfZpfuf9C93y7YoVXVqVe1RVXszOu6+XFXHApcBR3erbbw/1+/no7v1q1t+TNdydB9gX+AbjX7GolJV3wfuTrJft+iVwC14jC7UXcBhSbbv/v6v358eo0vRtFtyrX8xasH4LUYt6n572vUs1hfwUkaX5W4Aruter2F0z+ZLwLeBLwK7dusH+HC3X28EVs7Z1lsZNaZYA5ww7d827Rfwcja0Xn42o3+Q1gAXANt2y7fr5td07z97zud/u9vPtwOvnvbvmfK+PAi4ujtO/w+j1sceowvfn6cDtwE3AZ9k1ALZY3QJvuwGUpKkRhbL5WVJkmaeoStJUiOGriRJjRi6kiQ1YuhKktSIoSstQUnem2T7adchqR8fGZKWoK4HrZVVtW7atUgan2e60kCSHNeND3t9kk8m2TvJl7tlX0qyV7feJ5IcPedzf9f9+fIkX5kzLu2fdr02/SajPngvS3LZdH6dpIXYev5VJPWV5ADgfcDhVbUuya6Mhls7p6rOSfJW4Ew2DMe2OQcDBwDfAy4HXlJVZyY5CXiFZ7rS0uKZrjSMI4AL1odiVf0QeDHwZ937n2TUped8vlFVa6vqZ4y6/Nx78qVKasXQlabvMbq/i0m2ApbPee+ROdOP49UpaUkzdKVhfBl4Q5LdALrLy3/NaCQjgGOBr3XT3wEO6aZfC2wzxvYfAnacVLGS2vB/zdIAqurmJL8D/FWSx4FvAr8B/EmSk4EfACd0q38MuCjJ9cAljAZ9n89ZwCVJvldVr5j8L5A0BB8ZkiSpES8vS5LUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY38P3MU1Oc0v5hnAAAAAElFTkSuQmCC\n", 1195 | "text/plain": [ 1196 | "
" 1197 | ] 1198 | }, 1199 | "metadata": { 1200 | "needs_background": "light" 1201 | }, 1202 | "output_type": "display_data" 1203 | }, 1204 | { 1205 | "data": { 1206 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAVfElEQVR4nO3de9RldX3f8feHuXAZyQCaKhkYGQgNC4zKpYpiE8E0IkWGpVghFlBbiY1GjK2WLF0oriQrCeiqVquLCAoWJIpYkBosBoKJKQQGBOQWJ0B0kIgQBaqGm9/+cfbg42SY52zm7N95OPN+rXXWOfv+PXvtmc+zf3uf305VIUmShrfVtAuQJGlLYehKktSIoStJUiOGriRJjRi6kiQ1snjaBcx16KGH1iWXXDLtMiRJ7WXaBbSwoM5077333mmXIEnSYBZU6EqSNMsMXUmSGjF0JUlqxNCVJKkRQ1eSpEYMXUmSGslCesrQsmetqr2OPWXaZUiSxrTm1OMmtSp/pytJkibH0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgYP3SSHJrktydokJw29PUmSFqpBQzfJIuCjwCuAvYFjkuw95DYlSVqohj7TfQGwtqpur6qHgfOA1QNvU5KkBWno0F0BfHvO8LpunCRJW5zF0y4gyQnACQArli/hC9ufOuWKBLDy5BunXYIkzZyhz3TvAnadM7xLN+5xVXV6VR1QVQfstGzRwOVIkjQ9Q4fu1cCeSVYlWQocDVw08DYlSVqQBm1erqpHk7wV+DKwCDizqm4acpuSJC1Ug1/TraovAV8aejuSJC109kglSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0snnYBcy3deR9WnnzNtMuQJGkQnulKktSIoStJUiNjh26SNUnekmTHIQuSJGlW9TnTfS3wC8DVSc5L8vIkGaguSZJmztihW1Vrq+rdwL8EzgXOBP4+ySlJdhqqQEmSZkWva7pJngt8ADgV+DzwGuAB4LLJlyZJ0mwZ+ydDSdYAPwDOAE6qqoe6SVclOWiA2iRJmil9fqf7mqq6fe6IJKuq6o6qetWE65Ikaeb0aV4+f8xxkiRpI+Y9002yF7APsDzJ3DPanwO2GaowSZJmzTjNy78EHA7sALxyzvgHgTcNUJMkSTNp3tCtqguBC5O8qKr+b4OaJEmaSeM0L7+rqv4Y+I0kx2w4vareNkhlkiTNmHGal2/p3n38jyRJm2Gc5uUvdu9nrR+XZCvgaVX1wIC1SZI0U1JV482YnAu8GXgMuJrR3csfqqpTJ1XMsmetqr2OPWVSqxvLmlOPa7o9SdJGbRF9+ff5ne7e3ZntkcCfAauAY4coSpKkWdQndJckWcIodC+qqkeA8U6TJUlSr9D9OHAnsAz4apJnM3rYgSRJGsNYfS93N059t6pWzBn3LeDgoQqTJGnWjHWmW1U/Ad61wbiqqkcHqUqSpBnUp3n5K0n+S5Jdk+y0/jVYZZIkzZg+j/Z7bff+ljnjCth9cuVIkjS7xg7dqlo1ZCGSJM26sZuXk2yX5D1JTu+G90xy+HClSZI0W/pc0/0k8DDw4m74LuD3Jl6RJEkzqk/o7tE9begRgKr6EVtIt12SJE1Cn9B9OMm2dL1QJdkDeGiQqiRJmkF97l5+H3AJsGuSc4CDgNcPUJMkSTOpz93L/yfJGuBARs3KJ1bVvYNVJknSjBk7dJN8ETiX0cMOfjhcSZIkzaY+13RPA/41cHOS85MclWSbgeqSJGnm9GlevgK4Iski4BDgTcCZjB5mL0mS5tHnRiq6u5dfyahLyP2As4YoSpKkWdTnmu5ngRcwuoP5I8AV3dOHJEnSGPqc6Z4BHFNVjw1VjCRJs2ze0E1ySFVdBiwDVic/2wlVVV0wUG2SJM2Ucc50fxW4jNG13A0VYOhKkjSGeUO3qt7bvb9h+HIkSZpdfW6k2hp4NbDb3OWq6v2TL0uSpNnT50aqC4H7gTX4oANJknrrE7q7VNWhg1UiSdKM69MN5F8n+eXBKpEkqYckOyT5rQbbOTLJ3pNYV5/QfQmwJsltSW5IcmOSGza1QJIzk9yT5BubV6YkSf/MDsDYoZuRPrm33pHAREK3T/PyK57E+j/FqPeqs5/EspIkbcofAnsk+TpwOfBcYEdgCfCeqrowyW7Al4GrgP2Bw5IcB/x74HvAt4E1VXVakj2AjwI/D/yI0TMGdgKOAH41yXuAV1fV3z3ZgvuEbvVdeVV9tfvCkiRN2knAc6rq+UkWA9tV1QNJngFcmeSibr49geOr6sok/4rRL3Gexyicr2V0gzDA6cCbq+qbSV4I/I+qOqRbz8VVdf7mFtwndP83o+ANsA2wCrgN2GdzCkhyAnACwIrlS/jC9qfOu8zKk2/cnE1KkmZPgD9I8ivAT4AVwDO7aX9fVVd2nw8CLqyqfwL+qXtWPEmeBrwY+Nycnhe3nnSRfR7t9zM3USXZjx5t6ZtY7+mM/rrguSu27X02LUkS8DpGzcL7V9UjSe5kdIII8MMxlt8K+EFVPX+Y8n66kSelqq4FXjjBWiRJ6uNBYPvu83Lgni5wDwae/QTLfA14ZZJturPbwwGq6gHgjiSvgcdvunreRrazWfr0SPWOOYNbMXqe7ncmUYQkSX1V1X1Jvtb9QuZqYK8kNwLXALc+wTJXd9dobwC+C9zIqOMnGJ0tf6y7YWoJcB5wfff+J0neBhzV6kaquSn/KKNrvJ/f1AJJPgO8FHhGknXAe6vqjL5FSpK0MVX1G2PM9pwNhk+rqvcl2Q74Kt2NVFV1B/DPOoGqqq/R+idDVXXKpqYn+e9V9dsbLHPMky1MkqSBnN51drENcFZ3ubSJPme68zloguuSJGkQY54dD+JJ30glSZL6MXQlSWpkkqGb+WeRJGnLNcnQ/dAE1yVJ0syZ90aqrousJ+wpqqqO6N4/NbmyJEmaniSHMjqZXAR8oqr+cIPpWzN6mM/+wH3Aa6vqzvnWO87dy6d1768CngX8z274GEY/LJYkaRD7v/PsiXYPvObU4+a9FJpkEaOnDf0bYB1wdZKLqurmObP9B+D7VfWLSY4G/gh47Xzrnjd0q+qKrogPVNUBcyZ9Mck18y0vSdJTzAuAtVV1O0CS84DVwNzQXQ28r/t8PvCRJKmqTf6R0Oea7rIku68fSLIKWNZjeUmSngpWMHrO7nrrunEbnaeqHmXUleTT51txn84xfgf4iyS3M7pT+dnAb/ZYXpKkLVqfbiAvSbInsFc36taqemiYsiRJmpq7gF3nDO/SjdvYPOuSLGb0lKP75lvx2M3LXcfQ7wTeWlXXAyuTHD7u8pIkPUVcDeyZZFWSpcDRwEUbzHMRcHz3+Sjgsvmu50K/a7qfBB4GXtQN3wX8Xo/lJUla8LprtG8FvgzcAny2qm5K8v4kR3SznQE8Pcla4B3ASeOsO2ME82jG5JqqOiDJdVW1bzfu+qp63nzLjuu5K7ati3/zF+edb+XJN05qk5KkhWGL6NWwz5nuw0m2pesoI8kegNd0JUkaU5+7l98LXALsmuQcRo/ye/0QRUmSNIv63L18aZJrgQMZNQOcWFX3DlaZJEkzZpy+l/fbYNTd3fvKJCur6trJlyVJ0uwZ50z3A5uYVsAhE6pFkqSZNk7fywe3KESSpFk3TvPyqzY1vaoumFw5kiRNV5IzgcOBe6rqORuZHkaP/TsM+BHw+nEvtY7TvPzKTUwrwNCVJA3iW+//5Yk+2m/lyTeO83vgTwEfYfS83I15BbBn93oh8LHufV7jNC+/YZwVSZI0C6rqq0l228Qsq4Gzu24fr0yyQ5Kdq+ruTSwD9Ot7eXmSDya5pnt9IMnycZeXJGlGjPPov43q0yPVmcCDwL/rXg8w6o9ZkiSNoU+PVHtU1avnDJ+S5OuTLGbpzvuw8uRrJrlKSZImbZxH/21UnzPdHyd5yfqBJAcBP+6xvCRJs+Ai4LiMHAjcP871XOh3pvtm4Ow513G/z0+fJShJ0kxI8hngpcAzkqxj9OyBJQBV9XHgS4x+LrSW0U+Gxr7heKzQTbIIOLaqnpfk57oNP9DjO0iS1NuYP/GZqKo6Zp7pBbzlyax7rNCtqsfWNy0btpIkPTl9mpevS3IR8Dngh+tH2iOVJEnj6RO62wD38bMPOLBHKkmSxtTnebr2TCVJ0mYYO3ST/DzwJmC3uctV1RsnX5YkSbOnT/PyhcBfAl8BHhumHEmSZlef0N2uqv7rYJVIkjTj+vRIdXGSwwarRJKkGdcndE8Evpjkx0keSPJgEn+zK0nSmPo0Ly8HXgesqqr3J1kJ7DxMWZIkzZ4+Z7ofBQ4E1neP9SDwkYlXJEnSjOpzpvvCqtovyXUAVfX9JEsHqkuSpJnTJ3Qf6R58UPD473Z/Mslibll3H/u/8+xJrnKmrTn1uGmXIEnqoU/z8oeBLwD/IsnvA38F/MEgVUmSNIP6dAN5TpI1wMuAAEdW1S2DVSZJ0ozp07xMVd0K3DpQLZIkzbQ+zcuSJGkzGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4OGbpJdk1ye5OYkNyU5ccjtSZK0kC0eeP2PAv+5qq5Nsj2wJsmlVXXzwNuVJGnBGfRMt6rurqpru88PArcAK4bcpiRJC1Wza7pJdgP2Ba5qtU1JkhaSoZuXAUjyNODzwNur6oENpp0AnACwYvkSvrD9qS1KGsTKk2+cdgmSpAVs8DPdJEsYBe45VXXBhtOr6vSqOqCqDthp2aKhy5EkaWqGvns5wBnALVX1wSG3JUnSQjf0me5BwLHAIUm+3r0OG3ibkiQtSINe062qvwIy5DYkSXqqsEcqSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWpk8bQLmGvpzvuw8uRrpl2GJEmD8ExXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJaiRVNe0aHpfkQeC2adcxA54B3DvtImaA+3Ey3I+bb0vYh/dW1aHTLmJoC6pzDOC2qjpg2kU81SW5xv24+dyPk+F+3Hzuw9lh87IkSY0YupIkNbLQQvf0aRcwI9yPk+F+nAz34+ZzH86IBXUjlSRJs2yhnelKkjSzDF1JkhpZMKGb5NAktyVZm+SkadezkCTZNcnlSW5OclOSE7vxOyW5NMk3u/cdu/FJ8uFuX96QZL856zq+m/+bSY6f1neapiSLklyX5OJueFWSq7r99adJlnbjt+6G13bTd5uzjt/txt+W5OVT+ipTk2SHJOcnuTXJLUle5PHYT5Lf6f49fyPJZ5Js47G4Baiqqb+ARcDfAbsDS4Hrgb2nXddCeQE7A/t1n7cH/hbYG/hj4KRu/EnAH3WfDwP+DAhwIHBVN34n4Pbufcfu847T/n5T2J/vAM4FLu6GPwsc3X3+OPCfus+/BXy8+3w08Kfd5727Y3RrYFV37C6a9vdqvA/PAv5j93kpsIPHY6/9twK4A9h2zjH4eo/F2X8tlDPdFwBrq+r2qnoYOA9YPeWaFoyquruqru0+Pwjcwugf7WpG//nRvR/ZfV4NnF0jVwI7JNkZeDlwaVX9Y1V9H7gUmPkeYOZKsgvwb4FPdMMBDgHO72bZcD+u37/nAy/r5l8NnFdVD1XVHcBaRsfwFiHJcuBXgDMAqurhqvoBHo99LQa2TbIY2A64G4/FmbdQQncF8O05w+u6cdpA16y0L3AV8Myqurub9A/AM7vPT7Q/3c/w34B3AT/php8O/KCqHu2G5+6Tx/dXN/3+bv4tfT+uAr4HfLJrpv9EkmV4PI6tqu4CTgO+xShs7wfW4LE48xZK6GoMSZ4GfB54e1U9MHdaVRXg7782IcnhwD1VtWbatTzFLQb2Az5WVfsCP2TUnPw4j8dN6653r2b0B8wvAMvYss7yt1gLJXTvAnadM7xLN06dJEsYBe45VXVBN/q7XTMd3fs93fgn2p9b+n4+CDgiyZ2MLmEcAnyIUXPn+n7I5+6Tx/dXN305cB/ux3XAuqq6qhs+n1EIezyO79eAO6rqe1X1CHABo+PTY3HGLZTQvRrYs7tzbymjGwUumnJNC0Z37eYM4Jaq+uCcSRcB6+/4PB64cM7447q7Rg8E7u+a/b4M/HqSHbu/tH+9G7dFqKrfrapdqmo3RsfYZVX1OuBy4Khutg334/r9e1Q3f3Xjj+7uKF0F7An8TaOvMXVV9Q/At5P8UjfqZcDNeDz28S3gwCTbdf++1+9Dj8VZN+07uda/GN3h+LeM7r5797TrWUgv4CWMmupuAL7evQ5jdE3nz4FvAl8BdurmD/DRbl/eCBwwZ11vZHSzxVrgDdP+blPcpy/lp3cv787oP6q1wOeArbvx23TDa7vpu89Z/t3d/r0NeMW0v88U9t/zgWu6Y/J/Mbr72OOx3z48BbgV+AbwaUZ3IHsszvjLbiAlSWpkoTQvS5I08wxdSZIaMXQlSWrE0JUkqRFDV5KkRgxd6SkoyduTbDftOiT140+GpKegrletA6rq3mnXIml8nulKA0lyXPf82OuTfDrJbkku68b9eZKV3XyfSnLUnOX+X/f+0iR/Mee5ted0vTq9jVF/vZcnuXw6307Sk7F4/lkk9ZVkH+A9wIur6t4kOzF6NNtZVXVWkjcCH+anj257IvsC+wDfAb4GHFRVH07yDuBgz3SlpxbPdKVhHAJ8bn0oVtU/Ai8Czu2mf5pR957z+ZuqWldVP2HU/eduky9VUiuGrjR9j9L9W0yyFbB0zrSH5nx+DFunpKc0Q1caxmXAa5I8HaBrXv5rRk83Angd8Jfd5zuB/bvPRwBLxlj/g8D2kypWUhv+1SwNoKpuSvL7wBVJHgOuA34b+GSSdwLfA97Qzf4nwIVJrgcuYfRQ+PmcDlyS5DtVdfDkv4GkIfiTIUmSGrF5WZKkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrk/wMYpOnoPJkZCQAAAABJRU5ErkJggg==\n", 1207 | "text/plain": [ 1208 | "
" 1209 | ] 1210 | }, 1211 | "metadata": { 1212 | "needs_background": "light" 1213 | }, 1214 | "output_type": "display_data" 1215 | }, 1216 | { 1217 | "data": { 1218 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEYCAYAAAAZGCxpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAWiklEQVR4nO3de7BlZX3m8e9D09xbGpAo04hAJFBgkEsLKhYqlopG0UpIBUaFQjM98RK0nCKFE6YzOhNL01MpxVEQr+gYjUoQQqnECEZFuXSD3CF0lBmaoAgGRS2NwG/+2Kvh0HZz9m7Oevelv5+qU2fd9lq/F3bXc9Za71pvqgpJktS/rcZdgCRJWwpDV5KkRgxdSZIaMXQlSWrE0JUkqZGtx13AXMcee2x95StfGXcZkqT2Mu4CWpioM9177rln3CVIktSbiQpdSZJmmaErSVIjhq4kSY1kkl4DueOT96kDXvuOzf78mlUnLWA1kqSG7EglSZIWjqErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY30HrpJjk1ya5K1SU7v+3iSJE2qXkM3ySLgA8BLgQOBE5Mc2OcxJUmaVH2f6R4BrK2q71XVvwOfBV7Z8zElSZpIW/e8/2XAHXPm1wFHzt0gyQpgBcCynRdz/pJVAOy18vqeS5Mkqa2xd6SqqnOqanlVLd91x0XjLkeSpN70Hbp3Ak+ZM79nt0ySpC1O36F7FbBfkn2SbAOcAFzY8zElSZpIvd7TraoHkrwZuBhYBHysqm7s85iSJE2qvjtSUVVfAr7U93EkSZp0Y+9IJUnSlsLQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJasTQlSSpEUNXkqRGDF1JkhoxdCVJaqT3UYZGsc0eB7HXytXjLkOSpF54pitJUiOGriRJjRi6kiQ1YuhKktSIoStJUiOGriRJjRi6kiQ1YuhKktRIqmrcNTxsxyfvUwe89h1Dbbtm1Uk9VyNJaijjLqAFz3QlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRnoN3SQfS3J3khv6PI4kSdOg7zPdTwDH9nwMSZKmQq+hW1XfAH7c5zEkSZoWW4+7gCQrgBUAy3ZezPlLVrHXyuvHXJUkSQtv7B2pquqcqlpeVct33XHRuMuRJKk3Yw9dSZK2FIauJEmN9P3I0GeA7wD7J1mX5PV9Hk+SpEnWa0eqqjqxz/1LkjRNvLwsSVIjhq4kSY0YupIkNWLoSpLUiKErSVIj8/ZeTvJ+oDa1vqpOXdCKJEmaUcM8MrS69yokSdoCzBu6VXXu3PkkO1TVL/orSZKk2TT0Pd0kz05yE3BLN/+MJB/srTJJkmbMKB2p3gu8BLgXoKquBY7uoSZJkmbSSL2Xq+qODRY9uIC1SJI000Z59/IdSZ4DVJLFwFuAm/spS5Kk2TPKme6fAG8ClgF3Aod085IkaQip2uQjuI/eMNm9qn7UZzHLly+v1at9QkmStkAZdwEtjHKme1mSf0jy+iRL+ypIkqRZNXToVtXvAGcABwFXJ7koyWt6q0ySpBkzau/lK6vqbcARwI+Bc+f5iCRJ6ozycownJDk5yZeBbwN3MQhfSZI0hFEeGboW+CLwzqr6Tj/lSJI0u0YJ3X2rqpLs0Fs1kiTNsFHu6T7Ldy9LkrT5fPeyJEmNjHJ5maq6I3nU88sL+u7lm9fdy+GnfXIhdzn11qw6adwlSJIWiO9eliSpEd+9LElSI0Of6VbVPcCre6xFkqSZNm/oJnk/sMlREarq1AWtSJKkGTXMma7D/kiStADmDd2qGur9ykneX1V/+vhLkiRpNo004ME8jlrAfUmSNHMWMnQlSdJjMHQlSWpkIUM3828iSdKWayFD930LuC9JkmbO0C/HSPI7wGnAU+d+rqqO6X5/YqGLkyRplozy7uXPA2cDH2aBBzqQJGlLMEroPlBVZ/VWiSRJM26Ue7p/n+SNSfZIsuv6n94qkyTpMSRZmuSNDY7zqiQHLsS+Rgndkxnc0/02sKb7ecxXRCZ5SpJLk9yU5MYkb9n8UiVJepSlwNChm4HN6UD8KmBBQneUUYb22Yz9PwD8l6q6OskSYE2Sr1bVTZuxL0mS5no38NtJvgtcChwM7AIsBs6oqguS7A1cDFwBHA68LMlJwGuAHwF3AGuq6n8l+W3gA8DuwC+A/wTsChwHPC/JGcAfVNW/bG7Bo/ReXgy8ATi6W/R14ENV9etNfaaq7gLu6qbvT3Izg/F4DV1J0uN1OvD0qjokydbADlX10yRPBC5PcmG33X7AyVV1eZJnAn8APINBOF/N4MotwDnAn1TVbUmOBD5YVcd0+7moqr7weAsepSPVWV2BH+zmX9st++NhPtz9tXEog7825i5fAawAWLbzYs5fsmqEkn7TXiuvf1yflyRNpQDvSnI08BCDE7wndev+b1Vd3k0fBVxQVb8Efpnk7wGS7AQ8B/h88vC7nrZd6CJHCd1nVtUz5sxfkuTaYT7YNeY84K1V9dO566rqHAZ/XXDwsu03OW6vJEmP4dUMLgsfXlW/TnI7sF237udDfH4r4L6qOqSf8h45yLAe7K53A5BkX4Z4Xre7LH0e8Omq+rvRS5QkaaPuB5Z00zsDd3eB+wIGL3LamMuAVyTZrjshfDlAd0L4/SR/CA93ulp/ojn3OI/LKGe6pwGXJvkeg9P4pwKnPNYHMjhH/yhwc1X99WZXKUnSBqrq3iSXJbkBuAo4IMn1DJ6suWUTn7mqu0d7HfBD4HrgJ93qVwNndR2mFgOfBa7tfn84yanA8Y+nI1Wqhr+im2RbYP9u9taq+tU82z8X+CaDRj3ULf6vVfWljW1/8LLt66L//LSh69kY7+lK0lRqNmhOkp2q6mdJdgC+AayoqqtbHHveM90kx1TVJUl+f4NVT0vCY10yrqpv4ehDkqTJck73sovtgHNbBS4Md3n5ecAlwCs2sq4A79NKkqZGVf3HcR173tCtqr/oJt9ZVd+fuy7J5rwwQ5KkLdIovZfP28iyx/2gsCRJW4ph7ukeABwE7LzBfd0n8MgzUJIkaR7D3NPdn8FzTEt59H3d+xm8l1KSJA1hmHu6FwAXJHl2VX2nQU2SJI1VkmOB9wGLgI9U1bs3WL8t8EkGgyjcC/xRVd0+335HeTnGNUnexOBS88OXlavqdSPsQ5KkoR1+2icX9PXAa1adNO9jrEkWMRht6EXAOuCqJBduMELe64F/q6qnJTkBeA/wR/Pte5SOVJ8Cngy8BPgnYE8Gl5glSZolRwBrq+p7VfXvDN5I9coNtnklcG43/QXghZkzUsKmjBK6T6uq/wb8vKrOBX4POHKEz0uSNA2WMRhnd7113bKNblNVDzB4leRu8+14lNBdP27ufUmezuDl0r81wuclSdqijRK65yTZBTgDuJDBQPR/1UtVkiSNz53AU+bM79kt2+g2SbZmcCJ673w7HrojVVV9pJv8BrDvsJ+TJGnKXAXs17118U7gBGDDV0deCJwMfAc4HrikhhhBaOgz3STvSrJ0zvwuSf7nsJ+XJGkadPdo3wxcDNwMfK6qbkzyziTHdZt9FNgtyVrgbcDpw+x76KH9klxTVYdusOzqqjpsyHbMa/ny5bV69eqF2p0kaXpsESPSjXJPd1H3MDAASbYHtn2M7SVJ0hyjvBzj08DXkny8mz+FR55RkiRJ8xilI9V7klwHvLBb9D+q6uJ+ypIkafaMcqZLVX0Z+HJPtUiSNNOGDt0k9wPre11tAyxm8HaqJ/RRmCRJs2aUy8tL1k9375d8JfCsPoqSJGkWjdJ7+WE18EUGgx9IkjQzknwsyd1JbtjE+iQ5M8naJNclGfrR2VEuL//+nNmtgOXAL4f9vCRJo/p/7/zdBR3ab6+V1w/zPPAngP/NYLzcjXkpsF/3cyRwFkMOADRKR6pXzJl+ALid3xzq6HG5ed29HH7aptooDWfNqpPGXYKkKVZV30iy92Ns8krgk91rHy9PsjTJHlV113z7HuWe7inDbitJ0gzb1NB/jz90k7yfR3ot/4aqOnWIAiVJ2uIN05FqNbAG2A44DLit+zmEwaNDkiRtSYYZ+m+j5j3TrapzAZK8AXhuN/oCSc4GvjlyqZIkTbcLgTcn+SyDDlQ/GeZ+LozWkWoX4AnAj7v5nbplkiTNjCSfAZ4PPDHJOuAvGLwQiqo6G/gS8DJgLfALBmMRDGWU0H03cHWSrzMYgulo4L+P8HlJkkYy5CM+C6qqTpxnfQFv2px9j/JyjE8AK4GDgfOA5zEY3FeSJA1hlDPdDwIPAdtX1YVJdmEQvs/spTJJkmbMKKF7ZFUdluQagKr6tyT2XpYkaUijXF7+dZJFdM/sJtmdwZmvJEkawiiheyZwPvBbSf4S+Bbwrl6qkiRpBo3yGshPJ1kDvJBB7+VXVZUdqSRJGtIo93SpqluAW3qqRZKkmbZZ4+lKkqTRGbqSJDXSa+gm2S7JlUmuTXJjknf0eTxJkibZSPd0N8OvgGOq6mdJFgPfSvLlqrq85+NKkjRxeg3d7v2UP+tmF3c/mxybV5KkWdb3mS7dCzXWAE8DPlBVV2ywfgWwAmDZzos5f8mqvkvabHutvH7cJUiSpljvHamq6sGqOoTBIL9HJHn6BuvPqarlVbV81x0X9V2OJElj06z3clXdB1wKHNvqmJIkTZK+ey/vnmRpN7098CJ8uYYkaQvV9z3dPYBzu/u6WwGfq6qLej6mJEkTqe/ey9cBh/Z5DEmSpoVvpJIkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqZG+h/YbyTZ7HMReK1ePuwxJknrhma4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjqapx1/CwHZ+8Tx3w2neMuwxJ0pDWrDppoXaVhdrRJPNMV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdSZIaMXQlSWqkSegmWZTkmiQXtTieJEmTqNWZ7luAmxsdS5KkidR76CbZE/g94CN9H0uSpEm2dYNjvBf4M2DJxlYmWQGsAFi282LOX7KqQUmaJXutvH7cJUjSUHo9003ycuDuqlqzqW2q6pyqWl5Vy3fdcVGf5UiSNFZ9X14+Cjguye3AZ4Fjkvyfno8pSdJE6jV0q+rtVbVnVe0NnABcUlWv6fOYkiRNKp/TlSSpkRYdqQCoqq8DX291PEmSJo1nupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNWLoSpLUiKErSVIjhq4kSY0YupIkNdJslKFhbLPHQey1cvW4y5AkqRee6UqS1IihK0lSI4auJEmNGLqSJDWSqhp3DQ9Lcj9w67jrWGBPBO4ZdxELbNbaNGvtAds0DWatPfD42nRPVR27kMVMoonqvQzcWlXLx13EQkqy2jZNtllrD9imaTBr7YHZbNNC8/KyJEmNGLqSJDUyaaF7zrgL6IFtmnyz1h6wTdNg1toDs9mmBTVRHakkSZplk3amK0nSzDJ0JUlqZGJCN8mxSW5NsjbJ6eOu57Ek+ViSu5PcMGfZrkm+muS27vcu3fIkObNr13VJDpvzmZO77W9LcvI42tLV8ZQklya5KcmNSd4yA23aLsmVSa7t2vSObvk+Sa7oav/bJNt0y7ft5td26/ees6+3d8tvTfKSMTVpfS2LklyT5KJuftrbc3uS65N8N8nqbtnUfu+6WpYm+UKSW5LcnOTZ09qmJPt3/2/W//w0yVuntT0ToarG/gMsAv4F2BfYBrgWOHDcdT1GvUcDhwE3zFn2V8Dp3fTpwHu66ZcBXwYCPAu4olu+K/C97vcu3fQuY2rPHsBh3fQS4J+BA6e8TQF26qYXA1d0tX4OOKFbfjbwhm76jcDZ3fQJwN920wd238dtgX267+miMX733gb8DXBRNz/t7bkdeOIGy6b2e9fVcy7wx930NsDSaW9TV9Mi4AfAU2ehPWP77zjuArr/Ic8GLp4z/3bg7eOua56a9+bRoXsrsEc3vQeDF30AfAg4ccPtgBOBD81Z/qjtxty2C4AXzUqbgB2Aq4EjGbwtZ+sNv3fAxcCzu+mtu+2y4Xdx7nZjaMeewNeAY4CLuvqmtj3d8W/nN0N3ar93wM7A9+k6qc5Cm+bU8GLgsllpz7h+JuXy8jLgjjnz67pl0+RJVXVXN/0D4End9KbaNpFt7i5DHsrgzHCq29Rdiv0ucDfwVQZndfdV1QPdJnPre7j2bv1PgN2YrDa9F/gz4KFufjemuz0ABfxDkjVJVnTLpvl7tw/wI+Dj3W2AjyTZkelu03onAJ/ppmehPWMxKaE7U2rwp9zUPYuVZCfgPOCtVfXTueumsU1V9WBVHcLgDPEI4IDxVrT5krwcuLuq1oy7lgX23Ko6DHgp8KYkR89dOYXfu60Z3Ho6q6oOBX7O4PLrw6awTXR9BY4DPr/humlszzhNSujeCTxlzvye3bJp8sMkewB0v+/ulm+qbRPV5iSLGQTup6vq77rFU92m9arqPuBSBpdflyZZ/87xufU9XHu3fmfgXianTUcBxyW5Hfgsg0vM72N62wNAVd3Z/b4bOJ/BH0fT/L1bB6yrqiu6+S8wCOFpbhMM/ii6uqp+2M1Pe3vGZlJC9ypgv64n5jYMLmNcOOaaRnUhsL5H3skM7ouuX35S16vvWcBPussyFwMvTrJL1/Pvxd2y5pIE+Chwc1X99ZxV09ym3ZMs7aa3Z3CP+mYG4Xt8t9mGbVrf1uOBS7q/4C8ETuh6A+8D7Adc2aQRc1TV26tqz6ram8G/j0uq6tVMaXsAkuyYZMn6aQbflxuY4u9dVf0AuCPJ/t2iFwI3McVt6pzII5eWYfrbMz7jvqm8/odBr7d/ZnDf7c/HXc88tX4GuAv4NYO/bF/P4H7Z14DbgH8Edu22DfCBrl3XA8vn7Od1wNru55Qxtue5DC4PXQd8t/t52ZS36WDgmq5NNwAru+X7MgiZtQwulW3bLd+um1/brd93zr7+vGvrrcBLJ+D793we6b08te3par+2+7lx/b/7af7edbUcAqzuvntfZNBbd2rbBOzI4CrJznOWTW17xv3jayAlSWpkUi4vS5I08wxdSZIaMXQlSWrE0JUkqRFDV5KkRgxdaQp1I73sMO46JI3GR4akKdS9mWp5Vd0z7lokDc8zXaknSU7qxhS9Nsmnkuyd5JJu2deS7NVt94kkx8/53M+6389P8vU8Mjbrp7s3/ZwK/Afg0iSXjqd1kjbH1vNvImlUSQ4CzgCeU1X3JNmVwTir51bVuUleB5wJvGqeXR0KHAT8K3AZcFRVnZnkbcALPNOVpotnulI/jgE+vz4Uq+rHDAZc+Jtu/acYvH5zPldW1bqqeojB6zn3XvhSJbVi6Erj9wDdv8UkWwHbzFn3qznTD+LVKWmqGbpSPy4B/jDJbgDd5eVvMxghCODVwDe76duBw7vp44DFQ+z/fmDJQhUrqQ3/apZ6UFU3JvlL4J+SPMhgxKM/BT6e5DTgR8Ap3eYfBi5Ici3wFQYDn8/nHOArSf61ql6w8C2Q1AcfGZIkqREvL0uS1IihK0lSI4auJEmNGLqSJDVi6EqS1IihK0lSI4auJEmN/H/knqrmbsTOwAAAAABJRU5ErkJggg==\n", 1219 | "text/plain": [ 1220 | "
" 1221 | ] 1222 | }, 1223 | "metadata": { 1224 | "needs_background": "light" 1225 | }, 1226 | "output_type": "display_data" 1227 | }, 1228 | { 1229 | "data": { 1230 | "image/png": "\n", 1231 | "text/plain": [ 1232 | "
" 1233 | ] 1234 | }, 1235 | "metadata": { 1236 | "needs_background": "light" 1237 | }, 1238 | "output_type": "display_data" 1239 | } 1240 | ], 1241 | "source": [ 1242 | "X_train_ = X_train.copy()\n", 1243 | "X_train_['target'] = y_train\n", 1244 | "for var in cat_vars_ordinal:\n", 1245 | " sns.catplot(y=var, hue='target', data=X_train_, kind=\"count\", height=4, aspect=1.5)\n", 1246 | " plt.show()" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "markdown", 1251 | "id": "f51aeebc-1104-47ec-a56a-661b8acb625d", 1252 | "metadata": {}, 1253 | "source": [ 1254 | "## Feature Scaling" 1255 | ] 1256 | }, 1257 | { 1258 | "cell_type": "code", 1259 | "execution_count": 24, 1260 | "id": "b29e497f", 1261 | "metadata": {}, 1262 | "outputs": [], 1263 | "source": [ 1264 | "min_max_scaler = MinMaxScaler()\n", 1265 | "min_max_scaler.fit(X_train) \n", 1266 | "\n", 1267 | "X_train = pd.DataFrame(min_max_scaler.transform(X_train), columns=X_train.columns)\n", 1268 | "\n", 1269 | "X_test = pd.DataFrame(min_max_scaler.transform(X_test), columns=X_train.columns)" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "execution_count": 25, 1275 | "id": "28fdef63-9f00-484f-a11f-7b7b89c6b861", 1276 | "metadata": {}, 1277 | "outputs": [ 1278 | { 1279 | "data": { 1280 | "text/html": [ 1281 | "
\n", 1282 | "\n", 1295 | "\n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | "
citycity_development_indexrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hoursgender_Malegender_Missinggender_Othergender_Female
00.0000000.3512971.01.01.000.6666670.0476190.3751.0000000.20.5512601.00.00.00.0
10.0000000.3512970.01.01.000.6666670.0000000.1251.0000000.20.6751480.01.00.00.0
20.0081970.9221560.00.00.750.6666670.4285710.0001.0000000.20.4772241.00.00.00.0
30.0163930.5469060.00.50.750.6666670.4761900.5001.0000000.60.3591270.01.00.00.0
40.0245900.9421161.00.00.250.0000000.0952380.0000.6158190.00.6112890.01.00.00.0
................................................
153210.2540980.0618761.01.01.000.6666670.4761900.0000.6158191.00.4717371.00.00.00.0
153220.0000000.3512970.00.01.000.6666670.2857140.7501.0000000.20.3681121.00.00.00.0
153230.0245900.9421160.00.01.000.6666670.2857140.7501.0000000.00.5347620.00.01.00.0
153240.0000000.3512971.00.01.000.6666670.0952380.2500.6158190.20.6295030.01.00.00.0
153250.0081970.9221560.00.01.000.6666670.5714290.0000.6158190.20.0957691.00.00.00.0
\n", 1517 | "

15326 rows × 15 columns

\n", 1518 | "
" 1519 | ], 1520 | "text/plain": [ 1521 | " city city_development_index relevent_experience \\\n", 1522 | "0 0.000000 0.351297 1.0 \n", 1523 | "1 0.000000 0.351297 0.0 \n", 1524 | "2 0.008197 0.922156 0.0 \n", 1525 | "3 0.016393 0.546906 0.0 \n", 1526 | "4 0.024590 0.942116 1.0 \n", 1527 | "... ... ... ... \n", 1528 | "15321 0.254098 0.061876 1.0 \n", 1529 | "15322 0.000000 0.351297 0.0 \n", 1530 | "15323 0.024590 0.942116 0.0 \n", 1531 | "15324 0.000000 0.351297 1.0 \n", 1532 | "15325 0.008197 0.922156 0.0 \n", 1533 | "\n", 1534 | " enrolled_university education_level major_discipline experience \\\n", 1535 | "0 1.0 1.00 0.666667 0.047619 \n", 1536 | "1 1.0 1.00 0.666667 0.000000 \n", 1537 | "2 0.0 0.75 0.666667 0.428571 \n", 1538 | "3 0.5 0.75 0.666667 0.476190 \n", 1539 | "4 0.0 0.25 0.000000 0.095238 \n", 1540 | "... ... ... ... ... \n", 1541 | "15321 1.0 1.00 0.666667 0.476190 \n", 1542 | "15322 0.0 1.00 0.666667 0.285714 \n", 1543 | "15323 0.0 1.00 0.666667 0.285714 \n", 1544 | "15324 0.0 1.00 0.666667 0.095238 \n", 1545 | "15325 0.0 1.00 0.666667 0.571429 \n", 1546 | "\n", 1547 | " company_size company_type last_new_job training_hours gender_Male \\\n", 1548 | "0 0.375 1.000000 0.2 0.551260 1.0 \n", 1549 | "1 0.125 1.000000 0.2 0.675148 0.0 \n", 1550 | "2 0.000 1.000000 0.2 0.477224 1.0 \n", 1551 | "3 0.500 1.000000 0.6 0.359127 0.0 \n", 1552 | "4 0.000 0.615819 0.0 0.611289 0.0 \n", 1553 | "... ... ... ... ... ... \n", 1554 | "15321 0.000 0.615819 1.0 0.471737 1.0 \n", 1555 | "15322 0.750 1.000000 0.2 0.368112 1.0 \n", 1556 | "15323 0.750 1.000000 0.0 0.534762 0.0 \n", 1557 | "15324 0.250 0.615819 0.2 0.629503 0.0 \n", 1558 | "15325 0.000 0.615819 0.2 0.095769 1.0 \n", 1559 | "\n", 1560 | " gender_Missing gender_Other gender_Female \n", 1561 | "0 0.0 0.0 0.0 \n", 1562 | "1 1.0 0.0 0.0 \n", 1563 | "2 0.0 0.0 0.0 \n", 1564 | "3 1.0 0.0 0.0 \n", 1565 | "4 1.0 0.0 0.0 \n", 1566 | "... ... ... ... \n", 1567 | "15321 0.0 0.0 0.0 \n", 1568 | "15322 0.0 0.0 0.0 \n", 1569 | "15323 0.0 1.0 0.0 \n", 1570 | "15324 1.0 0.0 0.0 \n", 1571 | "15325 0.0 0.0 0.0 \n", 1572 | "\n", 1573 | "[15326 rows x 15 columns]" 1574 | ] 1575 | }, 1576 | "execution_count": 25, 1577 | "metadata": {}, 1578 | "output_type": "execute_result" 1579 | } 1580 | ], 1581 | "source": [ 1582 | "X_train" 1583 | ] 1584 | }, 1585 | { 1586 | "cell_type": "code", 1587 | "execution_count": null, 1588 | "id": "3d88ee15-f39c-428a-a919-a5b51262bd37", 1589 | "metadata": {}, 1590 | "outputs": [], 1591 | "source": [] 1592 | } 1593 | ], 1594 | "metadata": { 1595 | "kernelspec": { 1596 | "display_name": "Python 3 (ipykernel)", 1597 | "language": "python", 1598 | "name": "python3" 1599 | }, 1600 | "language_info": { 1601 | "codemirror_mode": { 1602 | "name": "ipython", 1603 | "version": 3 1604 | }, 1605 | "file_extension": ".py", 1606 | "mimetype": "text/x-python", 1607 | "name": "python", 1608 | "nbconvert_exporter": "python", 1609 | "pygments_lexer": "ipython3", 1610 | "version": "3.9.10" 1611 | } 1612 | }, 1613 | "nbformat": 4, 1614 | "nbformat_minor": 5 1615 | } 1616 | -------------------------------------------------------------------------------- /notebooks/3. Feature Engineering Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "id": "7c6d8ff9-e3b1-4abe-8ac9-6271a5598527", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import scipy.stats as stats\n", 13 | "\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn.preprocessing import MinMaxScaler\n", 16 | "from sklearn.pipeline import Pipeline\n", 17 | "\n", 18 | "from feature_engine.imputation import (\n", 19 | " CategoricalImputer,\n", 20 | ")\n", 21 | "\n", 22 | "from feature_engine.transformation import (\n", 23 | " YeoJohnsonTransformer,\n", 24 | ")\n", 25 | "\n", 26 | "from feature_engine.encoding import (\n", 27 | " RareLabelEncoder,\n", 28 | " OrdinalEncoder,\n", 29 | " OneHotEncoder,\n", 30 | " CountFrequencyEncoder\n", 31 | ")\n", 32 | "\n", 33 | "import joblib\n", 34 | "\n", 35 | "import preprocess as pp" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "ad1fb6dc-c5cf-4306-8adf-71c51ac67e0a", 41 | "metadata": {}, 42 | "source": [ 43 | "## Read Data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 21, 49 | "id": "08560625-7ac7-48fa-8d03-b9b3f8b17f59", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(19158, 14)\n" 57 | ] 58 | }, 59 | { 60 | "data": { 61 | "text/html": [ 62 | "
\n", 63 | "\n", 76 | "\n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | "
enrollee_idcitycity_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hourstarget
08949city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM>20NaNNaN1361.0
129725city_400.776MaleNo relevent experienceno_enrollmentGraduateSTEM1550-99Pvt Ltd>4470.0
211561city_210.624NaNNo relevent experienceFull time courseGraduateSTEM5NaNNaNnever830.0
333241city_1150.789NaNNo relevent experienceNaNGraduateBusiness Degree<1NaNPvt Ltdnever521.0
4666city_1620.767MaleHas relevent experienceno_enrollmentMastersSTEM>2050-99Funded Startup480.0
\n", 184 | "
" 185 | ], 186 | "text/plain": [ 187 | " enrollee_id city city_development_index gender \\\n", 188 | "0 8949 city_103 0.920 Male \n", 189 | "1 29725 city_40 0.776 Male \n", 190 | "2 11561 city_21 0.624 NaN \n", 191 | "3 33241 city_115 0.789 NaN \n", 192 | "4 666 city_162 0.767 Male \n", 193 | "\n", 194 | " relevent_experience enrolled_university education_level \\\n", 195 | "0 Has relevent experience no_enrollment Graduate \n", 196 | "1 No relevent experience no_enrollment Graduate \n", 197 | "2 No relevent experience Full time course Graduate \n", 198 | "3 No relevent experience NaN Graduate \n", 199 | "4 Has relevent experience no_enrollment Masters \n", 200 | "\n", 201 | " major_discipline experience company_size company_type last_new_job \\\n", 202 | "0 STEM >20 NaN NaN 1 \n", 203 | "1 STEM 15 50-99 Pvt Ltd >4 \n", 204 | "2 STEM 5 NaN NaN never \n", 205 | "3 Business Degree <1 NaN Pvt Ltd never \n", 206 | "4 STEM >20 50-99 Funded Startup 4 \n", 207 | "\n", 208 | " training_hours target \n", 209 | "0 36 1.0 \n", 210 | "1 47 0.0 \n", 211 | "2 83 0.0 \n", 212 | "3 52 1.0 \n", 213 | "4 8 0.0 " 214 | ] 215 | }, 216 | "execution_count": 21, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_csv('../src/data/train.csv')\n", 223 | "print(data.shape)\n", 224 | "data.head()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "id": "2e71c3f7-ea54-44b9-ad7f-6d7a5e9bbda6", 230 | "metadata": {}, 231 | "source": [ 232 | "## Train-Test split" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 22, 238 | "id": "2c5dce77-a9a3-467d-a688-a26bceecf6ac", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "X_train, X_test, y_train, y_test = train_test_split(\n", 243 | " data.drop(['enrollee_id', 'target'], axis=1),\n", 244 | " data['target'],\n", 245 | " test_size=0.2,\n", 246 | " random_state=0,\n", 247 | ")" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "95436658-505f-49b7-baad-4d6c143bdf4c", 253 | "metadata": {}, 254 | "source": [ 255 | "## Config" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 11, 261 | "id": "c7089a44-fe95-446e-90b8-0618cea9352e", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']\n", 266 | "\n", 267 | "CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']\n", 268 | "\n", 269 | "NUM_VARS = ['city_development_index', 'training_hours']\n", 270 | "\n", 271 | "NUM_VARS_YEO_JOHNSON = ['training_hours']\n", 272 | "\n", 273 | "CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']\n", 274 | "CAT_VARS_ORDINAL_ARBITRARY = ['city']\n", 275 | "CAT_VARS_ONEHOT = ['gender']\n", 276 | "CAT_VARS_COUNT_FREQUENCY = ['company_type']\n", 277 | "\n", 278 | "EXPERIENCE_VAR = ['experience']\n", 279 | "\n", 280 | "EXPERIENCE_MAP = {\n", 281 | " '<1': 0,\n", 282 | " '1': 1, \n", 283 | " '2': 2, \n", 284 | " '3': 3, \n", 285 | " '4': 4, \n", 286 | " '5': 5,\n", 287 | " '6': 6,\n", 288 | " '7': 7,\n", 289 | " '8': 8, \n", 290 | " '9': 9, \n", 291 | " '10': 10, \n", 292 | " '11': 11,\n", 293 | " '12': 12,\n", 294 | " '13': 13, \n", 295 | " '14': 14, \n", 296 | " '15': 15, \n", 297 | " '16': 16,\n", 298 | " '17': 17,\n", 299 | " '18': 18,\n", 300 | " '19': 19, \n", 301 | " '20': 20, \n", 302 | " '>20': 21\n", 303 | "} \n", 304 | "LAST_NEW_JOB_VAR = ['last_new_job']\n", 305 | "\n", 306 | "LAST_NEW_JOB_MAP = {\n", 307 | " 'never': 0,\n", 308 | " '1': 1, \n", 309 | " '2': 2, \n", 310 | " '3': 3, \n", 311 | " '4': 4, \n", 312 | " '>4': 5\n", 313 | "}\n", 314 | "\n", 315 | "COMPANY_SIZE_VAR = ['company_size']\n", 316 | "\n", 317 | "COMPANY_SIZE_MAP = {\n", 318 | " 'Missing': 0,\n", 319 | " '<10': 1,\n", 320 | " '10/49': 2, \n", 321 | " '100-500': 3, \n", 322 | " '1000-4999': 4, \n", 323 | " '10000+': 5, \n", 324 | " '50-99': 6, \n", 325 | " '500-999': 7, \n", 326 | " '5000-9999': 8\n", 327 | "}" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "id": "30198f77-bab4-498d-aba2-344cf450b1ef", 333 | "metadata": {}, 334 | "source": [ 335 | "## Feature Engineering Pipeline" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 23, 341 | "id": "8141f99c-d63c-4eeb-a5a3-33d78cd0428e", 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "fe_pipe = Pipeline([\n", 346 | " ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),\n", 347 | " ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),\n", 348 | " ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),\n", 349 | " ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),\n", 350 | " ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),\n", 351 | " ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),\n", 352 | " ('onehot_encoder', OneHotEncoder(variables=CAT_VARS_ONEHOT)),\n", 353 | " ('experience_map', pp.Mapper(variables=EXPERIENCE_VAR, mappings=EXPERIENCE_MAP)),\n", 354 | " ('last_new_job_map', pp.Mapper(variables=LAST_NEW_JOB_VAR, mappings=LAST_NEW_JOB_MAP)),\n", 355 | " ('company_size_map', pp.Mapper(variables=COMPANY_SIZE_VAR, mappings=COMPANY_SIZE_MAP)),\n", 356 | " # ('min_max_scaler', MinMaxScaler())\n", 357 | "])" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 24, 363 | "id": "9b4aee44-401a-48b4-8292-d6ff4096fc6f", 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/html": [ 369 | "
\n", 370 | "\n", 383 | "\n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | "
citycity_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hours
19147city_210.624MaleNo relevent experienceFull time courseGraduateSTEM1100-500Pvt Ltd152
8464city_210.624NaNHas relevent experienceFull time courseGraduateSTEM<1<10Pvt LtdNaN92
8869city_160.910MaleHas relevent experienceno_enrollmentMastersSTEM9NaNPvt Ltd136
11645city_1180.722NaNHas relevent experiencePart time courseMastersSTEM101000-4999Pvt Ltd319
7743city_1030.920NaNNo relevent experienceno_enrollmentPrimary SchoolNaN2NaNNaNnever69
\n", 479 | "
" 480 | ], 481 | "text/plain": [ 482 | " city city_development_index gender relevent_experience \\\n", 483 | "19147 city_21 0.624 Male No relevent experience \n", 484 | "8464 city_21 0.624 NaN Has relevent experience \n", 485 | "8869 city_16 0.910 Male Has relevent experience \n", 486 | "11645 city_118 0.722 NaN Has relevent experience \n", 487 | "7743 city_103 0.920 NaN No relevent experience \n", 488 | "\n", 489 | " enrolled_university education_level major_discipline experience \\\n", 490 | "19147 Full time course Graduate STEM 1 \n", 491 | "8464 Full time course Graduate STEM <1 \n", 492 | "8869 no_enrollment Masters STEM 9 \n", 493 | "11645 Part time course Masters STEM 10 \n", 494 | "7743 no_enrollment Primary School NaN 2 \n", 495 | "\n", 496 | " company_size company_type last_new_job training_hours \n", 497 | "19147 100-500 Pvt Ltd 1 52 \n", 498 | "8464 <10 Pvt Ltd NaN 92 \n", 499 | "8869 NaN Pvt Ltd 1 36 \n", 500 | "11645 1000-4999 Pvt Ltd 3 19 \n", 501 | "7743 NaN NaN never 69 " 502 | ] 503 | }, 504 | "execution_count": 24, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "X_train.head()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 25, 516 | "id": "30c79ad3-af73-434e-9425-5c8d8142f2bb", 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "data": { 521 | "text/plain": [ 522 | "Pipeline(steps=[('cat_imputer_missing',\n", 523 | " CategoricalImputer(variables=['gender', 'major_discipline',\n", 524 | " 'company_size',\n", 525 | " 'company_type'])),\n", 526 | " ('cat_imputer_frequent',\n", 527 | " CategoricalImputer(imputation_method='frequent',\n", 528 | " variables=['enrolled_university',\n", 529 | " 'education_level', 'experience',\n", 530 | " 'last_new_job'])),\n", 531 | " ('num_transformer_yeo_johnson',\n", 532 | " YeoJohnsonTransformer(variables=['t...\n", 533 | " '20': 20, '3': 3, '4': 4, '5': 5, '6': 6,\n", 534 | " '7': 7, '8': 8, '9': 9, '<1': 0, '>20': 21},\n", 535 | " variables=['experience'])),\n", 536 | " ('last_new_job_map',\n", 537 | " Mapper(mappings={'1': 1, '2': 2, '3': 3, '4': 4, '>4': 5,\n", 538 | " 'never': 0},\n", 539 | " variables=['last_new_job'])),\n", 540 | " ('company_size_map',\n", 541 | " Mapper(mappings={'10/49': 2, '100-500': 3, '1000-4999': 4,\n", 542 | " '10000+': 5, '50-99': 6, '500-999': 7,\n", 543 | " '5000-9999': 8, '<10': 1, 'Missing': 0},\n", 544 | " variables=['company_size']))])" 545 | ] 546 | }, 547 | "execution_count": 25, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "fe_pipe.fit(X_train, y_train)" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 26, 559 | "id": "5e070fd7-5a92-4cb0-8e3c-03632c84875f", 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "X_train = fe_pipe.transform(X_train)\n", 564 | "X_test = fe_pipe.transform(X_test)" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 27, 570 | "id": "5313bf6c-7c5a-4ae9-b9b9-74a1bda12326", 571 | "metadata": {}, 572 | "outputs": [ 573 | { 574 | "data": { 575 | "text/html": [ 576 | "
\n", 577 | "\n", 590 | "\n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | "
citycity_development_indexrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hoursgender_Malegender_Missinggender_Othergender_Female
1914700.6241244130.51474615.3719211000
846400.6240244010.51474616.4152910100
886910.9100034900.51474614.7483991000
1164520.72201341040.51474633.7537940100
774330.9201010200.31952205.8774770100
................................................
9225310.47912441000.31952254.7021841000
1312300.6240044660.51474613.8294701000
984530.9200044660.51474605.2329790010
1079900.6241044220.31952216.0308790100
273210.91000441200.31952211.5358191000
\n", 812 | "

15326 rows × 15 columns

\n", 813 | "
" 814 | ], 815 | "text/plain": [ 816 | " city city_development_index relevent_experience enrolled_university \\\n", 817 | "19147 0 0.624 1 2 \n", 818 | "8464 0 0.624 0 2 \n", 819 | "8869 1 0.910 0 0 \n", 820 | "11645 2 0.722 0 1 \n", 821 | "7743 3 0.920 1 0 \n", 822 | "... ... ... ... ... \n", 823 | "9225 31 0.479 1 2 \n", 824 | "13123 0 0.624 0 0 \n", 825 | "9845 3 0.920 0 0 \n", 826 | "10799 0 0.624 1 0 \n", 827 | "2732 1 0.910 0 0 \n", 828 | "\n", 829 | " education_level major_discipline experience company_size \\\n", 830 | "19147 4 4 1 3 \n", 831 | "8464 4 4 0 1 \n", 832 | "8869 3 4 9 0 \n", 833 | "11645 3 4 10 4 \n", 834 | "7743 1 0 2 0 \n", 835 | "... ... ... ... ... \n", 836 | "9225 4 4 10 0 \n", 837 | "13123 4 4 6 6 \n", 838 | "9845 4 4 6 6 \n", 839 | "10799 4 4 2 2 \n", 840 | "2732 4 4 12 0 \n", 841 | "\n", 842 | " company_type last_new_job training_hours gender_Male \\\n", 843 | "19147 0.514746 1 5.371921 1 \n", 844 | "8464 0.514746 1 6.415291 0 \n", 845 | "8869 0.514746 1 4.748399 1 \n", 846 | "11645 0.514746 3 3.753794 0 \n", 847 | "7743 0.319522 0 5.877477 0 \n", 848 | "... ... ... ... ... \n", 849 | "9225 0.319522 5 4.702184 1 \n", 850 | "13123 0.514746 1 3.829470 1 \n", 851 | "9845 0.514746 0 5.232979 0 \n", 852 | "10799 0.319522 1 6.030879 0 \n", 853 | "2732 0.319522 1 1.535819 1 \n", 854 | "\n", 855 | " gender_Missing gender_Other gender_Female \n", 856 | "19147 0 0 0 \n", 857 | "8464 1 0 0 \n", 858 | "8869 0 0 0 \n", 859 | "11645 1 0 0 \n", 860 | "7743 1 0 0 \n", 861 | "... ... ... ... \n", 862 | "9225 0 0 0 \n", 863 | "13123 0 0 0 \n", 864 | "9845 0 1 0 \n", 865 | "10799 1 0 0 \n", 866 | "2732 0 0 0 \n", 867 | "\n", 868 | "[15326 rows x 15 columns]" 869 | ] 870 | }, 871 | "execution_count": 27, 872 | "metadata": {}, 873 | "output_type": "execute_result" 874 | } 875 | ], 876 | "source": [ 877 | "X_train" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": null, 883 | "id": "cc449c65-ac72-4c2c-96a6-3b184315601e", 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [] 887 | } 888 | ], 889 | "metadata": { 890 | "kernelspec": { 891 | "display_name": "Python 3 (ipykernel)", 892 | "language": "python", 893 | "name": "python3" 894 | }, 895 | "language_info": { 896 | "codemirror_mode": { 897 | "name": "ipython", 898 | "version": 3 899 | }, 900 | "file_extension": ".py", 901 | "mimetype": "text/x-python", 902 | "name": "python", 903 | "nbconvert_exporter": "python", 904 | "pygments_lexer": "ipython3", 905 | "version": "3.9.7" 906 | } 907 | }, 908 | "nbformat": 4, 909 | "nbformat_minor": 5 910 | } 911 | -------------------------------------------------------------------------------- /notebooks/4. Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4419c01c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import scipy.stats as stats\n", 13 | "\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn.preprocessing import MinMaxScaler\n", 16 | "from sklearn.pipeline import Pipeline\n", 17 | "from sklearn.linear_model import LogisticRegression\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "\n", 20 | "from feature_engine.imputation import (\n", 21 | " CategoricalImputer,\n", 22 | ")\n", 23 | "\n", 24 | "from feature_engine.transformation import (\n", 25 | " YeoJohnsonTransformer,\n", 26 | ")\n", 27 | "\n", 28 | "from feature_engine.encoding import (\n", 29 | " RareLabelEncoder,\n", 30 | " OrdinalEncoder,\n", 31 | " OneHotEncoder,\n", 32 | " CountFrequencyEncoder\n", 33 | ")\n", 34 | "\n", 35 | "import joblib\n", 36 | "\n", 37 | "import preprocess as pp" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "a9613b01", 43 | "metadata": {}, 44 | "source": [ 45 | "## Read Data" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "id": "9c4bb14b", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "(19158, 14)\n" 59 | ] 60 | }, 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
enrollee_idcitycity_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hourstarget
08949city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM>20NaNNaN1361.0
129725city_400.776MaleNo relevent experienceno_enrollmentGraduateSTEM1550-99Pvt Ltd>4470.0
211561city_210.624NaNNo relevent experienceFull time courseGraduateSTEM5NaNNaNnever830.0
333241city_1150.789NaNNo relevent experienceNaNGraduateBusiness Degree<1NaNPvt Ltdnever521.0
4666city_1620.767MaleHas relevent experienceno_enrollmentMastersSTEM>2050-99Funded Startup480.0
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " enrollee_id city city_development_index gender \\\n", 190 | "0 8949 city_103 0.920 Male \n", 191 | "1 29725 city_40 0.776 Male \n", 192 | "2 11561 city_21 0.624 NaN \n", 193 | "3 33241 city_115 0.789 NaN \n", 194 | "4 666 city_162 0.767 Male \n", 195 | "\n", 196 | " relevent_experience enrolled_university education_level \\\n", 197 | "0 Has relevent experience no_enrollment Graduate \n", 198 | "1 No relevent experience no_enrollment Graduate \n", 199 | "2 No relevent experience Full time course Graduate \n", 200 | "3 No relevent experience NaN Graduate \n", 201 | "4 Has relevent experience no_enrollment Masters \n", 202 | "\n", 203 | " major_discipline experience company_size company_type last_new_job \\\n", 204 | "0 STEM >20 NaN NaN 1 \n", 205 | "1 STEM 15 50-99 Pvt Ltd >4 \n", 206 | "2 STEM 5 NaN NaN never \n", 207 | "3 Business Degree <1 NaN Pvt Ltd never \n", 208 | "4 STEM >20 50-99 Funded Startup 4 \n", 209 | "\n", 210 | " training_hours target \n", 211 | "0 36 1.0 \n", 212 | "1 47 0.0 \n", 213 | "2 83 0.0 \n", 214 | "3 52 1.0 \n", 215 | "4 8 0.0 " 216 | ] 217 | }, 218 | "execution_count": 2, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "data = pd.read_csv('../src/data/train.csv')\n", 225 | "print(data.shape)\n", 226 | "data.head()" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "id": "f50f7905", 232 | "metadata": {}, 233 | "source": [ 234 | "## Train-Test Split" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 3, 240 | "id": "cfb4ac7f", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "X_train, X_test, y_train, y_test = train_test_split(\n", 245 | " data.drop(['enrollee_id', 'target'], axis=1),\n", 246 | " data['target'],\n", 247 | " test_size=0.2,\n", 248 | " random_state=0,\n", 249 | ")" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 4, 255 | "id": "2bf8009e", 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/html": [ 261 | "
\n", 262 | "\n", 275 | "\n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | "
enrollee_idcity_development_indextraining_hourstarget
count19158.00000019158.00000019158.00000019158.000000
mean16875.3581790.82884865.3668960.249348
std9616.2925920.12336260.0584620.432647
min1.0000000.4480001.0000000.000000
25%8554.2500000.74000023.0000000.000000
50%16982.5000000.90300047.0000000.000000
75%25169.7500000.92000088.0000000.000000
max33380.0000000.949000336.0000001.000000
\n", 344 | "
" 345 | ], 346 | "text/plain": [ 347 | " enrollee_id city_development_index training_hours target\n", 348 | "count 19158.000000 19158.000000 19158.000000 19158.000000\n", 349 | "mean 16875.358179 0.828848 65.366896 0.249348\n", 350 | "std 9616.292592 0.123362 60.058462 0.432647\n", 351 | "min 1.000000 0.448000 1.000000 0.000000\n", 352 | "25% 8554.250000 0.740000 23.000000 0.000000\n", 353 | "50% 16982.500000 0.903000 47.000000 0.000000\n", 354 | "75% 25169.750000 0.920000 88.000000 0.000000\n", 355 | "max 33380.000000 0.949000 336.000000 1.000000" 356 | ] 357 | }, 358 | "execution_count": 4, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "data.describe()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "id": "200a7584", 370 | "metadata": {}, 371 | "source": [ 372 | "## Config" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 6, 378 | "id": "563d7aac", 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']\n", 383 | "\n", 384 | "CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']\n", 385 | "\n", 386 | "NUM_VARS = ['city_development_index', 'training_hours']\n", 387 | "\n", 388 | "NUM_VARS_YEO_JOHNSON = ['training_hours']\n", 389 | "\n", 390 | "CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']\n", 391 | "CAT_VARS_ORDINAL_ARBITRARY = ['city']\n", 392 | "CAT_VARS_ONEHOT = ['gender']\n", 393 | "CAT_VARS_COUNT_FREQUENCY = ['company_type']\n", 394 | "\n", 395 | "EXPERIENCE_VAR = ['experience']\n", 396 | "\n", 397 | "EXPERIENCE_MAP = {\n", 398 | " '<1': 0,\n", 399 | " '1': 1, \n", 400 | " '2': 2, \n", 401 | " '3': 3, \n", 402 | " '4': 4, \n", 403 | " '5': 5,\n", 404 | " '6': 6,\n", 405 | " '7': 7,\n", 406 | " '8': 8, \n", 407 | " '9': 9, \n", 408 | " '10': 10, \n", 409 | " '11': 11,\n", 410 | " '12': 12,\n", 411 | " '13': 13, \n", 412 | " '14': 14, \n", 413 | " '15': 15, \n", 414 | " '16': 16,\n", 415 | " '17': 17,\n", 416 | " '18': 18,\n", 417 | " '19': 19, \n", 418 | " '20': 20, \n", 419 | " '>20': 21\n", 420 | "} \n", 421 | "LAST_NEW_JOB_VAR = ['last_new_job']\n", 422 | "\n", 423 | "LAST_NEW_JOB_MAP = {\n", 424 | " 'never': 0,\n", 425 | " '1': 1, \n", 426 | " '2': 2, \n", 427 | " '3': 3, \n", 428 | " '4': 4, \n", 429 | " '>4': 5\n", 430 | "}\n", 431 | "\n", 432 | "COMPANY_SIZE_VAR = ['company_size']\n", 433 | "\n", 434 | "COMPANY_SIZE_MAP = {\n", 435 | " 'Missing': 0,\n", 436 | " '<10': 1,\n", 437 | " '10/49': 2, \n", 438 | " '50-99': 3, \n", 439 | " '100-500': 4, \n", 440 | " '500-999': 5, \n", 441 | " '1000-4999': 6, \n", 442 | " '5000-9999': 7,\n", 443 | " '10000+': 8, \n", 444 | "}" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "id": "e16607b7", 450 | "metadata": {}, 451 | "source": [ 452 | "## Pipeline" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 7, 458 | "id": "1b60d336", 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "pipe = Pipeline([\n", 463 | " ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),\n", 464 | " ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),\n", 465 | " ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),\n", 466 | " ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),\n", 467 | " ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),\n", 468 | " ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),\n", 469 | " ('onehot_encoder', OneHotEncoder(variables=CAT_VARS_ONEHOT)),\n", 470 | " ('experience_map', pp.Mapper(variables=EXPERIENCE_VAR, mappings=EXPERIENCE_MAP)),\n", 471 | " ('last_new_job_map', pp.Mapper(variables=LAST_NEW_JOB_VAR, mappings=LAST_NEW_JOB_MAP)),\n", 472 | " ('company_size_map', pp.Mapper(variables=COMPANY_SIZE_VAR, mappings=COMPANY_SIZE_MAP)),\n", 473 | " ('min_max_scaler', MinMaxScaler()),\n", 474 | " \n", 475 | " ('logistic_regression', LogisticRegression(random_state=0))\n", 476 | "])" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 8, 482 | "id": "dda370e1", 483 | "metadata": {}, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/plain": [ 488 | "Pipeline(steps=[('cat_imputer_missing',\n", 489 | " CategoricalImputer(variables=['gender', 'major_discipline',\n", 490 | " 'company_size',\n", 491 | " 'company_type'])),\n", 492 | " ('cat_imputer_frequent',\n", 493 | " CategoricalImputer(imputation_method='frequent',\n", 494 | " variables=['enrolled_university',\n", 495 | " 'education_level', 'experience',\n", 496 | " 'last_new_job'])),\n", 497 | " ('num_transformer_yeo_johnson',\n", 498 | " YeoJohnsonTransformer(variables=['t...\n", 499 | " Mapper(mappings={'1': 1, '2': 2, '3': 3, '4': 4, '>4': 5,\n", 500 | " 'never': 0},\n", 501 | " variables=['last_new_job'])),\n", 502 | " ('company_size_map',\n", 503 | " Mapper(mappings={'10/49': 2, '100-500': 4, '1000-4999': 6,\n", 504 | " '10000+': 8, '50-99': 3, '500-999': 5,\n", 505 | " '5000-9999': 7, '<10': 1, 'Missing': 0},\n", 506 | " variables=['company_size'])),\n", 507 | " ('min_max_scaler', MinMaxScaler()),\n", 508 | " ('logistic_regression', LogisticRegression(random_state=0))])" 509 | ] 510 | }, 511 | "execution_count": 8, 512 | "metadata": {}, 513 | "output_type": "execute_result" 514 | } 515 | ], 516 | "source": [ 517 | "pipe.fit(X_train, y_train)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 9, 523 | "id": "2624855f", 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "preds = pipe.predict(X_test)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 10, 533 | "id": "d24ccde7", 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "0.774008350730689" 540 | ] 541 | }, 542 | "execution_count": 10, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "accuracy_score(y_test, preds)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "id": "25bd4047", 554 | "metadata": {}, 555 | "source": [ 556 | "## Save the pipe" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 11, 562 | "id": "943b6d87", 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "['pipe.joblib']" 569 | ] 570 | }, 571 | "execution_count": 11, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | } 575 | ], 576 | "source": [ 577 | "joblib.dump(pipe, 'pipe.joblib') " 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "id": "d92a3839", 583 | "metadata": {}, 584 | "source": [ 585 | "## Score new data" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 12, 591 | "id": "b3988421", 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "new_data = pd.read_csv('../src/data/test.csv')" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 13, 601 | "id": "6f5ba36f", 602 | "metadata": { 603 | "tags": [] 604 | }, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/html": [ 609 | "
\n", 610 | "\n", 623 | "\n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | "
citycity_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_sizecompany_typelast_new_jobtraining_hours
0city_410.827MaleHas relevent experienceFull time courseGraduateSTEM9<10NaN121
1city_1030.920FemaleHas relevent experienceno_enrollmentGraduateSTEM5NaNPvt Ltd198
2city_210.624MaleNo relevent experienceno_enrollmentHigh SchoolNaN<1NaNPvt Ltdnever15
3city_130.827MaleHas relevent experienceno_enrollmentMastersSTEM1110/49Pvt Ltd139
4city_1030.920MaleHas relevent experienceno_enrollmentGraduateSTEM>2010000+Pvt Ltd>472
.......................................
2124city_1030.920MaleNo relevent experienceno_enrollmentGraduateHumanities16NaNPublic Sector415
2125city_1360.897MaleHas relevent experienceno_enrollmentMastersSTEM18NaNNaN230
2126city_1000.887MaleNo relevent experienceno_enrollmentPrimary SchoolNaN3NaNPvt Ltdnever18
2127city_1020.804MaleHas relevent experienceFull time courseHigh SchoolNaN7100-500Public Sector184
2128city_1020.804MaleHas relevent experienceno_enrollmentMastersSTEM1510000+Pvt Ltd211
\n", 809 | "

2129 rows × 12 columns

\n", 810 | "
" 811 | ], 812 | "text/plain": [ 813 | " city city_development_index gender relevent_experience \\\n", 814 | "0 city_41 0.827 Male Has relevent experience \n", 815 | "1 city_103 0.920 Female Has relevent experience \n", 816 | "2 city_21 0.624 Male No relevent experience \n", 817 | "3 city_13 0.827 Male Has relevent experience \n", 818 | "4 city_103 0.920 Male Has relevent experience \n", 819 | "... ... ... ... ... \n", 820 | "2124 city_103 0.920 Male No relevent experience \n", 821 | "2125 city_136 0.897 Male Has relevent experience \n", 822 | "2126 city_100 0.887 Male No relevent experience \n", 823 | "2127 city_102 0.804 Male Has relevent experience \n", 824 | "2128 city_102 0.804 Male Has relevent experience \n", 825 | "\n", 826 | " enrolled_university education_level major_discipline experience \\\n", 827 | "0 Full time course Graduate STEM 9 \n", 828 | "1 no_enrollment Graduate STEM 5 \n", 829 | "2 no_enrollment High School NaN <1 \n", 830 | "3 no_enrollment Masters STEM 11 \n", 831 | "4 no_enrollment Graduate STEM >20 \n", 832 | "... ... ... ... ... \n", 833 | "2124 no_enrollment Graduate Humanities 16 \n", 834 | "2125 no_enrollment Masters STEM 18 \n", 835 | "2126 no_enrollment Primary School NaN 3 \n", 836 | "2127 Full time course High School NaN 7 \n", 837 | "2128 no_enrollment Masters STEM 15 \n", 838 | "\n", 839 | " company_size company_type last_new_job training_hours \n", 840 | "0 <10 NaN 1 21 \n", 841 | "1 NaN Pvt Ltd 1 98 \n", 842 | "2 NaN Pvt Ltd never 15 \n", 843 | "3 10/49 Pvt Ltd 1 39 \n", 844 | "4 10000+ Pvt Ltd >4 72 \n", 845 | "... ... ... ... ... \n", 846 | "2124 NaN Public Sector 4 15 \n", 847 | "2125 NaN NaN 2 30 \n", 848 | "2126 NaN Pvt Ltd never 18 \n", 849 | "2127 100-500 Public Sector 1 84 \n", 850 | "2128 10000+ Pvt Ltd 2 11 \n", 851 | "\n", 852 | "[2129 rows x 12 columns]" 853 | ] 854 | }, 855 | "execution_count": 13, 856 | "metadata": {}, 857 | "output_type": "execute_result" 858 | } 859 | ], 860 | "source": [ 861 | "new_data = new_data.drop(['enrollee_id'], axis=1)\n", 862 | "new_data" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 14, 868 | "id": "67b39187", 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "new_preds = pipe.predict(new_data)" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": null, 878 | "id": "21ebea6c", 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [ 882 | "new_preds" 883 | ] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "id": "e2e394d7", 888 | "metadata": {}, 889 | "source": [ 890 | "TODO:\n", 891 | " \n", 892 | "- hyperparameter tuning\n", 893 | "- multiple algorithms\n", 894 | "- cross validation\n", 895 | "- sklearn similar projects\n", 896 | "- optuna" 897 | ] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "execution_count": 15, 902 | "id": "17a6a178", 903 | "metadata": {}, 904 | "outputs": [ 905 | { 906 | "data": { 907 | "text/plain": [ 908 | "array([0., 0., 1., ..., 0., 0., 0.])" 909 | ] 910 | }, 911 | "execution_count": 15, 912 | "metadata": {}, 913 | "output_type": "execute_result" 914 | } 915 | ], 916 | "source": [ 917 | "pipe.predict(new_data)" 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": 20, 923 | "id": "e8b02e6b", 924 | "metadata": {}, 925 | "outputs": [ 926 | { 927 | "data": { 928 | "text/plain": [ 929 | "numpy.ndarray" 930 | ] 931 | }, 932 | "execution_count": 20, 933 | "metadata": {}, 934 | "output_type": "execute_result" 935 | } 936 | ], 937 | "source": [ 938 | "type(pipe.predict(data.drop(['enrollee_id', 'target'], axis=1)))" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 21, 944 | "id": "b497132c", 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "data": { 949 | "text/plain": [ 950 | "True" 951 | ] 952 | }, 953 | "execution_count": 21, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "isinstance(pipe.predict(data.drop(['enrollee_id', 'target'], axis=1))[:10], np.ndarray)" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 35, 965 | "id": "7b8396cc", 966 | "metadata": {}, 967 | "outputs": [], 968 | "source": [ 969 | "REQUIREMENTS_DIR = '../requirements'\n", 970 | "fname=\"production.txt\"\n", 971 | "with open(f'{REQUIREMENTS_DIR}/{fname}') as fd:\n", 972 | " reqs = fd.read().splitlines()\n", 973 | "reqs = list(filter(None, reqs))" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": 36, 979 | "id": "82d589f2-5682-4d41-a416-40107feff6db", 980 | "metadata": {}, 981 | "outputs": [], 982 | "source": [ 983 | "for req in reqs:\n", 984 | " if '-r' in req:\n", 985 | " with open(f\"{REQUIREMENTS_DIR}/{req.split(' ')[1]}\") as fd:\n", 986 | " extra_reqs = fd.read().splitlines()\n", 987 | " reqs.remove(req)" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 37, 993 | "id": "fbcb5bf8-f71c-443c-96b6-1fa2a6603108", 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [ 997 | "reqs=extra_reqs+reqs" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": 38, 1003 | "id": "d3573470-3693-40a2-99c9-07893ae991f5", 1004 | "metadata": {}, 1005 | "outputs": [ 1006 | { 1007 | "data": { 1008 | "text/plain": [ 1009 | "['feature-engine==1.2.0',\n", 1010 | " 'scikit-learn==1.0.2',\n", 1011 | " 'scipy==1.8.0',\n", 1012 | " 'seaborn==0.11.2',\n", 1013 | " 'pandas==1.4.1',\n", 1014 | " 'numpy==1.22.3',\n", 1015 | " 'joblib==1.1.0',\n", 1016 | " 'loguru==0.6.0',\n", 1017 | " 'tox==3.24.5',\n", 1018 | " 'pytest==7.0.1',\n", 1019 | " 'black==22.1.0',\n", 1020 | " 'flake8==4.0.1',\n", 1021 | " 'mypy==0.931',\n", 1022 | " 'isort==5.10.1',\n", 1023 | " 'pydantic==1.9.0',\n", 1024 | " 'strictyaml==1.6.1']" 1025 | ] 1026 | }, 1027 | "execution_count": 38, 1028 | "metadata": {}, 1029 | "output_type": "execute_result" 1030 | } 1031 | ], 1032 | "source": [ 1033 | "reqs" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "code", 1038 | "execution_count": null, 1039 | "id": "51185c06-71e0-4fde-bef1-328ff6b745a9", 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [] 1043 | } 1044 | ], 1045 | "metadata": { 1046 | "kernelspec": { 1047 | "display_name": "Python 3 (ipykernel)", 1048 | "language": "python", 1049 | "name": "python3" 1050 | }, 1051 | "language_info": { 1052 | "codemirror_mode": { 1053 | "name": "ipython", 1054 | "version": 3 1055 | }, 1056 | "file_extension": ".py", 1057 | "mimetype": "text/x-python", 1058 | "name": "python", 1059 | "nbconvert_exporter": "python", 1060 | "pygments_lexer": "ipython3", 1061 | "version": "3.9.10" 1062 | } 1063 | }, 1064 | "nbformat": 4, 1065 | "nbformat_minor": 5 1066 | } 1067 | -------------------------------------------------------------------------------- /notebooks/pipe.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/notebooks/pipe.joblib -------------------------------------------------------------------------------- /notebooks/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | 6 | 7 | class Mapper(BaseEstimator, TransformerMixin): 8 | 9 | def __init__(self, variables, mappings): 10 | 11 | if not isinstance(variables, list): 12 | raise ValueError('variables should be a list') 13 | 14 | self.variables = variables 15 | self.mappings = mappings 16 | 17 | def fit(self, X, y=None): 18 | # fit statement to be in-line with the sklearn pipeline 19 | return self 20 | 21 | def transform(self, X): 22 | X = X.copy() 23 | for feature in self.variables: 24 | X[feature] = X[feature].map(self.mappings) 25 | 26 | return X -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [tool.pytest.ini_options] 9 | minversion = "2.0" 10 | addopts = "-rfEX -p pytester --strict-markers" 11 | python_files = ["test_*.py", "*_test.py"] 12 | python_classes = ["Test", "Acceptance"] 13 | python_functions = ["test"] 14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting". 15 | testpaths = ["tests"] 16 | xfail_strict = true 17 | filterwarnings = [ 18 | "error", 19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*", 20 | # produced by older pyparsing<=2.2.0. 21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*", 22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*", 23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12 24 | "ignore:The distutils package is deprecated:DeprecationWarning", 25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)." 26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))", 27 | # produced by pytest-xdist 28 | "ignore:.*type argument to addoption.*:DeprecationWarning", 29 | # produced on execnet (pytest-xdist) 30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning", 31 | # pytest's own futurewarnings 32 | "ignore::pytest.PytestExperimentalApiWarning", 33 | # Do not cause SyntaxError for invalid escape sequences in py37. 34 | # Those are caught/handled by pyupgrade, and not easy to filter with the 35 | # module being the filename (with .py removed). 36 | "default:invalid escape sequence:DeprecationWarning", 37 | # ignore use of unregistered marks, because we use many to test the implementation 38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning", 39 | ] 40 | 41 | [tool.black] 42 | target-version = ['py39'] 43 | 44 | [tool.isort] 45 | profile = "black" 46 | line_length = 120 47 | lines_between_sections = 1 48 | known_first_party = "sentry" 49 | skip = "migrations" -------------------------------------------------------------------------------- /requirements/deployment.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | -r production.txt 3 | 4 | fastapi==0.75.0 5 | uvicorn==0.17.5 6 | python-multipart==0.0.5 7 | typing_extensions==3.10.0 -------------------------------------------------------------------------------- /requirements/production.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | tox==3.24.5 4 | pytest==7.0.1 5 | black==22.1.0 6 | flake8==4.0.1 7 | mypy==0.931 8 | isort==5.10.1 9 | pydantic==1.9.0 10 | strictyaml==1.6.1 11 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | feature-engine==1.2.0 2 | scikit-learn==1.0.2 3 | scipy==1.8.0 4 | seaborn==0.11.2 5 | pandas==1.4.1 6 | numpy==1.22.3 7 | joblib==1.1.0 8 | loguru==0.6.0 9 | -------------------------------------------------------------------------------- /requirements/research-env.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | jupyterlab==3.3.0 4 | jupyterlab-lsp==3.10.0 5 | jupyter-lsp==1.5.1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | 6 | from setuptools import find_packages, setup 7 | 8 | # Package meta-data. 9 | NAME = 'end-to-end-ML-project' 10 | DESCRIPTION = "End to End ML Project" 11 | URL = "https://github.com/Deffro/end-to-end-ML-project" 12 | EMAIL = "dimitris.effrosynidis@gmail.com" 13 | AUTHOR = "Dimitris Effrosynidis" 14 | REQUIRES_PYTHON = ">=3.7.0" 15 | 16 | 17 | # The rest you shouldn't have to touch too much :) 18 | # ------------------------------------------------ 19 | # Except, perhaps the License and Trove Classifiers! 20 | # If you do change the License, remember to change the 21 | # Trove Classifier for that! 22 | long_description = DESCRIPTION 23 | 24 | # Load the package's VERSION file as a dictionary. 25 | about = {} 26 | ROOT_DIR = Path(__file__).resolve().parent 27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements' 28 | PACKAGE_DIR = ROOT_DIR / 'src' 29 | with open(PACKAGE_DIR / "VERSION") as f: 30 | _version = f.read().strip() 31 | about["__version__"] = _version 32 | 33 | 34 | # What packages are required for this module to be executed? 35 | def list_reqs(fname="production.txt"): 36 | with open(REQUIREMENTS_DIR / fname) as fd: 37 | reqs = fd.read().splitlines() 38 | # remove empty line 39 | reqs = list(filter(None, reqs)) 40 | 41 | # add the packages from the -r requirement.txt in production.txt 42 | for req in reqs: 43 | if '-r' in req: 44 | with open(f"{REQUIREMENTS_DIR}/{req.split(' ')[1]}") as fd: 45 | extra_reqs = fd.read().splitlines() 46 | reqs.remove(req) 47 | return extra_reqs+reqs 48 | 49 | 50 | # Where the magic happens: 51 | setup( 52 | name=NAME, 53 | version=about["__version__"], 54 | description=DESCRIPTION, 55 | long_description=long_description, 56 | long_description_content_type="text/markdown", 57 | author=AUTHOR, 58 | author_email=EMAIL, 59 | python_requires=REQUIRES_PYTHON, 60 | url=URL, 61 | packages=find_packages(exclude=("tests",)), 62 | package_data={"src": ["VERSION"]}, 63 | install_requires=list_reqs(), 64 | extras_require={}, 65 | include_package_data=True, 66 | license="BSD-3", 67 | classifiers=[ 68 | # Trove classifiers 69 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 70 | "License :: OSI Approved :: MIT License", 71 | "Programming Language :: Python", 72 | "Programming Language :: Python :: 3", 73 | "Programming Language :: Python :: 3.6", 74 | "Programming Language :: Python :: 3.7", 75 | "Programming Language :: Python :: 3.8", 76 | "Programming Language :: Python :: 3.9", 77 | "Programming Language :: Python :: Implementation :: CPython", 78 | "Programming Language :: Python :: Implementation :: PyPy", 79 | ], 80 | ) 81 | -------------------------------------------------------------------------------- /src/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.7 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from src.config.core import PACKAGE_ROOT 2 | 3 | with open(PACKAGE_ROOT / "VERSION") as version_file: 4 | __version__ = version_file.read().strip() 5 | -------------------------------------------------------------------------------- /src/config.yml: -------------------------------------------------------------------------------- 1 | # Data Files 2 | training_data_file: train.csv 3 | test_data_file: test.csv 4 | pipeline_save_file: model_v 5 | 6 | # Variables 7 | target: target 8 | 9 | var_to_drop: enrollee_id 10 | 11 | cat_vars_replace_na_with_string_missing: 12 | - gender 13 | - major_discipline 14 | - company_size 15 | - company_type 16 | 17 | cat_vars_replace_na_with_frequent: 18 | - enrolled_university 19 | - education_level 20 | - experience 21 | - last_new_job 22 | 23 | num_vars: 24 | - city_development_index 25 | - training_hours 26 | 27 | num_vars_yeo_johnson: 28 | - training_hours 29 | 30 | cat_vars_ordinal: 31 | - relevent_experience 32 | - enrolled_university 33 | - education_level 34 | - major_discipline 35 | 36 | cat_vars_ordinal_arbitrary: 37 | - city 38 | 39 | cat_vars_onehot: 40 | - gender 41 | 42 | cat_vars_count_frequency: 43 | - company_type 44 | 45 | experience_var: 46 | - experience 47 | 48 | experience_map: 49 | <1: 0 50 | 1: 1 51 | 2: 2 52 | 3: 3 53 | 4: 4 54 | 5: 5 55 | 6: 6 56 | 7: 7 57 | 8: 8 58 | 9: 9 59 | 10: 10 60 | 11: 11 61 | 12: 12 62 | 13: 13 63 | 14: 14 64 | 15: 15 65 | 16: 16 66 | 17: 17 67 | 18: 18 68 | 19: 19 69 | 20: 20 70 | '>20': 21 71 | 72 | last_new_job_var: 73 | - last_new_job 74 | 75 | last_new_job_map: 76 | never: 0 77 | 1: 1 78 | 2: 2 79 | 3: 3 80 | 4: 4 81 | '>4': 5 82 | 83 | company_size_var: 84 | - company_size 85 | 86 | company_size_map: 87 | Missing: 0 88 | <10: 1 89 | 10/49: 2 90 | 50-99: 3 91 | 100-500: 4 92 | 500-999: 5 93 | 1000-4999: 6 94 | 5000-9999: 7 95 | 10000+: 8 96 | 97 | # Initializations 98 | 99 | test_size: 0.1 100 | 101 | # Model specific 102 | 103 | random_state: 43 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /src/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/config/__init__.py -------------------------------------------------------------------------------- /src/config/core.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Dict, List 3 | 4 | from pydantic import BaseModel 5 | from strictyaml import YAML, load 6 | 7 | import src 8 | 9 | PACKAGE_ROOT = Path(src.__file__).resolve().parent 10 | ROOT = PACKAGE_ROOT.parent 11 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" 12 | DATASET_DIR = PACKAGE_ROOT / "data" 13 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" 14 | 15 | 16 | class AppConfig(BaseModel): 17 | """ 18 | Application-level config. 19 | """ 20 | 21 | training_data_file: str 22 | test_data_file: str 23 | pipeline_save_file: str 24 | 25 | 26 | class ModelConfig(BaseModel): 27 | target: str 28 | var_to_drop: str 29 | cat_vars_replace_na_with_string_missing: List[str] 30 | cat_vars_replace_na_with_frequent: List[str] 31 | num_vars: List[str] 32 | num_vars_yeo_johnson: List[str] 33 | cat_vars_ordinal: List[str] 34 | cat_vars_ordinal_arbitrary: List[str] 35 | cat_vars_onehot: List[str] 36 | cat_vars_count_frequency: List[str] 37 | experience_var: List[str] 38 | experience_map: Dict[str, int] 39 | last_new_job_var: List[str] 40 | last_new_job_map: Dict[str, int] 41 | company_size_var: List[str] 42 | company_size_map: Dict[str, int] 43 | test_size: float 44 | random_state: int 45 | 46 | 47 | class Config(BaseModel): 48 | """Master config object. Name and match the pydantic configs""" 49 | 50 | app_config: AppConfig 51 | model_config: ModelConfig 52 | 53 | 54 | def find_config_file() -> Path: 55 | """Locate the configuration file.""" 56 | 57 | if CONFIG_FILE_PATH.is_file(): 58 | return CONFIG_FILE_PATH 59 | raise Exception(f"Config not found at {CONFIG_FILE_PATH}") 60 | 61 | 62 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML: 63 | """Parse YAML containing the package configuration.""" 64 | 65 | if not cfg_path: 66 | cfg_path = find_config_file() 67 | 68 | if cfg_path: 69 | with open(cfg_path, "r") as conf_file: 70 | parsed_config = load(conf_file.read()) 71 | return parsed_config 72 | raise OSError(f"Did not find config file at path: {cfg_path}") 73 | 74 | 75 | def create_and_validate_config(parsed_config: YAML = None) -> Config: 76 | """Run validation on config values.""" 77 | if parsed_config is None: 78 | parsed_config = fetch_config_from_yaml() 79 | 80 | # specify the data attribute from the strictyaml YAML type. 81 | _config = Config( 82 | app_config=AppConfig(**parsed_config.data), 83 | model_config=ModelConfig(**parsed_config.data), 84 | ) 85 | 86 | return _config 87 | 88 | 89 | config = create_and_validate_config() 90 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/data/__init__.py -------------------------------------------------------------------------------- /src/pipeline.py: -------------------------------------------------------------------------------- 1 | from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder, OrdinalEncoder 2 | from feature_engine.imputation import CategoricalImputer 3 | from feature_engine.transformation import YeoJohnsonTransformer 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.preprocessing import MinMaxScaler 7 | 8 | from src.config.core import config 9 | from src.processing import features as pp 10 | 11 | pipe = Pipeline( 12 | [ 13 | ( 14 | "cat_imputer_missing", 15 | CategoricalImputer( 16 | imputation_method="missing", 17 | variables=config.model_config.cat_vars_replace_na_with_string_missing, 18 | ), 19 | ), 20 | ( 21 | "cat_imputer_frequent", 22 | CategoricalImputer( 23 | imputation_method="frequent", 24 | variables=config.model_config.cat_vars_replace_na_with_frequent, 25 | ), 26 | ), 27 | ( 28 | "num_transformer_yeo_johnson", 29 | YeoJohnsonTransformer(variables=config.model_config.num_vars_yeo_johnson), 30 | ), 31 | ( 32 | "ordinal_encoder", 33 | OrdinalEncoder( 34 | encoding_method="ordered", 35 | variables=config.model_config.cat_vars_ordinal, 36 | ), 37 | ), 38 | ( 39 | "ordinal_encoder_arbitrary", 40 | OrdinalEncoder( 41 | encoding_method="arbitrary", 42 | variables=config.model_config.cat_vars_ordinal_arbitrary, 43 | ), 44 | ), 45 | ( 46 | "count_frequency_encoder", 47 | CountFrequencyEncoder( 48 | encoding_method="frequency", 49 | variables=config.model_config.cat_vars_count_frequency, 50 | ), 51 | ), 52 | ( 53 | "onehot_encoder", 54 | OneHotEncoder(variables=config.model_config.cat_vars_onehot), 55 | ), 56 | ( 57 | "experience_map", 58 | pp.Mapper( 59 | variables=config.model_config.experience_var, 60 | mappings=config.model_config.experience_map, 61 | ), 62 | ), 63 | ( 64 | "last_new_job_map", 65 | pp.Mapper( 66 | variables=config.model_config.last_new_job_var, 67 | mappings=config.model_config.last_new_job_map, 68 | ), 69 | ), 70 | ( 71 | "company_size_map", 72 | pp.Mapper( 73 | variables=config.model_config.company_size_var, 74 | mappings=config.model_config.company_size_map, 75 | ), 76 | ), 77 | ("min_max_scaler", MinMaxScaler()), 78 | ( 79 | "logistic_regression", 80 | LogisticRegression(random_state=config.model_config.random_state), 81 | ), 82 | ] 83 | ) 84 | -------------------------------------------------------------------------------- /src/predict.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from src import __version__ as _version 4 | from src.config.core import config 5 | from src.processing.data_manager import load_pipeline 6 | 7 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" 8 | trained_pipe = load_pipeline(file_name=pipeline_file_name) 9 | 10 | 11 | def make_prediction(input_data) -> dict: 12 | """Make a prediction using a saved model pipeline.""" 13 | 14 | data = pd.DataFrame(input_data) 15 | 16 | if config.model_config.target in data.columns: 17 | data = data.drop([config.model_config.target], axis=1) 18 | 19 | predictions = trained_pipe.predict(X=data) 20 | results = { 21 | "predictions": [pred for pred in predictions], 22 | "version": _version, 23 | } 24 | 25 | return results 26 | -------------------------------------------------------------------------------- /src/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/processing/__init__.py -------------------------------------------------------------------------------- /src/processing/data_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from pathlib import Path 3 | 4 | import joblib 5 | import pandas as pd 6 | from sklearn.pipeline import Pipeline 7 | 8 | from src import __version__ as _version 9 | from src.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config 10 | 11 | 12 | def load_dataset(file_name: str) -> pd.DataFrame: 13 | df = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) 14 | df = df.drop(columns=[config.model_config.var_to_drop]) 15 | return df 16 | 17 | 18 | def save_pipeline(pipeline_to_persist: Pipeline) -> None: 19 | """Persist the pipeline. 20 | Saves the versioned model, and overwrites any previous 21 | saved models. This ensures that when the package is 22 | published, there is only one trained model that can be 23 | called, and we know exactly how it was built. 24 | """ 25 | 26 | # Prepare versioned save file name 27 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" 28 | save_path = TRAINED_MODEL_DIR / save_file_name 29 | 30 | remove_old_pipelines(files_to_keep=[save_file_name]) 31 | joblib.dump(pipeline_to_persist, save_path) 32 | 33 | 34 | def load_pipeline(file_name: str) -> Pipeline: 35 | """Load a persisted pipeline.""" 36 | 37 | file_path = TRAINED_MODEL_DIR / file_name 38 | trained_model = joblib.load(filename=file_path) 39 | return trained_model 40 | 41 | 42 | def remove_old_pipelines(files_to_keep: t.List[str]) -> None: 43 | """ 44 | Remove old model pipelines. 45 | This is to ensure there is a simple one-to-one 46 | mapping between the package version and the model 47 | version to be imported and used by other applications. 48 | """ 49 | do_not_delete = files_to_keep + ["__init__.py", ".ipynb_checkpoints"] 50 | for model_file in TRAINED_MODEL_DIR.iterdir(): 51 | if model_file.name not in do_not_delete: 52 | model_file.unlink() 53 | -------------------------------------------------------------------------------- /src/processing/features.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pandas as pd 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | 6 | 7 | class Mapper(BaseEstimator, TransformerMixin): 8 | """Categorical variable mapper.""" 9 | 10 | def __init__(self, variables: List[str], mappings: dict): 11 | 12 | if not isinstance(variables, list): 13 | raise ValueError("variables should be a list") 14 | 15 | self.variables = variables 16 | self.mappings = mappings 17 | 18 | def fit(self, x: pd.DataFrame, y: pd.Series = None): 19 | # fit statement to be in line with the sklearn pipeline 20 | return self 21 | 22 | def transform(self, x: pd.DataFrame) -> pd.DataFrame: 23 | x = x.copy() 24 | for feature in self.variables: 25 | x[feature] = x[feature].map(self.mappings) 26 | 27 | return x 28 | -------------------------------------------------------------------------------- /src/train_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | 3 | from config.core import config 4 | from pipeline import pipe 5 | from processing.data_manager import load_dataset, save_pipeline 6 | 7 | 8 | def run_training() -> None: 9 | """Train the model.""" 10 | 11 | # read training data 12 | data = load_dataset(file_name=config.app_config.training_data_file) 13 | 14 | # divide train and test 15 | X_train, X_test, y_train, y_test = train_test_split( 16 | data[[c for c in data.columns if c != config.model_config.target]], 17 | data[config.model_config.target], 18 | test_size=config.model_config.test_size, 19 | random_state=config.model_config.random_state, 20 | ) 21 | 22 | # fit model 23 | pipe.fit(X_train, y_train) 24 | 25 | # persist trained model 26 | save_pipeline(pipeline_to_persist=pipe) 27 | 28 | 29 | if __name__ == "__main__": 30 | run_training() 31 | -------------------------------------------------------------------------------- /src/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/trained_models/__init__.py -------------------------------------------------------------------------------- /src/trained_models/model_v0.0.7.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/src/trained_models/model_v0.0.7.pkl -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deffro/end-to-end-ML-project/5cea993ce8e49f7244ad056de5803cf9b1dcbd46/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from src.config.core import config 4 | from src.processing.data_manager import load_dataset 5 | 6 | 7 | @pytest.fixture() 8 | def train_data(): 9 | return load_dataset(file_name=config.app_config.training_data_file) 10 | 11 | 12 | @pytest.fixture() 13 | def test_data(): 14 | return load_dataset(file_name=config.app_config.test_data_file) 15 | -------------------------------------------------------------------------------- /tests/test_features.py: -------------------------------------------------------------------------------- 1 | from feature_engine.transformation import YeoJohnsonTransformer 2 | 3 | from src.config.core import config 4 | 5 | 6 | def test_yeo_johnson(train_data): 7 | assert train_data[config.model_config.num_vars_yeo_johnson].iloc[0].values[0] == 36 8 | 9 | yeo_transformer = YeoJohnsonTransformer( 10 | variables=config.model_config.num_vars_yeo_johnson 11 | ) 12 | subject = yeo_transformer.fit_transform(train_data) 13 | 14 | assert ( 15 | subject[config.model_config.num_vars_yeo_johnson].iloc[0].values[0] 16 | == 4.719119791024215 17 | ) 18 | -------------------------------------------------------------------------------- /tests/test_input_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from src.config.core import config 4 | 5 | 6 | def test_column_types(train_data): 7 | # Check if data is DataFrame 8 | assert isinstance(train_data, pd.DataFrame) 9 | cat_vars = [ 10 | f 11 | for f in train_data.columns 12 | if f 13 | not in ( 14 | config.model_config.num_vars 15 | + [config.model_config.target] 16 | + [config.model_config.var_to_drop] 17 | ) 18 | ] 19 | # Check column types 20 | for f in cat_vars: 21 | assert train_data[f].dtype == "O" 22 | 23 | assert train_data["training_hours"].dtype == "int64" 24 | assert train_data["city_development_index"].dtype == "float64" 25 | 26 | 27 | def test_number_of_columns(train_data): 28 | assert train_data.drop([config.model_config.target], axis=1).shape[1] == 12 29 | -------------------------------------------------------------------------------- /tests/test_prediction.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | from sklearn.metrics import accuracy_score 5 | from sklearn.model_selection import train_test_split 6 | 7 | from src.config.core import config 8 | from src.predict import make_prediction 9 | 10 | 11 | def test_make_prediction(train_data): 12 | 13 | expected_first_10_predictions = [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 14 | 15 | result = make_prediction(input_data=train_data) 16 | predictions = result.get("predictions") 17 | 18 | assert predictions[:10] == expected_first_10_predictions 19 | assert isinstance(predictions, list) 20 | assert isinstance(predictions[0], np.float64) 21 | 22 | 23 | def test_accuracy_over_threshold(train_data): 24 | 25 | X_train, X_test, y_train, y_test = train_test_split( 26 | train_data.drop([config.model_config.target], axis=1), 27 | train_data[config.model_config.target], 28 | test_size=config.model_config.test_size, 29 | random_state=config.model_config.random_state, 30 | ) 31 | 32 | result = make_prediction(input_data=X_test) 33 | predictions = result.get("predictions") 34 | 35 | assert accuracy_score(y_test, predictions) > 0.77 36 | 37 | 38 | def test_serving_latency(train_data): 39 | 40 | s = time.time() 41 | for i in range(100): 42 | make_prediction(input_data=train_data[:1]) 43 | elapsed_time = time.time() - s 44 | assert elapsed_time < 5 45 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = test_package, typechecks, stylechecks, lint 3 | skipsdist = True 4 | 5 | [testenv] 6 | install_command = pip install {opts} {packages} 7 | 8 | [testenv:test_package] 9 | deps = 10 | -rrequirements/production.txt 11 | 12 | setenv = 13 | PYTHONPATH=. 14 | PYTHONHASHSEED=0 15 | 16 | commands= 17 | python src/train_pipeline.py 18 | pytest \ 19 | -s \ 20 | -vv \ 21 | {posargs:tests/} 22 | 23 | [testenv:train] 24 | envdir = {toxworkdir}/test_package 25 | deps = 26 | {[testenv:test_package]deps} 27 | 28 | setenv = 29 | {[testenv:test_package]setenv} 30 | 31 | commands= 32 | python src/train_pipeline.py 33 | 34 | 35 | [testenv:typechecks] 36 | envdir = {toxworkdir}/test_package 37 | 38 | deps = 39 | {[testenv:test_package]deps} 40 | 41 | commands = {posargs:mypy src} 42 | 43 | 44 | [testenv:stylechecks] 45 | envdir = {toxworkdir}/test_package 46 | 47 | deps = 48 | {[testenv:test_package]deps} 49 | 50 | commands = {posargs:flake8 src tests} 51 | 52 | 53 | [testenv:lint] 54 | envdir = {toxworkdir}/test_package 55 | 56 | deps = 57 | {[testenv:test_package]deps} 58 | 59 | commands = 60 | isort src tests 61 | black src tests 62 | mypy src 63 | flake8 src 64 | 65 | [flake8] 66 | exclude = .git,env 67 | max-line-length = 120 --------------------------------------------------------------------------------