├── templates ├── eda.html ├── experiment_history.html ├── log.html ├── train.html ├── files.html ├── log_files.html ├── saved_models_files.html ├── update_model.html ├── index.html ├── predict.html └── header.html ├── housing ├── __init__.py ├── config │ ├── __init__.py │ └── configuration.py ├── entity │ ├── __init__.py │ ├── experiment.py │ ├── artifact_entity.py │ ├── config_entity.py │ ├── housing_predictor.py │ └── model_factory.py ├── pipeline │ ├── __init__.py │ └── pipeline.py ├── util │ ├── __init__.py │ └── util.py ├── component │ ├── __init__.py │ ├── model_pusher.py │ ├── data_ingestion.py │ ├── data_validation.py │ ├── model_trainer.py │ ├── model_evaluation.py │ └── data_transformation.py ├── logger │ └── __init__.py ├── exception │ └── __init__.py └── constant │ └── __init__.py ├── notebook ├── sample.json ├── preprocessing.pkl ├── Untitled.ipynb ├── example3.ipynb ├── prediction.ipynb ├── log.ipynb ├── model_training.ipynb ├── EDA.ipynb └── example.ipynb ├── .dockerignore ├── study ├── project.png └── code-writing-flow.png ├── requirements.txt ├── Dockerfile ├── config ├── model.yaml ├── schema.yaml └── config.yaml ├── setup.py ├── demo.py ├── .github └── workflows │ └── main.yaml ├── README.md ├── .gitignore ├── app.py └── LICENSE /templates/eda.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /housing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebook/sample.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /housing/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /housing/entity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /housing/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /housing/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /housing/component/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .git 3 | .gitignore 4 | -------------------------------------------------------------------------------- /study/project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/machine_learning_project/HEAD/study/project.png -------------------------------------------------------------------------------- /notebook/preprocessing.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/machine_learning_project/HEAD/notebook/preprocessing.pkl -------------------------------------------------------------------------------- /study/code-writing-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/machine_learning_project/HEAD/study/code-writing-flow.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | Flask 3 | gunicorn 4 | sklearn 5 | pandas 6 | PyYAML 7 | evidently 8 | dill 9 | matplotlib 10 | -e . -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | COPY . /app 3 | WORKDIR /app 4 | RUN pip install -r requirements.txt 5 | EXPOSE $PORT 6 | CMD gunicorn --workers=1 --bind 0.0.0.0:$PORT app:app 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /templates/experiment_history.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Train Status 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 | 12 | Go to Home 13 |
14 | 15 |
16 | {{ context['experiment']|safe }} 17 |
18 |
19 | 20 | 21 | 22 | {% endblock %} -------------------------------------------------------------------------------- /templates/log.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Log Details 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 | 12 | Go to Home 13 |
14 | 15 |
16 | {{ context['log']|safe }} 17 |
18 |
19 | 20 | 21 | 22 | {% endblock %} -------------------------------------------------------------------------------- /templates/train.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Train Status 7 | {% endblock %} 8 | 9 | 10 | 11 | 12 | {% block content %} 13 | 14 | 15 | Go to Home 16 |
17 | 20 | 21 | {{ context['experiment']|safe }} 22 | 23 | 24 | 25 |
26 | 27 | 28 | {% endblock %} -------------------------------------------------------------------------------- /housing/entity/experiment.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | class Experiment: 5 | running_status=False 6 | def __new__(cls,*args,**kwargs): 7 | if Experiment.running_status: 8 | raise Exception("Experiment is already running hence new experiment can not be created") 9 | return super(Experiment,cls).__new__(cls,*args,**kwargs) 10 | 11 | def __init__(self,experiment_id): 12 | self.experiment_id = experiment_id 13 | self.running_status = Experiment.running_status 14 | 15 | -------------------------------------------------------------------------------- /config/model.yaml: -------------------------------------------------------------------------------- 1 | grid_search: 2 | class: GridSearchCV 3 | module: sklearn.model_selection 4 | params: 5 | cv: 5 6 | verbose: 2 7 | model_selection: 8 | module_0: 9 | class: LinearRegression 10 | module: sklearn.linear_model 11 | params: 12 | fit_intercept: true 13 | search_param_grid: 14 | fit_intercept: 15 | - true 16 | - false 17 | module_1: 18 | class: RandomForestRegressor 19 | module: sklearn.ensemble 20 | params: 21 | min_samples_leaf: 3 22 | search_param_grid: 23 | min_samples_leaf: 24 | - 6 25 | -------------------------------------------------------------------------------- /config/schema.yaml: -------------------------------------------------------------------------------- 1 | columns: 2 | longitude: float 3 | latitude: float 4 | housing_median_age: float 5 | total_rooms: float 6 | total_bedrooms: float 7 | population: float 8 | households: float 9 | median_income: float 10 | median_house_value: float 11 | ocean_proximity: category 12 | 13 | numerical_columns: 14 | - longitude 15 | - latitude 16 | - housing_median_age 17 | - total_rooms 18 | - total_bedrooms 19 | - population 20 | - households 21 | - median_income 22 | 23 | 24 | 25 | categorical_columns: 26 | - ocean_proximity 27 | 28 | 29 | 30 | 31 | target_column: median_house_value 32 | 33 | domain_value: 34 | ocean_proximity: 35 | - <1H OCEAN 36 | - INLAND 37 | - ISLAND 38 | - NEAR BAY 39 | - NEAR OCEAN 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /templates/files.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Housing Estimator 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 | 12 | 13 | {% if "housing" in result['parent_folder'] %} 14 | Back 15 | 16 |

17 | {{result['parent_label']}} 18 |

19 | {% endif %} 20 | 21 |
22 | {% for href,label in result["files"].items() %} 23 |
24 | 25 | 26 |
{{ label }} 27 | {% if '.' in label %} 28 | 29 | {% endif %} 30 |
31 |
32 | {% endfor %} 33 | 34 |
35 | 36 | {% endblock %} -------------------------------------------------------------------------------- /templates/log_files.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {% extends 'header.html' %} 5 | 6 | {% block head %} 7 | 8 | 9 | Log Files 10 | {% endblock %} 11 | 12 | {% block content %} 13 | 14 | 15 | 16 | 17 | {% if "logs" in result['parent_folder'] %} 18 | Back 19 | 20 |

21 | {{result['parent_label']}} 22 |

23 | {% endif %} 24 | 25 |
26 | {% for href,label in result["files"].items() %} 27 |
28 | 29 | 30 |
{{ label }} 31 | {% if '.' in label %} 32 | 33 | {% endif %} 34 |
35 |
36 | {% endfor %} 37 | 38 |
39 | 40 | 41 | {% endblock %} -------------------------------------------------------------------------------- /templates/saved_models_files.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Model List 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 | 12 | 13 | 14 | {% if "saved_models" in result['parent_folder'] %} 15 | Back 16 | 17 |

18 | {{result['parent_label']}} 19 |

20 | {% endif %} 21 | 22 |
23 | {% for href,label in result["files"].items() %} 24 |
25 | 26 | 27 |
{{ label }} 28 | {% if '.' in label %} 29 | 30 | {% endif %} 31 |
32 |
33 | {% endfor %} 34 | 35 |
36 | 37 | {% endblock %} -------------------------------------------------------------------------------- /housing/logger/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | import os 4 | import pandas as pd 5 | from housing.constant import get_current_time_stamp 6 | LOG_DIR="logs" 7 | 8 | def get_log_file_name(): 9 | return f"log_{get_current_time_stamp()}.log" 10 | 11 | LOG_FILE_NAME=get_log_file_name() 12 | 13 | os.makedirs(LOG_DIR,exist_ok=True) 14 | 15 | LOG_FILE_PATH = os.path.join(LOG_DIR,LOG_FILE_NAME) 16 | 17 | 18 | 19 | logging.basicConfig(filename=LOG_FILE_PATH, 20 | filemode="w", 21 | format='[%(asctime)s]^;%(levelname)s^;%(lineno)d^;%(filename)s^;%(funcName)s()^;%(message)s', 22 | level=logging.INFO 23 | ) 24 | 25 | def get_log_dataframe(file_path): 26 | data=[] 27 | with open(file_path) as log_file: 28 | for line in log_file.readlines(): 29 | data.append(line.split("^;")) 30 | 31 | log_df = pd.DataFrame(data) 32 | columns=["Time stamp","Log Level","line number","file name","function name","message"] 33 | log_df.columns=columns 34 | 35 | log_df["log_message"] = log_df['Time stamp'].astype(str) +":$"+ log_df["message"] 36 | 37 | return log_df[["log_message"]] 38 | 39 | 40 | -------------------------------------------------------------------------------- /housing/entity/artifact_entity.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | 4 | 5 | DataIngestionArtifact = namedtuple("DataIngestionArtifact", 6 | [ "train_file_path", "test_file_path", "is_ingested", "message"]) 7 | 8 | 9 | DataValidationArtifact = namedtuple("DataValidationArtifact", 10 | ["schema_file_path","report_file_path","report_page_file_path","is_validated","message"]) 11 | 12 | 13 | DataTransformationArtifact = namedtuple("DataTransformationArtifact", 14 | ["is_transformed", "message", "transformed_train_file_path","transformed_test_file_path", 15 | "preprocessed_object_file_path"]) 16 | 17 | ModelTrainerArtifact = namedtuple("ModelTrainerArtifact", ["is_trained", "message", "trained_model_file_path", 18 | "train_rmse", "test_rmse", "train_accuracy", "test_accuracy", 19 | "model_accuracy"]) 20 | 21 | ModelEvaluationArtifact = namedtuple("ModelEvaluationArtifact", ["is_model_accepted", "evaluated_model_path"]) 22 | 23 | ModelPusherArtifact = namedtuple("ModelPusherArtifact", ["is_model_pusher", "export_model_file_path"]) 24 | 25 | -------------------------------------------------------------------------------- /housing/entity/config_entity.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | 4 | DataIngestionConfig=namedtuple("DataIngestionConfig", 5 | ["dataset_download_url","tgz_download_dir","raw_data_dir","ingested_train_dir","ingested_test_dir"]) 6 | 7 | 8 | DataValidationConfig = namedtuple("DataValidationConfig", ["schema_file_path","report_file_path","report_page_file_path"]) 9 | 10 | DataTransformationConfig = namedtuple("DataTransformationConfig", ["add_bedroom_per_room", 11 | "transformed_train_dir", 12 | "transformed_test_dir", 13 | "preprocessed_object_file_path"]) 14 | 15 | 16 | ModelTrainerConfig = namedtuple("ModelTrainerConfig", ["trained_model_file_path","base_accuracy","model_config_file_path"]) 17 | 18 | ModelEvaluationConfig = namedtuple("ModelEvaluationConfig", ["model_evaluation_file_path","time_stamp"]) 19 | 20 | 21 | ModelPusherConfig = namedtuple("ModelPusherConfig", ["export_dir_path"]) 22 | 23 | TrainingPipelineConfig = namedtuple("TrainingPipelineConfig", ["artifact_dir"]) -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | training_pipeline_config: 2 | pipeline_name: housing 3 | artifact_dir: artifact 4 | 5 | data_ingestion_config: 6 | dataset_download_url: https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz 7 | raw_data_dir: raw_data 8 | tgz_download_dir: tgz_data 9 | ingested_dir: ingested_data 10 | ingested_train_dir: train 11 | ingested_test_dir: test 12 | 13 | 14 | 15 | 16 | data_validation_config: 17 | schema_dir: config 18 | schema_file_name: schema.yaml 19 | report_file_name: report.json 20 | report_page_file_name: report.html 21 | 22 | data_transformation_config: 23 | add_bedroom_per_room: true 24 | transformed_dir: transformed_data 25 | transformed_train_dir: train 26 | transformed_test_dir: test 27 | preprocessing_dir: preprocessed 28 | preprocessed_object_file_name: preprocessed.pkl 29 | 30 | model_trainer_config: 31 | trained_model_dir: trained_model 32 | model_file_name: model.pkl 33 | base_accuracy: 0.6 34 | model_config_dir: config 35 | model_config_file_name: model.yaml 36 | 37 | 38 | model_evaluation_config: 39 | model_evaluation_file_name: model_evaluation.yaml 40 | 41 | 42 | model_pusher_config: 43 | model_export_dir: saved_models 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup,find_packages 2 | from typing import List 3 | 4 | #Declaring variables for setup functions 5 | PROJECT_NAME="housing-predictor" 6 | VERSION="0.0.3" 7 | AUTHOR="Avnish Yadav" 8 | DESRCIPTION="This is a first FSDS Nov batch Machine Learning Project" 9 | 10 | REQUIREMENT_FILE_NAME="requirements.txt" 11 | 12 | HYPHEN_E_DOT = "-e ." 13 | 14 | 15 | def get_requirements_list() -> List[str]: 16 | """ 17 | Description: This function is going to return list of requirement 18 | mention in requirements.txt file 19 | return This function is going to return a list which contain name 20 | of libraries mentioned in requirements.txt file 21 | """ 22 | with open(REQUIREMENT_FILE_NAME) as requirement_file: 23 | requirement_list = requirement_file.readlines() 24 | requirement_list = [requirement_name.replace("\n", "") for requirement_name in requirement_list] 25 | if HYPHEN_E_DOT in requirement_list: 26 | requirement_list.remove(HYPHEN_E_DOT) 27 | return requirement_list 28 | 29 | 30 | 31 | setup( 32 | name=PROJECT_NAME, 33 | version=VERSION, 34 | author=AUTHOR, 35 | description=DESRCIPTION, 36 | packages=find_packages(), 37 | install_requires=get_requirements_list() 38 | ) 39 | 40 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from housing.pipeline.pipeline import Pipeline 2 | from housing.exception import HousingException 3 | from housing.logger import logging 4 | from housing.config.configuration import Configuartion 5 | from housing.component.data_transformation import DataTransformation 6 | import os 7 | def main(): 8 | try: 9 | config_path = os.path.join("config","config.yaml") 10 | pipeline = Pipeline(Configuartion(config_file_path=config_path)) 11 | #pipeline.run_pipeline() 12 | pipeline.start() 13 | logging.info("main function execution completed.") 14 | # # data_validation_config = Configuartion().get_data_transformation_config() 15 | # # print(data_validation_config) 16 | # schema_file_path=r"D:\Project\machine_learning_project\config\schema.yaml" 17 | # file_path=r"D:\Project\machine_learning_project\housing\artifact\data_ingestion\2022-06-27-19-13-17\ingested_data\train\housing.csv" 18 | 19 | # df= DataTransformation.load_data(file_path=file_path,schema_file_path=schema_file_path) 20 | # print(df.columns) 21 | # print(df.dtypes) 22 | 23 | except Exception as e: 24 | logging.error(f"{e}") 25 | print(e) 26 | 27 | 28 | 29 | if __name__=="__main__": 30 | main() 31 | 32 | -------------------------------------------------------------------------------- /housing/exception/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | class HousingException(Exception): 5 | 6 | def __init__(self, error_message:Exception,error_detail:sys): 7 | super().__init__(error_message) 8 | self.error_message=HousingException.get_detailed_error_message(error_message=error_message, 9 | error_detail=error_detail 10 | ) 11 | 12 | 13 | @staticmethod 14 | def get_detailed_error_message(error_message:Exception,error_detail:sys)->str: 15 | """ 16 | error_message: Exception object 17 | error_detail: object of sys module 18 | """ 19 | _,_ ,exec_tb = error_detail.exc_info() 20 | exception_block_line_number = exec_tb.tb_frame.f_lineno 21 | try_block_line_number = exec_tb.tb_lineno 22 | file_name = exec_tb.tb_frame.f_code.co_filename 23 | error_message = f""" 24 | Error occured in script: 25 | [ {file_name} ] at 26 | try block line number: [{try_block_line_number}] and exception block line number: [{exception_block_line_number}] 27 | error message: [{error_message}] 28 | """ 29 | return error_message 30 | 31 | def __str__(self): 32 | return self.error_message 33 | 34 | 35 | def __repr__(self) -> str: 36 | return HousingException.__name__.str() 37 | 38 | -------------------------------------------------------------------------------- /templates/update_model.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Model Config 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 | 12 | Go to Home 13 |
14 | 15 |
16 |
17 |
18 | 19 | 23 |
24 | 25 | 28 | 29 |
30 | 31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |
39 |
40 | 41 | 42 | 43 | {% endblock %} -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | # Your workflow name. 2 | name: Deploy to heroku. 3 | 4 | # Run workflow on every push to main branch. 5 | on: 6 | push: 7 | branches: [main] 8 | 9 | # Your workflows jobs. 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | # Check-out your repository. 15 | - name: Checkout 16 | uses: actions/checkout@v2 17 | 18 | 19 | ### ⬇ IMPORTANT PART ⬇ ### 20 | 21 | - name: Build, Push and Release a Docker container to Heroku. # Your custom step name 22 | uses: gonuit/heroku-docker-deploy@v1.3.3 # GitHub action name (leave it as it is). 23 | with: 24 | # Below you must provide variables for your Heroku app. 25 | 26 | # The email address associated with your Heroku account. 27 | # If you don't want to use repository secrets (which is recommended) you can do: 28 | # email: my.email@example.com 29 | email: ${{ secrets.HEROKU_EMAIL }} 30 | 31 | # Heroku API key associated with provided user's email. 32 | # Api Key is available under your Heroku account settings. 33 | heroku_api_key: ${{ secrets.HEROKU_API_KEY }} 34 | 35 | # Name of the heroku application to which the build is to be sent. 36 | heroku_app_name: ${{ secrets.HEROKU_APP_NAME }} 37 | 38 | # (Optional, default: "./") 39 | # Dockerfile directory. 40 | # For example, if you have a Dockerfile in the root of your project, leave it as follows: 41 | dockerfile_directory: ./ 42 | 43 | # (Optional, default: "Dockerfile") 44 | # Dockerfile name. 45 | dockerfile_name: Dockerfile 46 | 47 | # (Optional, default: "") 48 | # Additional options of docker build command. 49 | docker_options: "--no-cache" 50 | 51 | # (Optional, default: "web") 52 | # Select the process type for which you want the docker container to be uploaded. 53 | # By default, this argument is set to "web". 54 | # For more information look at https://devcenter.heroku.com/articles/process-model 55 | process_type: web 56 | 57 | 58 | 59 | ### ⬆ IMPORTANT PART ⬆ ### -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Application url: 2 | [HousingPredictor](https://ml-regression-app.herokuapp.com/) 3 | 4 | ## Start Machine Learning project. 5 | 6 | ### Software and account Requirement. 7 | 8 | 1. [Github Account](https://github.com) 9 | 2. [Heroku Account](https://dashboard.heroku.com/login) 10 | 3. [VS Code IDE](https://code.visualstudio.com/download) 11 | 4. [GIT cli](https://git-scm.com/downloads) 12 | 5. [GIT Documentation](https://git-scm.com/docs/gittutorial) 13 | 14 | 15 | Creating conda environment 16 | ``` 17 | conda create -p venv python==3.7 -y 18 | ``` 19 | ``` 20 | conda activate venv/ 21 | ``` 22 | OR 23 | ``` 24 | conda activate venv 25 | ``` 26 | 27 | ``` 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | To Add files to git 32 | ``` 33 | git add . 34 | ``` 35 | 36 | OR 37 | ``` 38 | git add 39 | ``` 40 | 41 | > Note: To ignore file or folder from git we can write name of file/folder in .gitignore file 42 | 43 | To check the git status 44 | ``` 45 | git status 46 | ``` 47 | To check all version maintained by git 48 | ``` 49 | git log 50 | ``` 51 | 52 | To create version/commit all changes by git 53 | ``` 54 | git commit -m "message" 55 | ``` 56 | 57 | To send version/changes to github 58 | ``` 59 | git push origin main 60 | ``` 61 | 62 | To check remote url 63 | ``` 64 | git remote -v 65 | ``` 66 | 67 | To setup CI/CD pipeline in heroku we need 3 information 68 | 1. HEROKU_EMAIL = anishyadav7045075175@gmail.com 69 | 2. HEROKU_API_KEY = <> 70 | 3. HEROKU_APP_NAME = ml-regression-app 71 | 72 | BUILD DOCKER IMAGE 73 | ``` 74 | docker build -t : . 75 | ``` 76 | > Note: Image name for docker must be lowercase 77 | 78 | 79 | To list docker image 80 | ``` 81 | docker images 82 | ``` 83 | 84 | Run docker image 85 | ``` 86 | docker run -p 5000:5000 -e PORT=5000 f8c749e73678 87 | ``` 88 | 89 | To check running container in docker 90 | ``` 91 | docker ps 92 | ``` 93 | 94 | Tos stop docker conatiner 95 | ``` 96 | docker stop 97 | ``` 98 | 99 | 100 | 101 | ``` 102 | python setup.py install 103 | ``` 104 | 105 | 106 | Install ipykernel 107 | 108 | ``` 109 | pip install ipykernel 110 | ``` 111 | 112 | 113 | Data Drift: 114 | When your datset stats gets change we call it as data drift 115 | 116 | 117 | 118 | ## Write a function to get training file path from artifact dir -------------------------------------------------------------------------------- /housing/component/model_pusher.py: -------------------------------------------------------------------------------- 1 | from housing.logger import logging 2 | from housing.exception import HousingException 3 | from housing.entity.artifact_entity import ModelPusherArtifact, ModelEvaluationArtifact 4 | from housing.entity.config_entity import ModelPusherConfig 5 | import os, sys 6 | import shutil 7 | 8 | 9 | class ModelPusher: 10 | 11 | def __init__(self, model_pusher_config: ModelPusherConfig, 12 | model_evaluation_artifact: ModelEvaluationArtifact 13 | ): 14 | try: 15 | logging.info(f"{'>>' * 30}Model Pusher log started.{'<<' * 30} ") 16 | self.model_pusher_config = model_pusher_config 17 | self.model_evaluation_artifact = model_evaluation_artifact 18 | 19 | except Exception as e: 20 | raise HousingException(e, sys) from e 21 | 22 | def export_model(self) -> ModelPusherArtifact: 23 | try: 24 | evaluated_model_file_path = self.model_evaluation_artifact.evaluated_model_path 25 | export_dir = self.model_pusher_config.export_dir_path 26 | model_file_name = os.path.basename(evaluated_model_file_path) 27 | export_model_file_path = os.path.join(export_dir, model_file_name) 28 | logging.info(f"Exporting model file: [{export_model_file_path}]") 29 | os.makedirs(export_dir, exist_ok=True) 30 | 31 | shutil.copy(src=evaluated_model_file_path, dst=export_model_file_path) 32 | #we can call a function to save model to Azure blob storage/ google cloud strorage / s3 bucket 33 | logging.info( 34 | f"Trained model: {evaluated_model_file_path} is copied in export dir:[{export_model_file_path}]") 35 | 36 | model_pusher_artifact = ModelPusherArtifact(is_model_pusher=True, 37 | export_model_file_path=export_model_file_path 38 | ) 39 | logging.info(f"Model pusher artifact: [{model_pusher_artifact}]") 40 | return model_pusher_artifact 41 | except Exception as e: 42 | raise HousingException(e, sys) from e 43 | 44 | def initiate_model_pusher(self) -> ModelPusherArtifact: 45 | try: 46 | return self.export_model() 47 | except Exception as e: 48 | raise HousingException(e, sys) from e 49 | 50 | def __del__(self): 51 | logging.info(f"{'>>' * 20}Model Pusher log completed.{'<<' * 20} ") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | saved_models/* 6 | # C extensions 7 | *.so 8 | housing/artifact 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | .idea/* 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /housing/entity/housing_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from housing.exception import HousingException 5 | from housing.util.util import load_object 6 | 7 | import pandas as pd 8 | 9 | 10 | class HousingData: 11 | 12 | def __init__(self, 13 | longitude: float, 14 | latitude: float, 15 | housing_median_age: float, 16 | total_rooms: float, 17 | total_bedrooms: float, 18 | population: float, 19 | households: float, 20 | median_income: float, 21 | ocean_proximity: str, 22 | median_house_value: float = None 23 | ): 24 | try: 25 | self.longitude = longitude 26 | self.latitude = latitude 27 | self.housing_median_age = housing_median_age 28 | self.total_rooms = total_rooms 29 | self.total_bedrooms = total_bedrooms 30 | self.population = population 31 | self.households = households 32 | self.median_income = median_income 33 | self.ocean_proximity = ocean_proximity 34 | self.median_house_value = median_house_value 35 | except Exception as e: 36 | raise HousingException(e, sys) from e 37 | 38 | def get_housing_input_data_frame(self): 39 | 40 | try: 41 | housing_input_dict = self.get_housing_data_as_dict() 42 | return pd.DataFrame(housing_input_dict) 43 | except Exception as e: 44 | raise HousingException(e, sys) from e 45 | 46 | def get_housing_data_as_dict(self): 47 | try: 48 | input_data = { 49 | "longitude": [self.longitude], 50 | "latitude": [self.latitude], 51 | "housing_median_age": [self.housing_median_age], 52 | "total_rooms": [self.total_rooms], 53 | "total_bedrooms": [self.total_bedrooms], 54 | "population": [self.population], 55 | "households": [self.households], 56 | "median_income": [self.median_income], 57 | "ocean_proximity": [self.ocean_proximity]} 58 | return input_data 59 | except Exception as e: 60 | raise HousingException(e, sys) 61 | 62 | 63 | class HousingPredictor: 64 | 65 | def __init__(self, model_dir: str): 66 | try: 67 | self.model_dir = model_dir 68 | except Exception as e: 69 | raise HousingException(e, sys) from e 70 | 71 | def get_latest_model_path(self): 72 | try: 73 | folder_name = list(map(int, os.listdir(self.model_dir))) 74 | latest_model_dir = os.path.join(self.model_dir, f"{max(folder_name)}") 75 | file_name = os.listdir(latest_model_dir)[0] 76 | latest_model_path = os.path.join(latest_model_dir, file_name) 77 | return latest_model_path 78 | except Exception as e: 79 | raise HousingException(e, sys) from e 80 | 81 | def predict(self, X): 82 | try: 83 | model_path = self.get_latest_model_path() 84 | model = load_object(file_path=model_path) 85 | median_house_value = model.predict(X) 86 | return median_house_value 87 | except Exception as e: 88 | raise HousingException(e, sys) from e -------------------------------------------------------------------------------- /notebook/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "id": "09acfffe", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "da={\"a\":[1,2,3]}" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 4, 25 | "id": "5d96c8ba", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df=pd.DataFrame(da)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 7, 35 | "id": "5422324a", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "df=df.astype('str')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 11, 45 | "id": "df4cba4e", 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | "
a
01 1
12 2
23 3
\n", 87 | "
" 88 | ], 89 | "text/plain": [ 90 | " a\n", 91 | "0 1 1\n", 92 | "1 2 2\n", 93 | "2 3 3" 94 | ] 95 | }, 96 | "execution_count": 11, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "df+\" \"+df" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "654b3846", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3.7.0 (conda)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.0" 131 | }, 132 | "vscode": { 133 | "interpreter": { 134 | "hash": "fc6fa6e48c86001677d15bc9af4f846353042d089527ab27e7c7a4474d3b154b" 135 | } 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 5 140 | } 141 | -------------------------------------------------------------------------------- /housing/util/util.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from housing.exception import HousingException 3 | import os,sys 4 | import numpy as np 5 | import dill 6 | import pandas as pd 7 | from housing.constant import * 8 | 9 | 10 | def write_yaml_file(file_path:str,data:dict=None): 11 | """ 12 | Create yaml file 13 | file_path: str 14 | data: dict 15 | """ 16 | try: 17 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 18 | with open(file_path,"w") as yaml_file: 19 | if data is not None: 20 | yaml.dump(data,yaml_file) 21 | except Exception as e: 22 | raise HousingException(e,sys) 23 | 24 | 25 | def read_yaml_file(file_path:str)->dict: 26 | """ 27 | Reads a YAML file and returns the contents as a dictionary. 28 | file_path: str 29 | """ 30 | try: 31 | with open(file_path, 'rb') as yaml_file: 32 | return yaml.safe_load(yaml_file) 33 | except Exception as e: 34 | raise HousingException(e,sys) from e 35 | 36 | 37 | def save_numpy_array_data(file_path: str, array: np.array): 38 | """ 39 | Save numpy array data to file 40 | file_path: str location of file to save 41 | array: np.array data to save 42 | """ 43 | try: 44 | dir_path = os.path.dirname(file_path) 45 | os.makedirs(dir_path, exist_ok=True) 46 | with open(file_path, 'wb') as file_obj: 47 | np.save(file_obj, array) 48 | except Exception as e: 49 | raise HousingException(e, sys) from e 50 | 51 | 52 | def load_numpy_array_data(file_path: str) -> np.array: 53 | """ 54 | load numpy array data from file 55 | file_path: str location of file to load 56 | return: np.array data loaded 57 | """ 58 | try: 59 | with open(file_path, 'rb') as file_obj: 60 | return np.load(file_obj) 61 | except Exception as e: 62 | raise HousingException(e, sys) from e 63 | 64 | 65 | def save_object(file_path:str,obj): 66 | """ 67 | file_path: str 68 | obj: Any sort of object 69 | """ 70 | try: 71 | dir_path = os.path.dirname(file_path) 72 | os.makedirs(dir_path, exist_ok=True) 73 | with open(file_path, "wb") as file_obj: 74 | dill.dump(obj, file_obj) 75 | except Exception as e: 76 | raise HousingException(e,sys) from e 77 | 78 | 79 | def load_object(file_path:str): 80 | """ 81 | file_path: str 82 | """ 83 | try: 84 | with open(file_path, "rb") as file_obj: 85 | return dill.load(file_obj) 86 | except Exception as e: 87 | raise HousingException(e,sys) from e 88 | 89 | 90 | def load_data(file_path: str, schema_file_path: str) -> pd.DataFrame: 91 | try: 92 | datatset_schema = read_yaml_file(schema_file_path) 93 | 94 | schema = datatset_schema[DATASET_SCHEMA_COLUMNS_KEY] 95 | 96 | dataframe = pd.read_csv(file_path) 97 | 98 | error_messgae = "" 99 | 100 | 101 | for column in dataframe.columns: 102 | if column in list(schema.keys()): 103 | dataframe[column].astype(schema[column]) 104 | else: 105 | error_messgae = f"{error_messgae} \nColumn: [{column}] is not in the schema." 106 | if len(error_messgae) > 0: 107 | raise Exception(error_messgae) 108 | return dataframe 109 | 110 | except Exception as e: 111 | raise HousingException(e,sys) from e 112 | -------------------------------------------------------------------------------- /housing/constant/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | from datetime import datetime 5 | 6 | 7 | def get_current_time_stamp(): 8 | return f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" 9 | 10 | 11 | ROOT_DIR = os.getcwd() #to get current working directory 12 | CONFIG_DIR = "config" 13 | CONFIG_FILE_NAME = "config.yaml" 14 | CONFIG_FILE_PATH = os.path.join(ROOT_DIR,CONFIG_DIR,CONFIG_FILE_NAME) 15 | 16 | 17 | 18 | CURRENT_TIME_STAMP = get_current_time_stamp() 19 | 20 | 21 | 22 | 23 | # Training pipeline related variable 24 | TRAINING_PIPELINE_CONFIG_KEY = "training_pipeline_config" 25 | TRAINING_PIPELINE_ARTIFACT_DIR_KEY = "artifact_dir" 26 | TRAINING_PIPELINE_NAME_KEY = "pipeline_name" 27 | 28 | 29 | # Data Ingestion related variable 30 | 31 | DATA_INGESTION_CONFIG_KEY = "data_ingestion_config" 32 | DATA_INGESTION_ARTIFACT_DIR = "data_ingestion" 33 | DATA_INGESTION_DOWNLOAD_URL_KEY = "dataset_download_url" 34 | DATA_INGESTION_RAW_DATA_DIR_KEY = "raw_data_dir" 35 | DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY = "tgz_download_dir" 36 | DATA_INGESTION_INGESTED_DIR_NAME_KEY = "ingested_dir" 37 | DATA_INGESTION_TRAIN_DIR_KEY = "ingested_train_dir" 38 | DATA_INGESTION_TEST_DIR_KEY = "ingested_test_dir" 39 | 40 | # Data Validation related variable 41 | 42 | # Data Validation related variables 43 | DATA_VALIDATION_CONFIG_KEY = "data_validation_config" 44 | DATA_VALIDATION_SCHEMA_FILE_NAME_KEY = "schema_file_name" 45 | DATA_VALIDATION_SCHEMA_DIR_KEY = "schema_dir" 46 | DATA_VALIDATION_ARTIFACT_DIR_NAME="data_validation" 47 | DATA_VALIDATION_REPORT_FILE_NAME_KEY = "report_file_name" 48 | DATA_VALIDATION_REPORT_PAGE_FILE_NAME_KEY = "report_page_file_name" 49 | 50 | 51 | 52 | # Data Transformation related variables 53 | DATA_TRANSFORMATION_ARTIFACT_DIR = "data_transformation" 54 | DATA_TRANSFORMATION_CONFIG_KEY = "data_transformation_config" 55 | DATA_TRANSFORMATION_ADD_BEDROOM_PER_ROOM_KEY = "add_bedroom_per_room" 56 | DATA_TRANSFORMATION_DIR_NAME_KEY = "transformed_dir" 57 | DATA_TRANSFORMATION_TRAIN_DIR_NAME_KEY = "transformed_train_dir" 58 | DATA_TRANSFORMATION_TEST_DIR_NAME_KEY = "transformed_test_dir" 59 | DATA_TRANSFORMATION_PREPROCESSING_DIR_KEY = "preprocessing_dir" 60 | DATA_TRANSFORMATION_PREPROCESSED_FILE_NAME_KEY = "preprocessed_object_file_name" 61 | 62 | 63 | 64 | COLUMN_TOTAL_ROOMS = "total_rooms" 65 | COLUMN_POPULATION = "population" 66 | COLUMN_HOUSEHOLDS = "households" 67 | COLUMN_TOTAL_BEDROOM = "total_bedrooms" 68 | DATASET_SCHEMA_COLUMNS_KEY= "columns" 69 | 70 | NUMERICAL_COLUMN_KEY="numerical_columns" 71 | CATEGORICAL_COLUMN_KEY = "categorical_columns" 72 | 73 | 74 | TARGET_COLUMN_KEY="target_column" 75 | 76 | 77 | # Model Training related variables 78 | 79 | MODEL_TRAINER_ARTIFACT_DIR = "model_trainer" 80 | MODEL_TRAINER_CONFIG_KEY = "model_trainer_config" 81 | MODEL_TRAINER_TRAINED_MODEL_DIR_KEY = "trained_model_dir" 82 | MODEL_TRAINER_TRAINED_MODEL_FILE_NAME_KEY = "model_file_name" 83 | MODEL_TRAINER_BASE_ACCURACY_KEY = "base_accuracy" 84 | MODEL_TRAINER_MODEL_CONFIG_DIR_KEY = "model_config_dir" 85 | MODEL_TRAINER_MODEL_CONFIG_FILE_NAME_KEY = "model_config_file_name" 86 | 87 | 88 | MODEL_EVALUATION_CONFIG_KEY = "model_evaluation_config" 89 | MODEL_EVALUATION_FILE_NAME_KEY = "model_evaluation_file_name" 90 | MODEL_EVALUATION_ARTIFACT_DIR = "model_evaluation" 91 | # Model Pusher config key 92 | MODEL_PUSHER_CONFIG_KEY = "model_pusher_config" 93 | MODEL_PUSHER_MODEL_EXPORT_DIR_KEY = "model_export_dir" 94 | 95 | BEST_MODEL_KEY = "best_model" 96 | HISTORY_KEY = "history" 97 | MODEL_PATH_KEY = "model_path" 98 | 99 | EXPERIMENT_DIR_NAME="experiment" 100 | EXPERIMENT_FILE_NAME="experiment.csv" -------------------------------------------------------------------------------- /notebook/example3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from scipy.stats import ks_2samp" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 16, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "arr1 = np.arange(10)\n", 28 | "arr2=np.arange(10)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 17, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 40 | ] 41 | }, 42 | "execution_count": 17, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "arr1" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 18, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 60 | ] 61 | }, 62 | "execution_count": 18, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "arr2" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 19, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "res = ks_2samp(arr1,arr2)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 20, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "1" 94 | ] 95 | }, 96 | "execution_count": 20, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "round(res.pvalue)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "Null: Two datset are from same distribution\n", 112 | "\n", 113 | "Alterate: Two dataset are not from same distribution\n", 114 | "\n", 115 | "\n", 116 | "if p>=0.05 :\n", 117 | " We have sufficent proof that null hypothesis is True\n", 118 | "else:\n", 119 | " We don't have sufficient proof that null hypothesis is True\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 25, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "1.0" 131 | ] 132 | }, 133 | "execution_count": 25, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "res= ks_2samp(arr1,arr2)\n", 140 | "round(res.pvalue,3)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python 3.7.0", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.7.0" 168 | }, 169 | "orig_nbformat": 4, 170 | "vscode": { 171 | "interpreter": { 172 | "hash": "7a29293c9d4d8b93126739266382f07a312940ff8d40640417510f0b045f4058" 173 | } 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 2 178 | } 179 | -------------------------------------------------------------------------------- /notebook/prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from housing.entity import housing_predictor" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 5, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "housing_data=housing_predictor.HousingData(-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,'<1H OCEAN')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 6, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "{'longitude': [-118.39],\n", 30 | " 'latitude': [34.12],\n", 31 | " 'housing_median_age': [29.0],\n", 32 | " 'total_rooms': [6447.0],\n", 33 | " 'total_bedrooms': [1012.0],\n", 34 | " 'population': [2184.0],\n", 35 | " 'households': [960.0],\n", 36 | " 'median_income': [8.2816],\n", 37 | " 'ocean_proximity': ['<1H OCEAN']}" 38 | ] 39 | }, 40 | "execution_count": 6, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "housing_data.get_housing_data_as_dict()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 8, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "df=housing_data.get_housing_input_data_frame()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from housing.entity.housing_predictor import HousingPredictor" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 12, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "model_path=\"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/saved_models\"\n", 74 | "housing_predictor= HousingPredictor(model_dir=model_path)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 13, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "'/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/saved_models'" 86 | ] 87 | }, 88 | "execution_count": 13, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "housing_predictor.model_dir" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 14, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "'/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/saved_models/20220706202006/model.pkl'" 106 | ] 107 | }, 108 | "execution_count": 14, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "housing_predictor.get_latest_model_path()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 15, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "424328.0048828125" 126 | ] 127 | }, 128 | "execution_count": 15, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "housing_predictor.predict(df)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3.7.0 (conda)", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.7.0" 162 | }, 163 | "orig_nbformat": 4, 164 | "vscode": { 165 | "interpreter": { 166 | "hash": "fc6fa6e48c86001677d15bc9af4f846353042d089527ab27e7c7a4474d3b154b" 167 | } 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 2 172 | } 173 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Index 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 |
12 |
13 |
14 |
15 |
16 |
17 | Want to learn project like this. 18 |
19 |
20 | 21 |
Check Full stack data science at iNeuron portal.
22 |

Full-stack data science course is a live mentor-led job guaranteed certification program with a full-time one-year internship provided by iNeuron, in this course you will learn the entire stack required to work in the data science, data analytics, and big data domain, including machine learning, deep learning, computer vision NLP and Big data including MLOps and cloud infrastructure, along with real-time industry projects and product development with the iNeuron product development team, which will enable you to contribute on various levels.

23 | Check Course 24 |
25 |
26 |
27 |
28 | 29 |
30 | 31 | 32 |
33 |
34 |
35 |
36 |
37 | View Logs 38 |
39 |
40 | 41 |
Log Files
42 |

All model training log file can be downloaded.

43 | Check Logs 44 |
45 |
46 |
47 | 48 | 49 |
50 |
51 |
52 | View Artifacts 53 |
54 |
55 | 56 |
Artifact Files
57 |

All model artifacts file can be downloaded.

58 | Check Artifacts 59 |
60 |
61 |
62 | 63 | 64 | 65 |
66 |
67 |
68 | View Trained Models 69 |
70 |
71 | 72 |
Trained Models
73 |

All model file can be downloaded.

74 | Check Models 75 |
76 |
77 |
78 | 79 | 80 |
81 |
82 |
83 | Estimate California price 84 |
85 |
86 | 87 |
Access California estimator
88 |

Form will be displayed. Submit the form to get estimated price for california house.

89 | Get Estimated Price 90 |
91 |
92 |
93 | 94 |
95 |
96 |
97 | Train housing estimator model. 98 |
99 |
100 | 101 |
Initiate model training.
102 |

Model training will be done. Files such as log, artifact and models can be viewed and downloaded using appropriate link.

103 | Initiate Training 104 |
105 |
106 |
107 | 108 | 109 | 110 | 111 | 112 | 113 |
114 | 115 | {% endblock %} -------------------------------------------------------------------------------- /templates/predict.html: -------------------------------------------------------------------------------- 1 | {% extends 'header.html' %} 2 | 3 | {% block head %} 4 | 5 | 6 | Housing Estimator 7 | {% endblock %} 8 | 9 | {% block content %} 10 | 11 |
12 |
13 | 14 | 15 |
16 | Housing Estimation Form 17 |
18 | 19 | 21 |
22 |
23 | 24 | 26 | 27 |
28 |
29 | 30 | 32 | 33 |
34 |
35 | 36 | 38 | 39 |
40 |
41 | 42 | 44 |
45 |
46 | 47 | 49 |
50 |
51 | 52 | 54 |
55 |
56 | 57 | 59 |
60 |
61 | 62 | 82 |
83 |
84 | 85 | 86 |
87 | 88 |
89 |
90 | 91 |
92 |
93 |
94 | California housing price 95 |
96 |
97 | {% if context['housing_data'] is not none %} 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | {% for column,value in context['housing_data'].items() %} 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | {% endfor %} 115 | 116 | 117 | 118 | 121 | 122 |
Califoria Housing Prediction
Input FeatureFeature Value
{{column}}{{value[0]}}
median_house_value 119 | {{ context['median_house_value'] }} 120 |
123 | 124 | {% else %} 125 | 126 |
Submit Form
127 |

Kindly provide necessary information to estimate housing price in california

128 | 129 | 130 | 131 | {% endif %} 132 | Go to Home 133 |
134 |
135 |
136 | 137 | {% endblock %} -------------------------------------------------------------------------------- /housing/component/data_ingestion.py: -------------------------------------------------------------------------------- 1 | from housing.entity.config_entity import DataIngestionConfig 2 | import sys,os 3 | from housing.exception import HousingException 4 | from housing.logger import logging 5 | from housing.entity.artifact_entity import DataIngestionArtifact 6 | import tarfile 7 | import numpy as np 8 | from six.moves import urllib 9 | import pandas as pd 10 | from sklearn.model_selection import StratifiedShuffleSplit 11 | 12 | class DataIngestion: 13 | 14 | def __init__(self,data_ingestion_config:DataIngestionConfig ): 15 | try: 16 | logging.info(f"{'>>'*20}Data Ingestion log started.{'<<'*20} ") 17 | self.data_ingestion_config = data_ingestion_config 18 | 19 | except Exception as e: 20 | raise HousingException(e,sys) 21 | 22 | 23 | def download_housing_data(self,) -> str: 24 | try: 25 | #extraction remote url to download dataset 26 | download_url = self.data_ingestion_config.dataset_download_url 27 | 28 | #folder location to download file 29 | tgz_download_dir = self.data_ingestion_config.tgz_download_dir 30 | 31 | os.makedirs(tgz_download_dir,exist_ok=True) 32 | 33 | housing_file_name = os.path.basename(download_url) 34 | 35 | tgz_file_path = os.path.join(tgz_download_dir, housing_file_name) 36 | 37 | logging.info(f"Downloading file from :[{download_url}] into :[{tgz_file_path}]") 38 | urllib.request.urlretrieve(download_url, tgz_file_path) 39 | logging.info(f"File :[{tgz_file_path}] has been downloaded successfully.") 40 | return tgz_file_path 41 | 42 | except Exception as e: 43 | raise HousingException(e,sys) from e 44 | 45 | def extract_tgz_file(self,tgz_file_path:str): 46 | try: 47 | raw_data_dir = self.data_ingestion_config.raw_data_dir 48 | 49 | if os.path.exists(raw_data_dir): 50 | os.remove(raw_data_dir) 51 | 52 | os.makedirs(raw_data_dir,exist_ok=True) 53 | 54 | logging.info(f"Extracting tgz file: [{tgz_file_path}] into dir: [{raw_data_dir}]") 55 | with tarfile.open(tgz_file_path) as housing_tgz_file_obj: 56 | housing_tgz_file_obj.extractall(path=raw_data_dir) 57 | logging.info(f"Extraction completed") 58 | 59 | except Exception as e: 60 | raise HousingException(e,sys) from e 61 | 62 | def split_data_as_train_test(self) -> DataIngestionArtifact: 63 | try: 64 | raw_data_dir = self.data_ingestion_config.raw_data_dir 65 | 66 | file_name = os.listdir(raw_data_dir)[0] 67 | 68 | housing_file_path = os.path.join(raw_data_dir,file_name) 69 | 70 | 71 | logging.info(f"Reading csv file: [{housing_file_path}]") 72 | housing_data_frame = pd.read_csv(housing_file_path) 73 | 74 | housing_data_frame["income_cat"] = pd.cut( 75 | housing_data_frame["median_income"], 76 | bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf], 77 | labels=[1,2,3,4,5] 78 | ) 79 | 80 | 81 | logging.info(f"Splitting data into train and test") 82 | strat_train_set = None 83 | strat_test_set = None 84 | 85 | split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 86 | 87 | for train_index,test_index in split.split(housing_data_frame, housing_data_frame["income_cat"]): 88 | strat_train_set = housing_data_frame.loc[train_index].drop(["income_cat"],axis=1) 89 | strat_test_set = housing_data_frame.loc[test_index].drop(["income_cat"],axis=1) 90 | 91 | train_file_path = os.path.join(self.data_ingestion_config.ingested_train_dir, 92 | file_name) 93 | 94 | test_file_path = os.path.join(self.data_ingestion_config.ingested_test_dir, 95 | file_name) 96 | 97 | if strat_train_set is not None: 98 | os.makedirs(self.data_ingestion_config.ingested_train_dir,exist_ok=True) 99 | logging.info(f"Exporting training datset to file: [{train_file_path}]") 100 | strat_train_set.to_csv(train_file_path,index=False) 101 | 102 | if strat_test_set is not None: 103 | os.makedirs(self.data_ingestion_config.ingested_test_dir, exist_ok= True) 104 | logging.info(f"Exporting test dataset to file: [{test_file_path}]") 105 | strat_test_set.to_csv(test_file_path,index=False) 106 | 107 | 108 | data_ingestion_artifact = DataIngestionArtifact(train_file_path=train_file_path, 109 | test_file_path=test_file_path, 110 | is_ingested=True, 111 | message=f"Data ingestion completed successfully." 112 | ) 113 | logging.info(f"Data Ingestion artifact:[{data_ingestion_artifact}]") 114 | return data_ingestion_artifact 115 | 116 | except Exception as e: 117 | raise HousingException(e,sys) from e 118 | 119 | def initiate_data_ingestion(self)-> DataIngestionArtifact: 120 | try: 121 | tgz_file_path = self.download_housing_data() 122 | self.extract_tgz_file(tgz_file_path=tgz_file_path) 123 | return self.split_data_as_train_test() 124 | except Exception as e: 125 | raise HousingException(e,sys) from e 126 | 127 | 128 | 129 | def __del__(self): 130 | logging.info(f"{'>>'*20}Data Ingestion log completed.{'<<'*20} \n\n") 131 | -------------------------------------------------------------------------------- /housing/component/data_validation.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from housing.logger import logging 4 | from housing.exception import HousingException 5 | from housing.entity.config_entity import DataValidationConfig 6 | from housing.entity.artifact_entity import DataIngestionArtifact,DataValidationArtifact 7 | import os,sys 8 | import pandas as pd 9 | from evidently.model_profile import Profile 10 | from evidently.model_profile.sections import DataDriftProfileSection 11 | from evidently.dashboard import Dashboard 12 | from evidently.dashboard.tabs import DataDriftTab 13 | import json 14 | 15 | class DataValidation: 16 | 17 | 18 | def __init__(self, data_validation_config:DataValidationConfig, 19 | data_ingestion_artifact:DataIngestionArtifact): 20 | try: 21 | logging.info(f"{'>>'*30}Data Valdaition log started.{'<<'*30} \n\n") 22 | self.data_validation_config = data_validation_config 23 | self.data_ingestion_artifact = data_ingestion_artifact 24 | except Exception as e: 25 | raise HousingException(e,sys) from e 26 | 27 | 28 | def get_train_and_test_df(self): 29 | try: 30 | train_df = pd.read_csv(self.data_ingestion_artifact.train_file_path) 31 | test_df = pd.read_csv(self.data_ingestion_artifact.test_file_path) 32 | return train_df,test_df 33 | except Exception as e: 34 | raise HousingException(e,sys) from e 35 | 36 | 37 | def is_train_test_file_exists(self)->bool: 38 | try: 39 | logging.info("Checking if training and test file is available") 40 | is_train_file_exist = False 41 | is_test_file_exist = False 42 | 43 | train_file_path = self.data_ingestion_artifact.train_file_path 44 | test_file_path = self.data_ingestion_artifact.test_file_path 45 | 46 | is_train_file_exist = os.path.exists(train_file_path) 47 | is_test_file_exist = os.path.exists(test_file_path) 48 | 49 | is_available = is_train_file_exist and is_test_file_exist 50 | 51 | logging.info(f"Is train and test file exists?-> {is_available}") 52 | 53 | if not is_available: 54 | training_file = self.data_ingestion_artifact.train_file_path 55 | testing_file = self.data_ingestion_artifact.test_file_path 56 | message=f"Training file: {training_file} or Testing file: {testing_file}" \ 57 | "is not present" 58 | raise Exception(message) 59 | 60 | return is_available 61 | except Exception as e: 62 | raise HousingException(e,sys) from e 63 | 64 | 65 | def validate_dataset_schema(self)->bool: 66 | try: 67 | validation_status = False 68 | 69 | #Assigment validate training and testing dataset using schema file 70 | #1. Number of Column 71 | #2. Check the value of ocean proximity 72 | # acceptable values <1H OCEAN 73 | # INLAND 74 | # ISLAND 75 | # NEAR BAY 76 | # NEAR OCEAN 77 | #3. Check column names 78 | 79 | 80 | validation_status = True 81 | return validation_status 82 | except Exception as e: 83 | raise HousingException(e,sys) from e 84 | 85 | def get_and_save_data_drift_report(self): 86 | try: 87 | profile = Profile(sections=[DataDriftProfileSection()]) 88 | 89 | train_df,test_df = self.get_train_and_test_df() 90 | 91 | profile.calculate(train_df,test_df) 92 | 93 | report = json.loads(profile.json()) 94 | 95 | report_file_path = self.data_validation_config.report_file_path 96 | report_dir = os.path.dirname(report_file_path) 97 | os.makedirs(report_dir,exist_ok=True) 98 | 99 | with open(report_file_path,"w") as report_file: 100 | json.dump(report, report_file, indent=6) 101 | return report 102 | except Exception as e: 103 | raise HousingException(e,sys) from e 104 | 105 | def save_data_drift_report_page(self): 106 | try: 107 | dashboard = Dashboard(tabs=[DataDriftTab()]) 108 | train_df,test_df = self.get_train_and_test_df() 109 | dashboard.calculate(train_df,test_df) 110 | 111 | report_page_file_path = self.data_validation_config.report_page_file_path 112 | report_page_dir = os.path.dirname(report_page_file_path) 113 | os.makedirs(report_page_dir,exist_ok=True) 114 | 115 | dashboard.save(report_page_file_path) 116 | except Exception as e: 117 | raise HousingException(e,sys) from e 118 | 119 | def is_data_drift_found(self)->bool: 120 | try: 121 | report = self.get_and_save_data_drift_report() 122 | self.save_data_drift_report_page() 123 | return True 124 | except Exception as e: 125 | raise HousingException(e,sys) from e 126 | 127 | def initiate_data_validation(self)->DataValidationArtifact : 128 | try: 129 | self.is_train_test_file_exists() 130 | self.validate_dataset_schema() 131 | self.is_data_drift_found() 132 | 133 | data_validation_artifact = DataValidationArtifact( 134 | schema_file_path=self.data_validation_config.schema_file_path, 135 | report_file_path=self.data_validation_config.report_file_path, 136 | report_page_file_path=self.data_validation_config.report_page_file_path, 137 | is_validated=True, 138 | message="Data Validation performed successully." 139 | ) 140 | logging.info(f"Data validation artifact: {data_validation_artifact}") 141 | return data_validation_artifact 142 | except Exception as e: 143 | raise HousingException(e,sys) from e 144 | 145 | 146 | def __del__(self): 147 | logging.info(f"{'>>'*30}Data Valdaition log completed.{'<<'*30} \n\n") 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /housing/component/model_trainer.py: -------------------------------------------------------------------------------- 1 | 2 | from housing.exception import HousingException 3 | import sys 4 | from housing.logger import logging 5 | from typing import List 6 | from housing.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact 7 | from housing.entity.config_entity import ModelTrainerConfig 8 | from housing.util.util import load_numpy_array_data,save_object,load_object 9 | from housing.entity.model_factory import MetricInfoArtifact, ModelFactory,GridSearchedBestModel 10 | from housing.entity.model_factory import evaluate_regression_model 11 | 12 | 13 | 14 | class HousingEstimatorModel: 15 | def __init__(self, preprocessing_object, trained_model_object): 16 | """ 17 | TrainedModel constructor 18 | preprocessing_object: preprocessing_object 19 | trained_model_object: trained_model_object 20 | """ 21 | self.preprocessing_object = preprocessing_object 22 | self.trained_model_object = trained_model_object 23 | 24 | def predict(self, X): 25 | """ 26 | function accepts raw inputs and then transformed raw input using preprocessing_object 27 | which gurantees that the inputs are in the same format as the training data 28 | At last it perform prediction on transformed features 29 | """ 30 | transformed_feature = self.preprocessing_object.transform(X) 31 | return self.trained_model_object.predict(transformed_feature) 32 | 33 | def __repr__(self): 34 | return f"{type(self.trained_model_object).__name__}()" 35 | 36 | def __str__(self): 37 | return f"{type(self.trained_model_object).__name__}()" 38 | 39 | 40 | 41 | 42 | class ModelTrainer: 43 | 44 | def __init__(self, model_trainer_config:ModelTrainerConfig, data_transformation_artifact: DataTransformationArtifact): 45 | try: 46 | logging.info(f"{'>>' * 30}Model trainer log started.{'<<' * 30} ") 47 | self.model_trainer_config = model_trainer_config 48 | self.data_transformation_artifact = data_transformation_artifact 49 | except Exception as e: 50 | raise HousingException(e, sys) from e 51 | 52 | def initiate_model_trainer(self)->ModelTrainerArtifact: 53 | try: 54 | logging.info(f"Loading transformed training dataset") 55 | transformed_train_file_path = self.data_transformation_artifact.transformed_train_file_path 56 | train_array = load_numpy_array_data(file_path=transformed_train_file_path) 57 | 58 | logging.info(f"Loading transformed testing dataset") 59 | transformed_test_file_path = self.data_transformation_artifact.transformed_test_file_path 60 | test_array = load_numpy_array_data(file_path=transformed_test_file_path) 61 | 62 | logging.info(f"Splitting training and testing input and target feature") 63 | x_train,y_train,x_test,y_test = train_array[:,:-1],train_array[:,-1],test_array[:,:-1],test_array[:,-1] 64 | 65 | 66 | logging.info(f"Extracting model config file path") 67 | model_config_file_path = self.model_trainer_config.model_config_file_path 68 | 69 | logging.info(f"Initializing model factory class using above model config file: {model_config_file_path}") 70 | model_factory = ModelFactory(model_config_path=model_config_file_path) 71 | 72 | 73 | base_accuracy = self.model_trainer_config.base_accuracy 74 | logging.info(f"Expected accuracy: {base_accuracy}") 75 | 76 | logging.info(f"Initiating operation model selecttion") 77 | best_model = model_factory.get_best_model(X=x_train,y=y_train,base_accuracy=base_accuracy) 78 | 79 | logging.info(f"Best model found on training dataset: {best_model}") 80 | 81 | logging.info(f"Extracting trained model list.") 82 | grid_searched_best_model_list:List[GridSearchedBestModel]=model_factory.grid_searched_best_model_list 83 | 84 | model_list = [model.best_model for model in grid_searched_best_model_list ] 85 | logging.info(f"Evaluation all trained model on training and testing dataset both") 86 | metric_info:MetricInfoArtifact = evaluate_regression_model(model_list=model_list,X_train=x_train,y_train=y_train,X_test=x_test,y_test=y_test,base_accuracy=base_accuracy) 87 | 88 | logging.info(f"Best found model on both training and testing dataset.") 89 | 90 | preprocessing_obj= load_object(file_path=self.data_transformation_artifact.preprocessed_object_file_path) 91 | model_object = metric_info.model_object 92 | 93 | 94 | trained_model_file_path=self.model_trainer_config.trained_model_file_path 95 | housing_model = HousingEstimatorModel(preprocessing_object=preprocessing_obj,trained_model_object=model_object) 96 | logging.info(f"Saving model at path: {trained_model_file_path}") 97 | save_object(file_path=trained_model_file_path,obj=housing_model) 98 | 99 | 100 | model_trainer_artifact= ModelTrainerArtifact(is_trained=True,message="Model Trained successfully", 101 | trained_model_file_path=trained_model_file_path, 102 | train_rmse=metric_info.train_rmse, 103 | test_rmse=metric_info.test_rmse, 104 | train_accuracy=metric_info.train_accuracy, 105 | test_accuracy=metric_info.test_accuracy, 106 | model_accuracy=metric_info.model_accuracy 107 | 108 | ) 109 | 110 | logging.info(f"Model Trainer Artifact: {model_trainer_artifact}") 111 | return model_trainer_artifact 112 | except Exception as e: 113 | raise HousingException(e, sys) from e 114 | 115 | def __del__(self): 116 | logging.info(f"{'>>' * 30}Model trainer log completed.{'<<' * 30} ") 117 | 118 | 119 | 120 | #loading transformed training and testing datset 121 | #reading model config file 122 | #getting best model on training datset 123 | #evaludation models on both training & testing datset -->model object 124 | #loading preprocessing pbject 125 | #custom model object by combining both preprocessing obj and model obj 126 | #saving custom model object 127 | #return model_trainer_artifact 128 | -------------------------------------------------------------------------------- /templates/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 16 | 17 | 19 | {% block head %}{% endblock %} 20 | 21 | 42 | 43 | 44 | 45 | 46 | 53 | 54 |
85 | 86 | 87 | 88 | 89 |
90 | 91 |
92 | 115 | 116 |
117 | 118 |
119 | {% block content %} 120 | 121 | 122 | {% endblock %} 123 | 124 |
125 | 126 |
127 | 128 | 129 | 130 | 131 | 132 | 133 |
134 | 135 |
136 | 137 |
138 | 139 | 141 | 142 | 143 | 145 | 146 | 147 | 149 | 150 | 151 | 153 | 154 | 155 | 157 | 158 | 160 |
161 | 162 |
163 | 164 | 165 | 166 |
167 | © 2022 Copyright: 168 | iNeuron Intelligence Pvt Limited 169 |
170 | 171 |
172 | 173 | 174 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | import sys 3 | 4 | import pip 5 | from housing.util.util import read_yaml_file, write_yaml_file 6 | from matplotlib.style import context 7 | from housing.logger import logging 8 | from housing.exception import HousingException 9 | import os, sys 10 | import json 11 | from housing.config.configuration import Configuartion 12 | from housing.constant import CONFIG_DIR, get_current_time_stamp 13 | from housing.pipeline.pipeline import Pipeline 14 | from housing.entity.housing_predictor import HousingPredictor, HousingData 15 | from flask import send_file, abort, render_template 16 | 17 | 18 | ROOT_DIR = os.getcwd() 19 | LOG_FOLDER_NAME = "logs" 20 | PIPELINE_FOLDER_NAME = "housing" 21 | SAVED_MODELS_DIR_NAME = "saved_models" 22 | MODEL_CONFIG_FILE_PATH = os.path.join(ROOT_DIR, CONFIG_DIR, "model.yaml") 23 | LOG_DIR = os.path.join(ROOT_DIR, LOG_FOLDER_NAME) 24 | PIPELINE_DIR = os.path.join(ROOT_DIR, PIPELINE_FOLDER_NAME) 25 | MODEL_DIR = os.path.join(ROOT_DIR, SAVED_MODELS_DIR_NAME) 26 | 27 | 28 | from housing.logger import get_log_dataframe 29 | 30 | HOUSING_DATA_KEY = "housing_data" 31 | MEDIAN_HOUSING_VALUE_KEY = "median_house_value" 32 | 33 | app = Flask(__name__) 34 | 35 | 36 | @app.route('/artifact', defaults={'req_path': 'housing'}) 37 | @app.route('/artifact/') 38 | def render_artifact_dir(req_path): 39 | os.makedirs("housing", exist_ok=True) 40 | # Joining the base and the requested path 41 | print(f"req_path: {req_path}") 42 | abs_path = os.path.join(req_path) 43 | print(abs_path) 44 | # Return 404 if path doesn't exist 45 | if not os.path.exists(abs_path): 46 | return abort(404) 47 | 48 | # Check if path is a file and serve 49 | if os.path.isfile(abs_path): 50 | if ".html" in abs_path: 51 | with open(abs_path, "r", encoding="utf-8") as file: 52 | content = '' 53 | for line in file.readlines(): 54 | content = f"{content}{line}" 55 | return content 56 | return send_file(abs_path) 57 | 58 | # Show directory contents 59 | files = {os.path.join(abs_path, file_name): file_name for file_name in os.listdir(abs_path) if 60 | "artifact" in os.path.join(abs_path, file_name)} 61 | 62 | result = { 63 | "files": files, 64 | "parent_folder": os.path.dirname(abs_path), 65 | "parent_label": abs_path 66 | } 67 | return render_template('files.html', result=result) 68 | 69 | 70 | @app.route('/', methods=['GET', 'POST']) 71 | def index(): 72 | try: 73 | return render_template('index.html') 74 | except Exception as e: 75 | return str(e) 76 | 77 | 78 | @app.route('/view_experiment_hist', methods=['GET', 'POST']) 79 | def view_experiment_history(): 80 | experiment_df = Pipeline.get_experiments_status() 81 | context = { 82 | "experiment": experiment_df.to_html(classes='table table-striped col-12') 83 | } 84 | return render_template('experiment_history.html', context=context) 85 | 86 | 87 | @app.route('/train', methods=['GET', 'POST']) 88 | def train(): 89 | message = "" 90 | pipeline = Pipeline(config=Configuartion(current_time_stamp=get_current_time_stamp())) 91 | if not Pipeline.experiment.running_status: 92 | message = "Training started." 93 | pipeline.start() 94 | else: 95 | message = "Training is already in progress." 96 | context = { 97 | "experiment": pipeline.get_experiments_status().to_html(classes='table table-striped col-12'), 98 | "message": message 99 | } 100 | return render_template('train.html', context=context) 101 | 102 | 103 | @app.route('/predict', methods=['GET', 'POST']) 104 | def predict(): 105 | context = { 106 | HOUSING_DATA_KEY: None, 107 | MEDIAN_HOUSING_VALUE_KEY: None 108 | } 109 | 110 | if request.method == 'POST': 111 | longitude = float(request.form['longitude']) 112 | latitude = float(request.form['latitude']) 113 | housing_median_age = float(request.form['housing_median_age']) 114 | total_rooms = float(request.form['total_rooms']) 115 | total_bedrooms = float(request.form['total_bedrooms']) 116 | population = float(request.form['population']) 117 | households = float(request.form['households']) 118 | median_income = float(request.form['median_income']) 119 | ocean_proximity = request.form['ocean_proximity'] 120 | 121 | housing_data = HousingData(longitude=longitude, 122 | latitude=latitude, 123 | housing_median_age=housing_median_age, 124 | total_rooms=total_rooms, 125 | total_bedrooms=total_bedrooms, 126 | population=population, 127 | households=households, 128 | median_income=median_income, 129 | ocean_proximity=ocean_proximity, 130 | ) 131 | housing_df = housing_data.get_housing_input_data_frame() 132 | housing_predictor = HousingPredictor(model_dir=MODEL_DIR) 133 | median_housing_value = housing_predictor.predict(X=housing_df) 134 | context = { 135 | HOUSING_DATA_KEY: housing_data.get_housing_data_as_dict(), 136 | MEDIAN_HOUSING_VALUE_KEY: median_housing_value, 137 | } 138 | return render_template('predict.html', context=context) 139 | return render_template("predict.html", context=context) 140 | 141 | 142 | @app.route('/saved_models', defaults={'req_path': 'saved_models'}) 143 | @app.route('/saved_models/') 144 | def saved_models_dir(req_path): 145 | os.makedirs("saved_models", exist_ok=True) 146 | # Joining the base and the requested path 147 | print(f"req_path: {req_path}") 148 | abs_path = os.path.join(req_path) 149 | print(abs_path) 150 | # Return 404 if path doesn't exist 151 | if not os.path.exists(abs_path): 152 | return abort(404) 153 | 154 | # Check if path is a file and serve 155 | if os.path.isfile(abs_path): 156 | return send_file(abs_path) 157 | 158 | # Show directory contents 159 | files = {os.path.join(abs_path, file): file for file in os.listdir(abs_path)} 160 | 161 | result = { 162 | "files": files, 163 | "parent_folder": os.path.dirname(abs_path), 164 | "parent_label": abs_path 165 | } 166 | return render_template('saved_models_files.html', result=result) 167 | 168 | 169 | @app.route("/update_model_config", methods=['GET', 'POST']) 170 | def update_model_config(): 171 | try: 172 | if request.method == 'POST': 173 | model_config = request.form['new_model_config'] 174 | model_config = model_config.replace("'", '"') 175 | print(model_config) 176 | model_config = json.loads(model_config) 177 | 178 | write_yaml_file(file_path=MODEL_CONFIG_FILE_PATH, data=model_config) 179 | 180 | model_config = read_yaml_file(file_path=MODEL_CONFIG_FILE_PATH) 181 | return render_template('update_model.html', result={"model_config": model_config}) 182 | 183 | except Exception as e: 184 | logging.exception(e) 185 | return str(e) 186 | 187 | 188 | @app.route(f'/logs', defaults={'req_path': f'{LOG_FOLDER_NAME}'}) 189 | @app.route(f'/{LOG_FOLDER_NAME}/') 190 | def render_log_dir(req_path): 191 | os.makedirs(LOG_FOLDER_NAME, exist_ok=True) 192 | # Joining the base and the requested path 193 | logging.info(f"req_path: {req_path}") 194 | abs_path = os.path.join(req_path) 195 | print(abs_path) 196 | # Return 404 if path doesn't exist 197 | if not os.path.exists(abs_path): 198 | return abort(404) 199 | 200 | # Check if path is a file and serve 201 | if os.path.isfile(abs_path): 202 | log_df = get_log_dataframe(abs_path) 203 | context = {"log": log_df.to_html(classes="table-striped", index=False)} 204 | return render_template('log.html', context=context) 205 | 206 | # Show directory contents 207 | files = {os.path.join(abs_path, file): file for file in os.listdir(abs_path)} 208 | 209 | result = { 210 | "files": files, 211 | "parent_folder": os.path.dirname(abs_path), 212 | "parent_label": abs_path 213 | } 214 | return render_template('log_files.html', result=result) 215 | 216 | 217 | if __name__ == "__main__": 218 | app.run() 219 | -------------------------------------------------------------------------------- /housing/component/model_evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from housing.logger import logging 4 | from housing.exception import HousingException 5 | from housing.entity.config_entity import ModelEvaluationConfig 6 | from housing.entity.artifact_entity import DataIngestionArtifact,DataValidationArtifact,ModelTrainerArtifact,ModelEvaluationArtifact 7 | from housing.constant import * 8 | import numpy as np 9 | import os 10 | import sys 11 | from housing.util.util import write_yaml_file, read_yaml_file, load_object,load_data 12 | from housing.entity.model_factory import evaluate_regression_model 13 | 14 | 15 | 16 | 17 | class ModelEvaluation: 18 | 19 | def __init__(self, model_evaluation_config: ModelEvaluationConfig, 20 | data_ingestion_artifact: DataIngestionArtifact, 21 | data_validation_artifact: DataValidationArtifact, 22 | model_trainer_artifact: ModelTrainerArtifact): 23 | try: 24 | logging.info(f"{'>>' * 30}Model Evaluation log started.{'<<' * 30} ") 25 | self.model_evaluation_config = model_evaluation_config 26 | self.model_trainer_artifact = model_trainer_artifact 27 | self.data_ingestion_artifact = data_ingestion_artifact 28 | self.data_validation_artifact = data_validation_artifact 29 | except Exception as e: 30 | raise HousingException(e, sys) from e 31 | 32 | def get_best_model(self): 33 | try: 34 | model = None 35 | model_evaluation_file_path = self.model_evaluation_config.model_evaluation_file_path 36 | 37 | if not os.path.exists(model_evaluation_file_path): 38 | write_yaml_file(file_path=model_evaluation_file_path, 39 | ) 40 | return model 41 | model_eval_file_content = read_yaml_file(file_path=model_evaluation_file_path) 42 | 43 | model_eval_file_content = dict() if model_eval_file_content is None else model_eval_file_content 44 | 45 | if BEST_MODEL_KEY not in model_eval_file_content: 46 | return model 47 | 48 | model = load_object(file_path=model_eval_file_content[BEST_MODEL_KEY][MODEL_PATH_KEY]) 49 | return model 50 | except Exception as e: 51 | raise HousingException(e, sys) from e 52 | 53 | def update_evaluation_report(self, model_evaluation_artifact: ModelEvaluationArtifact): 54 | try: 55 | eval_file_path = self.model_evaluation_config.model_evaluation_file_path 56 | model_eval_content = read_yaml_file(file_path=eval_file_path) 57 | model_eval_content = dict() if model_eval_content is None else model_eval_content 58 | 59 | 60 | previous_best_model = None 61 | if BEST_MODEL_KEY in model_eval_content: 62 | previous_best_model = model_eval_content[BEST_MODEL_KEY] 63 | 64 | logging.info(f"Previous eval result: {model_eval_content}") 65 | eval_result = { 66 | BEST_MODEL_KEY: { 67 | MODEL_PATH_KEY: model_evaluation_artifact.evaluated_model_path, 68 | } 69 | } 70 | 71 | if previous_best_model is not None: 72 | model_history = {self.model_evaluation_config.time_stamp: previous_best_model} 73 | if HISTORY_KEY not in model_eval_content: 74 | history = {HISTORY_KEY: model_history} 75 | eval_result.update(history) 76 | else: 77 | model_eval_content[HISTORY_KEY].update(model_history) 78 | 79 | model_eval_content.update(eval_result) 80 | logging.info(f"Updated eval result:{model_eval_content}") 81 | write_yaml_file(file_path=eval_file_path, data=model_eval_content) 82 | 83 | except Exception as e: 84 | raise HousingException(e, sys) from e 85 | 86 | def initiate_model_evaluation(self) -> ModelEvaluationArtifact: 87 | try: 88 | trained_model_file_path = self.model_trainer_artifact.trained_model_file_path 89 | trained_model_object = load_object(file_path=trained_model_file_path) 90 | 91 | train_file_path = self.data_ingestion_artifact.train_file_path 92 | test_file_path = self.data_ingestion_artifact.test_file_path 93 | 94 | schema_file_path = self.data_validation_artifact.schema_file_path 95 | 96 | train_dataframe = load_data(file_path=train_file_path, 97 | schema_file_path=schema_file_path, 98 | ) 99 | test_dataframe = load_data(file_path=test_file_path, 100 | schema_file_path=schema_file_path, 101 | ) 102 | schema_content = read_yaml_file(file_path=schema_file_path) 103 | target_column_name = schema_content[TARGET_COLUMN_KEY] 104 | 105 | # target_column 106 | logging.info(f"Converting target column into numpy array.") 107 | train_target_arr = np.array(train_dataframe[target_column_name]) 108 | test_target_arr = np.array(test_dataframe[target_column_name]) 109 | logging.info(f"Conversion completed target column into numpy array.") 110 | 111 | # dropping target column from the dataframe 112 | logging.info(f"Dropping target column from the dataframe.") 113 | train_dataframe.drop(target_column_name, axis=1, inplace=True) 114 | test_dataframe.drop(target_column_name, axis=1, inplace=True) 115 | logging.info(f"Dropping target column from the dataframe completed.") 116 | 117 | model = self.get_best_model() 118 | 119 | if model is None: 120 | logging.info("Not found any existing model. Hence accepting trained model") 121 | model_evaluation_artifact = ModelEvaluationArtifact(evaluated_model_path=trained_model_file_path, 122 | is_model_accepted=True) 123 | self.update_evaluation_report(model_evaluation_artifact) 124 | logging.info(f"Model accepted. Model eval artifact {model_evaluation_artifact} created") 125 | return model_evaluation_artifact 126 | 127 | model_list = [model, trained_model_object] 128 | 129 | metric_info_artifact = evaluate_regression_model(model_list=model_list, 130 | X_train=train_dataframe, 131 | y_train=train_target_arr, 132 | X_test=test_dataframe, 133 | y_test=test_target_arr, 134 | base_accuracy=self.model_trainer_artifact.model_accuracy, 135 | ) 136 | logging.info(f"Model evaluation completed. model metric artifact: {metric_info_artifact}") 137 | 138 | if metric_info_artifact is None: 139 | response = ModelEvaluationArtifact(is_model_accepted=False, 140 | evaluated_model_path=trained_model_file_path 141 | ) 142 | logging.info(response) 143 | return response 144 | 145 | if metric_info_artifact.index_number == 1: 146 | model_evaluation_artifact = ModelEvaluationArtifact(evaluated_model_path=trained_model_file_path, 147 | is_model_accepted=True) 148 | self.update_evaluation_report(model_evaluation_artifact) 149 | logging.info(f"Model accepted. Model eval artifact {model_evaluation_artifact} created") 150 | 151 | else: 152 | logging.info("Trained model is no better than existing model hence not accepting trained model") 153 | model_evaluation_artifact = ModelEvaluationArtifact(evaluated_model_path=trained_model_file_path, 154 | is_model_accepted=False) 155 | return model_evaluation_artifact 156 | except Exception as e: 157 | raise HousingException(e, sys) from e 158 | 159 | def __del__(self): 160 | logging.info(f"{'=' * 20}Model Evaluation log completed.{'=' * 20} ") -------------------------------------------------------------------------------- /housing/component/data_transformation.py: -------------------------------------------------------------------------------- 1 | from cgi import test 2 | from sklearn import preprocessing 3 | from housing.exception import HousingException 4 | from housing.logger import logging 5 | from housing.entity.config_entity import DataTransformationConfig 6 | from housing.entity.artifact_entity import DataIngestionArtifact,\ 7 | DataValidationArtifact,DataTransformationArtifact 8 | import sys,os 9 | import numpy as np 10 | from sklearn.base import BaseEstimator,TransformerMixin 11 | from sklearn.preprocessing import StandardScaler,OneHotEncoder 12 | from sklearn.pipeline import Pipeline 13 | from sklearn.compose import ColumnTransformer 14 | from sklearn.impute import SimpleImputer 15 | import pandas as pd 16 | from housing.constant import * 17 | from housing.util.util import read_yaml_file,save_object,save_numpy_array_data,load_data 18 | 19 | 20 | # longitude: float 21 | # latitude: float 22 | # housing_median_age: float 23 | # total_rooms: float 24 | # total_bedrooms: float 25 | # population: float 26 | # households: float 27 | # median_income: float 28 | # median_house_value: float 29 | # ocean_proximity: category 30 | # income_cat: float 31 | 32 | 33 | class FeatureGenerator(BaseEstimator, TransformerMixin): 34 | 35 | def __init__(self, add_bedrooms_per_room=True, 36 | total_rooms_ix=3, 37 | population_ix=5, 38 | households_ix=6, 39 | total_bedrooms_ix=4, columns=None): 40 | """ 41 | FeatureGenerator Initialization 42 | add_bedrooms_per_room: bool 43 | total_rooms_ix: int index number of total rooms columns 44 | population_ix: int index number of total population columns 45 | households_ix: int index number of households columns 46 | total_bedrooms_ix: int index number of bedrooms columns 47 | """ 48 | try: 49 | self.columns = columns 50 | if self.columns is not None: 51 | total_rooms_ix = self.columns.index(COLUMN_TOTAL_ROOMS) 52 | population_ix = self.columns.index(COLUMN_POPULATION) 53 | households_ix = self.columns.index(COLUMN_HOUSEHOLDS) 54 | total_bedrooms_ix = self.columns.index(COLUMN_TOTAL_BEDROOM) 55 | 56 | self.add_bedrooms_per_room = add_bedrooms_per_room 57 | self.total_rooms_ix = total_rooms_ix 58 | self.population_ix = population_ix 59 | self.households_ix = households_ix 60 | self.total_bedrooms_ix = total_bedrooms_ix 61 | except Exception as e: 62 | raise HousingException(e, sys) from e 63 | 64 | def fit(self, X, y=None): 65 | return self 66 | 67 | def transform(self, X, y=None): 68 | try: 69 | room_per_household = X[:, self.total_rooms_ix] / \ 70 | X[:, self.households_ix] 71 | population_per_household = X[:, self.population_ix] / \ 72 | X[:, self.households_ix] 73 | if self.add_bedrooms_per_room: 74 | bedrooms_per_room = X[:, self.total_bedrooms_ix] / \ 75 | X[:, self.total_rooms_ix] 76 | generated_feature = np.c_[ 77 | X, room_per_household, population_per_household, bedrooms_per_room] 78 | else: 79 | generated_feature = np.c_[ 80 | X, room_per_household, population_per_household] 81 | 82 | return generated_feature 83 | except Exception as e: 84 | raise HousingException(e, sys) from e 85 | 86 | 87 | 88 | 89 | 90 | class DataTransformation: 91 | 92 | def __init__(self, data_transformation_config: DataTransformationConfig, 93 | data_ingestion_artifact: DataIngestionArtifact, 94 | data_validation_artifact: DataValidationArtifact 95 | ): 96 | try: 97 | logging.info(f"{'>>' * 30}Data Transformation log started.{'<<' * 30} ") 98 | self.data_transformation_config= data_transformation_config 99 | self.data_ingestion_artifact = data_ingestion_artifact 100 | self.data_validation_artifact = data_validation_artifact 101 | 102 | except Exception as e: 103 | raise HousingException(e,sys) from e 104 | 105 | 106 | 107 | def get_data_transformer_object(self)->ColumnTransformer: 108 | try: 109 | schema_file_path = self.data_validation_artifact.schema_file_path 110 | 111 | dataset_schema = read_yaml_file(file_path=schema_file_path) 112 | 113 | numerical_columns = dataset_schema[NUMERICAL_COLUMN_KEY] 114 | categorical_columns = dataset_schema[CATEGORICAL_COLUMN_KEY] 115 | 116 | 117 | num_pipeline = Pipeline(steps=[ 118 | ('imputer', SimpleImputer(strategy="median")), 119 | ('feature_generator', FeatureGenerator( 120 | add_bedrooms_per_room=self.data_transformation_config.add_bedroom_per_room, 121 | columns=numerical_columns 122 | )), 123 | ('scaler', StandardScaler()) 124 | ] 125 | ) 126 | 127 | cat_pipeline = Pipeline(steps=[ 128 | ('impute', SimpleImputer(strategy="most_frequent")), 129 | ('one_hot_encoder', OneHotEncoder()), 130 | ('scaler', StandardScaler(with_mean=False)) 131 | ] 132 | ) 133 | 134 | logging.info(f"Categorical columns: {categorical_columns}") 135 | logging.info(f"Numerical columns: {numerical_columns}") 136 | 137 | 138 | preprocessing = ColumnTransformer([ 139 | ('num_pipeline', num_pipeline, numerical_columns), 140 | ('cat_pipeline', cat_pipeline, categorical_columns), 141 | ]) 142 | return preprocessing 143 | 144 | except Exception as e: 145 | raise HousingException(e,sys) from e 146 | 147 | 148 | def initiate_data_transformation(self)->DataTransformationArtifact: 149 | try: 150 | logging.info(f"Obtaining preprocessing object.") 151 | preprocessing_obj = self.get_data_transformer_object() 152 | 153 | 154 | logging.info(f"Obtaining training and test file path.") 155 | train_file_path = self.data_ingestion_artifact.train_file_path 156 | test_file_path = self.data_ingestion_artifact.test_file_path 157 | 158 | 159 | schema_file_path = self.data_validation_artifact.schema_file_path 160 | 161 | logging.info(f"Loading training and test data as pandas dataframe.") 162 | train_df = load_data(file_path=train_file_path, schema_file_path=schema_file_path) 163 | 164 | test_df = load_data(file_path=test_file_path, schema_file_path=schema_file_path) 165 | 166 | schema = read_yaml_file(file_path=schema_file_path) 167 | 168 | target_column_name = schema[TARGET_COLUMN_KEY] 169 | 170 | 171 | logging.info(f"Splitting input and target feature from training and testing dataframe.") 172 | input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1) 173 | target_feature_train_df = train_df[target_column_name] 174 | 175 | input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1) 176 | target_feature_test_df = test_df[target_column_name] 177 | 178 | 179 | logging.info(f"Applying preprocessing object on training dataframe and testing dataframe") 180 | input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df) 181 | input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df) 182 | 183 | 184 | train_arr = np.c_[ input_feature_train_arr, np.array(target_feature_train_df)] 185 | 186 | test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)] 187 | 188 | transformed_train_dir = self.data_transformation_config.transformed_train_dir 189 | transformed_test_dir = self.data_transformation_config.transformed_test_dir 190 | 191 | train_file_name = os.path.basename(train_file_path).replace(".csv",".npz") 192 | test_file_name = os.path.basename(test_file_path).replace(".csv",".npz") 193 | 194 | transformed_train_file_path = os.path.join(transformed_train_dir, train_file_name) 195 | transformed_test_file_path = os.path.join(transformed_test_dir, test_file_name) 196 | 197 | logging.info(f"Saving transformed training and testing array.") 198 | 199 | save_numpy_array_data(file_path=transformed_train_file_path,array=train_arr) 200 | save_numpy_array_data(file_path=transformed_test_file_path,array=test_arr) 201 | 202 | preprocessing_obj_file_path = self.data_transformation_config.preprocessed_object_file_path 203 | 204 | logging.info(f"Saving preprocessing object.") 205 | save_object(file_path=preprocessing_obj_file_path,obj=preprocessing_obj) 206 | 207 | data_transformation_artifact = DataTransformationArtifact(is_transformed=True, 208 | message="Data transformation successfull.", 209 | transformed_train_file_path=transformed_train_file_path, 210 | transformed_test_file_path=transformed_test_file_path, 211 | preprocessed_object_file_path=preprocessing_obj_file_path 212 | 213 | ) 214 | logging.info(f"Data transformationa artifact: {data_transformation_artifact}") 215 | return data_transformation_artifact 216 | except Exception as e: 217 | raise HousingException(e,sys) from e 218 | 219 | def __del__(self): 220 | logging.info(f"{'>>'*30}Data Transformation log completed.{'<<'*30} \n\n") 221 | -------------------------------------------------------------------------------- /housing/config/configuration.py: -------------------------------------------------------------------------------- 1 | 2 | from housing.entity.config_entity import DataIngestionConfig, DataTransformationConfig,DataValidationConfig, \ 3 | ModelTrainerConfig,ModelEvaluationConfig,ModelPusherConfig,TrainingPipelineConfig 4 | from housing.util.util import read_yaml_file 5 | from housing.logger import logging 6 | import sys,os 7 | from housing.constant import * 8 | from housing.exception import HousingException 9 | 10 | 11 | class Configuartion: 12 | 13 | def __init__(self, 14 | config_file_path:str =CONFIG_FILE_PATH, 15 | current_time_stamp:str = CURRENT_TIME_STAMP 16 | ) -> None: 17 | try: 18 | self.config_info = read_yaml_file(file_path=config_file_path) 19 | self.training_pipeline_config = self.get_training_pipeline_config() 20 | self.time_stamp = current_time_stamp 21 | except Exception as e: 22 | raise HousingException(e,sys) from e 23 | 24 | 25 | def get_data_ingestion_config(self) ->DataIngestionConfig: 26 | try: 27 | artifact_dir = self.training_pipeline_config.artifact_dir 28 | data_ingestion_artifact_dir=os.path.join( 29 | artifact_dir, 30 | DATA_INGESTION_ARTIFACT_DIR, 31 | self.time_stamp 32 | ) 33 | data_ingestion_info = self.config_info[DATA_INGESTION_CONFIG_KEY] 34 | 35 | dataset_download_url = data_ingestion_info[DATA_INGESTION_DOWNLOAD_URL_KEY] 36 | tgz_download_dir = os.path.join( 37 | data_ingestion_artifact_dir, 38 | data_ingestion_info[DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY] 39 | ) 40 | raw_data_dir = os.path.join(data_ingestion_artifact_dir, 41 | data_ingestion_info[DATA_INGESTION_RAW_DATA_DIR_KEY] 42 | ) 43 | 44 | ingested_data_dir = os.path.join( 45 | data_ingestion_artifact_dir, 46 | data_ingestion_info[DATA_INGESTION_INGESTED_DIR_NAME_KEY] 47 | ) 48 | ingested_train_dir = os.path.join( 49 | ingested_data_dir, 50 | data_ingestion_info[DATA_INGESTION_TRAIN_DIR_KEY] 51 | ) 52 | ingested_test_dir =os.path.join( 53 | ingested_data_dir, 54 | data_ingestion_info[DATA_INGESTION_TEST_DIR_KEY] 55 | ) 56 | 57 | 58 | data_ingestion_config=DataIngestionConfig( 59 | dataset_download_url=dataset_download_url, 60 | tgz_download_dir=tgz_download_dir, 61 | raw_data_dir=raw_data_dir, 62 | ingested_train_dir=ingested_train_dir, 63 | ingested_test_dir=ingested_test_dir 64 | ) 65 | logging.info(f"Data Ingestion config: {data_ingestion_config}") 66 | return data_ingestion_config 67 | except Exception as e: 68 | raise HousingException(e,sys) from e 69 | 70 | def get_data_validation_config(self) -> DataValidationConfig: 71 | try: 72 | artifact_dir = self.training_pipeline_config.artifact_dir 73 | 74 | data_validation_artifact_dir=os.path.join( 75 | artifact_dir, 76 | DATA_VALIDATION_ARTIFACT_DIR_NAME, 77 | self.time_stamp 78 | ) 79 | data_validation_config = self.config_info[DATA_VALIDATION_CONFIG_KEY] 80 | 81 | 82 | schema_file_path = os.path.join(ROOT_DIR, 83 | data_validation_config[DATA_VALIDATION_SCHEMA_DIR_KEY], 84 | data_validation_config[DATA_VALIDATION_SCHEMA_FILE_NAME_KEY] 85 | ) 86 | 87 | report_file_path = os.path.join(data_validation_artifact_dir, 88 | data_validation_config[DATA_VALIDATION_REPORT_FILE_NAME_KEY] 89 | ) 90 | 91 | report_page_file_path = os.path.join(data_validation_artifact_dir, 92 | data_validation_config[DATA_VALIDATION_REPORT_PAGE_FILE_NAME_KEY] 93 | 94 | ) 95 | 96 | data_validation_config = DataValidationConfig( 97 | schema_file_path=schema_file_path, 98 | report_file_path=report_file_path, 99 | report_page_file_path=report_page_file_path, 100 | ) 101 | return data_validation_config 102 | except Exception as e: 103 | raise HousingException(e,sys) from e 104 | 105 | def get_data_transformation_config(self) -> DataTransformationConfig: 106 | try: 107 | artifact_dir = self.training_pipeline_config.artifact_dir 108 | 109 | data_transformation_artifact_dir=os.path.join( 110 | artifact_dir, 111 | DATA_TRANSFORMATION_ARTIFACT_DIR, 112 | self.time_stamp 113 | ) 114 | 115 | data_transformation_config_info=self.config_info[DATA_TRANSFORMATION_CONFIG_KEY] 116 | 117 | add_bedroom_per_room=data_transformation_config_info[DATA_TRANSFORMATION_ADD_BEDROOM_PER_ROOM_KEY] 118 | 119 | 120 | preprocessed_object_file_path = os.path.join( 121 | data_transformation_artifact_dir, 122 | data_transformation_config_info[DATA_TRANSFORMATION_PREPROCESSING_DIR_KEY], 123 | data_transformation_config_info[DATA_TRANSFORMATION_PREPROCESSED_FILE_NAME_KEY] 124 | ) 125 | 126 | 127 | transformed_train_dir=os.path.join( 128 | data_transformation_artifact_dir, 129 | data_transformation_config_info[DATA_TRANSFORMATION_DIR_NAME_KEY], 130 | data_transformation_config_info[DATA_TRANSFORMATION_TRAIN_DIR_NAME_KEY] 131 | ) 132 | 133 | 134 | transformed_test_dir = os.path.join( 135 | data_transformation_artifact_dir, 136 | data_transformation_config_info[DATA_TRANSFORMATION_DIR_NAME_KEY], 137 | data_transformation_config_info[DATA_TRANSFORMATION_TEST_DIR_NAME_KEY] 138 | 139 | ) 140 | 141 | 142 | data_transformation_config=DataTransformationConfig( 143 | add_bedroom_per_room=add_bedroom_per_room, 144 | preprocessed_object_file_path=preprocessed_object_file_path, 145 | transformed_train_dir=transformed_train_dir, 146 | transformed_test_dir=transformed_test_dir 147 | ) 148 | 149 | logging.info(f"Data transformation config: {data_transformation_config}") 150 | return data_transformation_config 151 | except Exception as e: 152 | raise HousingException(e,sys) from e 153 | 154 | def get_model_trainer_config(self) -> ModelTrainerConfig: 155 | try: 156 | artifact_dir = self.training_pipeline_config.artifact_dir 157 | 158 | model_trainer_artifact_dir=os.path.join( 159 | artifact_dir, 160 | MODEL_TRAINER_ARTIFACT_DIR, 161 | self.time_stamp 162 | ) 163 | model_trainer_config_info = self.config_info[MODEL_TRAINER_CONFIG_KEY] 164 | trained_model_file_path = os.path.join(model_trainer_artifact_dir, 165 | model_trainer_config_info[MODEL_TRAINER_TRAINED_MODEL_DIR_KEY], 166 | model_trainer_config_info[MODEL_TRAINER_TRAINED_MODEL_FILE_NAME_KEY] 167 | ) 168 | 169 | model_config_file_path = os.path.join(model_trainer_config_info[MODEL_TRAINER_MODEL_CONFIG_DIR_KEY], 170 | model_trainer_config_info[MODEL_TRAINER_MODEL_CONFIG_FILE_NAME_KEY] 171 | ) 172 | 173 | base_accuracy = model_trainer_config_info[MODEL_TRAINER_BASE_ACCURACY_KEY] 174 | 175 | model_trainer_config = ModelTrainerConfig( 176 | trained_model_file_path=trained_model_file_path, 177 | base_accuracy=base_accuracy, 178 | model_config_file_path=model_config_file_path 179 | ) 180 | logging.info(f"Model trainer config: {model_trainer_config}") 181 | return model_trainer_config 182 | except Exception as e: 183 | raise HousingException(e,sys) from e 184 | 185 | def get_model_evaluation_config(self) ->ModelEvaluationConfig: 186 | try: 187 | model_evaluation_config = self.config_info[MODEL_EVALUATION_CONFIG_KEY] 188 | artifact_dir = os.path.join(self.training_pipeline_config.artifact_dir, 189 | MODEL_EVALUATION_ARTIFACT_DIR, ) 190 | 191 | model_evaluation_file_path = os.path.join(artifact_dir, 192 | model_evaluation_config[MODEL_EVALUATION_FILE_NAME_KEY]) 193 | response = ModelEvaluationConfig(model_evaluation_file_path=model_evaluation_file_path, 194 | time_stamp=self.time_stamp) 195 | 196 | 197 | logging.info(f"Model Evaluation Config: {response}.") 198 | return response 199 | except Exception as e: 200 | raise HousingException(e,sys) from e 201 | 202 | 203 | def get_model_pusher_config(self) -> ModelPusherConfig: 204 | try: 205 | time_stamp = f"{datetime.now().strftime('%Y%m%d%H%M%S')}" 206 | model_pusher_config_info = self.config_info[MODEL_PUSHER_CONFIG_KEY] 207 | export_dir_path = os.path.join(ROOT_DIR, model_pusher_config_info[MODEL_PUSHER_MODEL_EXPORT_DIR_KEY], 208 | time_stamp) 209 | 210 | model_pusher_config = ModelPusherConfig(export_dir_path=export_dir_path) 211 | logging.info(f"Model pusher config {model_pusher_config}") 212 | return model_pusher_config 213 | 214 | except Exception as e: 215 | raise HousingException(e,sys) from e 216 | 217 | def get_training_pipeline_config(self) ->TrainingPipelineConfig: 218 | try: 219 | training_pipeline_config = self.config_info[TRAINING_PIPELINE_CONFIG_KEY] 220 | artifact_dir = os.path.join(ROOT_DIR, 221 | training_pipeline_config[TRAINING_PIPELINE_NAME_KEY], 222 | training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY] 223 | ) 224 | 225 | training_pipeline_config = TrainingPipelineConfig(artifact_dir=artifact_dir) 226 | logging.info(f"Training pipleine config: {training_pipeline_config}") 227 | return training_pipeline_config 228 | except Exception as e: 229 | raise HousingException(e,sys) from e -------------------------------------------------------------------------------- /notebook/log.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "f=\"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/logs/log_2022-07-06-19-40-36.log\"" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 10, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 21, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | "
Time stampLog Levelline numberfile namefunction namemessage
0[2022-07-06 19:40:36,738]INFO226configuration.pyget_training_pipeline_config()Training pipleine config: TrainingPipelineConf...
1[2022-07-06 19:40:36,743]INFO224_internal.py_log()* Running on http://127.0.0.1:5000 (Press CTR...
2[2022-07-06 19:40:42,739]INFO224_internal.py_log()127.0.0.1 - - [06/Jul/2022 19:40:42] \"GET / HT...
3[2022-07-06 19:40:45,899]INFO224_internal.py_log()127.0.0.1 - - [06/Jul/2022 19:40:45] \"GET /vie...
4[2022-07-06 19:40:50,204]INFO226configuration.pyget_training_pipeline_config()Training pipleine config: TrainingPipelineConf...
.....................
134[2022-07-06 19:40:55,171]INFO38model_pusher.pyexport_model()Model pusher artifact: [ModelPusherArtifact(is...
135[2022-07-06 19:40:55,171]INFO50model_pusher.py__del__()>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Model ...
136[2022-07-06 19:40:55,171]INFO149pipeline.pyrun_pipeline()Model pusher artifact: ModelPusherArtifact(is_...
137[2022-07-06 19:40:55,171]INFO152pipeline.pyrun_pipeline()Pipeline completed.\\n
138[2022-07-06 19:40:55,171]INFO166pipeline.pyrun_pipeline()Pipeline experiment: Experiment(experiment_id=...
\n", 164 | "

139 rows × 6 columns

\n", 165 | "
" 166 | ], 167 | "text/plain": [ 168 | " Time stamp Log Level line number file name \\\n", 169 | "0 [2022-07-06 19:40:36,738] INFO 226 configuration.py \n", 170 | "1 [2022-07-06 19:40:36,743] INFO 224 _internal.py \n", 171 | "2 [2022-07-06 19:40:42,739] INFO 224 _internal.py \n", 172 | "3 [2022-07-06 19:40:45,899] INFO 224 _internal.py \n", 173 | "4 [2022-07-06 19:40:50,204] INFO 226 configuration.py \n", 174 | ".. ... ... ... ... \n", 175 | "134 [2022-07-06 19:40:55,171] INFO 38 model_pusher.py \n", 176 | "135 [2022-07-06 19:40:55,171] INFO 50 model_pusher.py \n", 177 | "136 [2022-07-06 19:40:55,171] INFO 149 pipeline.py \n", 178 | "137 [2022-07-06 19:40:55,171] INFO 152 pipeline.py \n", 179 | "138 [2022-07-06 19:40:55,171] INFO 166 pipeline.py \n", 180 | "\n", 181 | " function name \\\n", 182 | "0 get_training_pipeline_config() \n", 183 | "1 _log() \n", 184 | "2 _log() \n", 185 | "3 _log() \n", 186 | "4 get_training_pipeline_config() \n", 187 | ".. ... \n", 188 | "134 export_model() \n", 189 | "135 __del__() \n", 190 | "136 run_pipeline() \n", 191 | "137 run_pipeline() \n", 192 | "138 run_pipeline() \n", 193 | "\n", 194 | " message \n", 195 | "0 Training pipleine config: TrainingPipelineConf... \n", 196 | "1 * Running on http://127.0.0.1:5000 (Press CTR... \n", 197 | "2 127.0.0.1 - - [06/Jul/2022 19:40:42] \"GET / HT... \n", 198 | "3 127.0.0.1 - - [06/Jul/2022 19:40:45] \"GET /vie... \n", 199 | "4 Training pipleine config: TrainingPipelineConf... \n", 200 | ".. ... \n", 201 | "134 Model pusher artifact: [ModelPusherArtifact(is... \n", 202 | "135 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Model ... \n", 203 | "136 Model pusher artifact: ModelPusherArtifact(is_... \n", 204 | "137 Pipeline completed.\\n \n", 205 | "138 Pipeline experiment: Experiment(experiment_id=... \n", 206 | "\n", 207 | "[139 rows x 6 columns]" 208 | ] 209 | }, 210 | "execution_count": 21, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 20, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 12, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "df=pd.DataFrame(data)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 17, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 18, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "df.columns=columns" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [] 269 | } 270 | ], 271 | "metadata": { 272 | "kernelspec": { 273 | "display_name": "Python 3.7.0 (conda)", 274 | "language": "python", 275 | "name": "python3" 276 | }, 277 | "language_info": { 278 | "codemirror_mode": { 279 | "name": "ipython", 280 | "version": 3 281 | }, 282 | "file_extension": ".py", 283 | "mimetype": "text/x-python", 284 | "name": "python", 285 | "nbconvert_exporter": "python", 286 | "pygments_lexer": "ipython3", 287 | "version": "3.7.0" 288 | }, 289 | "orig_nbformat": 4, 290 | "vscode": { 291 | "interpreter": { 292 | "hash": "fc6fa6e48c86001677d15bc9af4f846353042d089527ab27e7c7a4474d3b154b" 293 | } 294 | } 295 | }, 296 | "nbformat": 4, 297 | "nbformat_minor": 2 298 | } 299 | -------------------------------------------------------------------------------- /housing/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from datetime import datetime 3 | import uuid 4 | from housing.config.configuration import Configuartion 5 | from housing.logger import logging, get_log_file_name 6 | from housing.exception import HousingException 7 | from threading import Thread 8 | from typing import List 9 | 10 | from multiprocessing import Process 11 | from housing.entity.artifact_entity import ModelPusherArtifact, DataIngestionArtifact, ModelEvaluationArtifact 12 | from housing.entity.artifact_entity import DataValidationArtifact, DataTransformationArtifact, ModelTrainerArtifact 13 | from housing.entity.config_entity import DataIngestionConfig, ModelEvaluationConfig 14 | from housing.component.data_ingestion import DataIngestion 15 | from housing.component.data_validation import DataValidation 16 | from housing.component.data_transformation import DataTransformation 17 | from housing.component.model_trainer import ModelTrainer 18 | from housing.component.model_evaluation import ModelEvaluation 19 | from housing.component.model_pusher import ModelPusher 20 | import os, sys 21 | from collections import namedtuple 22 | from datetime import datetime 23 | import pandas as pd 24 | from housing.constant import EXPERIMENT_DIR_NAME, EXPERIMENT_FILE_NAME 25 | 26 | Experiment = namedtuple("Experiment", ["experiment_id", "initialization_timestamp", "artifact_time_stamp", 27 | "running_status", "start_time", "stop_time", "execution_time", "message", 28 | "experiment_file_path", "accuracy", "is_model_accepted"]) 29 | 30 | 31 | 32 | 33 | 34 | class Pipeline(Thread): 35 | experiment: Experiment = Experiment(*([None] * 11)) 36 | experiment_file_path = None 37 | 38 | def __init__(self, config: Configuartion ) -> None: 39 | try: 40 | os.makedirs(config.training_pipeline_config.artifact_dir, exist_ok=True) 41 | Pipeline.experiment_file_path=os.path.join(config.training_pipeline_config.artifact_dir,EXPERIMENT_DIR_NAME, EXPERIMENT_FILE_NAME) 42 | super().__init__(daemon=False, name="pipeline") 43 | self.config = config 44 | except Exception as e: 45 | raise HousingException(e, sys) from e 46 | 47 | def start_data_ingestion(self) -> DataIngestionArtifact: 48 | try: 49 | data_ingestion = DataIngestion(data_ingestion_config=self.config.get_data_ingestion_config()) 50 | return data_ingestion.initiate_data_ingestion() 51 | except Exception as e: 52 | raise HousingException(e, sys) from e 53 | 54 | def start_data_validation(self, data_ingestion_artifact: DataIngestionArtifact) \ 55 | -> DataValidationArtifact: 56 | try: 57 | data_validation = DataValidation(data_validation_config=self.config.get_data_validation_config(), 58 | data_ingestion_artifact=data_ingestion_artifact 59 | ) 60 | return data_validation.initiate_data_validation() 61 | except Exception as e: 62 | raise HousingException(e, sys) from e 63 | 64 | def start_data_transformation(self, 65 | data_ingestion_artifact: DataIngestionArtifact, 66 | data_validation_artifact: DataValidationArtifact 67 | ) -> DataTransformationArtifact: 68 | try: 69 | data_transformation = DataTransformation( 70 | data_transformation_config=self.config.get_data_transformation_config(), 71 | data_ingestion_artifact=data_ingestion_artifact, 72 | data_validation_artifact=data_validation_artifact 73 | ) 74 | return data_transformation.initiate_data_transformation() 75 | except Exception as e: 76 | raise HousingException(e, sys) 77 | 78 | def start_model_trainer(self, data_transformation_artifact: DataTransformationArtifact) -> ModelTrainerArtifact: 79 | try: 80 | model_trainer = ModelTrainer(model_trainer_config=self.config.get_model_trainer_config(), 81 | data_transformation_artifact=data_transformation_artifact 82 | ) 83 | return model_trainer.initiate_model_trainer() 84 | except Exception as e: 85 | raise HousingException(e, sys) from e 86 | 87 | def start_model_evaluation(self, data_ingestion_artifact: DataIngestionArtifact, 88 | data_validation_artifact: DataValidationArtifact, 89 | model_trainer_artifact: ModelTrainerArtifact) -> ModelEvaluationArtifact: 90 | try: 91 | model_eval = ModelEvaluation( 92 | model_evaluation_config=self.config.get_model_evaluation_config(), 93 | data_ingestion_artifact=data_ingestion_artifact, 94 | data_validation_artifact=data_validation_artifact, 95 | model_trainer_artifact=model_trainer_artifact) 96 | return model_eval.initiate_model_evaluation() 97 | except Exception as e: 98 | raise HousingException(e, sys) from e 99 | 100 | def start_model_pusher(self, model_eval_artifact: ModelEvaluationArtifact) -> ModelPusherArtifact: 101 | try: 102 | model_pusher = ModelPusher( 103 | model_pusher_config=self.config.get_model_pusher_config(), 104 | model_evaluation_artifact=model_eval_artifact 105 | ) 106 | return model_pusher.initiate_model_pusher() 107 | except Exception as e: 108 | raise HousingException(e, sys) from e 109 | 110 | def run_pipeline(self): 111 | try: 112 | if Pipeline.experiment.running_status: 113 | logging.info("Pipeline is already running") 114 | return Pipeline.experiment 115 | # data ingestion 116 | logging.info("Pipeline starting.") 117 | 118 | experiment_id = str(uuid.uuid4()) 119 | 120 | Pipeline.experiment = Experiment(experiment_id=experiment_id, 121 | initialization_timestamp=self.config.time_stamp, 122 | artifact_time_stamp=self.config.time_stamp, 123 | running_status=True, 124 | start_time=datetime.now(), 125 | stop_time=None, 126 | execution_time=None, 127 | experiment_file_path=Pipeline.experiment_file_path, 128 | is_model_accepted=None, 129 | message="Pipeline has been started.", 130 | accuracy=None, 131 | ) 132 | logging.info(f"Pipeline experiment: {Pipeline.experiment}") 133 | 134 | self.save_experiment() 135 | 136 | data_ingestion_artifact = self.start_data_ingestion() 137 | data_validation_artifact = self.start_data_validation(data_ingestion_artifact=data_ingestion_artifact) 138 | data_transformation_artifact = self.start_data_transformation( 139 | data_ingestion_artifact=data_ingestion_artifact, 140 | data_validation_artifact=data_validation_artifact 141 | ) 142 | model_trainer_artifact = self.start_model_trainer(data_transformation_artifact=data_transformation_artifact) 143 | 144 | model_evaluation_artifact = self.start_model_evaluation(data_ingestion_artifact=data_ingestion_artifact, 145 | data_validation_artifact=data_validation_artifact, 146 | model_trainer_artifact=model_trainer_artifact) 147 | 148 | if model_evaluation_artifact.is_model_accepted: 149 | model_pusher_artifact = self.start_model_pusher(model_eval_artifact=model_evaluation_artifact) 150 | logging.info(f'Model pusher artifact: {model_pusher_artifact}') 151 | else: 152 | logging.info("Trained model rejected.") 153 | logging.info("Pipeline completed.") 154 | 155 | stop_time = datetime.now() 156 | Pipeline.experiment = Experiment(experiment_id=Pipeline.experiment.experiment_id, 157 | initialization_timestamp=self.config.time_stamp, 158 | artifact_time_stamp=self.config.time_stamp, 159 | running_status=False, 160 | start_time=Pipeline.experiment.start_time, 161 | stop_time=stop_time, 162 | execution_time=stop_time - Pipeline.experiment.start_time, 163 | message="Pipeline has been completed.", 164 | experiment_file_path=Pipeline.experiment_file_path, 165 | is_model_accepted=model_evaluation_artifact.is_model_accepted, 166 | accuracy=model_trainer_artifact.model_accuracy 167 | ) 168 | logging.info(f"Pipeline experiment: {Pipeline.experiment}") 169 | self.save_experiment() 170 | except Exception as e: 171 | raise HousingException(e, sys) from e 172 | 173 | def run(self): 174 | try: 175 | self.run_pipeline() 176 | except Exception as e: 177 | raise e 178 | 179 | def save_experiment(self): 180 | try: 181 | if Pipeline.experiment.experiment_id is not None: 182 | experiment = Pipeline.experiment 183 | experiment_dict = experiment._asdict() 184 | experiment_dict: dict = {key: [value] for key, value in experiment_dict.items()} 185 | 186 | experiment_dict.update({ 187 | "created_time_stamp": [datetime.now()], 188 | "experiment_file_path": [os.path.basename(Pipeline.experiment.experiment_file_path)]}) 189 | 190 | experiment_report = pd.DataFrame(experiment_dict) 191 | 192 | os.makedirs(os.path.dirname(Pipeline.experiment_file_path), exist_ok=True) 193 | if os.path.exists(Pipeline.experiment_file_path): 194 | experiment_report.to_csv(Pipeline.experiment_file_path, index=False, header=False, mode="a") 195 | else: 196 | experiment_report.to_csv(Pipeline.experiment_file_path, mode="w", index=False, header=True) 197 | else: 198 | print("First start experiment") 199 | except Exception as e: 200 | raise HousingException(e, sys) from e 201 | 202 | @classmethod 203 | def get_experiments_status(cls, limit: int = 5) -> pd.DataFrame: 204 | try: 205 | if os.path.exists(Pipeline.experiment_file_path): 206 | df = pd.read_csv(Pipeline.experiment_file_path) 207 | limit = -1 * int(limit) 208 | return df[limit:].drop(columns=["experiment_file_path", "initialization_timestamp"], axis=1) 209 | else: 210 | return pd.DataFrame() 211 | except Exception as e: 212 | raise HousingException(e, sys) from e 213 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /notebook/model_training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from housing.entity.model_factory import ModelFactory,get_sample_model_config_yaml_file" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 5, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'d:\\\\Project\\\\machine_learning_project\\\\notebook'" 21 | ] 22 | }, 23 | "execution_count": 5, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "os.getcwd()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "'config\\\\model.yaml'" 41 | ] 42 | }, 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "get_sample_model_config_yaml_file(export_dir=\"config\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 6, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "from sklearn.linear_model import LinearRegression" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "LinearRegression()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 1, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "model_config_file=r\"D:\\Project\\machine_learning_project\\notebook\\config\\model.yaml\"" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "from neuro_mf import ModelFactory" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "model_factory = ModelFactory(model_config_path=model_config_file)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 9, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "{'fit_intercept': True}\n", 107 | "{'n_estimators': 40, 'min_samples_leaf': 2}\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "model_list = model_factory.get_initialized_model_list()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 11, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "2" 124 | ] 125 | }, 126 | "execution_count": 11, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "len(model_list)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 13, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "InitializedModelDetail(model_serial_number='module_1', model=RandomForestRegressor(min_samples_leaf=2, n_estimators=40), param_grid_search={'min_samples_leaf': [2, 4, 6], 'n_estimators': [50, 100, 80]}, model_name='sklearn.ensemble.RandomForestRegressor')" 144 | ] 145 | }, 146 | "execution_count": 13, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "model_list[1]" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 4, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "from housing.util.util import load_numpy_array_data" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 5, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "data_file_path=r\"D:\\Project\\machine_learning_project\\housing\\artifact\\data_transformation\\2022-07-03-13-23-39\\transformed_data\\train\\housing.npz\"" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "data = load_numpy_array_data(data_file_path)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "x,y = data[:,:-1],data[:,-1]" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "[GridSearchedBestModel(model_serial_number='module_0', model=LinearRegression(), best_model=LinearRegression(fit_intercept=False), best_parameters={'fit_intercept': False}, best_score=0.6393153733826),\n", 200 | " GridSearchedBestModel(model_serial_number='module_1', model=RandomForestRegressor(min_samples_leaf=2, n_estimators=40), best_model=RandomForestRegressor(min_samples_leaf=2, n_estimators=80), best_parameters={'min_samples_leaf': 2, 'n_estimators': 80}, best_score=0.8050101845299591)]" 201 | ] 202 | }, 203 | "execution_count": 9, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "model_factory.grid_searched_best_model_list" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 10, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "Fitting 4 folds for each of 2 candidates, totalling 8 fits\n", 222 | "[CV] END .................................fit_intercept=True; total time= 0.0s\n", 223 | "[CV] END .................................fit_intercept=True; total time= 0.0s\n", 224 | "[CV] END .................................fit_intercept=True; total time= 0.0s\n", 225 | "[CV] END .................................fit_intercept=True; total time= 0.0s\n", 226 | "[CV] END ................................fit_intercept=False; total time= 0.0s\n", 227 | "[CV] END ................................fit_intercept=False; total time= 0.0s\n", 228 | "[CV] END ................................fit_intercept=False; total time= 0.0s\n", 229 | "[CV] END ................................fit_intercept=False; total time= 0.0s\n", 230 | "Fitting 4 folds for each of 9 candidates, totalling 36 fits\n", 231 | "[CV] END ................min_samples_leaf=2, n_estimators=50; total time= 4.4s\n", 232 | "[CV] END ................min_samples_leaf=2, n_estimators=50; total time= 4.5s\n", 233 | "[CV] END ................min_samples_leaf=2, n_estimators=50; total time= 4.5s\n", 234 | "[CV] END ................min_samples_leaf=2, n_estimators=50; total time= 4.5s\n", 235 | "[CV] END ...............min_samples_leaf=2, n_estimators=100; total time= 9.2s\n", 236 | "[CV] END ...............min_samples_leaf=2, n_estimators=100; total time= 9.2s\n", 237 | "[CV] END ...............min_samples_leaf=2, n_estimators=100; total time= 9.1s\n", 238 | "[CV] END ...............min_samples_leaf=2, n_estimators=100; total time= 9.1s\n", 239 | "[CV] END ................min_samples_leaf=2, n_estimators=80; total time= 7.4s\n", 240 | "[CV] END ................min_samples_leaf=2, n_estimators=80; total time= 7.3s\n", 241 | "[CV] END ................min_samples_leaf=2, n_estimators=80; total time= 7.4s\n", 242 | "[CV] END ................min_samples_leaf=2, n_estimators=80; total time= 7.5s\n", 243 | "[CV] END ................min_samples_leaf=4, n_estimators=50; total time= 4.1s\n", 244 | "[CV] END ................min_samples_leaf=4, n_estimators=50; total time= 4.1s\n", 245 | "[CV] END ................min_samples_leaf=4, n_estimators=50; total time= 4.0s\n", 246 | "[CV] END ................min_samples_leaf=4, n_estimators=50; total time= 4.0s\n", 247 | "[CV] END ...............min_samples_leaf=4, n_estimators=100; total time= 8.2s\n", 248 | "[CV] END ...............min_samples_leaf=4, n_estimators=100; total time= 8.1s\n", 249 | "[CV] END ...............min_samples_leaf=4, n_estimators=100; total time= 8.1s\n", 250 | "[CV] END ...............min_samples_leaf=4, n_estimators=100; total time= 8.1s\n", 251 | "[CV] END ................min_samples_leaf=4, n_estimators=80; total time= 6.5s\n", 252 | "[CV] END ................min_samples_leaf=4, n_estimators=80; total time= 6.4s\n", 253 | "[CV] END ................min_samples_leaf=4, n_estimators=80; total time= 6.5s\n", 254 | "[CV] END ................min_samples_leaf=4, n_estimators=80; total time= 6.5s\n", 255 | "[CV] END ................min_samples_leaf=6, n_estimators=50; total time= 3.8s\n", 256 | "[CV] END ................min_samples_leaf=6, n_estimators=50; total time= 3.7s\n", 257 | "[CV] END ................min_samples_leaf=6, n_estimators=50; total time= 3.7s\n", 258 | "[CV] END ................min_samples_leaf=6, n_estimators=50; total time= 3.8s\n", 259 | "[CV] END ...............min_samples_leaf=6, n_estimators=100; total time= 7.5s\n", 260 | "[CV] END ...............min_samples_leaf=6, n_estimators=100; total time= 7.7s\n", 261 | "[CV] END ...............min_samples_leaf=6, n_estimators=100; total time= 7.6s\n", 262 | "[CV] END ...............min_samples_leaf=6, n_estimators=100; total time= 7.5s\n", 263 | "[CV] END ................min_samples_leaf=6, n_estimators=80; total time= 6.2s\n", 264 | "[CV] END ................min_samples_leaf=6, n_estimators=80; total time= 6.0s\n", 265 | "[CV] END ................min_samples_leaf=6, n_estimators=80; total time= 6.0s\n", 266 | "[CV] END ................min_samples_leaf=6, n_estimators=80; total time= 6.0s\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "best_model = model_factory.get_best_model(x,y,0.79)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 11, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "RandomForestRegressor(min_samples_leaf=2)" 283 | ] 284 | }, 285 | "execution_count": 11, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "best_model.best_model" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 12, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/plain": [ 302 | "GridSearchedBestModel(model_serial_number='module_0', model=LinearRegression(), best_model=LinearRegression(fit_intercept=False), best_parameters={'fit_intercept': False}, best_score=0.6393153733826)" 303 | ] 304 | }, 305 | "execution_count": 12, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "model_factory.grid_searched_best_model_list[0]" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 26, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "[InitializedModelDetail(model_serial_number='module_0', model=LinearRegression(), param_grid_search={'fit_intercept': [True, False]}, model_name='sklearn.linear_model.LinearRegression'),\n", 323 | " InitializedModelDetail(model_serial_number='module_1', model=RandomForestRegressor(min_samples_leaf=2, n_estimators=40), param_grid_search={'min_samples_leaf': [2, 4, 6], 'n_estimators': [50, 100, 80]}, model_name='sklearn.ensemble.RandomForestRegressor')]" 324 | ] 325 | }, 326 | "execution_count": 26, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "model_factory.initialized_model_list" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 3.7.0", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.7.0" 360 | }, 361 | "orig_nbformat": 4, 362 | "vscode": { 363 | "interpreter": { 364 | "hash": "7a29293c9d4d8b93126739266382f07a312940ff8d40640417510f0b045f4058" 365 | } 366 | } 367 | }, 368 | "nbformat": 4, 369 | "nbformat_minor": 2 370 | } 371 | -------------------------------------------------------------------------------- /notebook/EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "\n", 11 | "os.chdir(os.pardir)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from housing.pipeline.pipeline import Pipeline" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "p=Pipeline()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "" 41 | ] 42 | }, 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "p" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stderr", 59 | "output_type": "stream", 60 | "text": [ 61 | "Exception in thread pipeline:\n", 62 | "Traceback (most recent call last):\n", 63 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py\", line 183, in save_experiment\n", 64 | " { \"experiment_file_path\": os.path.basename(experiment_dict[\"experiment_file_path\"]),\n", 65 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/venv/lib/python3.7/posixpath.py\", line 146, in basename\n", 66 | " p = os.fspath(p)\n", 67 | "TypeError: expected str, bytes or os.PathLike object, not list\n", 68 | "\n", 69 | "The above exception was the direct cause of the following exception:\n", 70 | "\n", 71 | "Traceback (most recent call last):\n", 72 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py\", line 132, in run_pipeline\n", 73 | " self.save_experiment()\n", 74 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py\", line 196, in save_experiment\n", 75 | " raise HousingException(e,sys) from e\n", 76 | "housing.exception.HousingException: \n", 77 | " Error occured in script: \n", 78 | " [ /home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py ] at \n", 79 | " try block line number: [183] and exception block line number: [196] \n", 80 | " error message: [expected str, bytes or os.PathLike object, not list]\n", 81 | " \n", 82 | "\n", 83 | "The above exception was the direct cause of the following exception:\n", 84 | "\n", 85 | "Traceback (most recent call last):\n", 86 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/venv/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n", 87 | " self.run()\n", 88 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py\", line 173, in run\n", 89 | " raise e\n", 90 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py\", line 171, in run\n", 91 | " self.run_pipeline()\n", 92 | " File \"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py\", line 167, in run_pipeline\n", 93 | " raise HousingException(e, sys) from e\n", 94 | "housing.exception.HousingException: \n", 95 | " Error occured in script: \n", 96 | " [ /home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py ] at \n", 97 | " try block line number: [132] and exception block line number: [167] \n", 98 | " error message: [\n", 99 | " Error occured in script: \n", 100 | " [ /home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/housing/pipeline/pipeline.py ] at \n", 101 | " try block line number: [183] and exception block line number: [196] \n", 102 | " error message: [expected str, bytes or os.PathLike object, not list]\n", 103 | " ]\n", 104 | " \n", 105 | "\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "p.start()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 1, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "f=\"/home/avnish/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/config/model.yaml\"" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 2, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "from housing.util.util import read_yaml_file" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 4, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "import json" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 9, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "json.dump(read_yaml_file(f),open(\"sample.json\",\"w\"),indent=4)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 11, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "data=\"\"\"{\n", 156 | " \"grid_search\": {\n", 157 | " \"class\": \"GridSearchCV\",\n", 158 | " \"module\": \"sklearn.model_selection\",\n", 159 | " \"params\": {\n", 160 | " \"cv\": 4,\n", 161 | " \"verbose\": 2\n", 162 | " }\n", 163 | " },\n", 164 | " \"model_selection\": {\n", 165 | " \"module_0\": {\n", 166 | " \"class\": \"LinearRegression\",\n", 167 | " \"module\": \"sklearn.linear_model\",\n", 168 | " \"params\": {\n", 169 | " \"fit_intercept\": true\n", 170 | " },\n", 171 | " \"search_param_grid\": {\n", 172 | " \"fit_intercept\": [\n", 173 | " true\n", 174 | " ]\n", 175 | " }\n", 176 | " },\n", 177 | " \"module_1\": {\n", 178 | " \"class\": \"RandomForestRegressor\",\n", 179 | " \"module\": \"sklearn.ensemble\",\n", 180 | " \"params\": {\n", 181 | " \"min_samples_leaf\": 2\n", 182 | " },\n", 183 | " \"search_param_grid\": {\n", 184 | " \"min_samples_leaf\": [\n", 185 | " 2\n", 186 | " ],\n", 187 | " \"n_estimators\": [\n", 188 | " 10\n", 189 | " ]\n", 190 | " }\n", 191 | " }\n", 192 | " }\n", 193 | "}\"\"\"" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 12, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "import json" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 32, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "ename": "JSONDecodeError", 212 | "evalue": "Expecting value: line 1 column 246 (char 245)", 213 | "output_type": "error", 214 | "traceback": [ 215 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 216 | "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", 217 | "\u001b[0;32m/tmp/ipykernel_13318/2214753276.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 218 | "\u001b[0;32m~/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/venv/lib/python3.7/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 348\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 349\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 219 | "\u001b[0;32m~/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/venv/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 220 | "\u001b[0;32m~/iNeuron_Private_Intelligence_Limited/MachineLearningProject/machine_learning_project/venv/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 356\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 221 | "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 246 (char 245)" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "json.loads(a)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 17, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "data=\"\"\"{'grid_search': {'class': 'GridSearchCV',\n", 236 | " 'module': 'sklearn.model_selection',\n", 237 | " 'params': {'cv': 4, 'verbose': 2}},\n", 238 | " 'model_selection': {'module_0': {'class': 'LinearRegression',\n", 239 | " 'module': 'sklearn.linear_model',\n", 240 | " 'params': {'fit_intercept': True},\n", 241 | " 'search_param_grid': {'fit_intercept': [True]}},\n", 242 | " 'module_1': {'class': 'RandomForestRegressor',\n", 243 | " 'module': 'sklearn.ensemble',\n", 244 | " 'params': {'min_samples_leaf': 2},\n", 245 | " 'search_param_grid': {'min_samples_leaf': [2], 'n_estimators': [10]}}}}\"\"\"" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 33, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "a=data.replace(\"'\",'\"').replace(\"\\n\",\"\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 27, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "1" 266 | ] 267 | }, 268 | "execution_count": 27, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "len(data[245])" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 37, 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "Expecting value: line 1 column 246 (char 245)\n" 287 | ] 288 | } 289 | ], 290 | "source": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 3.7.0", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.7.0" 317 | }, 318 | "orig_nbformat": 4, 319 | "vscode": { 320 | "interpreter": { 321 | "hash": "7a29293c9d4d8b93126739266382f07a312940ff8d40640417510f0b045f4058" 322 | } 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 2 327 | } 328 | -------------------------------------------------------------------------------- /housing/entity/model_factory.py: -------------------------------------------------------------------------------- 1 | from cmath import log 2 | import importlib 3 | from pyexpat import model 4 | import numpy as np 5 | import yaml 6 | from housing.exception import HousingException 7 | import os 8 | import sys 9 | 10 | from collections import namedtuple 11 | from typing import List 12 | from housing.logger import logging 13 | from sklearn.metrics import r2_score,mean_squared_error 14 | GRID_SEARCH_KEY = 'grid_search' 15 | MODULE_KEY = 'module' 16 | CLASS_KEY = 'class' 17 | PARAM_KEY = 'params' 18 | MODEL_SELECTION_KEY = 'model_selection' 19 | SEARCH_PARAM_GRID_KEY = "search_param_grid" 20 | 21 | InitializedModelDetail = namedtuple("InitializedModelDetail", 22 | ["model_serial_number", "model", "param_grid_search", "model_name"]) 23 | 24 | GridSearchedBestModel = namedtuple("GridSearchedBestModel", ["model_serial_number", 25 | "model", 26 | "best_model", 27 | "best_parameters", 28 | "best_score", 29 | ]) 30 | 31 | BestModel = namedtuple("BestModel", ["model_serial_number", 32 | "model", 33 | "best_model", 34 | "best_parameters", 35 | "best_score", ]) 36 | 37 | MetricInfoArtifact = namedtuple("MetricInfoArtifact", 38 | ["model_name", "model_object", "train_rmse", "test_rmse", "train_accuracy", 39 | "test_accuracy", "model_accuracy", "index_number"]) 40 | 41 | 42 | 43 | def evaluate_classification_model(model_list: list, X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray, base_accuracy:float=0.6)->MetricInfoArtifact: 44 | pass 45 | 46 | 47 | def evaluate_regression_model(model_list: list, X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray, base_accuracy:float=0.6) -> MetricInfoArtifact: 48 | """ 49 | Description: 50 | This function compare multiple regression model return best model 51 | 52 | Params: 53 | model_list: List of model 54 | X_train: Training dataset input feature 55 | y_train: Training dataset target feature 56 | X_test: Testing dataset input feature 57 | y_test: Testing dataset input feature 58 | 59 | return 60 | It retured a named tuple 61 | 62 | MetricInfoArtifact = namedtuple("MetricInfo", 63 | ["model_name", "model_object", "train_rmse", "test_rmse", "train_accuracy", 64 | "test_accuracy", "model_accuracy", "index_number"]) 65 | 66 | """ 67 | try: 68 | 69 | 70 | index_number = 0 71 | metric_info_artifact = None 72 | for model in model_list: 73 | model_name = str(model) #getting model name based on model object 74 | logging.info(f"{'>>'*30}Started evaluating model: [{type(model).__name__}] {'<<'*30}") 75 | 76 | #Getting prediction for training and testing dataset 77 | y_train_pred = model.predict(X_train) 78 | y_test_pred = model.predict(X_test) 79 | 80 | #Calculating r squared score on training and testing dataset 81 | train_acc = r2_score(y_train, y_train_pred) 82 | test_acc = r2_score(y_test, y_test_pred) 83 | 84 | #Calculating mean squared error on training and testing dataset 85 | train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred)) 86 | test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred)) 87 | 88 | # Calculating harmonic mean of train_accuracy and test_accuracy 89 | model_accuracy = (2 * (train_acc * test_acc)) / (train_acc + test_acc) 90 | diff_test_train_acc = abs(test_acc - train_acc) 91 | 92 | #logging all important metric 93 | logging.info(f"{'>>'*30} Score {'<<'*30}") 94 | logging.info(f"Train Score\t\t Test Score\t\t Average Score") 95 | logging.info(f"{train_acc}\t\t {test_acc}\t\t{model_accuracy}") 96 | 97 | logging.info(f"{'>>'*30} Loss {'<<'*30}") 98 | logging.info(f"Diff test train accuracy: [{diff_test_train_acc}].") 99 | logging.info(f"Train root mean squared error: [{train_rmse}].") 100 | logging.info(f"Test root mean squared error: [{test_rmse}].") 101 | 102 | 103 | #if model accuracy is greater than base accuracy and train and test score is within certain thershold 104 | #we will accept that model as accepted model 105 | if model_accuracy >= base_accuracy and diff_test_train_acc < 0.05: 106 | base_accuracy = model_accuracy 107 | metric_info_artifact = MetricInfoArtifact(model_name=model_name, 108 | model_object=model, 109 | train_rmse=train_rmse, 110 | test_rmse=test_rmse, 111 | train_accuracy=train_acc, 112 | test_accuracy=test_acc, 113 | model_accuracy=model_accuracy, 114 | index_number=index_number) 115 | 116 | logging.info(f"Acceptable model found {metric_info_artifact}. ") 117 | index_number += 1 118 | if metric_info_artifact is None: 119 | logging.info(f"No model found with higher accuracy than base accuracy") 120 | return metric_info_artifact 121 | except Exception as e: 122 | raise HousingException(e, sys) from e 123 | 124 | 125 | def get_sample_model_config_yaml_file(export_dir: str): 126 | try: 127 | model_config = { 128 | GRID_SEARCH_KEY: { 129 | MODULE_KEY: "sklearn.model_selection", 130 | CLASS_KEY: "GridSearchCV", 131 | PARAM_KEY: { 132 | "cv": 3, 133 | "verbose": 1 134 | } 135 | 136 | }, 137 | MODEL_SELECTION_KEY: { 138 | "module_0": { 139 | MODULE_KEY: "module_of_model", 140 | CLASS_KEY: "ModelClassName", 141 | PARAM_KEY: 142 | {"param_name1": "value1", 143 | "param_name2": "value2", 144 | }, 145 | SEARCH_PARAM_GRID_KEY: { 146 | "param_name": ['param_value_1', 'param_value_2'] 147 | } 148 | 149 | }, 150 | } 151 | } 152 | os.makedirs(export_dir, exist_ok=True) 153 | export_file_path = os.path.join(export_dir, "model.yaml") 154 | with open(export_file_path, 'w') as file: 155 | yaml.dump(model_config, file) 156 | return export_file_path 157 | except Exception as e: 158 | raise HousingException(e, sys) 159 | 160 | 161 | class ModelFactory: 162 | def __init__(self, model_config_path: str = None,): 163 | try: 164 | self.config: dict = ModelFactory.read_params(model_config_path) 165 | 166 | self.grid_search_cv_module: str = self.config[GRID_SEARCH_KEY][MODULE_KEY] 167 | self.grid_search_class_name: str = self.config[GRID_SEARCH_KEY][CLASS_KEY] 168 | self.grid_search_property_data: dict = dict(self.config[GRID_SEARCH_KEY][PARAM_KEY]) 169 | 170 | self.models_initialization_config: dict = dict(self.config[MODEL_SELECTION_KEY]) 171 | 172 | self.initialized_model_list = None 173 | self.grid_searched_best_model_list = None 174 | 175 | except Exception as e: 176 | raise HousingException(e, sys) from e 177 | 178 | @staticmethod 179 | def update_property_of_class(instance_ref:object, property_data: dict): 180 | try: 181 | if not isinstance(property_data, dict): 182 | raise Exception("property_data parameter required to dictionary") 183 | print(property_data) 184 | for key, value in property_data.items(): 185 | logging.info(f"Executing:$ {str(instance_ref)}.{key}={value}") 186 | setattr(instance_ref, key, value) 187 | return instance_ref 188 | except Exception as e: 189 | raise HousingException(e, sys) from e 190 | 191 | @staticmethod 192 | def read_params(config_path: str) -> dict: 193 | try: 194 | with open(config_path) as yaml_file: 195 | config:dict = yaml.safe_load(yaml_file) 196 | return config 197 | except Exception as e: 198 | raise HousingException(e, sys) from e 199 | 200 | @staticmethod 201 | def class_for_name(module_name:str, class_name:str): 202 | try: 203 | # load the module, will raise ImportError if module cannot be loaded 204 | module = importlib.import_module(module_name) 205 | # get the class, will raise AttributeError if class cannot be found 206 | logging.info(f"Executing command: from {module} import {class_name}") 207 | class_ref = getattr(module, class_name) 208 | return class_ref 209 | except Exception as e: 210 | raise HousingException(e, sys) from e 211 | 212 | def execute_grid_search_operation(self, initialized_model: InitializedModelDetail, input_feature, 213 | output_feature) -> GridSearchedBestModel: 214 | """ 215 | excute_grid_search_operation(): function will perform paramter search operation and 216 | it will return you the best optimistic model with best paramter: 217 | estimator: Model object 218 | param_grid: dictionary of paramter to perform search operation 219 | input_feature: your all input features 220 | output_feature: Target/Dependent features 221 | ================================================================================ 222 | return: Function will return GridSearchOperation object 223 | """ 224 | try: 225 | # instantiating GridSearchCV class 226 | 227 | 228 | grid_search_cv_ref = ModelFactory.class_for_name(module_name=self.grid_search_cv_module, 229 | class_name=self.grid_search_class_name 230 | ) 231 | 232 | grid_search_cv = grid_search_cv_ref(estimator=initialized_model.model, 233 | param_grid=initialized_model.param_grid_search) 234 | grid_search_cv = ModelFactory.update_property_of_class(grid_search_cv, 235 | self.grid_search_property_data) 236 | 237 | 238 | message = f'{">>"* 30} f"Training {type(initialized_model.model).__name__} Started." {"<<"*30}' 239 | logging.info(message) 240 | grid_search_cv.fit(input_feature, output_feature) 241 | message = f'{">>"* 30} f"Training {type(initialized_model.model).__name__}" completed {"<<"*30}' 242 | grid_searched_best_model = GridSearchedBestModel(model_serial_number=initialized_model.model_serial_number, 243 | model=initialized_model.model, 244 | best_model=grid_search_cv.best_estimator_, 245 | best_parameters=grid_search_cv.best_params_, 246 | best_score=grid_search_cv.best_score_ 247 | ) 248 | 249 | return grid_searched_best_model 250 | except Exception as e: 251 | raise HousingException(e, sys) from e 252 | 253 | def get_initialized_model_list(self) -> List[InitializedModelDetail]: 254 | """ 255 | This function will return a list of model details. 256 | return List[ModelDetail] 257 | """ 258 | try: 259 | initialized_model_list = [] 260 | for model_serial_number in self.models_initialization_config.keys(): 261 | 262 | model_initialization_config = self.models_initialization_config[model_serial_number] 263 | model_obj_ref = ModelFactory.class_for_name(module_name=model_initialization_config[MODULE_KEY], 264 | class_name=model_initialization_config[CLASS_KEY] 265 | ) 266 | model = model_obj_ref() 267 | 268 | if PARAM_KEY in model_initialization_config: 269 | model_obj_property_data = dict(model_initialization_config[PARAM_KEY]) 270 | model = ModelFactory.update_property_of_class(instance_ref=model, 271 | property_data=model_obj_property_data) 272 | 273 | param_grid_search = model_initialization_config[SEARCH_PARAM_GRID_KEY] 274 | model_name = f"{model_initialization_config[MODULE_KEY]}.{model_initialization_config[CLASS_KEY]}" 275 | 276 | model_initialization_config = InitializedModelDetail(model_serial_number=model_serial_number, 277 | model=model, 278 | param_grid_search=param_grid_search, 279 | model_name=model_name 280 | ) 281 | 282 | initialized_model_list.append(model_initialization_config) 283 | 284 | self.initialized_model_list = initialized_model_list 285 | return self.initialized_model_list 286 | except Exception as e: 287 | raise HousingException(e, sys) from e 288 | 289 | def initiate_best_parameter_search_for_initialized_model(self, initialized_model: InitializedModelDetail, 290 | input_feature, 291 | output_feature) -> GridSearchedBestModel: 292 | """ 293 | initiate_best_model_parameter_search(): function will perform paramter search operation and 294 | it will return you the best optimistic model with best paramter: 295 | estimator: Model object 296 | param_grid: dictionary of paramter to perform search operation 297 | input_feature: your all input features 298 | output_feature: Target/Dependent features 299 | ================================================================================ 300 | return: Function will return a GridSearchOperation 301 | """ 302 | try: 303 | return self.execute_grid_search_operation(initialized_model=initialized_model, 304 | input_feature=input_feature, 305 | output_feature=output_feature) 306 | except Exception as e: 307 | raise HousingException(e, sys) from e 308 | 309 | def initiate_best_parameter_search_for_initialized_models(self, 310 | initialized_model_list: List[InitializedModelDetail], 311 | input_feature, 312 | output_feature) -> List[GridSearchedBestModel]: 313 | 314 | try: 315 | self.grid_searched_best_model_list = [] 316 | for initialized_model_list in initialized_model_list: 317 | grid_searched_best_model = self.initiate_best_parameter_search_for_initialized_model( 318 | initialized_model=initialized_model_list, 319 | input_feature=input_feature, 320 | output_feature=output_feature 321 | ) 322 | self.grid_searched_best_model_list.append(grid_searched_best_model) 323 | return self.grid_searched_best_model_list 324 | except Exception as e: 325 | raise HousingException(e, sys) from e 326 | 327 | @staticmethod 328 | def get_model_detail(model_details: List[InitializedModelDetail], 329 | model_serial_number: str) -> InitializedModelDetail: 330 | """ 331 | This function return ModelDetail 332 | """ 333 | try: 334 | for model_data in model_details: 335 | if model_data.model_serial_number == model_serial_number: 336 | return model_data 337 | except Exception as e: 338 | raise HousingException(e, sys) from e 339 | 340 | @staticmethod 341 | def get_best_model_from_grid_searched_best_model_list(grid_searched_best_model_list: List[GridSearchedBestModel], 342 | base_accuracy=0.6 343 | ) -> BestModel: 344 | try: 345 | best_model = None 346 | for grid_searched_best_model in grid_searched_best_model_list: 347 | if base_accuracy < grid_searched_best_model.best_score: 348 | logging.info(f"Acceptable model found:{grid_searched_best_model}") 349 | base_accuracy = grid_searched_best_model.best_score 350 | 351 | best_model = grid_searched_best_model 352 | if not best_model: 353 | raise Exception(f"None of Model has base accuracy: {base_accuracy}") 354 | logging.info(f"Best model: {best_model}") 355 | return best_model 356 | except Exception as e: 357 | raise HousingException(e, sys) from e 358 | 359 | def get_best_model(self, X, y,base_accuracy=0.6) -> BestModel: 360 | try: 361 | logging.info("Started Initializing model from config file") 362 | initialized_model_list = self.get_initialized_model_list() 363 | logging.info(f"Initialized model: {initialized_model_list}") 364 | grid_searched_best_model_list = self.initiate_best_parameter_search_for_initialized_models( 365 | initialized_model_list=initialized_model_list, 366 | input_feature=X, 367 | output_feature=y 368 | ) 369 | return ModelFactory.get_best_model_from_grid_searched_best_model_list(grid_searched_best_model_list, 370 | base_accuracy=base_accuracy) 371 | except Exception as e: 372 | raise HousingException(e, sys) -------------------------------------------------------------------------------- /notebook/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from collections import namedtuple" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "1. Download url\n", 17 | "2. Download folder (compressed file)\n", 18 | "3. Extract folder (extracted file))\n", 19 | "4. Train dataset folder\n", 20 | "5. Test dataset folder\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "DataIngestionConfig=namedtuple(\"DataIngestionConfig\",\n", 30 | "[\"dataset_download_url\",\"tgz_download_dir\",\"raw_data_dir\",\"ingested_train_dir\",\"ingested_test_dir\"])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 5, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "data_ingestion_config = DataIngestionConfig(dataset_download_url=\"asfasdf\",\n", 40 | "tgz_download_dir='asdasd',\n", 41 | "raw_data_dir=\"asdas\",\n", 42 | "ingested_train_dir=\"asdbfk\",\n", 43 | "ingested_test_dir=\"sadnjk\"\n", 44 | ")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 6, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "DataIngestionConfig(dataset_download_url='asfasdf', tgz_download_dir='asdasd', raw_data_dir='asdas', ingested_train_dir='asdbfk', ingested_test_dir='sadnjk')" 56 | ] 57 | }, 58 | "execution_count": 6, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "data_ingestion_config" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 7, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "('sdfjnksdf', 'sdwjkuf', 'asdfasd', 'wsdfbkiasd')" 76 | ] 77 | }, 78 | "execution_count": 7, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "(\"sdfjnksdf\",\"sdwjkuf\",\"asdfasd\",\"wsdfbkiasd\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 1, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "import yaml" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 2, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import os" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "'d:\\\\Project\\\\machine_learning_project\\\\notebook'" 114 | ] 115 | }, 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "os.getcwd()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 4, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "os.chdir(\"d:\\\\Project\\\\machine_learning_project\")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "'d:\\\\Project\\\\machine_learning_project'" 143 | ] 144 | }, 145 | "execution_count": 5, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "os.getcwd()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 6, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "config_file_path=os.path.join(\"config\",\"config.yaml\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 7, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "'config\\\\config.yaml'" 179 | ] 180 | }, 181 | "execution_count": 7, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "config_file_path" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 8, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "True" 199 | ] 200 | }, 201 | "execution_count": 8, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "os.path.exists(config_file_path)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 17, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "'d:\\\\Project\\\\machine_learning_project'" 219 | ] 220 | }, 221 | "execution_count": 17, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "os.getcwd()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 18, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "config_info=None\n", 237 | "with open(config_file_path,\"rb\") as yaml_file:\n", 238 | " config_info=yaml.safe_load(yaml_file)\n" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 20, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "{'dataset_download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz',\n", 250 | " 'raw_data_dir': 'raw_data',\n", 251 | " 'tgz_download_dir': 'tgz_data',\n", 252 | " 'ingested_dir': 'ingested_data',\n", 253 | " 'ingested_train_dir': 'train',\n", 254 | " 'ingested_test_dir': 'test'}" 255 | ] 256 | }, 257 | "execution_count": 20, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "config_info[\"data_ingestion_config\"]" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 21, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "def read_yaml_file(file_path:str)->dict:\n", 273 | " \"\"\"\n", 274 | " Reads a YAML file and returns the contents as a dictionary.\n", 275 | " file_path: str\n", 276 | " \"\"\"\n", 277 | " try:\n", 278 | " with open(file_path, 'rb') as yaml_file:\n", 279 | " return yaml.safe_load(yaml_file)\n", 280 | " except Exception as e:\n", 281 | " raise e" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 23, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "config =read_yaml_file(config_file_path)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 2, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "from housing.constant import *" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 26, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "'training_pipeline_config'" 311 | ] 312 | }, 313 | "execution_count": 26, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "TRAINING_PIPELINE_CONFIG_KEY" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 29, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "{'pipeline_name': 'housing', 'artifact_dir': 'artifact'}" 331 | ] 332 | }, 333 | "execution_count": 29, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "config[TRAINING_PIPELINE_CONFIG_KEY]" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 28, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "'housing'" 351 | ] 352 | }, 353 | "execution_count": 28, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "config[TRAINING_PIPELINE_CONFIG_KEY][TRAINING_PIPELINE_NAME_KEY]" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 31, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "training_pipeline_config = config[TRAINING_PIPELINE_CONFIG_KEY]\n", 369 | "artifact_dir = os.path.join(ROOT_DIR,\n", 370 | "training_pipeline_config[TRAINING_PIPELINE_NAME_KEY],\n", 371 | "training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY]\n", 372 | ")" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 33, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "'d:\\\\Project\\\\machine_learning_project'" 384 | ] 385 | }, 386 | "execution_count": 33, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "ROOT_DIR" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 34, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "'housing'" 404 | ] 405 | }, 406 | "execution_count": 34, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "training_pipeline_config[TRAINING_PIPELINE_NAME_KEY]" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 35, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "data": { 422 | "text/plain": [ 423 | "'artifact'" 424 | ] 425 | }, 426 | "execution_count": 35, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY]" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 32, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "'d:\\\\Project\\\\machine_learning_project\\\\housing\\\\artifact'" 444 | ] 445 | }, 446 | "execution_count": 32, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "artifact_dir" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 10, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "from housing.config.configuration import Configuartion" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 13, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "'d:\\\\Project\\\\machine_learning_project'" 473 | ] 474 | }, 475 | "execution_count": 13, 476 | "metadata": {}, 477 | "output_type": "execute_result" 478 | } 479 | ], 480 | "source": [ 481 | "os.getcwd()" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 12, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "config = Configuartion(config_file_path=\"d:\\\\Project\\\\machine_learning_project\\\\config\\\\config.yaml\")" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 29, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "training_pipeline_config=config.get_training_pipeline_config()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 31, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "artifact_dir = training_pipeline_config.artifact_dir" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 32, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "'data_ingestion'" 520 | ] 521 | }, 522 | "execution_count": 32, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "DATA_INGESTION_ARTIFACT_DIR" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 33, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "'2022-06-25-12-58-04'" 540 | ] 541 | }, 542 | "execution_count": 33, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "CURRENT_TIME_STAMP" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 11, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "from housing.constant import *" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 23, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [ 566 | "data_ingestion_info=config.config_info[DATA_INGESTION_CONFIG_KEY]" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 24, 572 | "metadata": {}, 573 | "outputs": [ 574 | { 575 | "data": { 576 | "text/plain": [ 577 | "{'dataset_download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz',\n", 578 | " 'raw_data_dir': 'raw_data',\n", 579 | " 'tgz_download_dir': 'tgz_data',\n", 580 | " 'ingested_dir': 'ingested_data',\n", 581 | " 'ingested_train_dir': 'train',\n", 582 | " 'ingested_test_dir': 'test'}" 583 | ] 584 | }, 585 | "execution_count": 24, 586 | "metadata": {}, 587 | "output_type": "execute_result" 588 | } 589 | ], 590 | "source": [ 591 | "data_ingestion_info" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 26, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "'dataset_download_url'" 603 | ] 604 | }, 605 | "execution_count": 26, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 6, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "from housing.constant import DATA_INGESTION_CONFIG_KEY" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 7, 624 | "metadata": {}, 625 | "outputs": [ 626 | { 627 | "data": { 628 | "text/plain": [ 629 | "'data_ingestion_config'" 630 | ] 631 | }, 632 | "execution_count": 7, 633 | "metadata": {}, 634 | "output_type": "execute_result" 635 | } 636 | ], 637 | "source": [ 638 | "DATA_INGESTION_CONFIG_KEY" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 27, 644 | "metadata": {}, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/plain": [ 649 | "'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz'" 650 | ] 651 | }, 652 | "execution_count": 27, 653 | "metadata": {}, 654 | "output_type": "execute_result" 655 | } 656 | ], 657 | "source": [ 658 | "data_ingestion_info[DATA_INGESTION_DOWNLOAD_URL_KEY]" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 28, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/plain": [ 669 | "{'dataset_download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz',\n", 670 | " 'raw_data_dir': 'raw_data',\n", 671 | " 'tgz_download_dir': 'tgz_data',\n", 672 | " 'ingested_dir': 'ingested_data',\n", 673 | " 'ingested_train_dir': 'train',\n", 674 | " 'ingested_test_dir': 'test'}" 675 | ] 676 | }, 677 | "execution_count": 28, 678 | "metadata": {}, 679 | "output_type": "execute_result" 680 | } 681 | ], 682 | "source": [ 683 | "data_ingestion_info" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 14, 689 | "metadata": {}, 690 | "outputs": [ 691 | { 692 | "ename": "HousingException", 693 | "evalue": "Error occured in script: [d:\\Project\\machine_learning_project\\housing\\config\\configuration.py] at line number: [68] error message: ['dict' object has no attribute 'config_info']", 694 | "output_type": "error", 695 | "traceback": [ 696 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 697 | "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", 698 | "\u001b[1;32md:\\Project\\machine_learning_project\\housing\\config\\configuration.py\u001b[0m in \u001b[0;36mget_data_ingestion_config\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 32\u001b[0m )\n\u001b[1;32m---> 33\u001b[1;33m \u001b[0mdata_ingestion_info\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconfig_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mDATA_INGESTION_CONFIG_KEY\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 699 | "\u001b[1;31mAttributeError\u001b[0m: 'dict' object has no attribute 'config_info'", 700 | "\nThe above exception was the direct cause of the following exception:\n", 701 | "\u001b[1;31mHousingException\u001b[0m Traceback (most recent call last)", 702 | "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_14708\\2719117554.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mconfig\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_data_ingestion_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 703 | "\u001b[1;32md:\\Project\\machine_learning_project\\housing\\config\\configuration.py\u001b[0m in \u001b[0;36mget_data_ingestion_config\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdata_ingestion_config\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 68\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mHousingException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0msys\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 69\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget_data_validation_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mDataValidationConfig\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 704 | "\u001b[1;31mHousingException\u001b[0m: Error occured in script: [d:\\Project\\machine_learning_project\\housing\\config\\configuration.py] at line number: [68] error message: ['dict' object has no attribute 'config_info']" 705 | ] 706 | } 707 | ], 708 | "source": [ 709 | "config.get_data_ingestion_config()" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 2, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "from housing.config.configuration import Configuartion" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 3, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "config = Configuartion(config_file_path=\"d:\\\\Project\\\\machine_learning_project\\\\config\\\\config.yaml\")" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 4, 740 | "metadata": {}, 741 | "outputs": [ 742 | { 743 | "data": { 744 | "text/plain": [ 745 | "DataIngestionConfig(dataset_download_url='https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz', tgz_download_dir='d:\\\\Project\\\\machine_learning_project\\\\notebook\\\\housing\\\\artifact\\\\data_ingestion\\\\2022-06-25-13-25-32\\\\tgz_data', raw_data_dir='d:\\\\Project\\\\machine_learning_project\\\\notebook\\\\housing\\\\artifact\\\\data_ingestion\\\\2022-06-25-13-25-32\\\\raw_data', ingested_train_dir='d:\\\\Project\\\\machine_learning_project\\\\notebook\\\\housing\\\\artifact\\\\data_ingestion\\\\2022-06-25-13-25-32\\\\ingested_data\\\\train', ingested_test_dir='d:\\\\Project\\\\machine_learning_project\\\\notebook\\\\housing\\\\artifact\\\\data_ingestion\\\\2022-06-25-13-25-32\\\\ingested_data\\\\test')" 746 | ] 747 | }, 748 | "execution_count": 4, 749 | "metadata": {}, 750 | "output_type": "execute_result" 751 | } 752 | ], 753 | "source": [ 754 | "config.get_data_ingestion_config()" 755 | ] 756 | }, 757 | { 758 | "cell_type": "markdown", 759 | "metadata": {}, 760 | "source": [] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [] 768 | } 769 | ], 770 | "metadata": { 771 | "kernelspec": { 772 | "display_name": "Python 3.7.0", 773 | "language": "python", 774 | "name": "python3" 775 | }, 776 | "language_info": { 777 | "codemirror_mode": { 778 | "name": "ipython", 779 | "version": 3 780 | }, 781 | "file_extension": ".py", 782 | "mimetype": "text/x-python", 783 | "name": "python", 784 | "nbconvert_exporter": "python", 785 | "pygments_lexer": "ipython3", 786 | "version": "3.7.0" 787 | }, 788 | "orig_nbformat": 4, 789 | "vscode": { 790 | "interpreter": { 791 | "hash": "7a29293c9d4d8b93126739266382f07a312940ff8d40640417510f0b045f4058" 792 | } 793 | } 794 | }, 795 | "nbformat": 4, 796 | "nbformat_minor": 2 797 | } 798 | --------------------------------------------------------------------------------