12 |
90 |
91 |
92 |
93 |
96 |
97 | {% if context['housing_data'] is not none %}
98 |
99 | Califoria Housing Prediction
100 |
101 | Input Feature
102 | Feature Value
103 |
104 |
105 | {% for column,value in context['housing_data'].items() %}
106 |
107 |
108 |
109 |
110 | {{column}}
111 | {{value[0]}}
112 |
113 |
114 | {% endfor %}
115 |
116 |
117 | median_house_value
118 |
119 | {{ context['median_house_value'] }}
120 |
121 |
122 |
123 |
124 | {% else %}
125 |
126 |
Submit Form
127 |
Kindly provide necessary information to estimate housing price in california
128 |
129 |
130 |
131 | {% endif %}
132 | Go to
Home
133 |
134 |
135 |
136 |
137 | {% endblock %}
--------------------------------------------------------------------------------
/housing/component/data_ingestion.py:
--------------------------------------------------------------------------------
1 | from housing.entity.config_entity import DataIngestionConfig
2 | import sys,os
3 | from housing.exception import HousingException
4 | from housing.logger import logging
5 | from housing.entity.artifact_entity import DataIngestionArtifact
6 | import tarfile
7 | import numpy as np
8 | from six.moves import urllib
9 | import pandas as pd
10 | from sklearn.model_selection import StratifiedShuffleSplit
11 |
12 | class DataIngestion:
13 |
14 | def __init__(self,data_ingestion_config:DataIngestionConfig ):
15 | try:
16 | logging.info(f"{'>>'*20}Data Ingestion log started.{'<<'*20} ")
17 | self.data_ingestion_config = data_ingestion_config
18 |
19 | except Exception as e:
20 | raise HousingException(e,sys)
21 |
22 |
23 | def download_housing_data(self,) -> str:
24 | try:
25 | #extraction remote url to download dataset
26 | download_url = self.data_ingestion_config.dataset_download_url
27 |
28 | #folder location to download file
29 | tgz_download_dir = self.data_ingestion_config.tgz_download_dir
30 |
31 | os.makedirs(tgz_download_dir,exist_ok=True)
32 |
33 | housing_file_name = os.path.basename(download_url)
34 |
35 | tgz_file_path = os.path.join(tgz_download_dir, housing_file_name)
36 |
37 | logging.info(f"Downloading file from :[{download_url}] into :[{tgz_file_path}]")
38 | urllib.request.urlretrieve(download_url, tgz_file_path)
39 | logging.info(f"File :[{tgz_file_path}] has been downloaded successfully.")
40 | return tgz_file_path
41 |
42 | except Exception as e:
43 | raise HousingException(e,sys) from e
44 |
45 | def extract_tgz_file(self,tgz_file_path:str):
46 | try:
47 | raw_data_dir = self.data_ingestion_config.raw_data_dir
48 |
49 | if os.path.exists(raw_data_dir):
50 | os.remove(raw_data_dir)
51 |
52 | os.makedirs(raw_data_dir,exist_ok=True)
53 |
54 | logging.info(f"Extracting tgz file: [{tgz_file_path}] into dir: [{raw_data_dir}]")
55 | with tarfile.open(tgz_file_path) as housing_tgz_file_obj:
56 | housing_tgz_file_obj.extractall(path=raw_data_dir)
57 | logging.info(f"Extraction completed")
58 |
59 | except Exception as e:
60 | raise HousingException(e,sys) from e
61 |
62 | def split_data_as_train_test(self) -> DataIngestionArtifact:
63 | try:
64 | raw_data_dir = self.data_ingestion_config.raw_data_dir
65 |
66 | file_name = os.listdir(raw_data_dir)[0]
67 |
68 | housing_file_path = os.path.join(raw_data_dir,file_name)
69 |
70 |
71 | logging.info(f"Reading csv file: [{housing_file_path}]")
72 | housing_data_frame = pd.read_csv(housing_file_path)
73 |
74 | housing_data_frame["income_cat"] = pd.cut(
75 | housing_data_frame["median_income"],
76 | bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
77 | labels=[1,2,3,4,5]
78 | )
79 |
80 |
81 | logging.info(f"Splitting data into train and test")
82 | strat_train_set = None
83 | strat_test_set = None
84 |
85 | split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
86 |
87 | for train_index,test_index in split.split(housing_data_frame, housing_data_frame["income_cat"]):
88 | strat_train_set = housing_data_frame.loc[train_index].drop(["income_cat"],axis=1)
89 | strat_test_set = housing_data_frame.loc[test_index].drop(["income_cat"],axis=1)
90 |
91 | train_file_path = os.path.join(self.data_ingestion_config.ingested_train_dir,
92 | file_name)
93 |
94 | test_file_path = os.path.join(self.data_ingestion_config.ingested_test_dir,
95 | file_name)
96 |
97 | if strat_train_set is not None:
98 | os.makedirs(self.data_ingestion_config.ingested_train_dir,exist_ok=True)
99 | logging.info(f"Exporting training datset to file: [{train_file_path}]")
100 | strat_train_set.to_csv(train_file_path,index=False)
101 |
102 | if strat_test_set is not None:
103 | os.makedirs(self.data_ingestion_config.ingested_test_dir, exist_ok= True)
104 | logging.info(f"Exporting test dataset to file: [{test_file_path}]")
105 | strat_test_set.to_csv(test_file_path,index=False)
106 |
107 |
108 | data_ingestion_artifact = DataIngestionArtifact(train_file_path=train_file_path,
109 | test_file_path=test_file_path,
110 | is_ingested=True,
111 | message=f"Data ingestion completed successfully."
112 | )
113 | logging.info(f"Data Ingestion artifact:[{data_ingestion_artifact}]")
114 | return data_ingestion_artifact
115 |
116 | except Exception as e:
117 | raise HousingException(e,sys) from e
118 |
119 | def initiate_data_ingestion(self)-> DataIngestionArtifact:
120 | try:
121 | tgz_file_path = self.download_housing_data()
122 | self.extract_tgz_file(tgz_file_path=tgz_file_path)
123 | return self.split_data_as_train_test()
124 | except Exception as e:
125 | raise HousingException(e,sys) from e
126 |
127 |
128 |
129 | def __del__(self):
130 | logging.info(f"{'>>'*20}Data Ingestion log completed.{'<<'*20} \n\n")
131 |
--------------------------------------------------------------------------------
/housing/component/data_validation.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from housing.logger import logging
4 | from housing.exception import HousingException
5 | from housing.entity.config_entity import DataValidationConfig
6 | from housing.entity.artifact_entity import DataIngestionArtifact,DataValidationArtifact
7 | import os,sys
8 | import pandas as pd
9 | from evidently.model_profile import Profile
10 | from evidently.model_profile.sections import DataDriftProfileSection
11 | from evidently.dashboard import Dashboard
12 | from evidently.dashboard.tabs import DataDriftTab
13 | import json
14 |
15 | class DataValidation:
16 |
17 |
18 | def __init__(self, data_validation_config:DataValidationConfig,
19 | data_ingestion_artifact:DataIngestionArtifact):
20 | try:
21 | logging.info(f"{'>>'*30}Data Valdaition log started.{'<<'*30} \n\n")
22 | self.data_validation_config = data_validation_config
23 | self.data_ingestion_artifact = data_ingestion_artifact
24 | except Exception as e:
25 | raise HousingException(e,sys) from e
26 |
27 |
28 | def get_train_and_test_df(self):
29 | try:
30 | train_df = pd.read_csv(self.data_ingestion_artifact.train_file_path)
31 | test_df = pd.read_csv(self.data_ingestion_artifact.test_file_path)
32 | return train_df,test_df
33 | except Exception as e:
34 | raise HousingException(e,sys) from e
35 |
36 |
37 | def is_train_test_file_exists(self)->bool:
38 | try:
39 | logging.info("Checking if training and test file is available")
40 | is_train_file_exist = False
41 | is_test_file_exist = False
42 |
43 | train_file_path = self.data_ingestion_artifact.train_file_path
44 | test_file_path = self.data_ingestion_artifact.test_file_path
45 |
46 | is_train_file_exist = os.path.exists(train_file_path)
47 | is_test_file_exist = os.path.exists(test_file_path)
48 |
49 | is_available = is_train_file_exist and is_test_file_exist
50 |
51 | logging.info(f"Is train and test file exists?-> {is_available}")
52 |
53 | if not is_available:
54 | training_file = self.data_ingestion_artifact.train_file_path
55 | testing_file = self.data_ingestion_artifact.test_file_path
56 | message=f"Training file: {training_file} or Testing file: {testing_file}" \
57 | "is not present"
58 | raise Exception(message)
59 |
60 | return is_available
61 | except Exception as e:
62 | raise HousingException(e,sys) from e
63 |
64 |
65 | def validate_dataset_schema(self)->bool:
66 | try:
67 | validation_status = False
68 |
69 | #Assigment validate training and testing dataset using schema file
70 | #1. Number of Column
71 | #2. Check the value of ocean proximity
72 | # acceptable values <1H OCEAN
73 | # INLAND
74 | # ISLAND
75 | # NEAR BAY
76 | # NEAR OCEAN
77 | #3. Check column names
78 |
79 |
80 | validation_status = True
81 | return validation_status
82 | except Exception as e:
83 | raise HousingException(e,sys) from e
84 |
85 | def get_and_save_data_drift_report(self):
86 | try:
87 | profile = Profile(sections=[DataDriftProfileSection()])
88 |
89 | train_df,test_df = self.get_train_and_test_df()
90 |
91 | profile.calculate(train_df,test_df)
92 |
93 | report = json.loads(profile.json())
94 |
95 | report_file_path = self.data_validation_config.report_file_path
96 | report_dir = os.path.dirname(report_file_path)
97 | os.makedirs(report_dir,exist_ok=True)
98 |
99 | with open(report_file_path,"w") as report_file:
100 | json.dump(report, report_file, indent=6)
101 | return report
102 | except Exception as e:
103 | raise HousingException(e,sys) from e
104 |
105 | def save_data_drift_report_page(self):
106 | try:
107 | dashboard = Dashboard(tabs=[DataDriftTab()])
108 | train_df,test_df = self.get_train_and_test_df()
109 | dashboard.calculate(train_df,test_df)
110 |
111 | report_page_file_path = self.data_validation_config.report_page_file_path
112 | report_page_dir = os.path.dirname(report_page_file_path)
113 | os.makedirs(report_page_dir,exist_ok=True)
114 |
115 | dashboard.save(report_page_file_path)
116 | except Exception as e:
117 | raise HousingException(e,sys) from e
118 |
119 | def is_data_drift_found(self)->bool:
120 | try:
121 | report = self.get_and_save_data_drift_report()
122 | self.save_data_drift_report_page()
123 | return True
124 | except Exception as e:
125 | raise HousingException(e,sys) from e
126 |
127 | def initiate_data_validation(self)->DataValidationArtifact :
128 | try:
129 | self.is_train_test_file_exists()
130 | self.validate_dataset_schema()
131 | self.is_data_drift_found()
132 |
133 | data_validation_artifact = DataValidationArtifact(
134 | schema_file_path=self.data_validation_config.schema_file_path,
135 | report_file_path=self.data_validation_config.report_file_path,
136 | report_page_file_path=self.data_validation_config.report_page_file_path,
137 | is_validated=True,
138 | message="Data Validation performed successully."
139 | )
140 | logging.info(f"Data validation artifact: {data_validation_artifact}")
141 | return data_validation_artifact
142 | except Exception as e:
143 | raise HousingException(e,sys) from e
144 |
145 |
146 | def __del__(self):
147 | logging.info(f"{'>>'*30}Data Valdaition log completed.{'<<'*30} \n\n")
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/housing/component/model_trainer.py:
--------------------------------------------------------------------------------
1 |
2 | from housing.exception import HousingException
3 | import sys
4 | from housing.logger import logging
5 | from typing import List
6 | from housing.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
7 | from housing.entity.config_entity import ModelTrainerConfig
8 | from housing.util.util import load_numpy_array_data,save_object,load_object
9 | from housing.entity.model_factory import MetricInfoArtifact, ModelFactory,GridSearchedBestModel
10 | from housing.entity.model_factory import evaluate_regression_model
11 |
12 |
13 |
14 | class HousingEstimatorModel:
15 | def __init__(self, preprocessing_object, trained_model_object):
16 | """
17 | TrainedModel constructor
18 | preprocessing_object: preprocessing_object
19 | trained_model_object: trained_model_object
20 | """
21 | self.preprocessing_object = preprocessing_object
22 | self.trained_model_object = trained_model_object
23 |
24 | def predict(self, X):
25 | """
26 | function accepts raw inputs and then transformed raw input using preprocessing_object
27 | which gurantees that the inputs are in the same format as the training data
28 | At last it perform prediction on transformed features
29 | """
30 | transformed_feature = self.preprocessing_object.transform(X)
31 | return self.trained_model_object.predict(transformed_feature)
32 |
33 | def __repr__(self):
34 | return f"{type(self.trained_model_object).__name__}()"
35 |
36 | def __str__(self):
37 | return f"{type(self.trained_model_object).__name__}()"
38 |
39 |
40 |
41 |
42 | class ModelTrainer:
43 |
44 | def __init__(self, model_trainer_config:ModelTrainerConfig, data_transformation_artifact: DataTransformationArtifact):
45 | try:
46 | logging.info(f"{'>>' * 30}Model trainer log started.{'<<' * 30} ")
47 | self.model_trainer_config = model_trainer_config
48 | self.data_transformation_artifact = data_transformation_artifact
49 | except Exception as e:
50 | raise HousingException(e, sys) from e
51 |
52 | def initiate_model_trainer(self)->ModelTrainerArtifact:
53 | try:
54 | logging.info(f"Loading transformed training dataset")
55 | transformed_train_file_path = self.data_transformation_artifact.transformed_train_file_path
56 | train_array = load_numpy_array_data(file_path=transformed_train_file_path)
57 |
58 | logging.info(f"Loading transformed testing dataset")
59 | transformed_test_file_path = self.data_transformation_artifact.transformed_test_file_path
60 | test_array = load_numpy_array_data(file_path=transformed_test_file_path)
61 |
62 | logging.info(f"Splitting training and testing input and target feature")
63 | x_train,y_train,x_test,y_test = train_array[:,:-1],train_array[:,-1],test_array[:,:-1],test_array[:,-1]
64 |
65 |
66 | logging.info(f"Extracting model config file path")
67 | model_config_file_path = self.model_trainer_config.model_config_file_path
68 |
69 | logging.info(f"Initializing model factory class using above model config file: {model_config_file_path}")
70 | model_factory = ModelFactory(model_config_path=model_config_file_path)
71 |
72 |
73 | base_accuracy = self.model_trainer_config.base_accuracy
74 | logging.info(f"Expected accuracy: {base_accuracy}")
75 |
76 | logging.info(f"Initiating operation model selecttion")
77 | best_model = model_factory.get_best_model(X=x_train,y=y_train,base_accuracy=base_accuracy)
78 |
79 | logging.info(f"Best model found on training dataset: {best_model}")
80 |
81 | logging.info(f"Extracting trained model list.")
82 | grid_searched_best_model_list:List[GridSearchedBestModel]=model_factory.grid_searched_best_model_list
83 |
84 | model_list = [model.best_model for model in grid_searched_best_model_list ]
85 | logging.info(f"Evaluation all trained model on training and testing dataset both")
86 | metric_info:MetricInfoArtifact = evaluate_regression_model(model_list=model_list,X_train=x_train,y_train=y_train,X_test=x_test,y_test=y_test,base_accuracy=base_accuracy)
87 |
88 | logging.info(f"Best found model on both training and testing dataset.")
89 |
90 | preprocessing_obj= load_object(file_path=self.data_transformation_artifact.preprocessed_object_file_path)
91 | model_object = metric_info.model_object
92 |
93 |
94 | trained_model_file_path=self.model_trainer_config.trained_model_file_path
95 | housing_model = HousingEstimatorModel(preprocessing_object=preprocessing_obj,trained_model_object=model_object)
96 | logging.info(f"Saving model at path: {trained_model_file_path}")
97 | save_object(file_path=trained_model_file_path,obj=housing_model)
98 |
99 |
100 | model_trainer_artifact= ModelTrainerArtifact(is_trained=True,message="Model Trained successfully",
101 | trained_model_file_path=trained_model_file_path,
102 | train_rmse=metric_info.train_rmse,
103 | test_rmse=metric_info.test_rmse,
104 | train_accuracy=metric_info.train_accuracy,
105 | test_accuracy=metric_info.test_accuracy,
106 | model_accuracy=metric_info.model_accuracy
107 |
108 | )
109 |
110 | logging.info(f"Model Trainer Artifact: {model_trainer_artifact}")
111 | return model_trainer_artifact
112 | except Exception as e:
113 | raise HousingException(e, sys) from e
114 |
115 | def __del__(self):
116 | logging.info(f"{'>>' * 30}Model trainer log completed.{'<<' * 30} ")
117 |
118 |
119 |
120 | #loading transformed training and testing datset
121 | #reading model config file
122 | #getting best model on training datset
123 | #evaludation models on both training & testing datset -->model object
124 | #loading preprocessing pbject
125 | #custom model object by combining both preprocessing obj and model obj
126 | #saving custom model object
127 | #return model_trainer_artifact
128 |
--------------------------------------------------------------------------------
/templates/header.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
16 |
17 |
19 | {% block head %}{% endblock %}
20 |
21 |
42 |
43 |
44 |
45 |
46 |
53 |
54 |
55 |
84 |
85 |
86 |
87 |
88 |
89 |