├── .gitignore ├── .idea ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── pyspark machine learning.iml └── vcs.xml ├── README.md ├── __pycache__ └── utility.cpython-37.pyc ├── artifacts ├── Prediction_Output_DIR │ └── Predictions.csv ├── model │ └── random_forest_regressor │ │ ├── data │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet │ │ ├── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ │ └── treesMetadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet ├── pipeline │ └── pipeline_model │ │ ├── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ │ └── stages │ │ ├── 0_StringIndexer_046a38b797e0 │ │ ├── data │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 1_OneHotEncoder_ccd93b498912 │ │ ├── data │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ └── 2_VectorAssembler_c52ccaa0dc60 │ │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 ├── prediction_data │ ├── Prediction_FileFromDB │ │ └── master.csv │ └── Prediction_Raw_files_validated │ │ └── Good_raw │ │ └── HealthPrem_26092020_131534.csv └── training_data │ ├── Training_FileFromDB │ └── master.csv │ └── Training_Raw_files_validated │ └── Good_raw │ └── HealthPrem_26092020_131534.csv ├── config ├── params.yaml ├── schema_prediction.json └── schema_training.json ├── csv_to_kafka.py ├── data ├── Prediction_Batch_files │ └── HealthPrem_26092020_131534.csv └── training_batch_files │ ├── .gitignore │ └── HealthPrem_26092020_131534.csv ├── diagram ├── Drawing1.vsdx ├── streaming.jpg └── training and prediction.pdf ├── entry_point.py ├── insurance_exception ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── insurance_exception.cpython-37.pyc └── insurance_exception.py ├── insurance_prediction.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt └── top_level.txt ├── logger ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── logger.cpython-37.pyc └── logger.py ├── mongo_db ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── mongo_db_atlas.cpython-37.pyc └── mongo_db_atlas.py ├── new_data.csv ├── prediction ├── __init__.py ├── stage_00_data_loader.py ├── stage_01_data_validator.py ├── stage_02_data_transformer.py ├── stage_03_data_exporter.py └── stage_04_model_predictor.py ├── prediction_files └── HealthPrem_26092020_131534.csv ├── requirement.txt ├── setup.py ├── spark_consumer_from_kafka.py ├── streaming ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── __init__.cpython-38.pyc ├── consumer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── kafka_to_spark_csv_consumer.cpython-37.pyc │ │ └── kafka_to_spark_csv_consumer.cpython-38.pyc │ └── kafka_to_spark_csv_consumer.py ├── producer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── kafka_csv_data_producer.cpython-37.pyc │ │ └── kafka_csv_data_producer.cpython-38.pyc │ └── kafka_csv_data_producer.py ├── spark_manager │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── spark_manager.cpython-37.pyc │ │ └── spark_manager.cpython-38.pyc │ └── spark_manager.py └── transformer │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── spark_transformer.cpython-37.pyc │ └── spark_transformer.cpython-38.pyc │ └── spark_transformer.py ├── training ├── __init__.py ├── stage_00_data_loader.py ├── stage_01_data_validator.py ├── stage_02_data_transformer.py ├── stage_03_data_exporter.py └── stage_04_model_trainer.py └── utility.py /.gitignore: -------------------------------------------------------------------------------- 1 | insurance_prediction.egg-info -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/pyspark machine learning.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Spark Configuration in window 10 2 | 3 | 1. Downlaod all required file from below URL: 4 | ``` 5 | https://drive.google.com/drive/folders/1rBauyUVCRTbnKXgkMGh4l9MdIOVj8CQc?usp=sharing 6 | ``` 7 | 8 | 2. Install java .exe file 9 | > note: choose installtion path of java to "C:" drive 10 | 11 | 3. Extract spark file in C drive 12 | 13 | 4. Extract kafka file in C drive 14 | 15 | 5. Add environment variable 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
ENVIRONMENT VARIABLE NAMEVALUE
HADOOP_HOMEC:\winutils
JAVA_HOMEC:\Java\jdk1.8.0_202
SPARK_HOMEC:\spark-3.0.3-bin-hadoop2.7
35 | 36 | 6. select path variable from environment variable and add below values. 37 | ```buildoutcfg 38 | %SPARK_HOME%\bin 39 | ``` 40 | ```buildoutcfg 41 | %HADOOP_HOME%\bin 42 | ``` 43 | ```buildoutcfg 44 | %JAVA_HOME%\bin 45 | ``` 46 | ```buildoutcfg 47 | C:\Java\jre1.8.0_281\bin 48 | ``` 49 | ## Create conda environment 50 | 51 | 1. open conda terminal execute below command 52 | 53 | ```buildoutcfg 54 | conda create -n python=3.8 -y 55 | ``` 56 | 57 | 2. select created in previous step for project interpreter in pycharm. 58 | 59 | 3. Install all necessary python library specified in requirements.txt file using below command. 60 | ```buildoutcfg 61 | pip install -r requirements.txt 62 | ``` 63 | 64 | 65 | 5. To upload your code to gihub repo 66 | ``` 67 | git init 68 | git add . 69 | git commit -m "first commit" 70 | git branch -M main 71 | git remote add origin 72 | git push -u origin main 73 | ``` 74 | 75 | ## Train random forest model on insurance dataset 76 | ```buildoutcfg 77 | python training\stage_00_data_loader.py 78 | ``` 79 | ```buildoutcfg 80 | python training\stage_01_data_validator.py 81 | ``` 82 | ```buildoutcfg 83 | python training\stage_02_data_transformer.py 84 | ``` 85 | ```buildoutcfg 86 | python training\stage_03_data_exporter.py 87 | ``` 88 | ```buildoutcfg 89 | spark-submit training\stage_04_model_trainer.py 90 | ``` 91 | 92 | ## Prediction using random forest of insurance dataset 93 | ```buildoutcfg 94 | python prediction\stage_00_data_loader.py 95 | ``` 96 | ```buildoutcfg 97 | python prediction\stage_01_data_validator.py 98 | ``` 99 | ```buildoutcfg 100 | python prediction\stage_02_data_transformer.py 101 | ``` 102 | ```buildoutcfg 103 | python prediction\stage_03_data_exporter.py 104 | ``` 105 | ```buildoutcfg 106 | spark-submit prediction\stage_04_model_predictor.py 107 | ``` 108 | 109 | 110 | 111 | # start zookeeper and kafka server 112 | 113 | 114 | 115 | ## start kafka producer using below command 116 | ```buildoutcfg 117 | spark-submit csv_to_kafka.py 118 | ``` 119 | 120 | ## start pyspark consumer using below command 121 | ```buildoutcfg 122 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 spark_consumer_from_kafka.py 123 | ``` -------------------------------------------------------------------------------- /__pycache__/utility.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/__pycache__/utility.cpython-37.pyc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/data/.part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/.part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/_SUCCESS -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/data/part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/metadata/.part-00000.crc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/metadata/_SUCCESS -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1636780450788,"sparkVersion":"3.0.3","uid":"RandomForestRegressor_ae83aa5164df","paramMap":{"labelCol":"expenses","featuresCol":"input_features"},"defaultParamMap":{"labelCol":"label","maxBins":32,"bootstrap":true,"cacheNodeIds":false,"predictionCol":"prediction","featureSubsetStrategy":"auto","featuresCol":"features","seed":469049852166159693,"leafCol":"","minInstancesPerNode":1,"checkpointInterval":10,"minInfoGain":0.0,"numTrees":20,"subsamplingRate":1.0,"maxDepth":5,"maxMemoryInMB":256,"impurity":"variance","minWeightFractionPerNode":0.0},"numFeatures":5,"numTrees":20} 2 | -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/treesMetadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/treesMetadata/.part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/.part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/treesMetadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/_SUCCESS -------------------------------------------------------------------------------- /artifacts/model/random_forest_regressor/treesMetadata/part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/metadata/.part-00000.crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/metadata/_SUCCESS -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.PipelineModel","timestamp":1636780381034,"sparkVersion":"3.0.3","uid":"PipelineModel_463619223312","paramMap":{"stageUids":["StringIndexer_046a38b797e0","OneHotEncoder_ccd93b498912","VectorAssembler_c52ccaa0dc60"]},"defaultParamMap":{}} 2 | -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/.part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/.part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/_SUCCESS -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/.part-00000.crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/_SUCCESS -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1636780381434,"sparkVersion":"3.0.3","uid":"StringIndexer_046a38b797e0","paramMap":{"inputCols":["sex","smoker"],"outputCols":["sex_encoder","smoker_encoder"]},"defaultParamMap":{"handleInvalid":"error","stringOrderType":"frequencyDesc","outputCol":"StringIndexer_046a38b797e0__output"}} 2 | -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/.part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/.part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/_SUCCESS -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/.part-00000.crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/_SUCCESS -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1636780382829,"sparkVersion":"3.0.3","uid":"OneHotEncoder_ccd93b498912","paramMap":{"inputCols":["sex_encoder","smoker_encoder"],"outputCols":["sex_encoded","smoker_encoded"]},"defaultParamMap":{"dropLast":true,"outputCol":"OneHotEncoder_ccd93b498912__output","handleInvalid":"error"}} 2 | -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/.part-00000.crc -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/_SUCCESS -------------------------------------------------------------------------------- /artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1636780383948,"sparkVersion":"3.0.3","uid":"VectorAssembler_c52ccaa0dc60","paramMap":{"inputCols":["age","bmi","children","sex_encoded","smoker_encoded"],"outputCol":"input_features"},"defaultParamMap":{"outputCol":"VectorAssembler_c52ccaa0dc60__output","handleInvalid":"error"}} 2 | -------------------------------------------------------------------------------- /artifacts/prediction_data/Prediction_FileFromDB/master.csv: -------------------------------------------------------------------------------- 1 | age,sex,bmi,children,smoker 2 | 19,female,27.9,0,yes 3 | 18,male,33.8,1,no 4 | 28,male,33.0,3,no 5 | 33,male,22.7,0,no 6 | 32,male,28.9,0,no 7 | 31,female,25.7,0,no 8 | 46,female,33.4,1,no 9 | 37,female,27.7,3,no 10 | 37,male,29.8,2,no 11 | 60,female,25.8,0,no 12 | 25,male,26.2,0,no 13 | 62,female,26.3,0,yes 14 | 23,male,34.4,0,no 15 | 56,female,39.8,0,no 16 | 27,male,42.1,0,yes 17 | 19,male,24.6,1,no 18 | 52,female,30.8,1,no 19 | 23,male,23.8,0,no 20 | 56,male,40.3,0,no 21 | 30,male,35.3,0,yes 22 | 60,female,36.0,0,no 23 | 30,female,32.4,1,no 24 | 18,male,34.1,0,no 25 | 34,female,31.9,1,yes 26 | 37,male,28.0,2,no 27 | 59,female,27.7,3,no 28 | 63,female,23.1,0,no 29 | 55,female,32.8,2,no 30 | 23,male,17.4,1,no 31 | 31,male,36.3,2,yes 32 | 22,male,35.6,0,yes 33 | 18,female,26.3,0,no 34 | 19,female,28.6,5,no 35 | 63,male,28.3,0,no 36 | 28,male,36.4,1,yes 37 | 19,male,20.4,0,no 38 | 62,female,33.0,3,no 39 | 26,male,20.8,0,no 40 | 35,male,36.7,1,yes 41 | 60,male,39.9,0,yes 42 | 24,female,26.6,0,no 43 | 31,female,36.6,2,no 44 | 41,male,21.8,1,no 45 | 37,female,30.8,2,no 46 | 38,male,37.1,1,no 47 | 55,male,37.3,0,no 48 | 18,female,38.7,2,no 49 | 28,female,34.8,0,no 50 | 60,female,24.5,0,no 51 | 36,male,35.2,1,yes 52 | 18,female,35.6,0,no 53 | 21,female,33.6,2,no 54 | 48,male,28.0,1,yes 55 | 36,male,34.4,0,yes 56 | 40,female,28.7,3,no 57 | 58,male,37.0,2,yes 58 | 58,female,31.8,2,no 59 | 18,male,31.7,2,yes 60 | 53,female,22.9,1,yes 61 | 34,female,37.3,2,no 62 | 43,male,27.4,3,no 63 | 25,male,33.7,4,no 64 | 64,male,24.7,1,no 65 | 28,female,25.9,1,no 66 | 20,female,22.4,0,yes 67 | 19,female,28.9,0,no 68 | 61,female,39.1,2,no 69 | 40,male,26.3,1,no 70 | 40,female,36.2,0,no 71 | 28,male,24.0,3,yes 72 | 27,female,24.8,0,yes 73 | 31,male,28.5,5,no 74 | 53,female,28.1,3,no 75 | 58,male,32.0,1,no 76 | 44,male,27.4,2,no 77 | 57,male,34.0,0,no 78 | 29,female,29.6,1,no 79 | 21,male,35.5,0,no 80 | 22,female,39.8,0,no 81 | 41,female,33.0,0,no 82 | 31,male,26.9,1,no 83 | 45,female,38.3,0,no 84 | 22,male,37.6,1,yes 85 | 48,female,41.2,4,no 86 | 37,female,34.8,2,yes 87 | 45,male,22.9,2,yes 88 | 57,female,31.2,0,yes 89 | 56,female,27.2,0,no 90 | 46,female,27.7,0,no 91 | 55,female,27.0,0,no 92 | 21,female,39.5,0,no 93 | 53,female,24.8,1,no 94 | 59,male,29.8,3,yes 95 | 35,male,34.8,2,no 96 | 64,female,31.3,2,yes 97 | 28,female,37.6,1,no 98 | 54,female,30.8,3,no 99 | 55,male,38.3,0,no 100 | 56,male,20.0,0,yes 101 | 38,male,19.3,0,yes 102 | 41,female,31.6,0,no 103 | 30,male,25.5,0,no 104 | 18,female,30.1,0,no 105 | 61,female,29.9,3,yes 106 | 34,female,27.5,1,no 107 | 20,male,28.0,1,yes 108 | 19,female,28.4,1,no 109 | 26,male,30.9,2,no 110 | 29,male,27.9,0,no 111 | 63,male,35.1,0,yes 112 | 54,male,33.6,1,no 113 | 55,female,29.7,2,no 114 | 37,male,30.8,0,no 115 | 21,female,35.7,0,no 116 | 52,male,32.2,3,no 117 | 60,male,28.6,0,no 118 | 58,male,49.1,0,no 119 | 29,female,27.9,1,yes 120 | 49,female,27.2,0,no 121 | 37,female,23.4,2,no 122 | 44,male,37.1,2,no 123 | 18,male,23.8,0,no 124 | 20,female,29.0,0,no 125 | 44,male,31.4,1,yes 126 | 47,female,33.9,3,no 127 | 26,female,28.8,0,no 128 | 19,female,28.3,0,yes 129 | 52,female,37.4,0,no 130 | 32,female,17.8,2,yes 131 | 38,male,34.7,2,no 132 | 59,female,26.5,0,no 133 | 61,female,22.0,0,no 134 | 53,female,35.9,2,no 135 | 19,male,25.6,0,no 136 | 20,female,28.8,0,no 137 | 22,female,28.1,0,no 138 | 19,male,34.1,0,no 139 | 22,male,25.2,0,no 140 | 54,female,31.9,3,no 141 | 22,female,36.0,0,no 142 | 34,male,22.4,2,no 143 | 26,male,32.5,1,no 144 | 34,male,25.3,2,yes 145 | 29,male,29.7,2,no 146 | 30,male,28.7,3,yes 147 | 29,female,38.8,3,no 148 | 46,male,30.5,3,yes 149 | 51,female,37.7,1,no 150 | 53,female,37.4,1,no 151 | 19,male,28.4,1,no 152 | 35,male,24.1,1,no 153 | 48,male,29.7,0,no 154 | 32,female,37.1,3,no 155 | 42,female,23.4,0,yes 156 | 40,female,25.5,1,no 157 | 44,male,39.5,0,no 158 | 48,male,24.4,0,yes 159 | 18,male,25.2,0,yes 160 | 30,male,35.5,0,yes 161 | 50,female,27.8,3,no 162 | 42,female,26.6,0,yes 163 | 18,female,36.9,0,yes 164 | 54,male,39.6,1,no 165 | 32,female,29.8,2,no 166 | 37,male,29.6,0,no 167 | 47,male,28.2,4,no 168 | 20,female,37.0,5,no 169 | 32,female,33.2,3,no 170 | 19,female,31.8,1,no 171 | 27,male,18.9,3,no 172 | 63,male,41.5,0,no 173 | 49,male,30.3,0,no 174 | 18,male,16.0,0,no 175 | 35,female,34.8,1,no 176 | 24,female,33.3,0,no 177 | 63,female,37.7,0,yes 178 | 38,male,27.8,2,no 179 | 54,male,29.2,1,no 180 | 46,female,28.9,2,no 181 | 41,female,33.2,3,no 182 | 58,male,28.6,0,no 183 | 18,female,38.3,0,no 184 | 22,male,20.0,3,no 185 | 44,female,26.4,0,no 186 | 44,male,30.7,2,no 187 | 36,male,41.9,3,yes 188 | 26,female,29.9,2,no 189 | 30,female,30.9,3,no 190 | 41,female,32.2,1,no 191 | 29,female,32.1,2,no 192 | 61,male,31.6,0,no 193 | 36,female,26.2,0,no 194 | 25,male,25.7,0,no 195 | 56,female,26.6,1,no 196 | 18,male,34.4,0,no 197 | 19,male,30.6,0,no 198 | 39,female,32.8,0,no 199 | 45,female,28.6,2,no 200 | 51,female,18.1,0,no 201 | 64,female,39.3,0,no 202 | 19,female,32.1,0,no 203 | 48,female,32.2,1,no 204 | 60,female,24.0,0,no 205 | 27,female,36.1,0,yes 206 | 46,male,22.3,0,no 207 | 28,female,28.9,1,no 208 | 59,male,26.4,0,no 209 | 35,male,27.7,2,yes 210 | 63,female,31.8,0,no 211 | 40,male,41.2,1,no 212 | 20,male,33.0,1,no 213 | 40,male,30.9,4,no 214 | 24,male,28.5,2,no 215 | 34,female,26.7,1,no 216 | 45,female,30.9,2,no 217 | 41,female,37.1,2,no 218 | 53,female,26.6,0,no 219 | 27,male,23.1,0,no 220 | 26,female,29.9,1,no 221 | 24,female,23.2,0,no 222 | 34,female,33.7,1,no 223 | 53,female,33.3,0,no 224 | 32,male,30.8,3,no 225 | 19,male,34.8,0,yes 226 | 42,male,24.6,0,yes 227 | 55,male,33.9,3,no 228 | 28,male,38.1,0,no 229 | 58,female,41.9,0,no 230 | 41,female,31.6,1,no 231 | 47,male,25.5,2,no 232 | 42,female,36.2,1,no 233 | 59,female,27.8,3,no 234 | 19,female,17.8,0,no 235 | 59,male,27.5,1,no 236 | 39,male,24.5,2,no 237 | 40,female,22.2,2,yes 238 | 18,female,26.7,0,no 239 | 31,male,38.4,2,no 240 | 19,male,29.1,0,yes 241 | 44,male,38.1,1,no 242 | 23,female,36.7,2,yes 243 | 33,female,22.1,1,no 244 | 55,female,26.8,1,no 245 | 40,male,35.3,3,no 246 | 63,female,27.7,0,yes 247 | 54,male,30.0,0,no 248 | 60,female,38.1,0,no 249 | 24,male,35.9,0,no 250 | 19,male,20.9,1,no 251 | 29,male,29.0,1,no 252 | 18,male,17.3,2,yes 253 | 63,female,32.2,2,yes 254 | 54,male,34.2,2,yes 255 | 27,male,30.3,3,no 256 | 50,male,31.8,0,yes 257 | 55,female,25.4,3,no 258 | 56,male,33.6,0,yes 259 | 38,female,40.2,0,no 260 | 51,male,24.4,4,no 261 | 19,male,31.9,0,yes 262 | 58,female,25.2,0,no 263 | 20,female,26.8,1,yes 264 | 52,male,24.3,3,yes 265 | 19,male,37.0,0,yes 266 | 53,female,38.1,3,no 267 | 46,male,42.4,3,yes 268 | 40,male,19.8,1,yes 269 | 59,female,32.4,3,no 270 | 45,male,30.2,1,no 271 | 49,male,25.8,1,no 272 | 18,male,29.4,1,no 273 | 50,male,34.2,2,yes 274 | 41,male,37.1,2,no 275 | 50,male,27.5,1,no 276 | 25,male,27.6,0,no 277 | 47,female,26.6,2,no 278 | 19,male,20.6,2,no 279 | 22,female,24.3,0,no 280 | 59,male,31.8,2,no 281 | 51,female,21.6,1,no 282 | 40,female,28.1,1,yes 283 | 54,male,40.6,3,yes 284 | 30,male,27.6,1,no 285 | 55,female,32.4,1,no 286 | 52,female,31.2,0,no 287 | 46,male,26.6,1,no 288 | 46,female,48.1,2,no 289 | 63,female,26.2,0,no 290 | 59,female,36.8,1,yes 291 | 52,male,26.4,3,no 292 | 28,female,33.4,0,no 293 | 29,male,29.6,1,no 294 | 25,male,45.5,2,yes 295 | 22,female,28.8,0,no 296 | 25,male,26.8,3,no 297 | 18,male,23.0,0,no 298 | 19,male,27.7,0,yes 299 | 47,male,25.4,1,yes 300 | 31,male,34.4,3,yes 301 | 48,female,28.9,1,no 302 | 36,male,27.6,3,no 303 | 53,female,22.6,3,yes 304 | 56,female,37.5,2,no 305 | 28,female,33.0,2,no 306 | 57,female,38.0,2,no 307 | 29,male,33.3,2,no 308 | 28,female,27.5,2,no 309 | 30,female,33.3,1,no 310 | 58,male,34.9,0,no 311 | 41,female,33.1,2,no 312 | 50,male,26.6,0,no 313 | 19,female,24.7,0,no 314 | 43,male,36.0,3,yes 315 | 49,male,35.9,0,no 316 | 27,female,31.4,0,yes 317 | 52,male,33.3,0,no 318 | 50,male,32.2,0,no 319 | 54,male,32.8,0,no 320 | 44,female,27.6,0,no 321 | 32,male,37.3,1,no 322 | 34,male,25.3,1,no 323 | 26,female,29.6,4,no 324 | 34,male,30.8,0,yes 325 | 57,male,40.9,0,no 326 | 29,male,27.2,0,no 327 | 40,male,34.1,1,no 328 | 27,female,23.2,1,no 329 | 45,male,36.5,2,yes 330 | 64,female,33.8,1,yes 331 | 52,male,36.7,0,no 332 | 61,female,36.4,1,yes 333 | 52,male,27.4,0,yes 334 | 61,female,31.2,0,no 335 | 56,female,28.8,0,no 336 | 43,female,35.7,2,no 337 | 64,male,34.5,0,no 338 | 60,male,25.7,0,no 339 | 62,male,27.6,1,no 340 | 50,male,32.3,1,yes 341 | 46,female,27.7,1,no 342 | 24,female,27.6,0,no 343 | 62,male,30.0,0,no 344 | 60,female,27.6,0,no 345 | 63,male,36.8,0,no 346 | 49,female,41.5,4,no 347 | 34,female,29.3,3,no 348 | 33,male,35.8,2,no 349 | 46,male,33.3,1,no 350 | 36,female,29.9,1,no 351 | 19,male,27.8,0,no 352 | 57,female,23.2,0,no 353 | 50,female,25.6,0,no 354 | 30,female,27.7,0,no 355 | 33,male,35.2,0,no 356 | 18,female,38.3,0,no 357 | 46,male,27.6,0,no 358 | 46,male,43.9,3,no 359 | 47,male,29.8,3,no 360 | 23,male,41.9,0,no 361 | 18,female,20.8,0,no 362 | 48,female,32.3,2,no 363 | 35,male,30.5,1,no 364 | 19,female,21.7,0,yes 365 | 21,female,26.4,1,no 366 | 21,female,21.9,2,no 367 | 49,female,30.8,1,no 368 | 56,female,32.3,3,no 369 | 42,female,25.0,2,no 370 | 44,male,32.0,2,no 371 | 18,male,30.4,3,no 372 | 61,female,21.1,0,no 373 | 57,female,22.2,0,no 374 | 42,female,33.2,1,no 375 | 26,male,32.9,2,yes 376 | 20,male,33.3,0,no 377 | 23,female,28.3,0,yes 378 | 39,female,24.9,3,yes 379 | 24,male,40.2,0,yes 380 | 64,female,30.1,3,no 381 | 62,male,31.5,1,no 382 | 27,female,18.0,2,yes 383 | 55,male,30.7,0,yes 384 | 55,male,33.0,0,no 385 | 35,female,43.3,2,no 386 | 44,male,22.1,2,no 387 | 19,male,34.4,0,no 388 | 58,female,39.1,0,no 389 | 50,male,25.4,2,no 390 | 26,female,22.6,0,no 391 | 24,female,30.2,3,no 392 | 48,male,35.6,4,no 393 | 19,female,37.4,0,no 394 | 48,male,31.4,1,no 395 | 49,male,31.4,1,no 396 | 46,female,32.3,2,no 397 | 46,male,19.9,0,no 398 | 43,female,34.4,3,no 399 | 21,male,31.0,0,no 400 | 64,male,25.6,2,no 401 | 18,female,38.2,0,no 402 | 51,female,20.6,0,no 403 | 47,male,47.5,1,no 404 | 64,female,33.0,0,no 405 | 49,male,32.3,3,no 406 | 31,male,20.4,0,no 407 | 52,female,38.4,2,no 408 | 33,female,24.3,0,no 409 | 47,female,23.6,1,no 410 | 38,male,21.1,3,no 411 | 32,male,30.0,1,no 412 | 19,male,17.5,0,no 413 | 44,female,20.2,1,yes 414 | 26,female,17.2,2,yes 415 | 25,male,23.9,5,no 416 | 19,female,35.2,0,no 417 | 43,female,35.6,1,no 418 | 52,male,34.1,0,no 419 | 36,female,22.6,2,yes 420 | 64,male,39.2,1,no 421 | 63,female,27.0,0,yes 422 | 64,male,33.9,0,yes 423 | 61,male,35.9,0,yes 424 | 40,male,32.8,1,yes 425 | 25,male,30.6,0,no 426 | 48,male,30.2,2,no 427 | 45,male,24.3,5,no 428 | 38,female,27.3,1,no 429 | 18,female,29.2,0,no 430 | 21,female,16.8,1,no 431 | 27,female,30.4,3,no 432 | 19,male,33.1,0,no 433 | 29,female,20.2,2,no 434 | 42,male,26.9,0,no 435 | 60,female,30.5,0,no 436 | 31,male,28.6,1,no 437 | 60,male,33.1,3,no 438 | 22,male,31.7,0,no 439 | 35,male,28.9,3,no 440 | 52,female,46.8,5,no 441 | 26,male,29.5,0,no 442 | 31,female,32.7,1,no 443 | 33,female,33.5,0,yes 444 | 18,male,43.0,0,no 445 | 59,female,36.5,1,no 446 | 56,male,26.7,1,yes 447 | 45,female,33.1,0,no 448 | 60,male,29.6,0,no 449 | 56,female,25.7,0,no 450 | 40,female,29.6,0,no 451 | 35,male,38.6,1,no 452 | 39,male,29.6,4,no 453 | 30,male,24.1,1,no 454 | 24,male,23.4,0,no 455 | 20,male,29.7,0,no 456 | 32,male,46.5,2,no 457 | 59,male,37.4,0,no 458 | 55,female,30.1,2,no 459 | 57,female,30.5,0,no 460 | 56,male,39.6,0,no 461 | 40,female,33.0,3,no 462 | 49,female,36.6,3,no 463 | 42,male,30.0,0,yes 464 | 62,female,38.1,2,no 465 | 56,male,25.9,0,no 466 | 19,male,25.2,0,no 467 | 30,female,28.4,1,yes 468 | 60,female,28.7,1,no 469 | 56,female,33.8,2,no 470 | 28,female,24.3,1,no 471 | 18,female,24.1,1,no 472 | 27,male,32.7,0,no 473 | 18,female,30.1,0,no 474 | 19,female,29.8,0,no 475 | 47,female,33.3,0,no 476 | 54,male,25.1,3,yes 477 | 61,male,28.3,1,yes 478 | 24,male,28.5,0,yes 479 | 25,male,35.6,0,no 480 | 21,male,36.9,0,no 481 | 23,male,32.6,0,no 482 | 63,male,41.3,3,no 483 | 49,male,37.5,2,no 484 | 18,female,31.4,0,no 485 | 51,female,39.5,1,no 486 | 48,male,34.3,3,no 487 | 31,female,31.1,0,no 488 | 54,female,21.5,3,no 489 | 19,male,28.7,0,no 490 | 44,female,38.1,0,yes 491 | 53,male,31.2,1,no 492 | 19,female,32.9,0,no 493 | 61,female,25.1,0,no 494 | 18,female,25.1,0,no 495 | 61,male,43.4,0,no 496 | 21,male,25.7,4,yes 497 | 20,male,27.9,0,no 498 | 31,female,23.6,2,no 499 | 45,male,28.7,2,no 500 | 44,female,24.0,2,no 501 | 62,female,39.2,0,no 502 | 29,male,34.4,0,yes 503 | 43,male,26.0,0,no 504 | 51,male,23.2,1,yes 505 | 19,male,30.3,0,yes 506 | 38,female,28.9,1,no 507 | 37,male,30.9,3,no 508 | 22,male,31.4,1,no 509 | 21,male,23.8,2,no 510 | 24,female,25.3,0,no 511 | 57,female,28.7,0,no 512 | 56,male,32.1,1,no 513 | 27,male,33.7,0,no 514 | 51,male,22.4,0,no 515 | 19,male,30.4,0,no 516 | 39,male,28.3,1,yes 517 | 58,male,35.7,0,no 518 | 20,male,35.3,1,no 519 | 45,male,30.5,2,no 520 | 35,female,31.0,1,no 521 | 31,male,30.9,0,no 522 | 50,female,27.4,0,no 523 | 32,female,44.2,0,no 524 | 51,female,33.9,0,no 525 | 38,female,37.7,0,no 526 | 42,male,26.1,1,yes 527 | 18,female,33.9,0,no 528 | 19,female,30.6,2,no 529 | 51,female,25.8,1,no 530 | 46,male,39.4,1,no 531 | 18,male,25.5,0,no 532 | 57,male,42.1,1,yes 533 | 62,female,31.7,0,no 534 | 59,male,29.7,2,no 535 | 37,male,36.2,0,no 536 | 64,male,40.5,0,no 537 | 38,male,28.0,1,no 538 | 33,female,38.9,3,no 539 | 46,female,30.2,2,no 540 | 46,female,28.1,1,no 541 | 53,male,31.4,0,no 542 | 34,female,38.0,3,no 543 | 20,female,31.8,2,no 544 | 63,female,36.3,0,no 545 | 54,female,47.4,0,yes 546 | 54,male,30.2,0,no 547 | 49,male,25.8,2,yes 548 | 28,male,35.4,0,no 549 | 54,female,46.7,2,no 550 | 25,female,28.6,0,no 551 | 43,female,46.2,0,yes 552 | 63,male,30.8,0,no 553 | 32,female,28.9,0,no 554 | 62,male,21.4,0,no 555 | 52,female,31.7,2,no 556 | 25,female,41.3,0,no 557 | 28,male,23.8,2,no 558 | 46,male,33.4,1,no 559 | 34,male,34.2,0,no 560 | 35,female,34.1,3,yes 561 | 19,male,35.5,0,no 562 | 46,female,20.0,2,no 563 | 54,female,32.7,0,no 564 | 27,male,30.5,0,no 565 | 50,male,44.8,1,no 566 | 18,female,32.1,2,no 567 | 19,female,30.5,0,no 568 | 38,female,40.6,1,no 569 | 41,male,30.6,2,no 570 | 49,female,31.9,5,no 571 | 48,male,40.6,2,yes 572 | 31,female,29.1,0,no 573 | 18,female,37.3,1,no 574 | 30,female,43.1,2,no 575 | 62,female,36.9,1,no 576 | 57,female,34.3,2,no 577 | 58,female,27.2,0,no 578 | 22,male,26.8,0,no 579 | 31,female,38.1,1,yes 580 | 52,male,30.2,1,no 581 | 25,female,23.5,0,no 582 | 59,male,25.5,1,no 583 | 19,male,30.6,0,no 584 | 39,male,45.4,2,no 585 | 32,female,23.7,1,no 586 | 19,male,20.7,0,no 587 | 33,female,28.3,1,no 588 | 21,male,20.2,3,no 589 | 34,female,30.2,1,yes 590 | 61,female,35.9,0,no 591 | 38,female,30.7,1,no 592 | 58,female,29.0,0,no 593 | 47,male,19.6,1,no 594 | 20,male,31.1,2,no 595 | 21,female,21.9,1,yes 596 | 41,male,40.3,0,no 597 | 46,female,33.7,1,no 598 | 42,female,29.5,2,no 599 | 34,female,33.3,1,no 600 | 43,male,32.6,2,no 601 | 52,female,37.5,2,no 602 | 18,female,39.2,0,no 603 | 51,male,31.6,0,no 604 | 56,female,25.3,0,no 605 | 64,female,39.1,3,no 606 | 19,female,28.3,0,yes 607 | 51,female,34.1,0,no 608 | 27,female,25.2,0,no 609 | 59,female,23.7,0,yes 610 | 28,male,27.0,2,no 611 | 30,male,37.8,2,yes 612 | 47,female,29.4,1,no 613 | 38,female,34.8,2,no 614 | 18,female,33.2,0,no 615 | 34,female,19.0,3,no 616 | 20,female,33.0,0,no 617 | 47,female,36.6,1,yes 618 | 56,female,28.6,0,no 619 | 49,male,25.6,2,yes 620 | 19,female,33.1,0,yes 621 | 55,female,37.1,0,no 622 | 30,male,31.4,1,no 623 | 37,male,34.1,4,yes 624 | 49,female,21.3,1,no 625 | 18,male,33.5,0,yes 626 | 59,male,28.8,0,no 627 | 29,female,26.0,0,no 628 | 36,male,28.9,3,no 629 | 33,male,42.5,1,no 630 | 58,male,38.0,0,no 631 | 44,female,39.0,0,yes 632 | 53,male,36.1,1,no 633 | 24,male,29.3,0,no 634 | 29,female,35.5,0,no 635 | 40,male,22.7,2,no 636 | 51,male,39.7,1,no 637 | 64,male,38.2,0,no 638 | 19,female,24.5,1,no 639 | 35,female,38.1,2,no 640 | 39,male,26.4,0,yes 641 | 56,male,33.7,4,no 642 | 33,male,42.4,5,no 643 | 42,male,28.3,3,yes 644 | 61,male,33.9,0,no 645 | 23,female,35.0,3,no 646 | 43,male,35.3,2,no 647 | 48,male,30.8,3,no 648 | 39,male,26.2,1,no 649 | 40,female,23.4,3,no 650 | 18,male,28.5,0,no 651 | 58,female,33.0,0,no 652 | 49,female,42.7,2,no 653 | 53,female,39.6,1,no 654 | 48,female,31.1,0,no 655 | 45,female,36.3,2,no 656 | 59,female,35.2,0,no 657 | 52,female,25.3,2,yes 658 | 26,female,42.4,1,no 659 | 27,male,33.2,2,no 660 | 48,female,35.9,1,no 661 | 57,female,28.8,4,no 662 | 37,male,46.5,3,no 663 | 57,female,24.0,1,no 664 | 32,female,31.5,1,no 665 | 18,male,33.7,0,no 666 | 64,female,23.0,0,yes 667 | 43,male,38.1,2,yes 668 | 49,male,28.7,1,no 669 | 40,female,32.8,2,yes 670 | 62,male,32.0,0,yes 671 | 40,female,29.8,1,no 672 | 30,male,31.6,3,no 673 | 29,female,31.2,0,no 674 | 36,male,29.7,0,no 675 | 41,female,31.0,0,no 676 | 44,female,43.9,2,yes 677 | 45,male,21.4,0,no 678 | 55,female,40.8,3,no 679 | 60,male,31.4,3,yes 680 | 56,male,36.1,3,no 681 | 49,female,23.2,2,no 682 | 21,female,17.4,1,no 683 | 19,male,20.3,0,no 684 | 39,male,35.3,2,yes 685 | 53,male,24.3,0,no 686 | 33,female,18.5,1,no 687 | 53,male,26.4,2,no 688 | 42,male,26.1,2,no 689 | 40,male,41.7,0,no 690 | 47,female,24.1,1,no 691 | 27,male,31.1,1,yes 692 | 21,male,27.4,0,no 693 | 47,male,36.2,1,no 694 | 20,male,32.4,1,no 695 | 24,male,23.7,0,no 696 | 27,female,34.8,1,no 697 | 26,female,40.2,0,no 698 | 53,female,32.3,2,no 699 | 41,male,35.8,1,yes 700 | 56,male,33.7,0,no 701 | 23,female,39.3,2,no 702 | 21,female,34.9,0,no 703 | 50,female,44.7,0,no 704 | 53,male,41.5,0,no 705 | 34,female,26.4,1,no 706 | 47,female,29.5,1,no 707 | 33,female,32.9,2,no 708 | 51,female,38.1,0,yes 709 | 49,male,28.7,3,no 710 | 31,female,30.5,3,no 711 | 36,female,27.7,0,no 712 | 18,male,35.2,1,no 713 | 50,female,23.5,2,no 714 | 43,female,30.7,2,no 715 | 20,male,40.5,0,no 716 | 24,female,22.6,0,no 717 | 60,male,28.9,0,no 718 | 49,female,22.6,1,no 719 | 60,male,24.3,1,no 720 | 51,female,36.7,2,no 721 | 58,female,33.4,0,no 722 | 51,female,40.7,0,no 723 | 53,male,36.6,3,no 724 | 62,male,37.4,0,no 725 | 19,male,35.4,0,no 726 | 50,female,27.1,1,no 727 | 30,female,39.1,3,yes 728 | 41,male,28.4,1,no 729 | 29,female,21.8,1,yes 730 | 18,female,40.3,0,no 731 | 41,female,36.1,1,no 732 | 35,male,24.4,3,yes 733 | 53,male,21.4,1,no 734 | 24,female,30.1,3,no 735 | 48,female,27.3,1,no 736 | 59,female,32.1,3,no 737 | 49,female,34.8,1,no 738 | 37,female,38.4,0,yes 739 | 26,male,23.7,2,no 740 | 23,male,31.7,3,yes 741 | 29,male,35.5,2,yes 742 | 45,male,24.0,2,no 743 | 27,male,29.2,0,yes 744 | 53,male,34.1,0,yes 745 | 31,female,26.6,0,no 746 | 50,male,26.4,0,no 747 | 50,female,30.1,1,no 748 | 34,male,27.0,2,no 749 | 19,male,21.8,0,no 750 | 47,female,36.0,1,no 751 | 28,male,30.9,0,no 752 | 37,female,26.4,0,yes 753 | 21,male,29.0,0,no 754 | 64,male,37.9,0,no 755 | 58,female,22.8,0,no 756 | 24,male,33.6,4,no 757 | 31,male,27.6,2,no 758 | 39,female,22.8,3,no 759 | 47,female,27.8,0,yes 760 | 30,male,37.4,3,no 761 | 18,male,38.2,0,yes 762 | 22,female,34.6,2,no 763 | 23,male,35.2,1,no 764 | 33,male,27.1,1,yes 765 | 27,male,26.0,0,no 766 | 45,female,25.2,2,no 767 | 57,female,31.8,0,no 768 | 47,male,32.3,1,no 769 | 42,female,29.0,1,no 770 | 64,female,39.7,0,no 771 | 38,female,19.5,2,no 772 | 61,male,36.1,3,no 773 | 53,female,26.7,2,no 774 | 44,female,36.5,0,no 775 | 19,female,28.9,0,yes 776 | 41,male,34.2,2,no 777 | 51,male,33.3,3,no 778 | 40,male,32.3,2,no 779 | 45,male,39.8,0,no 780 | 35,male,34.3,3,no 781 | 53,male,28.9,0,no 782 | 30,male,24.4,3,yes 783 | 18,male,41.1,0,no 784 | 51,male,36.0,1,no 785 | 50,female,27.6,1,yes 786 | 31,female,29.3,1,no 787 | 35,female,27.7,3,no 788 | 60,male,37.0,0,no 789 | 21,male,36.9,0,no 790 | 29,male,22.5,3,no 791 | 62,female,29.9,0,no 792 | 39,female,41.8,0,no 793 | 19,male,27.6,0,no 794 | 22,female,23.2,0,no 795 | 53,male,20.9,0,yes 796 | 39,female,31.9,2,no 797 | 27,male,28.5,0,yes 798 | 30,male,44.2,2,no 799 | 30,female,22.9,1,no 800 | 58,female,33.1,0,no 801 | 33,male,24.8,0,yes 802 | 42,female,26.2,1,no 803 | 64,female,36.0,0,no 804 | 21,male,22.3,1,no 805 | 18,female,42.2,0,yes 806 | 23,male,26.5,0,no 807 | 45,female,35.8,0,no 808 | 40,female,41.4,1,no 809 | 19,female,36.6,0,no 810 | 18,male,30.1,0,no 811 | 25,male,25.8,1,no 812 | 46,female,30.8,3,no 813 | 33,female,42.9,3,no 814 | 54,male,21.0,2,no 815 | 28,male,22.5,2,no 816 | 36,male,34.4,2,no 817 | 20,female,31.5,0,no 818 | 24,female,24.2,0,no 819 | 23,male,37.1,3,no 820 | 47,female,26.1,1,yes 821 | 33,female,35.5,0,yes 822 | 45,male,33.7,1,no 823 | 26,male,17.7,0,no 824 | 18,female,31.1,0,no 825 | 44,female,29.8,2,no 826 | 60,male,24.3,0,no 827 | 64,female,31.8,2,no 828 | 56,male,31.8,2,yes 829 | 36,male,28.0,1,yes 830 | 41,male,30.8,3,yes 831 | 39,male,21.9,1,no 832 | 63,male,33.1,0,no 833 | 36,female,25.8,0,no 834 | 28,female,23.8,2,no 835 | 58,male,34.4,0,no 836 | 36,male,33.8,1,no 837 | 42,male,36.0,2,no 838 | 36,male,31.5,0,no 839 | 56,female,28.3,0,no 840 | 35,female,23.5,2,no 841 | 59,female,31.4,0,no 842 | 21,male,31.1,0,no 843 | 59,male,24.7,0,no 844 | 23,female,32.8,2,yes 845 | 57,female,29.8,0,yes 846 | 53,male,30.5,0,no 847 | 60,female,32.5,0,yes 848 | 51,female,34.2,1,no 849 | 23,male,50.4,1,no 850 | 27,female,24.1,0,no 851 | 55,male,32.8,0,no 852 | 37,female,30.8,0,yes 853 | 61,male,32.3,2,no 854 | 46,female,35.5,0,yes 855 | 53,female,23.8,2,no 856 | 49,female,23.8,3,yes 857 | 20,female,29.6,0,no 858 | 48,female,33.1,0,yes 859 | 25,male,24.1,0,yes 860 | 25,female,32.2,1,no 861 | 57,male,28.1,0,no 862 | 37,female,47.6,2,yes 863 | 38,female,28.0,3,no 864 | 55,female,33.5,2,no 865 | 36,female,19.9,0,no 866 | 51,male,25.4,0,no 867 | 40,male,29.9,2,no 868 | 18,male,37.3,0,no 869 | 57,male,43.7,1,no 870 | 61,male,23.7,0,no 871 | 25,female,24.3,3,no 872 | 50,male,36.2,0,no 873 | 26,female,29.5,1,no 874 | 42,male,24.9,0,no 875 | 43,male,30.1,1,no 876 | 44,male,21.9,3,no 877 | 23,female,28.1,0,no 878 | 49,female,27.1,1,no 879 | 33,male,33.4,5,no 880 | 41,male,28.8,1,no 881 | 37,female,29.5,2,no 882 | 22,male,34.8,3,no 883 | 23,male,27.4,1,no 884 | 21,female,22.1,0,no 885 | 51,female,37.1,3,yes 886 | 25,male,26.7,4,no 887 | 32,male,28.9,1,yes 888 | 57,male,29.0,0,yes 889 | 36,female,30.0,0,no 890 | 22,male,39.5,0,no 891 | 57,male,33.6,1,no 892 | 64,female,26.9,0,yes 893 | 36,female,29.0,4,no 894 | 54,male,24.0,0,no 895 | 47,male,38.9,2,yes 896 | 62,male,32.1,0,no 897 | 61,female,44.0,0,no 898 | 43,female,20.0,2,yes 899 | 19,male,25.6,1,no 900 | 18,female,40.3,0,no 901 | 19,female,22.5,0,no 902 | 49,male,22.5,0,no 903 | 60,male,40.9,0,yes 904 | 26,male,27.3,3,no 905 | 49,male,36.9,0,no 906 | 60,female,35.1,0,no 907 | 26,female,29.4,2,no 908 | 27,male,32.6,3,no 909 | 44,female,32.3,1,no 910 | 63,male,39.8,3,no 911 | 32,female,24.6,0,yes 912 | 22,male,28.3,1,no 913 | 18,male,31.7,0,yes 914 | 59,female,26.7,3,no 915 | 44,female,27.5,1,no 916 | 33,male,24.6,2,no 917 | 24,female,34.0,0,no 918 | 43,female,26.9,0,yes 919 | 45,male,22.9,0,yes 920 | 61,female,28.2,0,no 921 | 35,female,34.2,1,no 922 | 62,female,25.0,0,no 923 | 62,female,33.2,0,no 924 | 38,male,31.0,1,no 925 | 34,male,35.8,0,no 926 | 43,male,23.2,0,no 927 | 50,male,32.1,2,no 928 | 19,female,23.4,2,no 929 | 57,female,20.1,1,no 930 | 62,female,39.2,0,no 931 | 41,male,34.2,1,no 932 | 26,male,46.5,1,no 933 | 39,female,32.5,1,no 934 | 46,male,25.8,5,no 935 | 45,female,35.3,0,no 936 | 32,male,37.2,2,no 937 | 59,female,27.5,0,no 938 | 44,male,29.7,2,no 939 | 39,female,24.2,5,no 940 | 18,male,26.2,2,no 941 | 53,male,29.5,0,no 942 | 18,male,23.2,0,no 943 | 50,female,46.1,1,no 944 | 18,female,40.2,0,no 945 | 19,male,22.6,0,no 946 | 62,male,39.9,0,no 947 | 56,female,35.8,1,no 948 | 42,male,35.8,2,no 949 | 37,male,34.2,1,yes 950 | 42,male,31.3,0,no 951 | 25,male,29.7,3,yes 952 | 57,male,18.3,0,no 953 | 51,male,42.9,2,yes 954 | 30,female,28.4,1,no 955 | 44,male,30.2,2,yes 956 | 34,male,27.8,1,yes 957 | 31,male,39.5,1,no 958 | 54,male,30.8,1,yes 959 | 24,male,26.8,1,no 960 | 43,male,35.0,1,yes 961 | 48,male,36.7,1,no 962 | 19,female,39.6,1,no 963 | 29,female,25.9,0,no 964 | 63,female,35.2,1,no 965 | 46,male,24.8,3,no 966 | 52,male,36.8,2,no 967 | 35,male,27.1,1,no 968 | 51,male,24.8,2,yes 969 | 44,male,25.4,1,no 970 | 21,male,25.7,2,no 971 | 39,female,34.3,5,no 972 | 50,female,28.2,3,no 973 | 34,female,23.6,0,no 974 | 22,female,20.2,0,no 975 | 19,female,40.5,0,no 976 | 26,male,35.4,0,no 977 | 29,male,22.9,0,yes 978 | 48,male,40.2,0,no 979 | 26,male,29.2,1,no 980 | 45,female,40.0,3,no 981 | 36,female,29.9,0,no 982 | 54,male,25.5,1,no 983 | 34,male,21.4,0,no 984 | 31,male,25.9,3,yes 985 | 27,female,30.6,1,no 986 | 20,male,30.1,5,no 987 | 44,female,25.8,1,no 988 | 43,male,30.1,3,no 989 | 45,female,27.6,1,no 990 | 34,male,34.7,0,no 991 | 24,female,20.5,0,yes 992 | 26,female,19.8,1,no 993 | 38,female,27.8,2,no 994 | 50,female,31.6,2,no 995 | 38,male,28.3,1,no 996 | 27,female,20.0,3,yes 997 | 39,female,23.3,3,no 998 | 39,female,34.1,3,no 999 | 63,female,36.9,0,no 1000 | 33,female,36.3,3,no 1001 | 36,female,26.9,0,no 1002 | 30,male,23.0,2,yes 1003 | 24,male,32.7,0,yes 1004 | 24,male,25.8,0,no 1005 | 48,male,29.6,0,no 1006 | 47,male,19.2,1,no 1007 | 29,male,31.7,2,no 1008 | 28,male,29.3,2,no 1009 | 47,male,28.2,3,yes 1010 | 25,male,25.0,2,no 1011 | 51,male,27.7,1,no 1012 | 48,female,22.8,0,no 1013 | 43,male,20.1,2,yes 1014 | 61,female,33.3,4,no 1015 | 48,male,32.3,1,no 1016 | 38,female,27.6,0,no 1017 | 59,male,25.5,0,no 1018 | 19,female,24.6,1,no 1019 | 26,female,34.2,2,no 1020 | 54,female,35.8,3,no 1021 | 21,female,32.7,2,no 1022 | 51,male,37.0,0,no 1023 | 22,female,31.0,3,yes 1024 | 47,male,36.1,1,yes 1025 | 18,male,23.3,1,no 1026 | 47,female,45.3,1,no 1027 | 21,female,34.6,0,no 1028 | 19,male,26.0,1,yes 1029 | 23,male,18.7,0,no 1030 | 54,male,31.6,0,no 1031 | 37,female,17.3,2,no 1032 | 46,female,23.7,1,yes 1033 | 55,female,35.2,0,yes 1034 | 30,female,27.9,0,no 1035 | 18,male,21.6,0,yes 1036 | 61,male,38.4,0,no 1037 | 54,female,23.0,3,no 1038 | 22,male,37.1,2,yes 1039 | 45,female,30.5,1,yes 1040 | 22,male,28.9,0,no 1041 | 19,male,27.3,2,no 1042 | 35,female,28.0,0,yes 1043 | 18,male,23.1,0,no 1044 | 20,male,30.7,0,yes 1045 | 28,female,25.8,0,no 1046 | 55,male,35.2,1,no 1047 | 43,female,24.7,2,yes 1048 | 43,female,25.1,0,no 1049 | 22,male,52.6,1,yes 1050 | 25,female,22.5,1,no 1051 | 49,male,30.9,0,yes 1052 | 44,female,37.0,1,no 1053 | 64,male,26.4,0,no 1054 | 49,male,29.8,1,no 1055 | 47,male,29.8,3,yes 1056 | 27,female,21.5,0,no 1057 | 55,male,27.6,0,no 1058 | 48,female,28.9,0,no 1059 | 45,female,31.8,0,no 1060 | 24,female,39.5,0,no 1061 | 32,male,33.8,1,no 1062 | 24,male,32.0,0,no 1063 | 57,male,27.9,1,no 1064 | 59,male,41.1,1,yes 1065 | 36,male,28.6,3,no 1066 | 29,female,25.6,4,no 1067 | 42,female,25.3,1,no 1068 | 48,male,37.3,2,no 1069 | 39,male,42.7,0,no 1070 | 63,male,21.7,1,no 1071 | 54,female,31.9,1,no 1072 | 37,male,37.1,1,yes 1073 | 63,male,31.4,0,no 1074 | 21,male,31.3,0,no 1075 | 54,female,28.9,2,no 1076 | 60,female,18.3,0,no 1077 | 32,female,29.6,1,no 1078 | 47,female,32.0,1,no 1079 | 21,male,26.0,0,no 1080 | 28,male,31.7,0,yes 1081 | 63,male,33.7,3,no 1082 | 18,male,21.8,2,no 1083 | 32,male,27.8,1,no 1084 | 38,male,20.0,1,no 1085 | 32,male,31.5,1,no 1086 | 62,female,30.5,2,no 1087 | 39,female,18.3,5,yes 1088 | 55,male,29.0,0,no 1089 | 57,male,31.5,0,no 1090 | 52,male,47.7,1,no 1091 | 56,male,22.1,0,no 1092 | 47,male,36.2,0,yes 1093 | 55,female,29.8,0,no 1094 | 23,male,32.7,3,no 1095 | 22,female,30.4,0,yes 1096 | 50,female,33.7,4,no 1097 | 18,female,31.4,4,no 1098 | 51,female,35.0,2,yes 1099 | 22,male,33.8,0,no 1100 | 52,female,30.9,0,no 1101 | 25,female,34.0,1,no 1102 | 33,female,19.1,2,yes 1103 | 53,male,28.6,3,no 1104 | 29,male,38.9,1,no 1105 | 58,male,36.1,0,no 1106 | 37,male,29.8,0,no 1107 | 54,female,31.2,0,no 1108 | 49,female,29.9,0,no 1109 | 50,female,26.2,2,no 1110 | 26,male,30.0,1,no 1111 | 45,male,20.4,3,no 1112 | 54,female,32.3,1,no 1113 | 38,male,38.4,3,yes 1114 | 48,female,25.9,3,yes 1115 | 28,female,26.3,3,no 1116 | 23,male,24.5,0,no 1117 | 55,male,32.7,1,no 1118 | 41,male,29.6,5,no 1119 | 25,male,33.3,2,yes 1120 | 33,male,35.8,1,yes 1121 | 30,female,20.0,3,no 1122 | 23,female,31.4,0,yes 1123 | 46,male,38.2,2,no 1124 | 53,female,36.9,3,yes 1125 | 27,female,32.4,1,no 1126 | 23,female,42.8,1,yes 1127 | 63,female,25.1,0,no 1128 | 55,male,29.9,0,no 1129 | 35,female,35.9,2,no 1130 | 34,male,32.8,1,no 1131 | 19,female,18.6,0,no 1132 | 39,female,23.9,5,no 1133 | 27,male,45.9,2,no 1134 | 57,male,40.3,0,no 1135 | 52,female,18.3,0,no 1136 | 28,male,33.8,0,no 1137 | 50,female,28.1,3,no 1138 | 44,female,25.0,1,no 1139 | 26,female,22.2,0,no 1140 | 33,male,30.3,0,no 1141 | 19,female,32.5,0,yes 1142 | 50,male,37.1,1,no 1143 | 41,female,32.6,3,no 1144 | 52,female,24.9,0,no 1145 | 39,male,32.3,2,no 1146 | 50,male,32.3,2,no 1147 | 52,male,32.8,3,no 1148 | 60,male,32.8,0,yes 1149 | 20,female,31.9,0,no 1150 | 55,male,21.5,1,no 1151 | 42,male,34.1,0,no 1152 | 18,female,30.3,0,no 1153 | 58,female,36.5,0,no 1154 | 43,female,32.6,3,yes 1155 | 35,female,35.8,1,no 1156 | 48,female,27.9,4,no 1157 | 36,female,22.1,3,no 1158 | 19,male,44.9,0,yes 1159 | 23,female,23.2,2,no 1160 | 20,female,30.6,0,no 1161 | 32,female,41.1,0,no 1162 | 43,female,34.6,1,no 1163 | 34,male,42.1,2,no 1164 | 30,male,38.8,1,no 1165 | 18,female,28.2,0,no 1166 | 41,female,28.3,1,no 1167 | 35,female,26.1,0,no 1168 | 57,male,40.4,0,no 1169 | 29,female,24.6,2,no 1170 | 32,male,35.2,2,no 1171 | 37,female,34.1,1,no 1172 | 18,male,27.4,1,yes 1173 | 43,female,26.7,2,yes 1174 | 56,female,41.9,0,no 1175 | 38,male,29.3,2,no 1176 | 29,male,32.1,2,no 1177 | 22,female,27.1,0,no 1178 | 52,female,24.1,1,yes 1179 | 40,female,27.4,1,no 1180 | 23,female,34.9,0,no 1181 | 31,male,29.8,0,yes 1182 | 42,female,41.3,1,no 1183 | 24,female,29.9,0,no 1184 | 25,female,30.3,0,no 1185 | 48,female,27.4,1,no 1186 | 23,female,28.5,1,yes 1187 | 45,male,23.6,2,no 1188 | 20,male,35.6,3,yes 1189 | 62,female,32.7,0,no 1190 | 43,female,25.3,1,yes 1191 | 23,female,28.0,0,no 1192 | 31,female,32.8,2,no 1193 | 41,female,21.8,1,no 1194 | 58,female,32.4,1,no 1195 | 48,female,36.6,0,no 1196 | 31,female,21.8,0,no 1197 | 19,female,27.9,3,no 1198 | 19,female,30.0,0,yes 1199 | 41,male,33.6,0,no 1200 | 40,male,29.4,1,no 1201 | 31,female,25.8,2,no 1202 | 37,male,24.3,2,no 1203 | 46,male,40.4,2,no 1204 | 22,male,32.1,0,no 1205 | 51,male,32.3,1,no 1206 | 18,female,27.3,3,yes 1207 | 35,male,17.9,1,no 1208 | 59,female,34.8,2,no 1209 | 36,male,33.4,2,yes 1210 | 37,female,25.6,1,yes 1211 | 59,male,37.1,1,no 1212 | 36,male,30.9,1,no 1213 | 39,male,34.1,2,no 1214 | 18,male,21.5,0,no 1215 | 52,female,33.3,2,no 1216 | 27,female,31.3,1,no 1217 | 18,male,39.1,0,no 1218 | 40,male,25.1,0,no 1219 | 29,male,37.3,2,no 1220 | 46,female,34.6,1,yes 1221 | 38,female,30.2,3,no 1222 | 30,female,21.9,1,no 1223 | 40,male,25.0,2,no 1224 | 50,male,25.3,0,no 1225 | 20,female,24.4,0,yes 1226 | 41,male,23.9,1,no 1227 | 33,female,39.8,1,no 1228 | 38,male,16.8,2,no 1229 | 42,male,37.2,2,no 1230 | 56,male,34.4,0,no 1231 | 58,male,30.3,0,no 1232 | 52,male,34.5,3,yes 1233 | 20,female,21.8,0,yes 1234 | 54,female,24.6,3,no 1235 | 58,male,23.3,0,no 1236 | 45,female,27.8,2,no 1237 | 26,male,31.1,0,no 1238 | 63,female,21.7,0,no 1239 | 58,female,28.2,0,no 1240 | 37,male,22.7,3,no 1241 | 25,female,42.1,1,no 1242 | 52,male,41.8,2,yes 1243 | 64,male,37.0,2,yes 1244 | 22,female,21.3,3,no 1245 | 28,female,33.1,0,no 1246 | 18,male,33.3,0,no 1247 | 28,male,24.3,5,no 1248 | 45,female,25.7,3,no 1249 | 33,male,29.4,4,no 1250 | 18,female,39.8,0,no 1251 | 32,male,33.6,1,yes 1252 | 24,male,29.8,0,yes 1253 | 19,male,19.8,0,no 1254 | 20,male,27.3,0,yes 1255 | 40,female,29.3,4,no 1256 | 34,female,27.7,0,no 1257 | 42,female,37.9,0,no 1258 | 51,female,36.4,3,no 1259 | 54,female,27.6,1,no 1260 | 55,male,37.7,3,no 1261 | 52,female,23.2,0,no 1262 | 32,female,20.5,0,no 1263 | 28,male,37.1,1,no 1264 | 41,female,28.1,1,no 1265 | 43,female,29.9,1,no 1266 | 49,female,33.3,2,no 1267 | 64,male,23.8,0,yes 1268 | 55,female,30.5,0,no 1269 | 24,male,31.1,0,yes 1270 | 20,female,33.3,0,no 1271 | 45,male,27.5,3,no 1272 | 26,male,33.9,1,no 1273 | 25,female,34.5,0,no 1274 | 43,male,25.5,5,no 1275 | 35,male,27.6,1,no 1276 | 26,male,27.1,0,yes 1277 | 57,male,23.7,0,no 1278 | 22,female,30.4,0,no 1279 | 32,female,29.7,0,no 1280 | 39,male,29.9,1,yes 1281 | 25,female,26.8,2,no 1282 | 48,female,33.3,0,no 1283 | 47,female,27.6,2,yes 1284 | 18,female,21.7,0,yes 1285 | 18,male,30.0,1,no 1286 | 61,male,36.3,1,yes 1287 | 47,female,24.3,0,no 1288 | 28,female,17.3,0,no 1289 | 36,female,25.9,1,no 1290 | 20,male,39.4,2,yes 1291 | 44,male,34.3,1,no 1292 | 38,female,20.0,2,no 1293 | 19,male,34.9,0,yes 1294 | 21,male,23.2,0,no 1295 | 46,male,25.7,3,no 1296 | 58,male,25.2,0,no 1297 | 20,male,22.0,1,no 1298 | 18,male,26.1,0,no 1299 | 28,female,26.5,2,no 1300 | 33,male,27.5,2,no 1301 | 19,female,25.7,1,no 1302 | 45,male,30.4,0,yes 1303 | 62,male,30.9,3,yes 1304 | 25,female,20.8,1,no 1305 | 43,male,27.8,0,yes 1306 | 42,male,24.6,2,yes 1307 | 24,female,27.7,0,no 1308 | 29,female,21.9,0,yes 1309 | 32,male,28.1,4,yes 1310 | 25,female,30.2,0,yes 1311 | 41,male,32.2,2,no 1312 | 42,male,26.3,1,no 1313 | 33,female,26.7,0,no 1314 | 34,male,42.9,1,no 1315 | 19,female,34.7,2,yes 1316 | 30,female,23.7,3,yes 1317 | 18,male,28.3,1,no 1318 | 19,female,20.6,0,no 1319 | 18,male,53.1,0,no 1320 | 35,male,39.7,4,no 1321 | 39,female,26.3,2,no 1322 | 31,male,31.1,3,no 1323 | 62,male,26.7,0,yes 1324 | 62,male,38.8,0,no 1325 | 42,female,40.4,2,yes 1326 | 31,male,25.9,1,no 1327 | 61,male,33.5,0,no 1328 | 42,female,32.9,0,no 1329 | 51,male,30.0,1,no 1330 | 23,female,24.2,2,no 1331 | 52,male,38.6,2,no 1332 | 57,female,25.7,2,no 1333 | 23,female,33.4,0,no 1334 | 52,female,44.7,3,no 1335 | 50,male,31.0,3,no 1336 | 18,female,31.9,0,no 1337 | 18,female,36.9,0,no 1338 | 21,female,25.8,0,no 1339 | 61,female,29.1,0,yes 1340 | -------------------------------------------------------------------------------- /config/params.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | project_id: 5 3 | project: insurance-premium 4 | random_state: 42 5 | test_size: 0.2 6 | 7 | 8 | config: 9 | params: config/params.yaml 10 | schema_prediction: config/schema_prediction.json 11 | schema_training: config/schema_training.json 12 | 13 | cloud_provider: 14 | name: microsoft 15 | 16 | data_download: 17 | cloud_training_directory_path: data/training_batch_files/ 18 | cloud_prediction_directory_path: data/Prediction_Batch_files 19 | 20 | artifacts: 21 | training_data: 22 | good_file_path: artifacts/training_data/Training_Raw_files_validated/Good_raw 23 | bad_file_path: artifacts/training_data/Training_Raw_files_validated/Bad_raw 24 | archive_bad_file_path: artifacts/training_data/TrainingArchiveBadData 25 | training_file_from_db: artifacts/training_data/Training_FileFromDB 26 | master_csv: master.csv 27 | null_value_info_file_path: artifacts/training_data/preprocessing/null_value 28 | plots: artifacts/training_data/plots 29 | pipeline_path: artifacts/pipeline/pipeline_model 30 | 31 | model: 32 | model_path: artifacts/model 33 | 34 | 35 | prediction_data: 36 | good_file_path: artifacts/prediction_data/Prediction_Raw_files_validated/Good_raw 37 | bad_file_path: artifacts/prediction_data/Prediction_Raw_files_validated/Bad_raw 38 | archive_bad_file_path: artifacts/prediction_data/PredictionArchiveBadData 39 | prediction_file_from_db: artifacts/prediction_data/Prediction_FileFromDB 40 | master_csv: master.csv 41 | prediction_output_file_path: artifacts/Prediction_Output_DIR/ 42 | prediction_file_name: Predictions.csv 43 | 44 | data_source: 45 | Training_Batch_Files: Training_Batch_Files 46 | Prediction_Batch_Files: Prediction_Batch_Files 47 | 48 | kafka: 49 | topic_name: insurance-premium 50 | kafka_bootstrap_server: localhost:9092 51 | 52 | 53 | log_database: 54 | training_database_name: insurance_prediction_training 55 | prediction_database_name: insurance_prediction_prediction 56 | 57 | dataset: 58 | unwanted_column: 59 | - region 60 | 61 | 62 | database_detail: 63 | training_database_name: insurance_prediction_training 64 | prediction_database_name: insurance_prediction_prediction 65 | dataset_training_collection_name: insurance_prediction_training_dataset 66 | dataset_prediction_collection_name: insurance_prediction_prediction_dataset 67 | 68 | target_columns: 69 | columns: 70 | - expenses 71 | -------------------------------------------------------------------------------- /config/schema_prediction.json: -------------------------------------------------------------------------------- 1 | { 2 | "SampleFileName": "HealthPrem_26092020_131534.csv", 3 | "LengthOfDateStampInFile": 8, 4 | "LengthOfTimeStampInFile": 6, 5 | "NumberofColumns": 6, 6 | "ColName": { 7 | "age": "Integer", 8 | "sex": "varchar", 9 | "bmi": "float", 10 | "children": "Integer", 11 | "smoker": "varchar" 12 | } 13 | } -------------------------------------------------------------------------------- /config/schema_training.json: -------------------------------------------------------------------------------- 1 | { "SampleFileName": "HealthPrem_26092020_131534.csv", 2 | "LengthOfDateStampInFile": 8, 3 | "LengthOfTimeStampInFile": 6, 4 | "NumberofColumns" : 7, 5 | "ColName": { 6 | "age" : "Integer", 7 | "sex" : "varchar", 8 | "bmi" : "float", 9 | "children" : "Integer", 10 | "smoker" : "varchar", 11 | "region": " varchar", 12 | "expenses" : "float" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /csv_to_kafka.py: -------------------------------------------------------------------------------- 1 | from streaming.producer.kafka_csv_data_producer import KafkaCSVDataProducer 2 | from streaming.spark_manager.spark_manager import SparkManager 3 | 4 | if __name__ == "__main__": 5 | try: 6 | path = "prediction_files" 7 | spark_session = SparkManager().get_spark_session_object() 8 | kfk_csv_data_producer = KafkaCSVDataProducer( 9 | spark_session=spark_session, 10 | 11 | ) 12 | kfk_csv_data_producer.send_csv_data_to_kafka_topic(directory_path=path) 13 | except Exception as e: 14 | print(e) 15 | -------------------------------------------------------------------------------- /data/training_batch_files/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /diagram/Drawing1.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/Drawing1.vsdx -------------------------------------------------------------------------------- /diagram/streaming.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/streaming.jpg -------------------------------------------------------------------------------- /diagram/training and prediction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/training and prediction.pdf -------------------------------------------------------------------------------- /entry_point.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from utility import get_logger_object_of_training, get_logger_object_of_prediction 5 | from training.stage_00_data_loader import loader_main 6 | from training.stage_01_data_validator import validation_main 7 | from training.stage_02_data_transformer import transform_main 8 | from training.stage_03_data_exporter import export_main 9 | from training.stage_04_model_trainer import train_main 10 | 11 | from prediction.stage_00_data_loader import loader_main as pred_loader_main 12 | from prediction.stage_01_data_validator import validation_main as pred_validation_main 13 | from prediction.stage_02_data_transformer import transform_main as pred_transform_main 14 | from prediction.stage_03_data_exporter import export_main as pred_export_main 15 | from prediction.stage_04_model_predictor import predict_main 16 | from insurance_exception.insurance_exception import InsuranceException as GenericException 17 | 18 | collection_name = "main_pipeline" 19 | 20 | 21 | def begin_training(execution_id, executed_by): 22 | try: 23 | args = dict() 24 | args['config'] = os.path.join("config", "params.yaml") 25 | logger = get_logger_object_of_training(config_path=args['config'], 26 | collection_name=collection_name, 27 | execution_id=execution_id, 28 | executed_by=executed_by 29 | ) 30 | 31 | args['datasource'] = None 32 | parsed_args = args 33 | logger.log(f"dictionary created.{args}") 34 | logger.log(f"{parsed_args}") 35 | logger.log("Data loading begin..") 36 | 37 | loader_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], execution_id=execution_id, 38 | executed_by=executed_by) 39 | logger.log("Data loading completed..") 40 | logger.log("Data validation began..") 41 | 42 | validation_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], 43 | execution_id=execution_id, 44 | executed_by=executed_by) 45 | logger.log("Data validation completed..") 46 | logger.log("Data transformation began..") 47 | 48 | transform_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], 49 | execution_id=execution_id, 50 | executed_by=executed_by) 51 | logger.log("Data transformation completed..") 52 | logger.log("Export oberation began..") 53 | 54 | export_main(config_path=parsed_args['config'], execution_id=execution_id, 55 | executed_by=executed_by) 56 | logger.log("Export oberation completed..") 57 | logger.log("Training began..") 58 | 59 | train_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], execution_id=execution_id, 60 | executed_by=executed_by) 61 | logger.log(f"Training completed") 62 | return {'status': True, 'message': 'Training completed successfully'} 63 | except Exception as e: 64 | generic_exception = GenericException( 65 | "Error occurred in module [{0}] method [{1}]" 66 | .format(begin_training.__module__, 67 | begin_training.__name__)) 68 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 69 | 70 | 71 | def begin_prediction(execution_id, executed_by): 72 | try: 73 | args = dict() 74 | args['config'] = os.path.join("config", "params.yaml") 75 | logger = get_logger_object_of_prediction(config_path=args['config'], 76 | collection_name=collection_name, 77 | execution_id=execution_id, 78 | executed_by=executed_by 79 | ) 80 | args['datasource'] = None 81 | parsed_args = args 82 | logger.log(f"dictionary created.{args}") 83 | 84 | logger.log(f"{parsed_args}") 85 | logger.log("Data loading begin..") 86 | 87 | pred_loader_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], 88 | execution_id=execution_id, 89 | executed_by=executed_by 90 | ) 91 | logger.log("Data loading completed..") 92 | logger.log("Data validation began..") 93 | 94 | pred_validation_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], 95 | execution_id=execution_id, 96 | executed_by=executed_by 97 | ) 98 | logger.log("Data validation completed..") 99 | logger.log("Data transformation began..") 100 | 101 | pred_transform_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], 102 | execution_id=execution_id, 103 | executed_by=executed_by 104 | ) 105 | logger.log("Data transformation completed..") 106 | logger.log("Export oberation began..") 107 | 108 | pred_export_main(config_path=parsed_args['config']) 109 | logger.log("Export operation completed..") 110 | logger.log("Prediction began..") 111 | 112 | predict_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], 113 | execution_id=execution_id, 114 | executed_by=executed_by 115 | ) 116 | logger.log("Prediction completed") 117 | return {'status': True, 'message': 'Prediction completed successfully'} 118 | except Exception as e: 119 | generic_exception = GenericException( 120 | "Error occurred in module [{0}] method [{1}]" 121 | .format(begin_prediction.__module__, 122 | begin_prediction.__name__)) 123 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 124 | -------------------------------------------------------------------------------- /insurance_exception/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__init__.py -------------------------------------------------------------------------------- /insurance_exception/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /insurance_exception/__pycache__/insurance_exception.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__pycache__/insurance_exception.cpython-37.pyc -------------------------------------------------------------------------------- /insurance_exception/insurance_exception.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class InsuranceException(Exception): 5 | 6 | def __init__(self, error_message): 7 | """ 8 | 9 | :param error_message: error message in string format 10 | """ 11 | self.error_message = error_message 12 | 13 | def __repr__(self): 14 | return InsuranceException.__name__.__str__() 15 | 16 | def error_message_detail(self, error, error_detail): 17 | exc_type, exc_obj, exc_tb = error_detail.exc_info() 18 | file_name = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 19 | error_message = "python script name [{0}] line number [{1}] error message [{2}]".format(file_name, 20 | exc_tb.tb_lineno, 21 | str(error)) 22 | self.error_message = self.error_message + " " + error_message 23 | return self.error_message 24 | 25 | def __str__(self): 26 | return self.error_message 27 | -------------------------------------------------------------------------------- /insurance_prediction.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: insurance-prediction 3 | Version: 0.0.3 4 | Summary: insurance-prediction 5 | Home-page: UNKNOWN 6 | Author: Avnish yadav 7 | License: MIT 8 | Platform: UNKNOWN 9 | 10 | UNKNOWN 11 | 12 | -------------------------------------------------------------------------------- /insurance_prediction.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | azure_blob_storage/__init__.py 3 | azure_blob_storage/azure_blob_storage.py 4 | insurance_exception/__init__.py 5 | insurance_exception/insurance_exception.py 6 | insurance_prediction.egg-info/PKG-INFO 7 | insurance_prediction.egg-info/SOURCES.txt 8 | insurance_prediction.egg-info/dependency_links.txt 9 | insurance_prediction.egg-info/top_level.txt 10 | logger/__init__.py 11 | logger/logger.py 12 | mongo_db/__init__.py 13 | mongo_db/mongo_db_atlas.py 14 | prediction/__init__.py 15 | prediction/stage_00_data_loader.py 16 | prediction/stage_01_data_validator.py 17 | prediction/stage_02_data_transformer.py 18 | prediction/stage_03_data_exporter.py 19 | prediction/stage_04_model_predictor.py 20 | streaming/__init__.py 21 | streaming/consumer/__init__.py 22 | streaming/consumer/kafka_to_spark_csv_consumer.py 23 | streaming/producer/__init__.py 24 | streaming/producer/kafka_csv_data_producer.py 25 | streaming/spark_manager/__init__.py 26 | streaming/spark_manager/spark_manager.py 27 | streaming/transformer/__init__.py 28 | streaming/transformer/spark_transformer.py 29 | training/__init__.py 30 | training/stage_00_data_loader.py 31 | training/stage_01_data_validator.py 32 | training/stage_02_data_transformer.py 33 | training/stage_03_data_exporter.py 34 | training/stage_04_model_trainer.py -------------------------------------------------------------------------------- /insurance_prediction.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /insurance_prediction.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | azure_blob_storage 2 | insurance_exception 3 | logger 4 | mongo_db 5 | prediction 6 | streaming 7 | training 8 | -------------------------------------------------------------------------------- /logger/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__init__.py -------------------------------------------------------------------------------- /logger/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /logger/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /logger/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | import uuid 4 | import sys 5 | from insurance_exception.insurance_exception import InsuranceException as AppLoggerException 6 | from mongo_db.mongo_db_atlas import MongoDBOperation 7 | 8 | 9 | class AppLogger: 10 | def __init__(self, project_id, log_database, log_collection_name, executed_by, 11 | execution_id, is_log_enable=True): 12 | try: 13 | 14 | self.log_database = log_database 15 | self.log_collection_name = log_collection_name 16 | self.executed_by = executed_by 17 | self.execution_id = execution_id 18 | self.mongo_db_object = MongoDBOperation() 19 | self.project_id = project_id 20 | self.is_log_enable = is_log_enable 21 | except Exception as e: 22 | app_logger_exception = AppLoggerException( 23 | "Error occurred in module [{0}] class [{1}] method [{2}]" 24 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 25 | "__init__")) 26 | raise Exception(app_logger_exception.error_message_detail(str(e), sys)) from e 27 | 28 | def log(self, log_message): 29 | try: 30 | if not self.is_log_enable: 31 | return 0 32 | log_data = { 33 | 'execution_id': self.execution_id, 34 | 'message': log_message, 35 | 'executed_by': self.executed_by, 36 | 'project_id': self.project_id, 37 | 'updated_date_and_time': datetime.now().strftime("%H:%M:%S") 38 | } 39 | 40 | self.mongo_db_object.insert_record_in_collection( 41 | self.log_database, self.log_collection_name, log_data) 42 | except Exception as e: 43 | app_logger_exception = AppLoggerException( 44 | "Error occurred in module [{0}] class [{1}] method [{2}]" 45 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 46 | self.log.__name__)) 47 | raise Exception(app_logger_exception.error_message_detail(str(e), sys)) from e 48 | -------------------------------------------------------------------------------- /mongo_db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__init__.py -------------------------------------------------------------------------------- /mongo_db/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /mongo_db/__pycache__/mongo_db_atlas.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__pycache__/mongo_db_atlas.cpython-37.pyc -------------------------------------------------------------------------------- /mongo_db/mongo_db_atlas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 8 06:06:50 2021 4 | 5 | @author: AvnishYadav 6 | """ 7 | # importing mongodb file 8 | import ssl 9 | import pymongo 10 | import json 11 | import pandas as pd 12 | import sys 13 | from insurance_exception.insurance_exception import InsuranceException as MongoDbException 14 | 15 | 16 | class MongoDBOperation: 17 | def __init__(self, user_name=None, password=None): 18 | try: 19 | if user_name is None or password is None: 20 | # creating initial object to fetch mongodb credentials 21 | credentials = { 22 | "user_name": "avnyadav", 23 | "password": "Aa327030" 24 | } # get_mongo_db_credentials() # return dictionary with user name and password 25 | self.__user_name = credentials['user_name'] 26 | self.__password = credentials['password'] 27 | else: 28 | self.__user_name = user_name 29 | self.__password = password 30 | 31 | except Exception as e: 32 | mongo_db_exception = MongoDbException( 33 | "Failed to instantiate mongo_db_object in module [{0}] class [{1}] method [{2}]" 34 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 35 | "__init__")) 36 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 37 | 38 | def get_mongo_db_url(self): 39 | """ 40 | :return: mongo_db_url 41 | """ 42 | try: 43 | url = "" 44 | return url 45 | except Exception as e: 46 | mongo_db_exception = MongoDbException( 47 | "Failed to fetch mongo_db url in module [{0}] class [{1}] method [{2}]" 48 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 49 | self.get_mongo_db_url.__name__)) 50 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 51 | 52 | def get_database_client_object(self): 53 | """ 54 | Return pymongoClient object to perform action with MongoDB 55 | """ 56 | try: 57 | 58 | url = 'mongodb+srv://{0}:{1}@cluster0.wz7et.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'.format( 59 | self.__user_name, self.__password) 60 | client = pymongo.MongoClient(url, ssl_cert_reqs=ssl.CERT_NONE) # creating database client object 61 | return client 62 | except Exception as e: 63 | mongo_db_exception = MongoDbException( 64 | "Failed to fetch data base client object in module [{0}] class [{1}] method [{2}]" 65 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 66 | self.get_database_client_object.__name__)) 67 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 68 | 69 | def close_database_client_object(self, obj_name): 70 | """ 71 | 72 | 73 | Parameters 74 | ---------- 75 | obj_name : pymongo client 76 | DESCRIPTION.pymongo client object 77 | 78 | Raises 79 | ------ 80 | Exception 81 | Failed to close database connection-->. 82 | 83 | Returns 84 | ------- 85 | bool 86 | True if connection closed. 87 | 88 | """ 89 | try: 90 | obj_name.close() 91 | return True 92 | except Exception as e: 93 | mongo_db_exception = MongoDbException( 94 | "Failed to close data base client object in module [{0}] class [{1}] method [{2}]" 95 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 96 | self.close_database_client_object.__name__)) 97 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 98 | 99 | def is_database_present(self, client, db_name): 100 | """ 101 | 102 | Parameters 103 | ---------- 104 | client : pymongo client 105 | DESCRIPTION. object which will be used to fetch communicate with mongo db 106 | db_name : string 107 | database name. 108 | 109 | Raises 110 | ------ 111 | Exception 112 | DESCRIPTION.If any exception occurs 113 | 114 | Returns 115 | ------- 116 | bool 117 | True if database already exists. 118 | 119 | """ 120 | try: 121 | if db_name in client.list_database_names(): 122 | return True 123 | else: 124 | return False 125 | except Exception as e: 126 | mongo_db_exception = MongoDbException( 127 | "Failed during checking database in module [{0}] class [{1}] method [{2}]" 128 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 129 | self.is_database_present.__name__)) 130 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 131 | 132 | def create_database(self, client, db_name): 133 | """ 134 | client: client object of database 135 | db_name:data base name 136 | """ 137 | try: 138 | return client[db_name] 139 | except Exception as e: 140 | mongo_db_exception = MongoDbException( 141 | "Failure occured duing database creation steps in module [{0}] class [{1}] method [{2}]" 142 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 143 | self.create_database.__name__)) 144 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 145 | 146 | def create_collection_in_database(self, database, collection_name): 147 | """ 148 | database:database 149 | collection_name: name of collection 150 | return: 151 | collection object 152 | """ 153 | try: 154 | return database[collection_name] 155 | except Exception as e: 156 | mongo_db_exception = MongoDbException( 157 | "Failed during creating collection in database in module [{0}] class [{1}] method [{2}]" 158 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 159 | self.create_collection_in_database.__name__)) 160 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 161 | 162 | def is_collection_present(self, collection_name, database): 163 | """ 164 | 165 | 166 | Parameters 167 | ---------- 168 | collection_name : collection_name 169 | DESCRIPTION.collection name which needs to verify 170 | database : TYPE 171 | DESCRIPTION.database in which collection needs to check for existence 172 | 173 | Raises 174 | ------ 175 | Exception 176 | DESCRIPTION. 177 | 178 | Returns 179 | ------- 180 | bool 181 | true if collection present in database. 182 | 183 | """ 184 | try: 185 | """It verifies the existence of collection name in a database""" 186 | collection_list = database.list_collection_names() 187 | 188 | if collection_name in collection_list: 189 | # print("Collection:'{COLLECTION_NAME}' in Database:'{DB_NAME}' exists") 190 | return True 191 | 192 | # print(f"Collection:'{COLLECTION_NAME}' in Database:'{DB_NAME}' does not exists OR \n no documents are present in the collection") 193 | return False 194 | except Exception as e: 195 | mongo_db_exception = MongoDbException( 196 | "Failed during checking collection in module [{0}] class [{1}] method [{2}]" 197 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 198 | self.is_collection_present.__name__)) 199 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 200 | 201 | def get_collection(self, collection_name, database): 202 | """ 203 | collection_name:collection name 204 | database=database 205 | ------------------------------------------ 206 | return collection object 207 | """ 208 | try: 209 | collection = self.create_collection_in_database(database, collection_name) 210 | return collection 211 | except Exception as e: 212 | mongo_db_exception = MongoDbException( 213 | "Failed in retrival of collection in module [{0}] class [{1}] method [{2}]" 214 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 215 | self.get_collection.__name__)) 216 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 217 | 218 | def is_record_present(self, db_name, collection_name, record): 219 | """ 220 | db_name: database name 221 | collection_name: collection name 222 | record: records to search 223 | ---------------------------------------------- 224 | return True if record exists else return false 225 | """ 226 | try: 227 | client = self.get_database_client_object() # client object 228 | database = self.create_database(client, db_name) # database object 229 | collection = self.get_collection(collection_name, database) # collection object 230 | record_found = collection.find(record) # fetching record 231 | if record_found.count() > 0: 232 | client.close() 233 | return True 234 | else: 235 | client.close() 236 | return False 237 | except Exception as e: 238 | mongo_db_exception = MongoDbException( 239 | "Failed in fetching record in module [{0}] class [{1}] method [{2}]" 240 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 241 | self.is_record_present.__name__)) 242 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 243 | 244 | def create_record(self, collection, data): 245 | """ 246 | collection: Accept collection name 247 | data: accept single to insert into collection 248 | ------------------------------------------- 249 | return 1 if record inserted 250 | """ 251 | try: 252 | collection.insert_one(data) # insertion of record in collection 253 | return 1 254 | except Exception as e: 255 | mongo_db_exception = MongoDbException( 256 | "Failed in inserting record in module [{0}] class [{1}] method [{2}]" 257 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 258 | self.create_record.__name__)) 259 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 260 | 261 | def create_records(self, collection, data): 262 | """ 263 | collection: collection object 264 | data: data which needs to be inserted 265 | -------------------------------------------- 266 | return no of record inserted 267 | """ 268 | try: 269 | collection.insert_many(data) 270 | return len(data) 271 | except Exception as e: 272 | mongo_db_exception = MongoDbException( 273 | "Failed in inserting records in module [{0}] class [{1}] method [{2}]" 274 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 275 | self.create_records.__name__)) 276 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 277 | 278 | def insert_record_in_collection(self, db_name, collection_name, record): 279 | """ 280 | db_name: database name 281 | collection_name: collection name 282 | record: records to insert 283 | ------------------------------ 284 | return No of record inserted(int). 285 | """ 286 | try: 287 | no_of_row_inserted = 0 288 | client = self.get_database_client_object() 289 | database = self.create_database(client, db_name) 290 | collection = self.get_collection(collection_name, database) 291 | if not self.is_record_present(db_name, collection_name, record): 292 | no_of_row_inserted = self.create_record(collection=collection, data=record) 293 | client.close() 294 | return no_of_row_inserted 295 | except Exception as e: 296 | mongo_db_exception = MongoDbException( 297 | "Failed in inserting record in collection module [{0}] class [{1}] method [{2}]" 298 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 299 | self.insert_record_in_collection.__name__)) 300 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 301 | 302 | def drop_collection(self, db_name, collection_name): 303 | """ 304 | 305 | :param db_name: database name 306 | :param collection_name: collection name 307 | :return: True if collection droped successfully. 308 | """ 309 | try: 310 | client = self.get_database_client_object() 311 | database = self.create_database(client, db_name) 312 | if self.is_collection_present(collection_name, database): 313 | collection_name = self.get_collection(collection_name, database) 314 | collection_name.drop() 315 | return True 316 | except Exception as e: 317 | mongo_db_exception = MongoDbException( 318 | "Failed in droping collection module [{0}] class [{1}] method [{2}]" 319 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 320 | self.drop_collection.__name__)) 321 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 322 | 323 | def insert_records_in_collection(self, db_name, collection_name, records): 324 | """ 325 | db_name: database name 326 | collection_name: collection name 327 | records: records to insert 328 | """ 329 | try: 330 | no_of_row_inserted = 0 331 | client = self.get_database_client_object() 332 | database = self.create_database(client, db_name) 333 | collection = self.get_collection(collection_name, database) 334 | for record in records: 335 | if not self.is_record_present(db_name, collection_name, record): 336 | no_of_row_inserted = no_of_row_inserted + self.create_record(collection=collection, data=records) 337 | client.close() 338 | return no_of_row_inserted 339 | except Exception as e: 340 | mongo_db_exception = MongoDbException( 341 | "Failed in inserting records in collection module [{0}] class [{1}] method [{2}]" 342 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 343 | self.insert_record_in_collection.__name__)) 344 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 345 | 346 | def insert_dataframe_into_collection(self, db_name, collection_name, data_frame): 347 | """ 348 | db_name:Database Name 349 | collection_name: collection name 350 | data_frame: dataframe which needs to be inserted 351 | return: 352 | 353 | """ 354 | try: 355 | data_frame.reset_index(drop=True, inplace=True) 356 | records = list(json.loads(data_frame.T.to_json()).values()) 357 | client = self.get_database_client_object() 358 | database = self.create_database(client, db_name) 359 | collection = self.get_collection(collection_name, database) 360 | collection.insert_many(records) 361 | return len(records) 362 | except Exception as e: 363 | mongo_db_exception = MongoDbException( 364 | "Failed in inserting dataframe in collection module [{0}] class [{1}] method [{2}]" 365 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 366 | self.insert_dataframe_into_collection.__name__)) 367 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 368 | 369 | def get_record(self, database_name, collection_name, query=None): 370 | try: 371 | client = self.get_database_client_object() 372 | database = self.create_database(client, database_name) 373 | collection = self.get_collection(collection_name=collection_name, database=database) 374 | record = collection.find_one(query) 375 | return record 376 | except Exception as e: 377 | mongo_db_exception = MongoDbException( 378 | "Failed in retriving record in collection module [{0}] class [{1}] method [{2}]" 379 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 380 | self.get_record.__name__)) 381 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 382 | 383 | def get_min_value_of_column(self, database_name, collection_name, query, column): 384 | """ 385 | 386 | :param database_name: 387 | :param collection_name: 388 | :param query: to get all record 389 | :param column: column name 390 | :return: minimum value 391 | """ 392 | try: 393 | client = self.get_database_client_object() 394 | database = self.create_database(client, database_name) 395 | collection = self.get_collection(collection_name=collection_name, database=database) 396 | min_value = collection.find(query).sort(column, pymongo.ASCENDING).limit(1) 397 | value = [min_val for min_val in min_value] 398 | if len(value) > 0: 399 | if column in value[0]: 400 | return value[0][column] 401 | else: 402 | return None 403 | else: 404 | return None 405 | except Exception as e: 406 | mongo_db_exception = MongoDbException( 407 | "Failed in getting minimum value from column in collection module [{0}] class [{1}] method [{2}]" 408 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 409 | self.get_record.__name__)) 410 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 411 | 412 | def get_max_value_of_column(self, database_name, collection_name, query, column): 413 | """ 414 | 415 | :param database_name: database name 416 | :param collection_name: collection name 417 | :param query: query 418 | :param column: column name 419 | :return: maximum value 420 | """ 421 | try: 422 | client = self.get_database_client_object() 423 | database = self.create_database(client, database_name) 424 | collection = self.get_collection(collection_name=collection_name, database=database) 425 | max_value = collection.find(query).sort(column, pymongo.DESCENDING).limit(1) 426 | value = [max_val for max_val in max_value] 427 | if len(value) > 0: 428 | if column in value[0]: 429 | return value[0][column] 430 | else: 431 | return None 432 | else: 433 | return None 434 | 435 | except Exception as e: 436 | mongo_db_exception = MongoDbException( 437 | "Failed in getting maximum value from column in collection module [{0}] class [{1}] method [{2}]" 438 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 439 | self.get_record.__name__)) 440 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 441 | 442 | def get_records(self, database_name, collection_name, query=None): 443 | """ 444 | 445 | :param database_name: 446 | :param collection_name: 447 | :param query: 448 | :return: cursor object you need to iterate 449 | """ 450 | try: 451 | client = self.get_database_client_object() 452 | database = self.create_database(client, database_name) 453 | collection = self.get_collection(collection_name=collection_name, database=database) 454 | record = collection.find(query) 455 | return record 456 | except Exception as e: 457 | mongo_db_exception = MongoDbException( 458 | "Failed in retriving records in collection module [{0}] class [{1}] method [{2}]" 459 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 460 | self.get_record.__name__)) 461 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 462 | 463 | def update_record_in_collection(self, database_name, collection_name, query, new_value): 464 | """ 465 | 466 | :param database_name: database name 467 | :param collection_name: collection name 468 | :param query: search for record 469 | :param new_value: updated values 470 | :return: n_updated row 471 | """ 472 | try: 473 | client = self.get_database_client_object() 474 | database = self.create_database(client, database_name) 475 | collection = self.get_collection(collection_name=collection_name, database=database) 476 | update_query = {'$set': new_value} 477 | result = collection.update_one(query, update_query) 478 | client.close() 479 | return result.raw_result["nModified"] 480 | except Exception as e: 481 | mongo_db_exception = MongoDbException( 482 | "Failed updating record in collection module [{0}] class [{1}] method [{2}]" 483 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 484 | self.update_record_in_collection.__name__)) 485 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 486 | 487 | def get_dataframe_of_collection(self, db_name, collection_name, query=None): 488 | """ 489 | 490 | Parameters 491 | ---------- 492 | db_name : string 493 | DESCRIPTION. database name 494 | collection_name : string 495 | DESCRIPTION.collection name 496 | 497 | Returns 498 | ------- 499 | Pandas data frame of collection name present database. 500 | 501 | """ 502 | try: 503 | client = self.get_database_client_object() 504 | database = self.create_database(client, db_name) 505 | collection = self.get_collection(collection_name=collection_name, database=database) 506 | if query is None: 507 | query = {} 508 | df = pd.DataFrame(list(collection.find(query))) 509 | if "_id" in df.columns.to_list(): 510 | df = df.drop(columns=["_id"], axis=1) 511 | return df.copy() 512 | except Exception as e: 513 | mongo_db_exception = MongoDbException( 514 | "Failed in returning dataframe of collection module [{0}] class [{1}] method [{2}]" 515 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 516 | self.get_dataframe_of_collection.__name__)) 517 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 518 | 519 | def remove_record(self, db_name, collection_name, query): 520 | try: 521 | client = self.get_database_client_object() 522 | database = self.create_database(client, db_name) 523 | collection = self.get_collection(collection_name=collection_name, database=database) 524 | collection.delete_one(query) 525 | return True 526 | except Exception as e: 527 | mongo_db_exception = MongoDbException( 528 | "Failed in collection module [{0}] class [{1}] method [{2}]" 529 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__, 530 | self.remove_record.__name__)) 531 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e 532 | -------------------------------------------------------------------------------- /new_data.csv: -------------------------------------------------------------------------------- 1 | age,sex,bmi,children,smoker,region,timestamp,sex_encoder,smoker_encoder,sex_encoded,smoker_encoded,input_features,prediction 2 | 21,male,25.7,2,no,northeast,2021-11-13 11:57:30.400,0.0,0.0,"(1,[0],[1.0])","(1,[0],[1.0])","[21.0,25.7,2.0,1.0,1.0]",6333.757487873788 3 | 39,female,34.3,5,no,southeast,2021-11-13 11:57:31.410,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[39.0,34.3,5.0,0.0,1.0]",9860.294896652984 4 | 50,female,28.2,3,no,southeast,2021-11-13 11:57:32.416,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[50.0,28.2,3.0,0.0,1.0]",12142.963316508103 5 | 34,female,23.6,0,no,northeast,2021-11-13 11:57:33.420,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[34.0,23.6,0.0,0.0,1.0]",6761.00125859063 6 | 22,female,20.2,0,no,northwest,2021-11-13 11:57:34.425,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[22.0,20.2,0.0,0.0,1.0]",3976.837587719984 7 | -------------------------------------------------------------------------------- /prediction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/prediction/__init__.py -------------------------------------------------------------------------------- /prediction/stage_00_data_loader.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import shutil 5 | from utility import read_params, get_logger_object_of_prediction 6 | from insurance_exception.insurance_exception import InsuranceException as GenericException 7 | from utility import clean_data_source_dir 8 | 9 | log_collection_name = "data_loader" 10 | 11 | 12 | def loader_main(config_path: str, datasource: str,is_logging_enable=True,execution_id=None,executed_by=None) -> None: 13 | try: 14 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name, 15 | execution_id=execution_id, executed_by=executed_by) 16 | 17 | 18 | logger.is_log_enable = is_logging_enable 19 | logger.log("Starting data loading operation.\nReading configuration file.") 20 | 21 | config = read_params(config_path) 22 | downloader_path=config['data_download']['cloud_prediction_directory_path'] 23 | download_path=config['data_source']['Prediction_Batch_Files'] 24 | 25 | 26 | logger.log("Configuration detail has been fetched from configuration file.") 27 | # removing existing training and additional training files from local 28 | logger.log(f"Cleaning local directory [{download_path}] for training.") 29 | clean_data_source_dir(download_path,logger=logger, is_logging_enable=is_logging_enable) # removing existing file from local system 30 | 31 | logger.log(f"Cleaning completed. Directory has been cleared now [{download_path}]") 32 | # downloading training and additional training file from cloud into local system 33 | logger.log("Data will be downloaded from cloud storage into local system") 34 | 35 | 36 | for file in os.listdir(downloader_path): 37 | if '.dvc' in file or '.gitignore' in file: 38 | continue 39 | print(f"Source dir: {downloader_path} file: {file} is being copied into destination dir: {download_path}" 40 | f" file: {file}") 41 | shutil.copy(os.path.join(downloader_path,file),os.path.join(download_path,file)) 42 | logger.log("Data has been downloaded from cloud storage into local system") 43 | 44 | except Exception as e: 45 | generic_exception = GenericException( 46 | "Error occurred in module [{0}] method [{1}]" 47 | .format(loader_main.__module__, 48 | loader_main.__name__)) 49 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 50 | 51 | 52 | if __name__ == '__main__': 53 | args = argparse.ArgumentParser() 54 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 55 | args.add_argument("--datasource", default=None) 56 | parsed_args = args.parse_args() 57 | print("started") 58 | loader_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 59 | -------------------------------------------------------------------------------- /prediction/stage_01_data_validator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | 5 | import pandas as pd 6 | from utility import read_params, create_directory_path, values_from_schema_function, \ 7 | get_logger_object_of_prediction, get_date, get_time 8 | import argparse 9 | import shutil 10 | 11 | from insurance_exception.insurance_exception import InsuranceException as GenericException 12 | 13 | log_collection_name = "data_validator" 14 | 15 | 16 | class DataValidator: 17 | def __init__(self, config, logger, is_logging_enable=True): 18 | try: 19 | self.logger = logger 20 | self.logger.is_log_enable = is_logging_enable 21 | self.config = config 22 | self.file_path = self.config['data_source']['Prediction_Batch_Files'] 23 | self.good_file_path = self.config['artifacts']['prediction_data']['good_file_path'] 24 | self.bad_file_path = self.config['artifacts']['prediction_data']['bad_file_path'] 25 | self.archive_bad_file_path = self.config['artifacts']['prediction_data']['archive_bad_file_path'] 26 | self.prediction_schema_file = self.config['config']['schema_prediction'] 27 | except Exception as e: 28 | generic_exception = GenericException( 29 | "Error occurred in module [{0}] class [{1}] method [{2}]" 30 | .format(self.__module__, DataValidator.__name__, 31 | self.__init__.__name__)) 32 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 33 | 34 | def archive_bad_files(self): 35 | try: 36 | folder_name=f"bad_files_{get_date().replace('-','_')}_{get_time().replace(':','_')}" 37 | archive_directory_path=os.path.join(self.archive_bad_file_path,folder_name) 38 | create_directory_path(archive_directory_path) 39 | for file in os.listdir(self.bad_file_path): 40 | source_file_path=os.path.join(self.bad_file_path,file) 41 | shutil.move(source_file_path,archive_directory_path) 42 | except Exception as e: 43 | generic_exception = GenericException( 44 | "Error occurred in module [{0}] class [{1}] method [{2}]" 45 | .format(self.__module__, DataValidator.__name__, 46 | self.archive_bad_files.__name__)) 47 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 48 | 49 | 50 | def create_good_bad_archive_bad_file_path(self): 51 | try: 52 | create_directory_path(self.good_file_path) 53 | create_directory_path(self.bad_file_path) 54 | create_directory_path(self.archive_bad_file_path,is_recreate=False) 55 | except Exception as e: 56 | 57 | generic_exception = GenericException( 58 | "Error occurred in module [{0}] class [{1}] method [{2}]" 59 | .format(self.__module__, DataValidator.__name__, 60 | self.create_good_bad_archive_bad_file_path.__name__)) 61 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 62 | 63 | 64 | def value_from_schema(self): 65 | """ 66 | 67 | :return: tuple (sample_file_name,column_names,number_of_column) 68 | """ 69 | try: 70 | return values_from_schema_function(self.prediction_schema_file) 71 | except Exception as e: 72 | generic_exception = GenericException( 73 | "Error occurred in module [{0}] class [{1}] method [{2}]" 74 | .format(self.__module__, DataValidator.__name__, 75 | self.value_from_schema.__name__)) 76 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 77 | 78 | 79 | def file_name_regular_expression(self): 80 | """ 81 | 82 | :return: regular expression syntax which can be used for validation of file name 83 | """ 84 | return "['HealthPrem']+['\_'']+[\d_]+[\d]+\.csv" 85 | 86 | def validate_missing_values_in_whole_column(self): 87 | try: 88 | self.logger.log("Missing Values Validation Started!!") 89 | for file in os.listdir(self.good_file_path): 90 | csv = pd.read_csv(os.path.join(self.good_file_path, file)) 91 | count = 0 92 | for columns in csv: 93 | if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]): 94 | count += 1 95 | shutil.move(os.path.join(self.good_file_path, file), 96 | self.bad_file_path) 97 | self.logger.log( 98 | "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) 99 | break 100 | if count == 0: 101 | print(csv.columns) 102 | csv.rename(columns={"Unnamed: 0": "Premium "}, inplace=True) 103 | csv.to_csv(os.path.join(self.good_file_path, file), index=None, header=True) 104 | except Exception as e: 105 | generic_exception = GenericException( 106 | "Error occurred in module [{0}] class [{1}] method [{2}]" 107 | .format(self.__module__, DataValidator.__name__, 108 | self.validate_missing_values_in_whole_column.__name__)) 109 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 110 | 111 | 112 | def validate_file_name(self): 113 | try: 114 | self.create_good_bad_archive_bad_file_path() 115 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = self.value_from_schema() 116 | 117 | file_name_reg_pattern = self.file_name_regular_expression() 118 | self.logger.log(f"Validating file names.") 119 | files = os.listdir(self.file_path) 120 | for file in files: 121 | file_path = os.path.join(self.file_path, file) 122 | split_at_dot = re.split('.csv', file) 123 | split_at_dot = (re.split('_', split_at_dot[0])) 124 | if re.match(file_name_reg_pattern, file) and len(split_at_dot[1]) == length_of_date_stamp_in_file \ 125 | and len(split_at_dot[2]) == length_of_time_stamp_in_file: 126 | destination_file_path = os.path.join(self.good_file_path, file) 127 | self.logger.log(f"file name : {file} matched hence moving file to good file path {destination_file_path}") 128 | shutil.move(file_path, destination_file_path) 129 | else: 130 | destination_file_path = os.path.join(self.bad_file_path, file) 131 | self.logger.log(f"file name: {file} does not matched hence moving file to bad file path {destination_file_path}") 132 | shutil.move(file_path, destination_file_path) 133 | except Exception as e: 134 | generic_exception = GenericException( 135 | "Error occurred in module [{0}] class [{1}] method [{2}]" 136 | .format(self.__module__, DataValidator.__name__, 137 | self.validate_file_name.__name__)) 138 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 139 | 140 | 141 | def validate_no_of_column(self, no_of_column): 142 | """ 143 | Description: 144 | If number of column matches then file will be move to good file path else bad file path 145 | ===================================================================================== 146 | :param no_of_column: int Number of column must present in each file 147 | :return: Nothing 148 | """ 149 | try: 150 | self.logger.log(f"Validating number of column in input file") 151 | files = os.listdir(self.good_file_path) 152 | for file in files: 153 | file_path = os.path.join(self.good_file_path, file) 154 | df = pd.read_csv(file_path) 155 | if df.shape[1] != no_of_column: 156 | destination_file_path = os.path.join(self.bad_file_path, file) 157 | self.logger.log(f"file: {file} has incorrect number of column hence moving file to bad file path {destination_file_path}") 158 | shutil.move(file_path, destination_file_path) 159 | except Exception as e: 160 | generic_exception = GenericException( 161 | "Error occurred in module [{0}] class [{1}] method [{2}]" 162 | .format(self.__module__, DataValidator.__name__, 163 | self.validate_no_of_column.__name__)) 164 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 165 | 166 | 167 | def validation_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None: 168 | try: 169 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name, 170 | execution_id=execution_id, executed_by=executed_by) 171 | 172 | logger.is_log_enable = is_logging_enable 173 | config = read_params(config_path) 174 | logger.log("data validation started") 175 | data_validator = DataValidator(config=config, logger=logger, is_logging_enable=is_logging_enable) 176 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = \ 177 | data_validator.value_from_schema() 178 | data_validator.validate_file_name() 179 | data_validator.validate_no_of_column(no_of_column=number_of_columns) 180 | data_validator.validate_missing_values_in_whole_column() 181 | data_validator.archive_bad_files() 182 | except Exception as e: 183 | generic_exception = GenericException( 184 | "Error occurred in module [{0}] method [{1}]" 185 | .format(validation_main.__module__, 186 | validation_main.__name__)) 187 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 188 | 189 | 190 | if __name__ == '__main__': 191 | args = argparse.ArgumentParser() 192 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 193 | args.add_argument("--datasource", default=None) 194 | parsed_args = args.parse_args() 195 | print("started") 196 | validation_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 197 | -------------------------------------------------------------------------------- /prediction/stage_02_data_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pandas as pd 5 | import argparse 6 | from utility import read_params, get_logger_object_of_prediction 7 | from mongo_db.mongo_db_atlas import MongoDBOperation 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException 9 | 10 | log_collection_name = "data_transformer" 11 | 12 | 13 | class DataTransformer: 14 | def __init__(self, config, logger, is_log_enable=True): 15 | try: 16 | self.config = config 17 | self.logger = logger 18 | self.logger.is_log_enable = is_log_enable 19 | self.good_file_path = self.config["artifacts"]['prediction_data']['good_file_path'] 20 | self.unwanted_column_names=self.config["dataset"]['unwanted_column'] 21 | self.mongo_db=MongoDBOperation() 22 | self.dataset_database=self.config["dataset"]["database_detail"]["prediction_database_name"] 23 | self.dataset_collection_name=self.config["dataset"]["database_detail"]["dataset_prediction_collection_name"] 24 | self.mongo_db.drop_collection(self.dataset_database,self.dataset_collection_name) 25 | except Exception as e: 26 | generic_exception = GenericException( 27 | "Error occurred in module [{0}] class [{1}] method [{2}]" 28 | .format(self.__module__, DataTransformer.__name__, 29 | self.__init__.__name__)) 30 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 31 | 32 | def unite_dataset(self): 33 | try: 34 | dataset_list=[] 35 | for file in os.listdir(self.good_file_path): 36 | dataset_list.append(pd.read_csv(os.path.join(self.good_file_path,file))) 37 | df=pd.concat(dataset_list) 38 | df=self.remove_unwanted_column(df) 39 | self.logger.log(f"Inserting dataset into database {self.dataset_database} " 40 | f"collection_name: {self.dataset_collection_name}") 41 | self.mongo_db.insert_dataframe_into_collection(self.dataset_database,self.dataset_collection_name,df) 42 | except Exception as e: 43 | generic_exception = GenericException( 44 | "Error occurred in module [{0}] class [{1}] method [{2}]" 45 | .format(self.__module__, DataTransformer.__name__, 46 | self.unite_dataset.__name__)) 47 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 48 | 49 | 50 | def remove_unwanted_column(self,df): 51 | try: 52 | drop_column=list(filter(lambda x: x in df.columns ,self.unwanted_column_names)) 53 | return df.drop(drop_column,axis=1) 54 | except Exception as e: 55 | generic_exception = GenericException( 56 | "Error occurred in module [{0}] class [{1}] method [{2}]" 57 | .format(self.__module__, DataTransformer.__name__, 58 | self.remove_unwanted_column.__name__)) 59 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 60 | 61 | 62 | def transform_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None: 63 | try: 64 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name, 65 | execution_id=execution_id, executed_by=executed_by) 66 | 67 | logger.is_log_enable = is_logging_enable 68 | config = read_params(config_path) 69 | data_transformer = DataTransformer(config=config, logger=logger, is_log_enable=is_logging_enable) 70 | logger.log('Start of Data Preprocessing before DB') 71 | data_transformer.unite_dataset() 72 | logger.log('Data Preprocessing before DB Completed !!') 73 | 74 | except Exception as e: 75 | generic_exception = GenericException( 76 | "Error occurred in module [{0}] method [{1}]" 77 | .format(transform_main.__module__, 78 | transform_main.__name__)) 79 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 80 | 81 | 82 | if __name__ == '__main__': 83 | args = argparse.ArgumentParser() 84 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 85 | args.add_argument("--datasource", default=None) 86 | parsed_args = args.parse_args() 87 | print("started") 88 | transform_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 89 | -------------------------------------------------------------------------------- /prediction/stage_03_data_exporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from utility import read_params, create_directory_path 5 | from mongo_db.mongo_db_atlas import MongoDBOperation 6 | import argparse 7 | from utility import get_logger_object_of_prediction 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException 9 | 10 | log_collection_name = "data_export" 11 | 12 | 13 | class DataExporter: 14 | def __init__(self, config, logger, is_log_enable): 15 | try: 16 | self.config = config 17 | self.logger = logger 18 | self.is_log_enable = is_log_enable 19 | self.mongo_db = MongoDBOperation() 20 | self.dataset_database = self.config["dataset"]["database_detail"]["prediction_database_name"] 21 | self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_prediction_collection_name"] 22 | self.prediction_file_from_db = self.config["artifacts"]['prediction_data']['prediction_file_from_db'] 23 | self.master_csv = self.config["artifacts"]['prediction_data']['master_csv'] 24 | except Exception as e: 25 | generic_exception = GenericException( 26 | "Error occurred in module [{0}] class [{1}] method [{2}]" 27 | .format(self.__module__, DataExporter.__name__, 28 | self.__init__.__name__)) 29 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 30 | 31 | def export_dataframe_from_database(self): 32 | try: 33 | create_directory_path(self.prediction_file_from_db) 34 | self.logger.log(f"Creating dataframe of data stored db" 35 | f"[{self.dataset_database}] and collection[{self.dataset_collection_name}]") 36 | df = self.mongo_db.get_dataframe_of_collection(db_name=self.dataset_database, 37 | collection_name=self.dataset_collection_name) 38 | master_csv_file_path = os.path.join(self.prediction_file_from_db, self.master_csv) 39 | self.logger.log(f"master csv file will be generated at " 40 | f"{master_csv_file_path}.") 41 | df.to_csv(master_csv_file_path, index=None,header=True) 42 | 43 | except Exception as e: 44 | generic_exception = GenericException( 45 | "Error occurred in module [{0}] class [{1}] method [{2}]" 46 | .format(self.__module__, DataExporter.__name__, 47 | self.export_dataframe_from_database.__name__)) 48 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 49 | 50 | 51 | def export_main(config_path: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None: 52 | try: 53 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name, 54 | execution_id=execution_id, executed_by=executed_by) 55 | 56 | logger.is_log_enable = is_logging_enable 57 | config = read_params(config_path) 58 | data_exporter = DataExporter(config=config, logger=logger, is_log_enable=is_logging_enable) 59 | logger.log("Generating csv file from dataset stored in database.") 60 | data_exporter.export_dataframe_from_database() 61 | logger.log("Dataset has been successfully exported in directory and exiting export pipeline.") 62 | except Exception as e: 63 | generic_exception = GenericException( 64 | "Error occurred in module [{0}] method [{1}]" 65 | .format(export_main.__module__, 66 | export_main.__name__)) 67 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 68 | 69 | 70 | if __name__ == '__main__': 71 | args = argparse.ArgumentParser() 72 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 73 | parsed_args = args.parse_args() 74 | print("started") 75 | export_main(config_path=parsed_args.config) 76 | -------------------------------------------------------------------------------- /prediction/stage_04_model_predictor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | 5 | from pyspark.sql.types import IntegerType, FloatType, StringType 6 | 7 | from utility import create_directory_path,get_logger_object_of_prediction,read_params 8 | 9 | from streaming.spark_manager.spark_manager import SparkManager 10 | 11 | from insurance_exception.insurance_exception import InsuranceException as GenericException 12 | from pyspark.ml import Pipeline, PipelineModel 13 | from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor 14 | 15 | log_collection_name = "prediction_model" 16 | 17 | 18 | class DataPreProcessing: 19 | def __init__(self, logger, is_log_enable=True, data_frame=None, pipeline_path=None): 20 | try: 21 | self.logger = logger 22 | self.logger.is_log_enable = is_log_enable 23 | self.data_frame = data_frame 24 | print(pipeline_path) 25 | self.pipeline_obj = PipelineModel.load(pipeline_path) 26 | 27 | except Exception as e: 28 | generic_exception = GenericException( 29 | "Error occurred in module [{0}] class [{1}] method [{2}]" 30 | .format(self.__module__, DataPreProcessing.__name__, 31 | self.__init__.__name__)) 32 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 33 | 34 | def set_dataframe(self, dataframe): 35 | try: 36 | self.data_frame = dataframe 37 | except Exception as e: 38 | generic_exception = GenericException( 39 | "Error occurred in module [{0}] class [{1}] method [{2}]" 40 | .format(self.__module__, DataPreProcessing.__name__, 41 | self.update_dataframe_scheme.__name__)) 42 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 43 | 44 | def update_dataframe_scheme(self, schema_definition: dict): 45 | """ 46 | 47 | """ 48 | try: 49 | print(self.data_frame.printSchema()) 50 | if self.data_frame is None: 51 | raise Exception("update the attribute dataframe") 52 | for column, datatype in schema_definition.items(): 53 | self.logger.log(f"Update datatype of column: {column} to {str(datatype)}") 54 | self.data_frame = self.data_frame.withColumn(column, self.data_frame[column].cast(datatype)) 55 | except Exception as e: 56 | generic_exception = GenericException( 57 | "Error occurred in module [{0}] class [{1}] method [{2}]" 58 | .format(self.__module__, DataPreProcessing.__name__, 59 | self.update_dataframe_scheme.__name__)) 60 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 61 | 62 | def get_prepared_data(self): 63 | try: 64 | schema_definition = {"age": IntegerType(), 65 | "sex": StringType(), 66 | "bmi": FloatType(), 67 | "children": IntegerType(), 68 | "smoker": StringType(), 69 | } 70 | self.update_dataframe_scheme(schema_definition=schema_definition) 71 | self.data_frame = self.pipeline_obj.transform(self.data_frame) 72 | print(self.data_frame.printSchema()) 73 | return self.data_frame 74 | except Exception as e: 75 | generic_exception = GenericException( 76 | "Error occurred in module [{0}] class [{1}] method [{2}]" 77 | .format(self.__module__, DataPreProcessing.__name__, 78 | self.get_prepared_data.__name__)) 79 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 80 | 81 | 82 | class Predictor: 83 | 84 | def __init__(self, config, logger, is_log_enable): 85 | try: 86 | self.logger = logger 87 | self.logger.is_log_enable = is_log_enable 88 | self.config = config 89 | self.prediction_file_path = self.config['artifacts']['prediction_data']['prediction_file_from_db'] 90 | self.master_csv = self.config['artifacts']['prediction_data']['master_csv'] 91 | self.model_path = self.config['artifacts']['model']['model_path'] 92 | self.prediction_output_file_path = self.config['artifacts']['prediction_data'][ 93 | 'prediction_output_file_path'] 94 | self.prediction_file_name = self.config['artifacts']['prediction_data']['prediction_file_name'] 95 | self.target_columns = self.config['target_columns']['columns'] 96 | self.null_value_file_path = config['artifacts']['training_data']['null_value_info_file_path'] 97 | self.pipeline_path = self.config['artifacts']['training_data']['pipeline_path'] 98 | """ 99 | self.spark = SparkSession.builder. \ 100 | master("local[*]"). \ 101 | appName("insurance-premium-reg").getOrCreate() 102 | """ 103 | self.spark = SparkManager().get_spark_session_object() 104 | except Exception as e: 105 | generic_exception = GenericException( 106 | "Error occurred in module [{0}] class [{1}] method [{2}]" 107 | .format(self.__module__, Predictor.__name__, 108 | self.__init__.__name__)) 109 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 110 | 111 | def get_dataframe(self): 112 | try: 113 | master_file_path = os.path.join(self.prediction_file_path, self.master_csv) 114 | return self.spark.read.csv(master_file_path, header=True, inferSchema=True) 115 | except Exception as e: 116 | generic_exception = GenericException( 117 | "Error occurred in module [{0}] class [{1}] method [{2}]" 118 | .format(self.__module__, Predictor.__name__, 119 | self.get_dataframe.__name__)) 120 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 121 | 122 | def data_preparation(self): 123 | try: 124 | 125 | input_features = self.get_dataframe() 126 | data_preprocess = DataPreProcessing(logger=self.logger, 127 | is_log_enable=self.logger.is_log_enable, 128 | data_frame=input_features, 129 | pipeline_path=self.pipeline_path 130 | ) 131 | return data_preprocess.get_prepared_data() 132 | 133 | except Exception as e: 134 | generic_exception = GenericException( 135 | "Error occurred in module [{0}] class [{1}] method [{2}]" 136 | .format(self.__module__, Predictor.__name__, 137 | self.data_preparation.__name__)) 138 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 139 | 140 | 141 | def load_model(self): 142 | try: 143 | model_path = self.model_path 144 | if not os.path.exists(model_path): 145 | raise Exception(f"Model directory: {model_path} is not found.") 146 | model_names = os.listdir(model_path) 147 | if len(model_names) != 1: 148 | raise Exception(f"We have expected only one model instead we found {len(model_names)}") 149 | model_name = model_names[0] 150 | model_path = os.path.join(model_path, model_name) 151 | print(f"model path: {model_path}") 152 | return RandomForestRegressionModel.load(model_path) 153 | except Exception as e: 154 | generic_exception = GenericException( 155 | "Error occurred in module [{0}] class [{1}] method [{2}]" 156 | .format(self.__module__, Predictor.__name__, 157 | self.load_model.__name__)) 158 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 159 | 160 | def predict(self): 161 | try: 162 | 163 | input_data = self.data_preparation() 164 | model = self.load_model() 165 | print(str(model)) 166 | print(input_data.printSchema()) 167 | prediction = model.transform(input_data) 168 | prediction_output = prediction.select("age", "sex", "children", "smoker", "prediction").toPandas() 169 | create_directory_path(self.prediction_output_file_path) 170 | output_file_path = os.path.join(self.prediction_output_file_path, self.prediction_file_name) 171 | if prediction_output is not None: 172 | prediction_output.to_csv(output_file_path, index=None, header=True) 173 | except Exception as e: 174 | generic_exception = GenericException( 175 | "Error occurred in module [{0}] class [{1}] method [{2}]" 176 | .format(self.__module__, Predictor.__name__, 177 | self.predict.__name__)) 178 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 179 | 180 | 181 | def predict_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None, 182 | executed_by=None) -> None: 183 | try: 184 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name, 185 | execution_id=execution_id, executed_by=executed_by) 186 | 187 | logger.is_log_enable = is_logging_enable 188 | logger.log("Prediction begin.") 189 | config = read_params(config_path) 190 | predictor = Predictor(config=config, logger=logger, is_log_enable=is_logging_enable) 191 | predictor.predict() 192 | logger.log("Prediction completed successfully.") 193 | 194 | except Exception as e: 195 | generic_exception = GenericException( 196 | "Error occurred in module [{0}] method [{1}]" 197 | .format(predict_main.__module__, 198 | predict_main.__name__)) 199 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 200 | 201 | 202 | if __name__ == '__main__': 203 | args = argparse.ArgumentParser() 204 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 205 | args.add_argument("--datasource", default=None) 206 | parsed_args = args.parse_args() 207 | print(parsed_args.config) 208 | print(parsed_args.datasource) 209 | predict_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 210 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.0.1 2 | pymongo==3.11.0 3 | dnspython==1.16.0 4 | PyYAML 5 | pandas 6 | sklearn 7 | -e . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="insurance-prediction", 5 | version="0.0.3", 6 | description="insurance-prediction", 7 | author="Avnish yadav", 8 | packages=find_packages(), 9 | license="MIT" 10 | ) -------------------------------------------------------------------------------- /spark_consumer_from_kafka.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.ml import PipelineModel 4 | from pyspark.ml.regression import RandomForestRegressionModel 5 | 6 | from streaming.spark_manager.spark_manager import SparkManager 7 | from streaming.consumer.kafka_to_spark_csv_consumer import KafkaToSparkCSVConsumer 8 | 9 | 10 | if __name__ == "__main__": 11 | spark_session = SparkManager().get_spark_session_object() 12 | 13 | schema_string = "age INT,sex STRING,bmi DOUBLE,children INT,smoker STRING,region STRING" 14 | database_name = "stream_prediction" 15 | collection_name = "insurance_prediction_output" 16 | kfk_con = KafkaToSparkCSVConsumer(spark_session=spark_session, 17 | schema_string=schema_string, 18 | database_name=database_name, 19 | collection_name=collection_name 20 | ) 21 | transformer_list = [] 22 | pipeline_model = PipelineModel.load(os.path.join("artifacts", 23 | "pipeline", 24 | "pipeline_model")) 25 | random_forest_model = RandomForestRegressionModel.load(os.path.join("artifacts", 26 | "model", 27 | "random_forest_regressor")) 28 | 29 | transformer_list.append(pipeline_model) 30 | transformer_list.append(random_forest_model) 31 | kfk_con.spark_transformer.add_machine_learning_transformer( 32 | transformer=transformer_list 33 | ) 34 | kfk_con.receive_csv_data_from_kafka_topics() 35 | -------------------------------------------------------------------------------- /streaming/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__init__.py -------------------------------------------------------------------------------- /streaming/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/consumer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__init__.py -------------------------------------------------------------------------------- /streaming/consumer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/consumer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/consumer/kafka_to_spark_csv_consumer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql.functions import * 4 | 5 | from insurance_exception.insurance_exception import InsuranceException as KafkaToSparkCSVConsumerException 6 | import sys 7 | from utility import read_params 8 | from streaming.transformer.spark_transformer import SparkTransformer 9 | 10 | 11 | class KafkaToSparkCSVConsumer: 12 | def __init__(self, schema_string, database_name, collection_name, spark_session, processing_interval_second=5, 13 | config_path=None, ): 14 | try: 15 | # accepting default configuration file if no configuration file path has been specified during object 16 | # instantiation 17 | path = os.path.join("config", "params.yaml") if config_path is None else os.path.join(config_path) 18 | self.config = read_params(config_path=path) 19 | self.kafka_topic_name = self.config['kafka']['topic_name'] 20 | self.kafka_bootstrap_server = self.config['kafka']['kafka_bootstrap_server'] 21 | self.spark_session = spark_session 22 | self.schema = schema_string # "age INT,sex STRING,bmi DOUBLE,children INT,smoker STRING,region STRING" 23 | self.spark_transformer = SparkTransformer(database_name=database_name, collection_name=collection_name) 24 | self.processing_interval_second = processing_interval_second 25 | self.query = None 26 | except Exception as e: 27 | kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException( 28 | "Error occurred in module [{0}] class [{1}] method [{2}] ". 29 | format(self.__module__, KafkaToSparkCSVConsumer.__name__, 30 | self.__init__.__name__)) 31 | raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e 32 | 33 | def receive_csv_data_from_kafka_topics(self): 34 | try: 35 | dataframe = self.spark_session \ 36 | .readStream \ 37 | .format("kafka") \ 38 | .option("kafka.bootstrap.servers", self.kafka_bootstrap_server) \ 39 | .option("subscribe", self.kafka_topic_name) \ 40 | .option("startingOffsets", "latest") \ 41 | .load() 42 | dataframe_1 = dataframe.selectExpr("CAST(value as STRING) ", "timestamp") 43 | dataframe_2 = dataframe_1.select(from_csv(col("value"), self.schema).alias("records"), "timestamp") 44 | dataframe_3 = dataframe_2.select("records.*", "timestamp") 45 | transformed_df = dataframe_3 46 | for transformer in self.spark_transformer.ml_transformer: 47 | transformed_df = transformer.transform(transformed_df) 48 | self.query = transformed_df.writeStream.trigger( 49 | processingTime=f'{self.processing_interval_second} seconds').foreachBatch( 50 | self.spark_transformer.process_each_record).start() 51 | self.query.awaitTermination() 52 | except Exception as e: 53 | kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException( 54 | "Error occurred in module [{0}] class [{1}] method [{2}] ". 55 | format(self.__module__, KafkaToSparkCSVConsumer.__name__, 56 | self.receive_csv_data_from_kafka_topics.__name__)) 57 | raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e 58 | 59 | def stop_stream(self): 60 | try: 61 | if self.query is not None: 62 | self.query.stop() 63 | 64 | except Exception as e: 65 | kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException( 66 | "Error occurred in module [{0}] class [{1}] method [{2}] ". 67 | format(self.__module__, KafkaToSparkCSVConsumer.__name__, 68 | self.receive_csv_data_from_kafka_topics.__name__)) 69 | raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e 70 | -------------------------------------------------------------------------------- /streaming/producer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__init__.py -------------------------------------------------------------------------------- /streaming/producer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/producer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/producer/__pycache__/kafka_csv_data_producer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/producer/__pycache__/kafka_csv_data_producer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/producer/kafka_csv_data_producer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from kafka import KafkaProducer 5 | from utility import read_params 6 | import time 7 | from insurance_exception.insurance_exception import InsuranceException as KafkaCSVDataProducerException 8 | from streaming.spark_manager.spark_manager import SparkManager 9 | 10 | 11 | class KafkaCSVDataProducer: 12 | 13 | def __init__(self, spark_session,config_path=None): 14 | """ 15 | Creator: 16 | ********************************************************************************************************** 17 | created date: 02 November 2021 18 | Organization: iNeuron 19 | author: avnish@ineuron.ai 20 | ********************************************************************************************************** 21 | Description: 22 | ********************************************************************************************************** 23 | KafkaCSVDataProducer is responsible to read a csv file and send data row by row to a kafka topic specified in 24 | configuration file: 25 | define below record 26 | kafka: 27 | topic_name: 28 | kafka_bootstrap_server: 29 | ************************************************************************************************************* 30 | Example: 31 | kafka: 32 | topic_name: insurance-prediction 33 | kafka_bootstrap_server: localhost:9092 34 | 35 | parameters: 36 | ============================================================================================================= 37 | param config_path: configuration file path default is config/param.yaml 38 | 39 | """ 40 | try: 41 | # accepting default configuration file if no configuration file path has been specified during object 42 | # instantiation 43 | path = os.path.join("config", "params.yaml") if config_path is None else os.path.join(config_path) 44 | self.config = read_params(config_path=path) 45 | self.kafka_topic_name = self.config['kafka']['topic_name'] 46 | self.kafka_bootstrap_server = self.config['kafka']['kafka_bootstrap_server'] 47 | # creating kafka producer object 48 | self.kafka_producer = KafkaProducer(bootstrap_servers=self.kafka_bootstrap_server, 49 | value_serializer=lambda x: x.encode('utf-8')) 50 | # obtain spark session object 51 | self.spark_session = spark_session 52 | except Exception as e: 53 | kafka_csv_data_producer_exp = KafkaCSVDataProducerException( 54 | "Error occurred in module [{0}] class [{1}] method [{2}] ". 55 | format(self.__module__, KafkaCSVDataProducer.__name__, 56 | self.__init__.__name__)) 57 | raise Exception(kafka_csv_data_producer_exp.error_message_detail(str(e), sys)) from e 58 | 59 | def send_csv_data_to_kafka_topic(self, directory_path): 60 | """ 61 | Creator: 62 | ********************************************************************************************************** 63 | created date: 02 November 2021 64 | Organization: iNeuron 65 | author: avnish@ineuron.ai 66 | ********************************************************************************************************** 67 | Description: 68 | ********************************************************************************************************** 69 | function will send all csv files content to kafka topics specified in configuration file. 70 | ========================================================================================================== 71 | param: 72 | directory_path: csv file directory 73 | 74 | ========================================================================================================== 75 | return: function will not return any thing 76 | """ 77 | try: 78 | files = os.listdir(directory_path) 79 | n_row = 0 80 | 81 | for file in files: 82 | 83 | # skip all files except csv 84 | if not file.endswith(".csv"): 85 | continue 86 | file_path = os.path.join(directory_path, file) 87 | # reading csv file using spark session 88 | # df = self.spark_session.read.csv(file_path) 89 | df = self.spark_session.read.csv(file_path,header=True,inferSchema=True) 90 | # sending dataframe to kafka topic iteratively 91 | for row in df.rdd.toLocalIterator(): 92 | message=",".join(map(str, list(row))) 93 | print(message) 94 | self.kafka_producer.send(self.kafka_topic_name,message) 95 | n_row += 1 96 | time.sleep(1) 97 | 98 | 99 | #df.foreach(lambda row: self.kafka_producer.send(self.kafka_topic_name, ",".join(map(str, list(row))))) 100 | return n_row 101 | except Exception as e: 102 | kafka_csv_data_producer_exp = KafkaCSVDataProducerException( 103 | "Error occurred in module [{0}] class [{1}] method [{2}] ". 104 | format(self.__module__, KafkaCSVDataProducer.__name__, 105 | self.__init__.__name__)) 106 | raise Exception(kafka_csv_data_producer_exp.error_message_detail(str(e), sys)) from e 107 | 108 | 109 | """ 110 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 111 | """ 112 | -------------------------------------------------------------------------------- /streaming/spark_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__init__.py -------------------------------------------------------------------------------- /streaming/spark_manager/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/spark_manager/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/spark_manager/__pycache__/spark_manager.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/spark_manager.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/spark_manager/__pycache__/spark_manager.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/spark_manager.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/spark_manager/spark_manager.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pyspark.sql import SparkSession 4 | from insurance_exception.insurance_exception import InsuranceException as SparkManagerException 5 | 6 | 7 | class SparkManager: 8 | spark_session = None 9 | 10 | def __init__(self,app_name="ineuron-machine-learning"): 11 | """ 12 | Creator: 13 | ********************************************************************************************************** 14 | created date: 02 November 2021 15 | Organization: iNeuron 16 | author: avnish@ineuron.ai 17 | ********************************************************************************************************** 18 | Description: 19 | ********************************************************************************************************** 20 | SparkManager is responsible to return spark_session object. 21 | Any modification required should be done in SparkManager class 22 | """ 23 | try: 24 | self.app_name=app_name 25 | except Exception as e: 26 | spark_manager_exception = SparkManagerException("Error occurred in module [{0}] class [{1}] method [{2}] ". 27 | format(self.__module__, SparkManager.__name__, 28 | self.__init__.__name__)) 29 | raise Exception(spark_manager_exception.error_message_detail(str(e), sys)) from e 30 | 31 | def get_spark_session_object(self): 32 | """ 33 | function will return spark session object 34 | """ 35 | try: 36 | if SparkManager.spark_session is None: 37 | SparkManager.spark_session = SparkSession.builder.master("local").appName(self.app_name) \ 38 | .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1")\ 39 | .config("spark.ui.port", "4041").getOrCreate() 40 | 41 | return SparkManager.spark_session 42 | except Exception as e: 43 | spark_manager_exception = SparkManagerException("Error occurred in module [{0}] class [{1}] method [{2}] ". 44 | format(self.__module__, SparkManager.__name__, 45 | self.get_spark_session_object.__name__)) 46 | raise Exception(spark_manager_exception.error_message_detail(str(e), sys)) from e 47 | -------------------------------------------------------------------------------- /streaming/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__init__.py -------------------------------------------------------------------------------- /streaming/transformer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/transformer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/transformer/__pycache__/spark_transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/spark_transformer.cpython-37.pyc -------------------------------------------------------------------------------- /streaming/transformer/__pycache__/spark_transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/spark_transformer.cpython-38.pyc -------------------------------------------------------------------------------- /streaming/transformer/spark_transformer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from insurance_exception.insurance_exception import InsuranceException as SparkTransformerException 4 | import os, sys 5 | from mongo_db.mongo_db_atlas import MongoDBOperation 6 | 7 | 8 | class SparkTransformer(): 9 | def __init__(self, database_name, collection_name): 10 | try: 11 | self.database_name = database_name 12 | self.collection_name = collection_name 13 | self.mongo_db = MongoDBOperation() 14 | self.ml_transformer = [] 15 | 16 | 17 | except Exception as e: 18 | spark_transformer_exception = SparkTransformerException("Error occurred in module [{0}] class [{1}] " 19 | "method [{2}] ". 20 | format(self.__module__, SparkTransformer.__name__, 21 | self.__init__.__name__)) 22 | raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e 23 | 24 | def add_machine_learning_transformer(self, transformer: list): 25 | try: 26 | self.ml_transformer.extend(transformer) 27 | except Exception as e: 28 | spark_transformer_exception = SparkTransformerException("Error occurred in module [{0}] class [{1}] " 29 | "method [{2}] ". 30 | format(self.__module__, SparkTransformer.__name__, 31 | self.__init__.__name__)) 32 | raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e 33 | 34 | def process_each_record(self, dataframe,epoch_id): 35 | try: 36 | dataframe = dataframe.toPandas() 37 | if dataframe.shape[0] > 0: 38 | dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp']) 39 | self.mongo_db.insert_dataframe_into_collection(db_name=self.database_name, 40 | collection_name=self.collection_name, 41 | data_frame=dataframe) 42 | dataframe.to_csv("new_data.csv", index=None) 43 | except Exception as e: 44 | spark_transformer_exception = SparkTransformerException("Error occurred in module [{0}] class [{1}] " 45 | "method [{2}] ". 46 | format(self.__module__, SparkTransformer.__name__, 47 | self.process_each_record.__name__)) 48 | raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e 49 | -------------------------------------------------------------------------------- /training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/training/__init__.py -------------------------------------------------------------------------------- /training/stage_00_data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | from utility import read_params 5 | import argparse 6 | from utility import get_logger_object_of_training 7 | from utility import clean_data_source_dir 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException 9 | 10 | log_collection_name = "data_loader" 11 | 12 | 13 | def loader_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None: 14 | try: 15 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name, 16 | execution_id=execution_id,executed_by=executed_by) 17 | logger.is_log_enable = is_logging_enable 18 | logger.log("Starting data loading operation.\nReading configuration file.") 19 | 20 | config = read_params(config_path) 21 | 22 | downloader_path = config['data_download']['cloud_training_directory_path'] 23 | download_path = config['data_source']['Training_Batch_Files'] 24 | 25 | logger.log("Configuration detail has been fetched from configuration file.") 26 | # removing existing training and additional training files from local 27 | logger.log(f"Cleaning local directory [{download_path}] for training.") 28 | clean_data_source_dir(download_path, logger=logger, 29 | is_logging_enable=is_logging_enable) # removing existing file from local system 30 | 31 | logger.log(f"Cleaning completed. Directory has been cleared now [{download_path}]") 32 | # downloading traning and additional training file from cloud into local system 33 | logger.log("Data will be downloaded from cloud storage into local system") 34 | 35 | 36 | for file in os.listdir(downloader_path): 37 | if '.dvc' in file or '.gitignore' in file: 38 | continue 39 | print(f"Source dir: {downloader_path} file: {file} is being copied into destination dir: {download_path}" 40 | f" file: {file}") 41 | shutil.copy(os.path.join(downloader_path,file),os.path.join(download_path,file)) 42 | 43 | logger.log("Data has been downloaded from cloud storage into local system") 44 | 45 | except Exception as e: 46 | generic_exception = GenericException( 47 | "Error occurred in module [{0}] method [{1}]" 48 | .format(loader_main.__module__, 49 | loader_main.__name__)) 50 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 51 | 52 | if __name__ == '__main__': 53 | args = argparse.ArgumentParser() 54 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 55 | args.add_argument("--datasource", default=None) 56 | parsed_args = args.parse_args() 57 | print("started") 58 | loader_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 59 | -------------------------------------------------------------------------------- /training/stage_01_data_validator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | import sys 5 | 6 | import pandas as pd 7 | from utility import read_params, create_directory_path, values_from_schema_function, get_logger_object_of_training, \ 8 | get_date, get_time 9 | from insurance_exception.insurance_exception import InsuranceException as GenericException 10 | import argparse 11 | import datetime 12 | import shutil 13 | 14 | log_collection_name = "data_validator" 15 | 16 | 17 | class DataValidator: 18 | def __init__(self, config, logger, is_logging_enable=True): 19 | try: 20 | self.logger = logger 21 | self.logger.is_log_enable = is_logging_enable 22 | self.config = config 23 | self.file_path = self.config['data_source']['Training_Batch_Files'] 24 | self.good_file_path = self.config['artifacts']['training_data']['good_file_path'] 25 | self.bad_file_path = self.config['artifacts']['training_data']['bad_file_path'] 26 | self.archive_bad_file_path = self.config['artifacts']['training_data']['archive_bad_file_path'] 27 | self.training_schema_file = self.config['config']['schema_training'] 28 | except Exception as e: 29 | generic_exception = GenericException( 30 | "Error occurred in module [{0}] class [{1}] method [{2}]" 31 | .format(self.__module__, DataValidator.__name__, 32 | self.__init__.__name__)) 33 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 34 | 35 | def archive_bad_files(self): 36 | try: 37 | folder_name = f"bad_files_{get_date().replace('-', '_')}_{get_time().replace(':', '_')}" 38 | archive_directory_path = os.path.join(self.archive_bad_file_path, folder_name) 39 | create_directory_path(archive_directory_path) 40 | for file in os.listdir(self.bad_file_path): 41 | source_file_path = os.path.join(self.bad_file_path, file) 42 | shutil.move(source_file_path, archive_directory_path) 43 | except Exception as e: 44 | generic_exception = GenericException( 45 | "Error occurred in module [{0}] class [{1}] method [{2}]" 46 | .format(self.__module__, DataValidator.__name__, 47 | self.archive_bad_files.__name__)) 48 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 49 | 50 | def create_good_bad_archive_bad_file_path(self): 51 | try: 52 | create_directory_path(self.good_file_path) 53 | create_directory_path(self.bad_file_path) 54 | create_directory_path(self.archive_bad_file_path, is_recreate=False) 55 | except Exception as e: 56 | generic_exception = GenericException( 57 | "Error occurred in module [{0}] class [{1}] method [{2}]" 58 | .format(self.__module__, DataValidator.__name__, 59 | self.create_good_bad_archive_bad_file_path.__name__)) 60 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 61 | 62 | def value_from_schema(self): 63 | """ 64 | 65 | :return: tuple (sample_file_name,column_names,number_of_column) 66 | """ 67 | try: 68 | return values_from_schema_function(self.training_schema_file) 69 | except Exception as e: 70 | generic_exception = GenericException( 71 | "Error occurred in module [{0}] class [{1}] method [{2}]" 72 | .format(self.__module__, DataValidator.__name__, 73 | self.value_from_schema.__name__)) 74 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 75 | 76 | def file_name_regular_expression(self): 77 | """ 78 | 79 | :return: regular expression syntax which can be used for validation of file name 80 | """ 81 | return "['HealthPrem']+['\_'']+[\d_]+[\d]+\.csv" 82 | 83 | def validate_file_name(self): 84 | try: 85 | self.create_good_bad_archive_bad_file_path() 86 | file_name_reg_pattern = self.file_name_regular_expression() 87 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = self.value_from_schema() 88 | self.logger.log(f"Validating file names.") 89 | files = os.listdir(self.file_path) 90 | for file in files: 91 | file_path = os.path.join(self.file_path, file) 92 | split_at_dot = re.split('.csv', file) 93 | split_at_dot = (re.split('_', split_at_dot[0])) 94 | if re.match(file_name_reg_pattern, file) and len(split_at_dot[1]) == length_of_date_stamp_in_file \ 95 | and len(split_at_dot[2]) == length_of_time_stamp_in_file: 96 | destination_file_path = os.path.join(self.good_file_path, file) 97 | self.logger.log( 98 | f"file name : {file} matched hence moving file to good file path {destination_file_path}") 99 | shutil.move(file_path, destination_file_path) 100 | else: 101 | destination_file_path = os.path.join(self.bad_file_path, file) 102 | self.logger.log( 103 | f"file name: {file} does not matched hence moving file to bad file path {destination_file_path}") 104 | shutil.move(file_path, destination_file_path) 105 | except Exception as e: 106 | generic_exception = GenericException( 107 | "Error occurred in module [{0}] class [{1}] method [{2}]" 108 | .format(self.__module__, DataValidator.__name__, 109 | self.validate_file_name.__name__)) 110 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 111 | 112 | def validate_missing_values_in_whole_column(self): 113 | try: 114 | self.logger.log("Missing Values Validation Started!!") 115 | for file in os.listdir(self.good_file_path): 116 | csv = pd.read_csv(os.path.join(self.good_file_path, file)) 117 | count = 0 118 | for columns in csv: 119 | if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]): 120 | count += 1 121 | shutil.move(os.path.join(self.good_file_path, file), 122 | self.bad_file_path) 123 | self.logger.log( 124 | "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) 125 | break 126 | if count == 0: 127 | print(csv.columns) 128 | csv.rename(columns={"Unnamed: 0": "Premium "}, inplace=True) 129 | csv.to_csv(os.path.join(self.good_file_path, file), index=None, header=True) 130 | except Exception as e: 131 | generic_exception = GenericException( 132 | "Error occurred in module [{0}] class [{1}] method [{2}]" 133 | .format(self.__module__, DataValidator.__name__, 134 | self.validate_missing_values_in_whole_column.__name__)) 135 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 136 | 137 | def validate_no_of_column(self, no_of_column): 138 | """ 139 | Description: 140 | If number of column matches then file will be move to good file path else bad file path 141 | ===================================================================================== 142 | :param no_of_column: int Number of column must present in each file 143 | :return: Nothing 144 | """ 145 | try: 146 | self.logger.log(f"Validating number of column in input file") 147 | files = os.listdir(self.good_file_path) 148 | for file in files: 149 | file_path = os.path.join(self.good_file_path, file) 150 | df = pd.read_csv(file_path) 151 | if df.shape[1] != no_of_column: 152 | destination_file_path = os.path.join(self.bad_file_path, file) 153 | self.logger.log( 154 | f"file: {file} has incorrect number of column hence moving file to bad file path {destination_file_path}") 155 | shutil.move(file_path, destination_file_path) 156 | except Exception as e: 157 | generic_exception = GenericException( 158 | "Error occurred in module [{0}] class [{1}] method [{2}]" 159 | .format(self.__module__, DataValidator.__name__, 160 | self.validate_no_of_column.__name__)) 161 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 162 | 163 | 164 | def validation_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None, 165 | executed_by=None) -> None: 166 | try: 167 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name, 168 | execution_id=execution_id, executed_by=executed_by) 169 | logger.is_log_enable = is_logging_enable 170 | config = read_params(config_path) 171 | logger.log("data validation started") 172 | data_validator = DataValidator(config=config, logger=logger, is_logging_enable=is_logging_enable) 173 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns =\ 174 | data_validator.value_from_schema() 175 | data_validator.validate_file_name() 176 | data_validator.validate_no_of_column(no_of_column=number_of_columns) 177 | data_validator.validate_missing_values_in_whole_column() 178 | data_validator.archive_bad_files() 179 | except Exception as e: 180 | generic_exception = GenericException( 181 | "Error occurred in module [{0}] method [{1}]" 182 | .format(validation_main.__module__, 183 | validation_main.__name__)) 184 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 185 | 186 | 187 | if __name__ == '__main__': 188 | args = argparse.ArgumentParser() 189 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 190 | args.add_argument("--datasource", default=None) 191 | parsed_args = args.parse_args() 192 | print("started") 193 | validation_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 194 | -------------------------------------------------------------------------------- /training/stage_02_data_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pandas as pd 5 | import argparse 6 | from utility import read_params, get_logger_object_of_training 7 | from mongo_db.mongo_db_atlas import MongoDBOperation 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException 9 | 10 | log_collection_name = "data_transformer" 11 | 12 | 13 | class DataTransformer: 14 | def __init__(self, config, logger, is_log_enable=True): 15 | try: 16 | self.config = config 17 | self.logger = logger 18 | self.logger.is_log_enable = is_log_enable 19 | self.good_file_path = self.config["artifacts"]['training_data']['good_file_path'] 20 | self.unwanted_column_names = self.config["dataset"]['unwanted_column'] 21 | self.mongo_db = MongoDBOperation() 22 | self.dataset_database = self.config["dataset"]["database_detail"]["training_database_name"] 23 | self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_training_collection_name"] 24 | self.mongo_db.drop_collection(self.dataset_database, self.dataset_collection_name) 25 | except Exception as e: 26 | generic_exception = GenericException( 27 | "Error occurred in module [{0}] class [{1}] method [{2}]" 28 | .format(self.__module__, DataTransformer.__name__, 29 | self.__init__.__name__)) 30 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 31 | 32 | def unite_dataset(self): 33 | try: 34 | dataset_list = [] 35 | for file in os.listdir(self.good_file_path): 36 | dataset_list.append(pd.read_csv(os.path.join(self.good_file_path, file))) 37 | df = pd.concat(dataset_list) 38 | df = self.remove_unwanted_column(df) 39 | self.logger.log(f"Inserting dataset into database {self.dataset_database} " 40 | f"collection_name: {self.dataset_collection_name}") 41 | self.mongo_db.insert_dataframe_into_collection(self.dataset_database, self.dataset_collection_name, df) 42 | except Exception as e: 43 | generic_exception = GenericException( 44 | "Error occurred in module [{0}] class [{1}] method [{2}]" 45 | .format(self.__module__, DataTransformer.__name__, 46 | self.unite_dataset.__name__)) 47 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 48 | 49 | def remove_unwanted_column(self, df): 50 | try: 51 | print(self.unwanted_column_names) 52 | column_to_remove = list(filter(lambda x: x in df.columns, self.unwanted_column_names)) 53 | if len(column_to_remove) > 0: 54 | return df.drop(column_to_remove, axis=1) 55 | return df 56 | except Exception as e: 57 | generic_exception = GenericException( 58 | "Error occurred in module [{0}] class [{1}] method [{2}]" 59 | .format(self.__module__, DataTransformer.__name__, 60 | self.remove_unwanted_column.__name__)) 61 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 62 | 63 | 64 | def transform_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None, 65 | executed_by=None) -> None: 66 | try: 67 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name, 68 | execution_id=execution_id, executed_by=executed_by) 69 | logger.is_log_enable = is_logging_enable 70 | config = read_params(config_path) 71 | data_transformer = DataTransformer(config=config, logger=logger, is_log_enable=is_logging_enable) 72 | logger.log('Start of Data Preprocessing before DB') 73 | data_transformer.unite_dataset() 74 | logger.log('Data Preprocessing before DB Completed !!') 75 | 76 | except Exception as e: 77 | generic_exception = GenericException( 78 | "Error occurred in module [{0}] method [{1}]" 79 | .format(transform_main.__module__, 80 | transform_main.__name__)) 81 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 82 | 83 | 84 | if __name__ == '__main__': 85 | args = argparse.ArgumentParser() 86 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 87 | args.add_argument("--datasource", default=None) 88 | parsed_args = args.parse_args() 89 | print("started") 90 | transform_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 91 | -------------------------------------------------------------------------------- /training/stage_03_data_exporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from utility import read_params, create_directory_path 5 | from mongo_db.mongo_db_atlas import MongoDBOperation 6 | import argparse 7 | from utility import get_logger_object_of_training 8 | 9 | from insurance_exception.insurance_exception import InsuranceException as GenericException 10 | 11 | 12 | log_collection_name = "data_export" 13 | 14 | 15 | class DataExporter: 16 | def __init__(self, config, logger, is_log_enable): 17 | try: 18 | self.config = config 19 | self.logger = logger 20 | self.is_log_enable = is_log_enable 21 | self.mongo_db = MongoDBOperation() 22 | self.dataset_database = self.config["dataset"]["database_detail"]["training_database_name"] 23 | self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_training_collection_name"] 24 | self.training_file_from_db = self.config["artifacts"]['training_data']['training_file_from_db'] 25 | self.master_csv = self.config["artifacts"]['training_data']['master_csv'] 26 | except Exception as e: 27 | generic_exception = GenericException( 28 | "Error occurred in module [{0}] class [{1}] method [{2}]" 29 | .format(self.__module__, DataExporter.__name__, 30 | self.__init__.__name__)) 31 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 32 | 33 | def export_dataframe_from_database(self): 34 | try: 35 | create_directory_path(self.training_file_from_db) 36 | self.logger.log(f"Creating dataframe of data stored db" 37 | f"[{self.dataset_database}] and collection[{self.dataset_collection_name}]") 38 | df = self.mongo_db.get_dataframe_of_collection(db_name=self.dataset_database, 39 | collection_name=self.dataset_collection_name) 40 | master_csv_file_path = os.path.join(self.training_file_from_db, self.master_csv) 41 | self.logger.log(f"master csv file will be generated at " 42 | f"{master_csv_file_path}.") 43 | df.to_csv(master_csv_file_path, index=None,header=True) 44 | 45 | except Exception as e: 46 | generic_exception = GenericException( 47 | "Error occurred in module [{0}] class [{1}] method [{2}]" 48 | .format(self.__module__, DataExporter.__name__, 49 | self.export_dataframe_from_database.__name__)) 50 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 51 | 52 | 53 | def export_main(config_path: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None: 54 | try: 55 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name, 56 | execution_id=execution_id, executed_by=executed_by) 57 | 58 | logger.is_log_enable = is_logging_enable 59 | config = read_params(config_path) 60 | data_exporter = DataExporter(config=config, logger=logger, is_log_enable=is_logging_enable) 61 | logger.log("Generating csv file from dataset stored in database.") 62 | data_exporter.export_dataframe_from_database() 63 | logger.log("Dataset has been successfully exported in directory and exiting export pipeline.") 64 | except Exception as e: 65 | generic_exception = GenericException( 66 | "Error occurred in module [{0}] method [{1}]" 67 | .format(export_main.__module__, 68 | export_main.__name__)) 69 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 70 | 71 | 72 | if __name__ == '__main__': 73 | args = argparse.ArgumentParser() 74 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 75 | parsed_args = args.parse_args() 76 | print("started") 77 | export_main(config_path=parsed_args.config) 78 | -------------------------------------------------------------------------------- /training/stage_04_model_trainer.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | import sys 4 | 5 | import os 6 | import argparse 7 | 8 | from pyspark.sql.types import IntegerType, StringType, FloatType 9 | from sklearn.metrics import r2_score, mean_squared_error 10 | 11 | from utility import create_directory_path,read_params 12 | import numpy as np 13 | from utility import get_logger_object_of_training 14 | from pyspark.ml import Pipeline 15 | from pyspark.ml.regression import RandomForestRegressor 16 | from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler 17 | 18 | from insurance_exception.insurance_exception import InsuranceException as GenericException 19 | 20 | from streaming.spark_manager.spark_manager import SparkManager 21 | 22 | log_collection_name = "training_model" 23 | 24 | 25 | class DataPreProcessing: 26 | def __init__(self, logger, is_log_enable=True, data_frame=None, pipeline_path=None): 27 | try: 28 | self.logger = logger 29 | self.logger.is_log_enable = is_log_enable 30 | self.data_frame = data_frame 31 | self.stages = [] 32 | self.pipeline_path = pipeline_path 33 | except Exception as e: 34 | generic_exception = GenericException( 35 | "Error occurred in module [{0}] class [{1}] method [{2}]" 36 | .format(self.__module__, DataPreProcessing.__name__, 37 | self.__init__.__name__)) 38 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 39 | 40 | def set_dataframe(self, dataframe): 41 | try: 42 | self.data_frame = dataframe 43 | except Exception as e: 44 | generic_exception = GenericException( 45 | "Error occurred in module [{0}] class [{1}] method [{2}]" 46 | .format(self.__module__, DataPreProcessing.__name__, 47 | self.update_dataframe_scheme.__name__)) 48 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 49 | 50 | def update_dataframe_scheme(self, schema_definition: dict): 51 | """ 52 | 53 | """ 54 | try: 55 | print(self.data_frame.printSchema()) 56 | if self.data_frame is None: 57 | raise Exception("update the attribute dataframe") 58 | for column, datatype in schema_definition.items(): 59 | self.logger.log(f"Update datatype of column: {column} to {str(datatype)}") 60 | self.data_frame = self.data_frame.withColumn(column, self.data_frame[column].cast(datatype)) 61 | except Exception as e: 62 | generic_exception = GenericException( 63 | "Error occurred in module [{0}] class [{1}] method [{2}]" 64 | .format(self.__module__, DataPreProcessing.__name__, 65 | self.update_dataframe_scheme.__name__)) 66 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 67 | 68 | def encode_categorical_column(self, input_columns: list): 69 | try: 70 | string_indexer = StringIndexer(inputCols=input_columns, 71 | outputCols=[f"{column}_encoder" for column in input_columns]) 72 | self.stages.append(string_indexer) 73 | one_hot_encoder = OneHotEncoder(inputCols=string_indexer.getOutputCols(), 74 | outputCols=[f"{column}_encoded" for column in input_columns]) 75 | self.stages.append(one_hot_encoder) 76 | 77 | except Exception as e: 78 | generic_exception = GenericException( 79 | "Error occurred in module [{0}] class [{1}] method [{2}]" 80 | .format(self.__module__, DataPreProcessing.__name__, 81 | self.encode_categorical_column.__name__)) 82 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 83 | 84 | def create_input_features(self, required_column: list): 85 | """ 86 | 87 | """ 88 | try: 89 | vector_assembler = VectorAssembler(inputCols=required_column, outputCol="input_features") 90 | self.stages.append(vector_assembler) 91 | except Exception as e: 92 | generic_exception = GenericException( 93 | "Error occurred in module [{0}] class [{1}] method [{2}]" 94 | .format(self.__module__, DataPreProcessing.__name__, 95 | self.create_input_features.__name__)) 96 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 97 | 98 | def get_train_test_dataframe(self, test_size=0.2): 99 | try: 100 | train_df, test_df = self.data_frame.randomSplit([1 - test_size, test_size], seed=random.randint(0, 1000)) 101 | self.logger.log(f"Training dataset count {train_df.count()}") 102 | self.logger.log(f"Test dataset count {test_df.count()}") 103 | return train_df, test_df 104 | except Exception as e: 105 | generic_exception = GenericException( 106 | "Error occurred in module [{0}] class [{1}] method [{2}]" 107 | .format(self.__module__, DataPreProcessing.__name__, 108 | self.get_train_test_dataframe.__name__)) 109 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 110 | 111 | def get_prepared_dataset(self, ): 112 | try: 113 | schema_definition = {"age": IntegerType(), 114 | "sex": StringType(), 115 | "bmi": FloatType(), 116 | "children": IntegerType(), 117 | "smoker": StringType(), 118 | "expenses": FloatType() 119 | } 120 | self.update_dataframe_scheme(schema_definition=schema_definition) 121 | self.encode_categorical_column(input_columns=["sex", "smoker"]) 122 | required_column = ['age', 'bmi', 'children', 'sex_encoded', 'smoker_encoded', ] 123 | self.create_input_features(required_column=required_column) 124 | pipeline = Pipeline(stages=self.stages) 125 | pipeline_fitted_obj = pipeline.fit(self.data_frame) 126 | self.data_frame = pipeline_fitted_obj.transform(self.data_frame) 127 | # os.remove(path=self.pipeline_path) 128 | create_directory_path(self.pipeline_path, is_recreate=True) 129 | pipeline_fitted_obj.write().overwrite().save(self.pipeline_path) 130 | return self.get_train_test_dataframe(test_size=0.2) 131 | except Exception as e: 132 | generic_exception = GenericException( 133 | "Error occurred in module [{0}] class [{1}] method [{2}]" 134 | .format(self.__module__, DataPreProcessing.__name__, 135 | self.get_prepared_dataset.__name__)) 136 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 137 | 138 | 139 | class ModelTrainer: 140 | 141 | def __init__(self, config, logger, is_log_enable): 142 | try: 143 | self.logger = logger 144 | self.logger.is_log_enable = is_log_enable 145 | self.config = config 146 | self.training_file_path = self.config['artifacts']['training_data']['training_file_from_db'] 147 | self.master_csv = self.config['artifacts']['training_data']['master_csv'] 148 | self.target_columns = self.config['target_columns']['columns'] 149 | self.test_size = self.config['base']['test_size'] 150 | self.random_state = self.config['base']['random_state'] 151 | self.plot = self.config['artifacts']['training_data']['plots'] 152 | self.pipeline_path = self.config['artifacts']['training_data']['pipeline_path'] 153 | self.model_path = config['artifacts']['model']['model_path'] 154 | self.null_value_file_path = config['artifacts']['training_data']['null_value_info_file_path'] 155 | """ 156 | self.spark = SparkSession.builder.\ 157 | master("local[*]").\ 158 | appName("insurance-premium-reg").getOrCreate() 159 | """ 160 | self.spark = SparkManager().get_spark_session_object() 161 | """ 162 | self.spark=SparkSession.builder.appName('app_name') \ 163 | .master('local[*]') \ 164 | .config('spark.sql.execution.arrow.pyspark.enabled', True) \ 165 | .config('spark.sql.session.timeZone', 'UTC') \ 166 | .config('spark.driver.memory', '32G') \ 167 | .config('spark.ui.showConsoleProgress', True) \ 168 | .config('spark.sql.repl.eagerEval.enabled', True) \ 169 | .getOrCreate() 170 | """ 171 | except Exception as e: 172 | generic_exception = GenericException( 173 | "Error occurred in module [{0}] class [{1}] method [{2}]" 174 | .format(self.__module__, ModelTrainer.__name__, 175 | self.__init__.__name__)) 176 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 177 | 178 | 179 | def save_regression_metric_data(self, y_true, y_pred, title): 180 | try: 181 | y_true = np.array(y_true).reshape(-1) 182 | y_pred = np.array(y_pred).reshape(-1) 183 | rmse = np.sqrt(mean_squared_error(y_true, y_pred)) 184 | r_squared_score = r2_score(y_true, y_pred) 185 | msg = f"{title} R squared score: {r_squared_score:.3%}" 186 | self.logger.log(msg) 187 | print(msg) 188 | msg = f"{title} Root mean squared error: {rmse:.3}" 189 | self.logger.log(msg) 190 | print(msg) 191 | except Exception as e: 192 | generic_exception = GenericException( 193 | "Error occurred in module [{0}] class [{1}] method [{2}]" 194 | .format(self.__module__, ModelTrainer.__name__, 195 | self.save_regression_metric_data.__name__)) 196 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 197 | 198 | def get_dataframe(self): 199 | try: 200 | master_file_path = os.path.join(self.training_file_path, self.master_csv) 201 | 202 | return self.spark.read.csv(master_file_path, header=True, inferSchema=True) 203 | except Exception as e: 204 | generic_exception = GenericException( 205 | "Error occurred in module [{0}] class [{1}] method [{2}]" 206 | .format(self.__module__, ModelTrainer.__name__, 207 | self.get_dataframe.__name__)) 208 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 209 | 210 | def data_preparation(self): 211 | try: 212 | data_frame = self.get_dataframe() 213 | preprocessing = DataPreProcessing(logger=self.logger, 214 | is_log_enable=self.logger.is_log_enable, 215 | data_frame=data_frame, 216 | pipeline_path=self.pipeline_path) 217 | return preprocessing.get_prepared_dataset() 218 | except Exception as e: 219 | generic_exception = GenericException( 220 | "Error occurred in module [{0}] class [{1}] method [{2}]" 221 | .format(self.__module__, ModelTrainer.__name__, 222 | self.data_preparation.__name__)) 223 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 224 | 225 | def begin_training(self): 226 | try: 227 | train_df, test_df = self.data_preparation() 228 | random_forest_regressor = RandomForestRegressor(featuresCol="input_features", labelCol="expenses") 229 | random_forest_model = random_forest_regressor.fit(train_df) 230 | train_prediction = random_forest_model.transform(train_df) 231 | testing_prediction = random_forest_model.transform(test_df) 232 | training_data = train_prediction.select("expenses", "prediction").toPandas() 233 | testing_data = testing_prediction.select("expenses", "prediction").toPandas() 234 | self.save_regression_metric_data(training_data['expenses'], training_data['prediction'], 235 | title="Training score") 236 | self.save_regression_metric_data(testing_data['expenses'], testing_data['prediction'], 237 | title="Testing score") 238 | 239 | self.save_model(model=random_forest_model, model_name="random_forest_regressor") 240 | self.spark.stop() 241 | except Exception as e: 242 | generic_exception = GenericException( 243 | "Error occurred in module [{0}] class [{1}] method [{2}]" 244 | .format(self.__module__, ModelTrainer.__name__, 245 | self.begin_training.__name__)) 246 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 247 | 248 | def save_model(self, model, model_name, intermediate_path=None): 249 | try: 250 | 251 | if intermediate_path is None: 252 | model_path = os.path.join(self.model_path) 253 | else: 254 | model_path = os.path.join(self.model_path, intermediate_path) 255 | create_directory_path(model_path, ) 256 | model_full_path = os.path.join(model_path, f"{model_name}") 257 | self.logger.log(f"Saving mode: {model_name} at path {model_full_path}") 258 | # os.remove(path=model_full_path) 259 | model.write().overwrite().save(model_full_path) 260 | 261 | except Exception as e: 262 | generic_exception = GenericException( 263 | "Error occurred in module [{0}] class [{1}] method [{2}]" 264 | .format(self.__module__, ModelTrainer.__name__, 265 | self.save_model.__name__)) 266 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 267 | 268 | 269 | def train_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None, executed_by=None) -> None: 270 | try: 271 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name, 272 | execution_id=execution_id, executed_by=executed_by) 273 | 274 | logger.is_log_enable = is_logging_enable 275 | logger.log("Training begin.") 276 | config = read_params(config_path) 277 | model_trainer = ModelTrainer(config=config, logger=logger, is_log_enable=is_logging_enable) 278 | model_trainer.begin_training() 279 | logger.log("Training completed successfully.") 280 | 281 | except Exception as e: 282 | generic_exception = GenericException( 283 | "Error occurred in module [{0}] method [{1}]" 284 | .format(train_main.__module__, 285 | train_main.__name__)) 286 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e 287 | 288 | 289 | if __name__ == '__main__': 290 | args = argparse.ArgumentParser() 291 | args.add_argument("--config", default=os.path.join("config", "params.yaml")) 292 | args.add_argument("--datasource", default=None) 293 | parsed_args = args.parse_args() 294 | print(parsed_args.config) 295 | print(parsed_args.datasource) 296 | train_main(config_path=parsed_args.config, datasource=parsed_args.datasource) 297 | -------------------------------------------------------------------------------- /utility.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import json 3 | from datetime import datetime 4 | 5 | import yaml 6 | import uuid 7 | import os 8 | import shutil 9 | from logger.logger import AppLogger 10 | 11 | def get_time(): 12 | """ 13 | 14 | :return current time: 15 | """ 16 | return datetime.now().strftime("%H:%M:%S").__str__() 17 | 18 | def get_date(): 19 | """ 20 | 21 | :return current date: 22 | """ 23 | return datetime.now().date().__str__() 24 | 25 | 26 | 27 | def create_directory_path(path, is_recreate=True): 28 | """ 29 | :param path: 30 | :param is_recreate: Default it will delete the existing directory yet you can pass 31 | it's value to false if you do not want to remove existing directory 32 | :return: 33 | """ 34 | try: 35 | if is_recreate: 36 | if os.path.exists(path): 37 | shutil.rmtree(path, ignore_errors=False) # remove existing directory if is_recreate is true 38 | os.makedirs(path, exist_ok=True) # if directory is present it will not alter anything 39 | return True 40 | except Exception as e: 41 | raise e 42 | 43 | 44 | def clean_data_source_dir(path, logger=None, is_logging_enable=True): 45 | try: 46 | if not os.path.exists(path): 47 | os.mkdir(path) 48 | for file in os.listdir(path): 49 | if '.gitignore' in file: 50 | pass 51 | logger.log(f"{os.path.join(path, file)}file will be deleted.") 52 | os.remove(os.path.join(path, file)) 53 | logger.log(f"{os.path.join(path, file)}file has been deleted.") 54 | except Exception as e: 55 | raise e 56 | 57 | 58 | 59 | def get_logger_object_of_training(config_path: str, collection_name, execution_id=None, executed_by=None) -> AppLogger: 60 | config = read_params(config_path) 61 | database_name = config['log_database']['training_database_name'] 62 | if execution_id is None: 63 | execution_id = str(uuid.uuid4()) 64 | if executed_by is None: 65 | executed_by = "Avnish Yadav" 66 | logger = AppLogger(project_id=5, log_database=database_name, log_collection_name=collection_name, 67 | execution_id=execution_id, executed_by=executed_by) 68 | return logger 69 | 70 | 71 | def get_logger_object_of_prediction(config_path: str, collection_name, execution_id=None, 72 | executed_by=None) -> AppLogger: 73 | config = read_params(config_path) 74 | database_name = config['log_database']['prediction_database_name'] 75 | if execution_id is None: 76 | execution_id = str(uuid.uuid4()) 77 | if executed_by is None: 78 | executed_by = "Avnish Yadav" 79 | logger = AppLogger(project_id=5, log_database=database_name, log_collection_name=collection_name, 80 | execution_id=execution_id, executed_by=executed_by) 81 | return logger 82 | 83 | 84 | def read_params(config_path: str) -> dict: 85 | with open(config_path) as yaml_file: 86 | config = yaml.safe_load(yaml_file) 87 | return config 88 | 89 | 90 | def values_from_schema_function(schema_path): 91 | try: 92 | with open(schema_path, 'r') as r: 93 | dic = json.load(r) 94 | r.close() 95 | 96 | pattern = dic['SampleFileName'] 97 | length_of_date_stamp_in_file = dic['LengthOfDateStampInFile'] 98 | length_of_time_stamp_in_file = dic['LengthOfTimeStampInFile'] 99 | column_names = dic['ColName'] 100 | number_of_columns = dic['NumberofColumns'] 101 | return pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns 102 | except ValueError: 103 | raise ValueError 104 | 105 | except KeyError: 106 | raise KeyError 107 | 108 | except Exception as e: 109 | raise e 110 | 111 | --------------------------------------------------------------------------------