├── .gitignore
├── .idea
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── pyspark machine learning.iml
└── vcs.xml
├── README.md
├── __pycache__
└── utility.cpython-37.pyc
├── artifacts
├── Prediction_Output_DIR
│ └── Predictions.csv
├── model
│ └── random_forest_regressor
│ │ ├── data
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc
│ │ ├── _SUCCESS
│ │ └── part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet
│ │ ├── metadata
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000.crc
│ │ ├── _SUCCESS
│ │ └── part-00000
│ │ └── treesMetadata
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc
│ │ ├── _SUCCESS
│ │ └── part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet
├── pipeline
│ └── pipeline_model
│ │ ├── metadata
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000.crc
│ │ ├── _SUCCESS
│ │ └── part-00000
│ │ └── stages
│ │ ├── 0_StringIndexer_046a38b797e0
│ │ ├── data
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet
│ │ └── metadata
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ ├── 1_OneHotEncoder_ccd93b498912
│ │ ├── data
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet
│ │ └── metadata
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ └── 2_VectorAssembler_c52ccaa0dc60
│ │ └── metadata
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000.crc
│ │ ├── _SUCCESS
│ │ └── part-00000
├── prediction_data
│ ├── Prediction_FileFromDB
│ │ └── master.csv
│ └── Prediction_Raw_files_validated
│ │ └── Good_raw
│ │ └── HealthPrem_26092020_131534.csv
└── training_data
│ ├── Training_FileFromDB
│ └── master.csv
│ └── Training_Raw_files_validated
│ └── Good_raw
│ └── HealthPrem_26092020_131534.csv
├── config
├── params.yaml
├── schema_prediction.json
└── schema_training.json
├── csv_to_kafka.py
├── data
├── Prediction_Batch_files
│ └── HealthPrem_26092020_131534.csv
└── training_batch_files
│ ├── .gitignore
│ └── HealthPrem_26092020_131534.csv
├── diagram
├── Drawing1.vsdx
├── streaming.jpg
└── training and prediction.pdf
├── entry_point.py
├── insurance_exception
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── insurance_exception.cpython-37.pyc
└── insurance_exception.py
├── insurance_prediction.egg-info
├── PKG-INFO
├── SOURCES.txt
├── dependency_links.txt
└── top_level.txt
├── logger
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── logger.cpython-37.pyc
└── logger.py
├── mongo_db
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── mongo_db_atlas.cpython-37.pyc
└── mongo_db_atlas.py
├── new_data.csv
├── prediction
├── __init__.py
├── stage_00_data_loader.py
├── stage_01_data_validator.py
├── stage_02_data_transformer.py
├── stage_03_data_exporter.py
└── stage_04_model_predictor.py
├── prediction_files
└── HealthPrem_26092020_131534.csv
├── requirement.txt
├── setup.py
├── spark_consumer_from_kafka.py
├── streaming
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── __init__.cpython-38.pyc
├── consumer
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── kafka_to_spark_csv_consumer.cpython-37.pyc
│ │ └── kafka_to_spark_csv_consumer.cpython-38.pyc
│ └── kafka_to_spark_csv_consumer.py
├── producer
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── kafka_csv_data_producer.cpython-37.pyc
│ │ └── kafka_csv_data_producer.cpython-38.pyc
│ └── kafka_csv_data_producer.py
├── spark_manager
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── spark_manager.cpython-37.pyc
│ │ └── spark_manager.cpython-38.pyc
│ └── spark_manager.py
└── transformer
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── __init__.cpython-38.pyc
│ ├── spark_transformer.cpython-37.pyc
│ └── spark_transformer.cpython-38.pyc
│ └── spark_transformer.py
├── training
├── __init__.py
├── stage_00_data_loader.py
├── stage_01_data_validator.py
├── stage_02_data_transformer.py
├── stage_03_data_exporter.py
└── stage_04_model_trainer.py
└── utility.py
/.gitignore:
--------------------------------------------------------------------------------
1 | insurance_prediction.egg-info
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/pyspark machine learning.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Spark Configuration in window 10
2 |
3 | 1. Downlaod all required file from below URL:
4 | ```
5 | https://drive.google.com/drive/folders/1rBauyUVCRTbnKXgkMGh4l9MdIOVj8CQc?usp=sharing
6 | ```
7 |
8 | 2. Install java .exe file
9 | > note: choose installtion path of java to "C:" drive
10 |
11 | 3. Extract spark file in C drive
12 |
13 | 4. Extract kafka file in C drive
14 |
15 | 5. Add environment variable
16 |
17 |
18 |
19 | ENVIRONMENT VARIABLE NAME |
20 | VALUE |
21 |
22 |
23 | HADOOP_HOME |
24 | C:\winutils |
25 |
26 |
27 | JAVA_HOME |
28 | C:\Java\jdk1.8.0_202 |
29 |
30 |
31 | SPARK_HOME |
32 | C:\spark-3.0.3-bin-hadoop2.7 |
33 |
34 |
35 |
36 | 6. select path variable from environment variable and add below values.
37 | ```buildoutcfg
38 | %SPARK_HOME%\bin
39 | ```
40 | ```buildoutcfg
41 | %HADOOP_HOME%\bin
42 | ```
43 | ```buildoutcfg
44 | %JAVA_HOME%\bin
45 | ```
46 | ```buildoutcfg
47 | C:\Java\jre1.8.0_281\bin
48 | ```
49 | ## Create conda environment
50 |
51 | 1. open conda terminal execute below command
52 |
53 | ```buildoutcfg
54 | conda create -n python=3.8 -y
55 | ```
56 |
57 | 2. select created in previous step for project interpreter in pycharm.
58 |
59 | 3. Install all necessary python library specified in requirements.txt file using below command.
60 | ```buildoutcfg
61 | pip install -r requirements.txt
62 | ```
63 |
64 |
65 | 5. To upload your code to gihub repo
66 | ```
67 | git init
68 | git add .
69 | git commit -m "first commit"
70 | git branch -M main
71 | git remote add origin
72 | git push -u origin main
73 | ```
74 |
75 | ## Train random forest model on insurance dataset
76 | ```buildoutcfg
77 | python training\stage_00_data_loader.py
78 | ```
79 | ```buildoutcfg
80 | python training\stage_01_data_validator.py
81 | ```
82 | ```buildoutcfg
83 | python training\stage_02_data_transformer.py
84 | ```
85 | ```buildoutcfg
86 | python training\stage_03_data_exporter.py
87 | ```
88 | ```buildoutcfg
89 | spark-submit training\stage_04_model_trainer.py
90 | ```
91 |
92 | ## Prediction using random forest of insurance dataset
93 | ```buildoutcfg
94 | python prediction\stage_00_data_loader.py
95 | ```
96 | ```buildoutcfg
97 | python prediction\stage_01_data_validator.py
98 | ```
99 | ```buildoutcfg
100 | python prediction\stage_02_data_transformer.py
101 | ```
102 | ```buildoutcfg
103 | python prediction\stage_03_data_exporter.py
104 | ```
105 | ```buildoutcfg
106 | spark-submit prediction\stage_04_model_predictor.py
107 | ```
108 |
109 |
110 |
111 | # start zookeeper and kafka server
112 |
113 |
114 |
115 | ## start kafka producer using below command
116 | ```buildoutcfg
117 | spark-submit csv_to_kafka.py
118 | ```
119 |
120 | ## start pyspark consumer using below command
121 | ```buildoutcfg
122 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 spark_consumer_from_kafka.py
123 | ```
--------------------------------------------------------------------------------
/__pycache__/utility.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/__pycache__/utility.cpython-37.pyc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/.part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/.part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/metadata/.part-00000.crc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/metadata/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1636780450788,"sparkVersion":"3.0.3","uid":"RandomForestRegressor_ae83aa5164df","paramMap":{"labelCol":"expenses","featuresCol":"input_features"},"defaultParamMap":{"labelCol":"label","maxBins":32,"bootstrap":true,"cacheNodeIds":false,"predictionCol":"prediction","featureSubsetStrategy":"auto","featuresCol":"features","seed":469049852166159693,"leafCol":"","minInstancesPerNode":1,"checkpointInterval":10,"minInfoGain":0.0,"numTrees":20,"subsamplingRate":1.0,"maxDepth":5,"maxMemoryInMB":256,"impurity":"variance","minWeightFractionPerNode":0.0},"numFeatures":5,"numTrees":20}
2 |
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/.part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/.part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/metadata/.part-00000.crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/metadata/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.PipelineModel","timestamp":1636780381034,"sparkVersion":"3.0.3","uid":"PipelineModel_463619223312","paramMap":{"stageUids":["StringIndexer_046a38b797e0","OneHotEncoder_ccd93b498912","VectorAssembler_c52ccaa0dc60"]},"defaultParamMap":{}}
2 |
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/.part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/.part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/.part-00000.crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1636780381434,"sparkVersion":"3.0.3","uid":"StringIndexer_046a38b797e0","paramMap":{"inputCols":["sex","smoker"],"outputCols":["sex_encoder","smoker_encoder"]},"defaultParamMap":{"handleInvalid":"error","stringOrderType":"frequencyDesc","outputCol":"StringIndexer_046a38b797e0__output"}}
2 |
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/.part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/.part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/.part-00000.crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1636780382829,"sparkVersion":"3.0.3","uid":"OneHotEncoder_ccd93b498912","paramMap":{"inputCols":["sex_encoder","smoker_encoder"],"outputCols":["sex_encoded","smoker_encoded"]},"defaultParamMap":{"dropLast":true,"outputCol":"OneHotEncoder_ccd93b498912__output","handleInvalid":"error"}}
2 |
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/.part-00000.crc
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/_SUCCESS
--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1636780383948,"sparkVersion":"3.0.3","uid":"VectorAssembler_c52ccaa0dc60","paramMap":{"inputCols":["age","bmi","children","sex_encoded","smoker_encoded"],"outputCol":"input_features"},"defaultParamMap":{"outputCol":"VectorAssembler_c52ccaa0dc60__output","handleInvalid":"error"}}
2 |
--------------------------------------------------------------------------------
/artifacts/prediction_data/Prediction_FileFromDB/master.csv:
--------------------------------------------------------------------------------
1 | age,sex,bmi,children,smoker
2 | 19,female,27.9,0,yes
3 | 18,male,33.8,1,no
4 | 28,male,33.0,3,no
5 | 33,male,22.7,0,no
6 | 32,male,28.9,0,no
7 | 31,female,25.7,0,no
8 | 46,female,33.4,1,no
9 | 37,female,27.7,3,no
10 | 37,male,29.8,2,no
11 | 60,female,25.8,0,no
12 | 25,male,26.2,0,no
13 | 62,female,26.3,0,yes
14 | 23,male,34.4,0,no
15 | 56,female,39.8,0,no
16 | 27,male,42.1,0,yes
17 | 19,male,24.6,1,no
18 | 52,female,30.8,1,no
19 | 23,male,23.8,0,no
20 | 56,male,40.3,0,no
21 | 30,male,35.3,0,yes
22 | 60,female,36.0,0,no
23 | 30,female,32.4,1,no
24 | 18,male,34.1,0,no
25 | 34,female,31.9,1,yes
26 | 37,male,28.0,2,no
27 | 59,female,27.7,3,no
28 | 63,female,23.1,0,no
29 | 55,female,32.8,2,no
30 | 23,male,17.4,1,no
31 | 31,male,36.3,2,yes
32 | 22,male,35.6,0,yes
33 | 18,female,26.3,0,no
34 | 19,female,28.6,5,no
35 | 63,male,28.3,0,no
36 | 28,male,36.4,1,yes
37 | 19,male,20.4,0,no
38 | 62,female,33.0,3,no
39 | 26,male,20.8,0,no
40 | 35,male,36.7,1,yes
41 | 60,male,39.9,0,yes
42 | 24,female,26.6,0,no
43 | 31,female,36.6,2,no
44 | 41,male,21.8,1,no
45 | 37,female,30.8,2,no
46 | 38,male,37.1,1,no
47 | 55,male,37.3,0,no
48 | 18,female,38.7,2,no
49 | 28,female,34.8,0,no
50 | 60,female,24.5,0,no
51 | 36,male,35.2,1,yes
52 | 18,female,35.6,0,no
53 | 21,female,33.6,2,no
54 | 48,male,28.0,1,yes
55 | 36,male,34.4,0,yes
56 | 40,female,28.7,3,no
57 | 58,male,37.0,2,yes
58 | 58,female,31.8,2,no
59 | 18,male,31.7,2,yes
60 | 53,female,22.9,1,yes
61 | 34,female,37.3,2,no
62 | 43,male,27.4,3,no
63 | 25,male,33.7,4,no
64 | 64,male,24.7,1,no
65 | 28,female,25.9,1,no
66 | 20,female,22.4,0,yes
67 | 19,female,28.9,0,no
68 | 61,female,39.1,2,no
69 | 40,male,26.3,1,no
70 | 40,female,36.2,0,no
71 | 28,male,24.0,3,yes
72 | 27,female,24.8,0,yes
73 | 31,male,28.5,5,no
74 | 53,female,28.1,3,no
75 | 58,male,32.0,1,no
76 | 44,male,27.4,2,no
77 | 57,male,34.0,0,no
78 | 29,female,29.6,1,no
79 | 21,male,35.5,0,no
80 | 22,female,39.8,0,no
81 | 41,female,33.0,0,no
82 | 31,male,26.9,1,no
83 | 45,female,38.3,0,no
84 | 22,male,37.6,1,yes
85 | 48,female,41.2,4,no
86 | 37,female,34.8,2,yes
87 | 45,male,22.9,2,yes
88 | 57,female,31.2,0,yes
89 | 56,female,27.2,0,no
90 | 46,female,27.7,0,no
91 | 55,female,27.0,0,no
92 | 21,female,39.5,0,no
93 | 53,female,24.8,1,no
94 | 59,male,29.8,3,yes
95 | 35,male,34.8,2,no
96 | 64,female,31.3,2,yes
97 | 28,female,37.6,1,no
98 | 54,female,30.8,3,no
99 | 55,male,38.3,0,no
100 | 56,male,20.0,0,yes
101 | 38,male,19.3,0,yes
102 | 41,female,31.6,0,no
103 | 30,male,25.5,0,no
104 | 18,female,30.1,0,no
105 | 61,female,29.9,3,yes
106 | 34,female,27.5,1,no
107 | 20,male,28.0,1,yes
108 | 19,female,28.4,1,no
109 | 26,male,30.9,2,no
110 | 29,male,27.9,0,no
111 | 63,male,35.1,0,yes
112 | 54,male,33.6,1,no
113 | 55,female,29.7,2,no
114 | 37,male,30.8,0,no
115 | 21,female,35.7,0,no
116 | 52,male,32.2,3,no
117 | 60,male,28.6,0,no
118 | 58,male,49.1,0,no
119 | 29,female,27.9,1,yes
120 | 49,female,27.2,0,no
121 | 37,female,23.4,2,no
122 | 44,male,37.1,2,no
123 | 18,male,23.8,0,no
124 | 20,female,29.0,0,no
125 | 44,male,31.4,1,yes
126 | 47,female,33.9,3,no
127 | 26,female,28.8,0,no
128 | 19,female,28.3,0,yes
129 | 52,female,37.4,0,no
130 | 32,female,17.8,2,yes
131 | 38,male,34.7,2,no
132 | 59,female,26.5,0,no
133 | 61,female,22.0,0,no
134 | 53,female,35.9,2,no
135 | 19,male,25.6,0,no
136 | 20,female,28.8,0,no
137 | 22,female,28.1,0,no
138 | 19,male,34.1,0,no
139 | 22,male,25.2,0,no
140 | 54,female,31.9,3,no
141 | 22,female,36.0,0,no
142 | 34,male,22.4,2,no
143 | 26,male,32.5,1,no
144 | 34,male,25.3,2,yes
145 | 29,male,29.7,2,no
146 | 30,male,28.7,3,yes
147 | 29,female,38.8,3,no
148 | 46,male,30.5,3,yes
149 | 51,female,37.7,1,no
150 | 53,female,37.4,1,no
151 | 19,male,28.4,1,no
152 | 35,male,24.1,1,no
153 | 48,male,29.7,0,no
154 | 32,female,37.1,3,no
155 | 42,female,23.4,0,yes
156 | 40,female,25.5,1,no
157 | 44,male,39.5,0,no
158 | 48,male,24.4,0,yes
159 | 18,male,25.2,0,yes
160 | 30,male,35.5,0,yes
161 | 50,female,27.8,3,no
162 | 42,female,26.6,0,yes
163 | 18,female,36.9,0,yes
164 | 54,male,39.6,1,no
165 | 32,female,29.8,2,no
166 | 37,male,29.6,0,no
167 | 47,male,28.2,4,no
168 | 20,female,37.0,5,no
169 | 32,female,33.2,3,no
170 | 19,female,31.8,1,no
171 | 27,male,18.9,3,no
172 | 63,male,41.5,0,no
173 | 49,male,30.3,0,no
174 | 18,male,16.0,0,no
175 | 35,female,34.8,1,no
176 | 24,female,33.3,0,no
177 | 63,female,37.7,0,yes
178 | 38,male,27.8,2,no
179 | 54,male,29.2,1,no
180 | 46,female,28.9,2,no
181 | 41,female,33.2,3,no
182 | 58,male,28.6,0,no
183 | 18,female,38.3,0,no
184 | 22,male,20.0,3,no
185 | 44,female,26.4,0,no
186 | 44,male,30.7,2,no
187 | 36,male,41.9,3,yes
188 | 26,female,29.9,2,no
189 | 30,female,30.9,3,no
190 | 41,female,32.2,1,no
191 | 29,female,32.1,2,no
192 | 61,male,31.6,0,no
193 | 36,female,26.2,0,no
194 | 25,male,25.7,0,no
195 | 56,female,26.6,1,no
196 | 18,male,34.4,0,no
197 | 19,male,30.6,0,no
198 | 39,female,32.8,0,no
199 | 45,female,28.6,2,no
200 | 51,female,18.1,0,no
201 | 64,female,39.3,0,no
202 | 19,female,32.1,0,no
203 | 48,female,32.2,1,no
204 | 60,female,24.0,0,no
205 | 27,female,36.1,0,yes
206 | 46,male,22.3,0,no
207 | 28,female,28.9,1,no
208 | 59,male,26.4,0,no
209 | 35,male,27.7,2,yes
210 | 63,female,31.8,0,no
211 | 40,male,41.2,1,no
212 | 20,male,33.0,1,no
213 | 40,male,30.9,4,no
214 | 24,male,28.5,2,no
215 | 34,female,26.7,1,no
216 | 45,female,30.9,2,no
217 | 41,female,37.1,2,no
218 | 53,female,26.6,0,no
219 | 27,male,23.1,0,no
220 | 26,female,29.9,1,no
221 | 24,female,23.2,0,no
222 | 34,female,33.7,1,no
223 | 53,female,33.3,0,no
224 | 32,male,30.8,3,no
225 | 19,male,34.8,0,yes
226 | 42,male,24.6,0,yes
227 | 55,male,33.9,3,no
228 | 28,male,38.1,0,no
229 | 58,female,41.9,0,no
230 | 41,female,31.6,1,no
231 | 47,male,25.5,2,no
232 | 42,female,36.2,1,no
233 | 59,female,27.8,3,no
234 | 19,female,17.8,0,no
235 | 59,male,27.5,1,no
236 | 39,male,24.5,2,no
237 | 40,female,22.2,2,yes
238 | 18,female,26.7,0,no
239 | 31,male,38.4,2,no
240 | 19,male,29.1,0,yes
241 | 44,male,38.1,1,no
242 | 23,female,36.7,2,yes
243 | 33,female,22.1,1,no
244 | 55,female,26.8,1,no
245 | 40,male,35.3,3,no
246 | 63,female,27.7,0,yes
247 | 54,male,30.0,0,no
248 | 60,female,38.1,0,no
249 | 24,male,35.9,0,no
250 | 19,male,20.9,1,no
251 | 29,male,29.0,1,no
252 | 18,male,17.3,2,yes
253 | 63,female,32.2,2,yes
254 | 54,male,34.2,2,yes
255 | 27,male,30.3,3,no
256 | 50,male,31.8,0,yes
257 | 55,female,25.4,3,no
258 | 56,male,33.6,0,yes
259 | 38,female,40.2,0,no
260 | 51,male,24.4,4,no
261 | 19,male,31.9,0,yes
262 | 58,female,25.2,0,no
263 | 20,female,26.8,1,yes
264 | 52,male,24.3,3,yes
265 | 19,male,37.0,0,yes
266 | 53,female,38.1,3,no
267 | 46,male,42.4,3,yes
268 | 40,male,19.8,1,yes
269 | 59,female,32.4,3,no
270 | 45,male,30.2,1,no
271 | 49,male,25.8,1,no
272 | 18,male,29.4,1,no
273 | 50,male,34.2,2,yes
274 | 41,male,37.1,2,no
275 | 50,male,27.5,1,no
276 | 25,male,27.6,0,no
277 | 47,female,26.6,2,no
278 | 19,male,20.6,2,no
279 | 22,female,24.3,0,no
280 | 59,male,31.8,2,no
281 | 51,female,21.6,1,no
282 | 40,female,28.1,1,yes
283 | 54,male,40.6,3,yes
284 | 30,male,27.6,1,no
285 | 55,female,32.4,1,no
286 | 52,female,31.2,0,no
287 | 46,male,26.6,1,no
288 | 46,female,48.1,2,no
289 | 63,female,26.2,0,no
290 | 59,female,36.8,1,yes
291 | 52,male,26.4,3,no
292 | 28,female,33.4,0,no
293 | 29,male,29.6,1,no
294 | 25,male,45.5,2,yes
295 | 22,female,28.8,0,no
296 | 25,male,26.8,3,no
297 | 18,male,23.0,0,no
298 | 19,male,27.7,0,yes
299 | 47,male,25.4,1,yes
300 | 31,male,34.4,3,yes
301 | 48,female,28.9,1,no
302 | 36,male,27.6,3,no
303 | 53,female,22.6,3,yes
304 | 56,female,37.5,2,no
305 | 28,female,33.0,2,no
306 | 57,female,38.0,2,no
307 | 29,male,33.3,2,no
308 | 28,female,27.5,2,no
309 | 30,female,33.3,1,no
310 | 58,male,34.9,0,no
311 | 41,female,33.1,2,no
312 | 50,male,26.6,0,no
313 | 19,female,24.7,0,no
314 | 43,male,36.0,3,yes
315 | 49,male,35.9,0,no
316 | 27,female,31.4,0,yes
317 | 52,male,33.3,0,no
318 | 50,male,32.2,0,no
319 | 54,male,32.8,0,no
320 | 44,female,27.6,0,no
321 | 32,male,37.3,1,no
322 | 34,male,25.3,1,no
323 | 26,female,29.6,4,no
324 | 34,male,30.8,0,yes
325 | 57,male,40.9,0,no
326 | 29,male,27.2,0,no
327 | 40,male,34.1,1,no
328 | 27,female,23.2,1,no
329 | 45,male,36.5,2,yes
330 | 64,female,33.8,1,yes
331 | 52,male,36.7,0,no
332 | 61,female,36.4,1,yes
333 | 52,male,27.4,0,yes
334 | 61,female,31.2,0,no
335 | 56,female,28.8,0,no
336 | 43,female,35.7,2,no
337 | 64,male,34.5,0,no
338 | 60,male,25.7,0,no
339 | 62,male,27.6,1,no
340 | 50,male,32.3,1,yes
341 | 46,female,27.7,1,no
342 | 24,female,27.6,0,no
343 | 62,male,30.0,0,no
344 | 60,female,27.6,0,no
345 | 63,male,36.8,0,no
346 | 49,female,41.5,4,no
347 | 34,female,29.3,3,no
348 | 33,male,35.8,2,no
349 | 46,male,33.3,1,no
350 | 36,female,29.9,1,no
351 | 19,male,27.8,0,no
352 | 57,female,23.2,0,no
353 | 50,female,25.6,0,no
354 | 30,female,27.7,0,no
355 | 33,male,35.2,0,no
356 | 18,female,38.3,0,no
357 | 46,male,27.6,0,no
358 | 46,male,43.9,3,no
359 | 47,male,29.8,3,no
360 | 23,male,41.9,0,no
361 | 18,female,20.8,0,no
362 | 48,female,32.3,2,no
363 | 35,male,30.5,1,no
364 | 19,female,21.7,0,yes
365 | 21,female,26.4,1,no
366 | 21,female,21.9,2,no
367 | 49,female,30.8,1,no
368 | 56,female,32.3,3,no
369 | 42,female,25.0,2,no
370 | 44,male,32.0,2,no
371 | 18,male,30.4,3,no
372 | 61,female,21.1,0,no
373 | 57,female,22.2,0,no
374 | 42,female,33.2,1,no
375 | 26,male,32.9,2,yes
376 | 20,male,33.3,0,no
377 | 23,female,28.3,0,yes
378 | 39,female,24.9,3,yes
379 | 24,male,40.2,0,yes
380 | 64,female,30.1,3,no
381 | 62,male,31.5,1,no
382 | 27,female,18.0,2,yes
383 | 55,male,30.7,0,yes
384 | 55,male,33.0,0,no
385 | 35,female,43.3,2,no
386 | 44,male,22.1,2,no
387 | 19,male,34.4,0,no
388 | 58,female,39.1,0,no
389 | 50,male,25.4,2,no
390 | 26,female,22.6,0,no
391 | 24,female,30.2,3,no
392 | 48,male,35.6,4,no
393 | 19,female,37.4,0,no
394 | 48,male,31.4,1,no
395 | 49,male,31.4,1,no
396 | 46,female,32.3,2,no
397 | 46,male,19.9,0,no
398 | 43,female,34.4,3,no
399 | 21,male,31.0,0,no
400 | 64,male,25.6,2,no
401 | 18,female,38.2,0,no
402 | 51,female,20.6,0,no
403 | 47,male,47.5,1,no
404 | 64,female,33.0,0,no
405 | 49,male,32.3,3,no
406 | 31,male,20.4,0,no
407 | 52,female,38.4,2,no
408 | 33,female,24.3,0,no
409 | 47,female,23.6,1,no
410 | 38,male,21.1,3,no
411 | 32,male,30.0,1,no
412 | 19,male,17.5,0,no
413 | 44,female,20.2,1,yes
414 | 26,female,17.2,2,yes
415 | 25,male,23.9,5,no
416 | 19,female,35.2,0,no
417 | 43,female,35.6,1,no
418 | 52,male,34.1,0,no
419 | 36,female,22.6,2,yes
420 | 64,male,39.2,1,no
421 | 63,female,27.0,0,yes
422 | 64,male,33.9,0,yes
423 | 61,male,35.9,0,yes
424 | 40,male,32.8,1,yes
425 | 25,male,30.6,0,no
426 | 48,male,30.2,2,no
427 | 45,male,24.3,5,no
428 | 38,female,27.3,1,no
429 | 18,female,29.2,0,no
430 | 21,female,16.8,1,no
431 | 27,female,30.4,3,no
432 | 19,male,33.1,0,no
433 | 29,female,20.2,2,no
434 | 42,male,26.9,0,no
435 | 60,female,30.5,0,no
436 | 31,male,28.6,1,no
437 | 60,male,33.1,3,no
438 | 22,male,31.7,0,no
439 | 35,male,28.9,3,no
440 | 52,female,46.8,5,no
441 | 26,male,29.5,0,no
442 | 31,female,32.7,1,no
443 | 33,female,33.5,0,yes
444 | 18,male,43.0,0,no
445 | 59,female,36.5,1,no
446 | 56,male,26.7,1,yes
447 | 45,female,33.1,0,no
448 | 60,male,29.6,0,no
449 | 56,female,25.7,0,no
450 | 40,female,29.6,0,no
451 | 35,male,38.6,1,no
452 | 39,male,29.6,4,no
453 | 30,male,24.1,1,no
454 | 24,male,23.4,0,no
455 | 20,male,29.7,0,no
456 | 32,male,46.5,2,no
457 | 59,male,37.4,0,no
458 | 55,female,30.1,2,no
459 | 57,female,30.5,0,no
460 | 56,male,39.6,0,no
461 | 40,female,33.0,3,no
462 | 49,female,36.6,3,no
463 | 42,male,30.0,0,yes
464 | 62,female,38.1,2,no
465 | 56,male,25.9,0,no
466 | 19,male,25.2,0,no
467 | 30,female,28.4,1,yes
468 | 60,female,28.7,1,no
469 | 56,female,33.8,2,no
470 | 28,female,24.3,1,no
471 | 18,female,24.1,1,no
472 | 27,male,32.7,0,no
473 | 18,female,30.1,0,no
474 | 19,female,29.8,0,no
475 | 47,female,33.3,0,no
476 | 54,male,25.1,3,yes
477 | 61,male,28.3,1,yes
478 | 24,male,28.5,0,yes
479 | 25,male,35.6,0,no
480 | 21,male,36.9,0,no
481 | 23,male,32.6,0,no
482 | 63,male,41.3,3,no
483 | 49,male,37.5,2,no
484 | 18,female,31.4,0,no
485 | 51,female,39.5,1,no
486 | 48,male,34.3,3,no
487 | 31,female,31.1,0,no
488 | 54,female,21.5,3,no
489 | 19,male,28.7,0,no
490 | 44,female,38.1,0,yes
491 | 53,male,31.2,1,no
492 | 19,female,32.9,0,no
493 | 61,female,25.1,0,no
494 | 18,female,25.1,0,no
495 | 61,male,43.4,0,no
496 | 21,male,25.7,4,yes
497 | 20,male,27.9,0,no
498 | 31,female,23.6,2,no
499 | 45,male,28.7,2,no
500 | 44,female,24.0,2,no
501 | 62,female,39.2,0,no
502 | 29,male,34.4,0,yes
503 | 43,male,26.0,0,no
504 | 51,male,23.2,1,yes
505 | 19,male,30.3,0,yes
506 | 38,female,28.9,1,no
507 | 37,male,30.9,3,no
508 | 22,male,31.4,1,no
509 | 21,male,23.8,2,no
510 | 24,female,25.3,0,no
511 | 57,female,28.7,0,no
512 | 56,male,32.1,1,no
513 | 27,male,33.7,0,no
514 | 51,male,22.4,0,no
515 | 19,male,30.4,0,no
516 | 39,male,28.3,1,yes
517 | 58,male,35.7,0,no
518 | 20,male,35.3,1,no
519 | 45,male,30.5,2,no
520 | 35,female,31.0,1,no
521 | 31,male,30.9,0,no
522 | 50,female,27.4,0,no
523 | 32,female,44.2,0,no
524 | 51,female,33.9,0,no
525 | 38,female,37.7,0,no
526 | 42,male,26.1,1,yes
527 | 18,female,33.9,0,no
528 | 19,female,30.6,2,no
529 | 51,female,25.8,1,no
530 | 46,male,39.4,1,no
531 | 18,male,25.5,0,no
532 | 57,male,42.1,1,yes
533 | 62,female,31.7,0,no
534 | 59,male,29.7,2,no
535 | 37,male,36.2,0,no
536 | 64,male,40.5,0,no
537 | 38,male,28.0,1,no
538 | 33,female,38.9,3,no
539 | 46,female,30.2,2,no
540 | 46,female,28.1,1,no
541 | 53,male,31.4,0,no
542 | 34,female,38.0,3,no
543 | 20,female,31.8,2,no
544 | 63,female,36.3,0,no
545 | 54,female,47.4,0,yes
546 | 54,male,30.2,0,no
547 | 49,male,25.8,2,yes
548 | 28,male,35.4,0,no
549 | 54,female,46.7,2,no
550 | 25,female,28.6,0,no
551 | 43,female,46.2,0,yes
552 | 63,male,30.8,0,no
553 | 32,female,28.9,0,no
554 | 62,male,21.4,0,no
555 | 52,female,31.7,2,no
556 | 25,female,41.3,0,no
557 | 28,male,23.8,2,no
558 | 46,male,33.4,1,no
559 | 34,male,34.2,0,no
560 | 35,female,34.1,3,yes
561 | 19,male,35.5,0,no
562 | 46,female,20.0,2,no
563 | 54,female,32.7,0,no
564 | 27,male,30.5,0,no
565 | 50,male,44.8,1,no
566 | 18,female,32.1,2,no
567 | 19,female,30.5,0,no
568 | 38,female,40.6,1,no
569 | 41,male,30.6,2,no
570 | 49,female,31.9,5,no
571 | 48,male,40.6,2,yes
572 | 31,female,29.1,0,no
573 | 18,female,37.3,1,no
574 | 30,female,43.1,2,no
575 | 62,female,36.9,1,no
576 | 57,female,34.3,2,no
577 | 58,female,27.2,0,no
578 | 22,male,26.8,0,no
579 | 31,female,38.1,1,yes
580 | 52,male,30.2,1,no
581 | 25,female,23.5,0,no
582 | 59,male,25.5,1,no
583 | 19,male,30.6,0,no
584 | 39,male,45.4,2,no
585 | 32,female,23.7,1,no
586 | 19,male,20.7,0,no
587 | 33,female,28.3,1,no
588 | 21,male,20.2,3,no
589 | 34,female,30.2,1,yes
590 | 61,female,35.9,0,no
591 | 38,female,30.7,1,no
592 | 58,female,29.0,0,no
593 | 47,male,19.6,1,no
594 | 20,male,31.1,2,no
595 | 21,female,21.9,1,yes
596 | 41,male,40.3,0,no
597 | 46,female,33.7,1,no
598 | 42,female,29.5,2,no
599 | 34,female,33.3,1,no
600 | 43,male,32.6,2,no
601 | 52,female,37.5,2,no
602 | 18,female,39.2,0,no
603 | 51,male,31.6,0,no
604 | 56,female,25.3,0,no
605 | 64,female,39.1,3,no
606 | 19,female,28.3,0,yes
607 | 51,female,34.1,0,no
608 | 27,female,25.2,0,no
609 | 59,female,23.7,0,yes
610 | 28,male,27.0,2,no
611 | 30,male,37.8,2,yes
612 | 47,female,29.4,1,no
613 | 38,female,34.8,2,no
614 | 18,female,33.2,0,no
615 | 34,female,19.0,3,no
616 | 20,female,33.0,0,no
617 | 47,female,36.6,1,yes
618 | 56,female,28.6,0,no
619 | 49,male,25.6,2,yes
620 | 19,female,33.1,0,yes
621 | 55,female,37.1,0,no
622 | 30,male,31.4,1,no
623 | 37,male,34.1,4,yes
624 | 49,female,21.3,1,no
625 | 18,male,33.5,0,yes
626 | 59,male,28.8,0,no
627 | 29,female,26.0,0,no
628 | 36,male,28.9,3,no
629 | 33,male,42.5,1,no
630 | 58,male,38.0,0,no
631 | 44,female,39.0,0,yes
632 | 53,male,36.1,1,no
633 | 24,male,29.3,0,no
634 | 29,female,35.5,0,no
635 | 40,male,22.7,2,no
636 | 51,male,39.7,1,no
637 | 64,male,38.2,0,no
638 | 19,female,24.5,1,no
639 | 35,female,38.1,2,no
640 | 39,male,26.4,0,yes
641 | 56,male,33.7,4,no
642 | 33,male,42.4,5,no
643 | 42,male,28.3,3,yes
644 | 61,male,33.9,0,no
645 | 23,female,35.0,3,no
646 | 43,male,35.3,2,no
647 | 48,male,30.8,3,no
648 | 39,male,26.2,1,no
649 | 40,female,23.4,3,no
650 | 18,male,28.5,0,no
651 | 58,female,33.0,0,no
652 | 49,female,42.7,2,no
653 | 53,female,39.6,1,no
654 | 48,female,31.1,0,no
655 | 45,female,36.3,2,no
656 | 59,female,35.2,0,no
657 | 52,female,25.3,2,yes
658 | 26,female,42.4,1,no
659 | 27,male,33.2,2,no
660 | 48,female,35.9,1,no
661 | 57,female,28.8,4,no
662 | 37,male,46.5,3,no
663 | 57,female,24.0,1,no
664 | 32,female,31.5,1,no
665 | 18,male,33.7,0,no
666 | 64,female,23.0,0,yes
667 | 43,male,38.1,2,yes
668 | 49,male,28.7,1,no
669 | 40,female,32.8,2,yes
670 | 62,male,32.0,0,yes
671 | 40,female,29.8,1,no
672 | 30,male,31.6,3,no
673 | 29,female,31.2,0,no
674 | 36,male,29.7,0,no
675 | 41,female,31.0,0,no
676 | 44,female,43.9,2,yes
677 | 45,male,21.4,0,no
678 | 55,female,40.8,3,no
679 | 60,male,31.4,3,yes
680 | 56,male,36.1,3,no
681 | 49,female,23.2,2,no
682 | 21,female,17.4,1,no
683 | 19,male,20.3,0,no
684 | 39,male,35.3,2,yes
685 | 53,male,24.3,0,no
686 | 33,female,18.5,1,no
687 | 53,male,26.4,2,no
688 | 42,male,26.1,2,no
689 | 40,male,41.7,0,no
690 | 47,female,24.1,1,no
691 | 27,male,31.1,1,yes
692 | 21,male,27.4,0,no
693 | 47,male,36.2,1,no
694 | 20,male,32.4,1,no
695 | 24,male,23.7,0,no
696 | 27,female,34.8,1,no
697 | 26,female,40.2,0,no
698 | 53,female,32.3,2,no
699 | 41,male,35.8,1,yes
700 | 56,male,33.7,0,no
701 | 23,female,39.3,2,no
702 | 21,female,34.9,0,no
703 | 50,female,44.7,0,no
704 | 53,male,41.5,0,no
705 | 34,female,26.4,1,no
706 | 47,female,29.5,1,no
707 | 33,female,32.9,2,no
708 | 51,female,38.1,0,yes
709 | 49,male,28.7,3,no
710 | 31,female,30.5,3,no
711 | 36,female,27.7,0,no
712 | 18,male,35.2,1,no
713 | 50,female,23.5,2,no
714 | 43,female,30.7,2,no
715 | 20,male,40.5,0,no
716 | 24,female,22.6,0,no
717 | 60,male,28.9,0,no
718 | 49,female,22.6,1,no
719 | 60,male,24.3,1,no
720 | 51,female,36.7,2,no
721 | 58,female,33.4,0,no
722 | 51,female,40.7,0,no
723 | 53,male,36.6,3,no
724 | 62,male,37.4,0,no
725 | 19,male,35.4,0,no
726 | 50,female,27.1,1,no
727 | 30,female,39.1,3,yes
728 | 41,male,28.4,1,no
729 | 29,female,21.8,1,yes
730 | 18,female,40.3,0,no
731 | 41,female,36.1,1,no
732 | 35,male,24.4,3,yes
733 | 53,male,21.4,1,no
734 | 24,female,30.1,3,no
735 | 48,female,27.3,1,no
736 | 59,female,32.1,3,no
737 | 49,female,34.8,1,no
738 | 37,female,38.4,0,yes
739 | 26,male,23.7,2,no
740 | 23,male,31.7,3,yes
741 | 29,male,35.5,2,yes
742 | 45,male,24.0,2,no
743 | 27,male,29.2,0,yes
744 | 53,male,34.1,0,yes
745 | 31,female,26.6,0,no
746 | 50,male,26.4,0,no
747 | 50,female,30.1,1,no
748 | 34,male,27.0,2,no
749 | 19,male,21.8,0,no
750 | 47,female,36.0,1,no
751 | 28,male,30.9,0,no
752 | 37,female,26.4,0,yes
753 | 21,male,29.0,0,no
754 | 64,male,37.9,0,no
755 | 58,female,22.8,0,no
756 | 24,male,33.6,4,no
757 | 31,male,27.6,2,no
758 | 39,female,22.8,3,no
759 | 47,female,27.8,0,yes
760 | 30,male,37.4,3,no
761 | 18,male,38.2,0,yes
762 | 22,female,34.6,2,no
763 | 23,male,35.2,1,no
764 | 33,male,27.1,1,yes
765 | 27,male,26.0,0,no
766 | 45,female,25.2,2,no
767 | 57,female,31.8,0,no
768 | 47,male,32.3,1,no
769 | 42,female,29.0,1,no
770 | 64,female,39.7,0,no
771 | 38,female,19.5,2,no
772 | 61,male,36.1,3,no
773 | 53,female,26.7,2,no
774 | 44,female,36.5,0,no
775 | 19,female,28.9,0,yes
776 | 41,male,34.2,2,no
777 | 51,male,33.3,3,no
778 | 40,male,32.3,2,no
779 | 45,male,39.8,0,no
780 | 35,male,34.3,3,no
781 | 53,male,28.9,0,no
782 | 30,male,24.4,3,yes
783 | 18,male,41.1,0,no
784 | 51,male,36.0,1,no
785 | 50,female,27.6,1,yes
786 | 31,female,29.3,1,no
787 | 35,female,27.7,3,no
788 | 60,male,37.0,0,no
789 | 21,male,36.9,0,no
790 | 29,male,22.5,3,no
791 | 62,female,29.9,0,no
792 | 39,female,41.8,0,no
793 | 19,male,27.6,0,no
794 | 22,female,23.2,0,no
795 | 53,male,20.9,0,yes
796 | 39,female,31.9,2,no
797 | 27,male,28.5,0,yes
798 | 30,male,44.2,2,no
799 | 30,female,22.9,1,no
800 | 58,female,33.1,0,no
801 | 33,male,24.8,0,yes
802 | 42,female,26.2,1,no
803 | 64,female,36.0,0,no
804 | 21,male,22.3,1,no
805 | 18,female,42.2,0,yes
806 | 23,male,26.5,0,no
807 | 45,female,35.8,0,no
808 | 40,female,41.4,1,no
809 | 19,female,36.6,0,no
810 | 18,male,30.1,0,no
811 | 25,male,25.8,1,no
812 | 46,female,30.8,3,no
813 | 33,female,42.9,3,no
814 | 54,male,21.0,2,no
815 | 28,male,22.5,2,no
816 | 36,male,34.4,2,no
817 | 20,female,31.5,0,no
818 | 24,female,24.2,0,no
819 | 23,male,37.1,3,no
820 | 47,female,26.1,1,yes
821 | 33,female,35.5,0,yes
822 | 45,male,33.7,1,no
823 | 26,male,17.7,0,no
824 | 18,female,31.1,0,no
825 | 44,female,29.8,2,no
826 | 60,male,24.3,0,no
827 | 64,female,31.8,2,no
828 | 56,male,31.8,2,yes
829 | 36,male,28.0,1,yes
830 | 41,male,30.8,3,yes
831 | 39,male,21.9,1,no
832 | 63,male,33.1,0,no
833 | 36,female,25.8,0,no
834 | 28,female,23.8,2,no
835 | 58,male,34.4,0,no
836 | 36,male,33.8,1,no
837 | 42,male,36.0,2,no
838 | 36,male,31.5,0,no
839 | 56,female,28.3,0,no
840 | 35,female,23.5,2,no
841 | 59,female,31.4,0,no
842 | 21,male,31.1,0,no
843 | 59,male,24.7,0,no
844 | 23,female,32.8,2,yes
845 | 57,female,29.8,0,yes
846 | 53,male,30.5,0,no
847 | 60,female,32.5,0,yes
848 | 51,female,34.2,1,no
849 | 23,male,50.4,1,no
850 | 27,female,24.1,0,no
851 | 55,male,32.8,0,no
852 | 37,female,30.8,0,yes
853 | 61,male,32.3,2,no
854 | 46,female,35.5,0,yes
855 | 53,female,23.8,2,no
856 | 49,female,23.8,3,yes
857 | 20,female,29.6,0,no
858 | 48,female,33.1,0,yes
859 | 25,male,24.1,0,yes
860 | 25,female,32.2,1,no
861 | 57,male,28.1,0,no
862 | 37,female,47.6,2,yes
863 | 38,female,28.0,3,no
864 | 55,female,33.5,2,no
865 | 36,female,19.9,0,no
866 | 51,male,25.4,0,no
867 | 40,male,29.9,2,no
868 | 18,male,37.3,0,no
869 | 57,male,43.7,1,no
870 | 61,male,23.7,0,no
871 | 25,female,24.3,3,no
872 | 50,male,36.2,0,no
873 | 26,female,29.5,1,no
874 | 42,male,24.9,0,no
875 | 43,male,30.1,1,no
876 | 44,male,21.9,3,no
877 | 23,female,28.1,0,no
878 | 49,female,27.1,1,no
879 | 33,male,33.4,5,no
880 | 41,male,28.8,1,no
881 | 37,female,29.5,2,no
882 | 22,male,34.8,3,no
883 | 23,male,27.4,1,no
884 | 21,female,22.1,0,no
885 | 51,female,37.1,3,yes
886 | 25,male,26.7,4,no
887 | 32,male,28.9,1,yes
888 | 57,male,29.0,0,yes
889 | 36,female,30.0,0,no
890 | 22,male,39.5,0,no
891 | 57,male,33.6,1,no
892 | 64,female,26.9,0,yes
893 | 36,female,29.0,4,no
894 | 54,male,24.0,0,no
895 | 47,male,38.9,2,yes
896 | 62,male,32.1,0,no
897 | 61,female,44.0,0,no
898 | 43,female,20.0,2,yes
899 | 19,male,25.6,1,no
900 | 18,female,40.3,0,no
901 | 19,female,22.5,0,no
902 | 49,male,22.5,0,no
903 | 60,male,40.9,0,yes
904 | 26,male,27.3,3,no
905 | 49,male,36.9,0,no
906 | 60,female,35.1,0,no
907 | 26,female,29.4,2,no
908 | 27,male,32.6,3,no
909 | 44,female,32.3,1,no
910 | 63,male,39.8,3,no
911 | 32,female,24.6,0,yes
912 | 22,male,28.3,1,no
913 | 18,male,31.7,0,yes
914 | 59,female,26.7,3,no
915 | 44,female,27.5,1,no
916 | 33,male,24.6,2,no
917 | 24,female,34.0,0,no
918 | 43,female,26.9,0,yes
919 | 45,male,22.9,0,yes
920 | 61,female,28.2,0,no
921 | 35,female,34.2,1,no
922 | 62,female,25.0,0,no
923 | 62,female,33.2,0,no
924 | 38,male,31.0,1,no
925 | 34,male,35.8,0,no
926 | 43,male,23.2,0,no
927 | 50,male,32.1,2,no
928 | 19,female,23.4,2,no
929 | 57,female,20.1,1,no
930 | 62,female,39.2,0,no
931 | 41,male,34.2,1,no
932 | 26,male,46.5,1,no
933 | 39,female,32.5,1,no
934 | 46,male,25.8,5,no
935 | 45,female,35.3,0,no
936 | 32,male,37.2,2,no
937 | 59,female,27.5,0,no
938 | 44,male,29.7,2,no
939 | 39,female,24.2,5,no
940 | 18,male,26.2,2,no
941 | 53,male,29.5,0,no
942 | 18,male,23.2,0,no
943 | 50,female,46.1,1,no
944 | 18,female,40.2,0,no
945 | 19,male,22.6,0,no
946 | 62,male,39.9,0,no
947 | 56,female,35.8,1,no
948 | 42,male,35.8,2,no
949 | 37,male,34.2,1,yes
950 | 42,male,31.3,0,no
951 | 25,male,29.7,3,yes
952 | 57,male,18.3,0,no
953 | 51,male,42.9,2,yes
954 | 30,female,28.4,1,no
955 | 44,male,30.2,2,yes
956 | 34,male,27.8,1,yes
957 | 31,male,39.5,1,no
958 | 54,male,30.8,1,yes
959 | 24,male,26.8,1,no
960 | 43,male,35.0,1,yes
961 | 48,male,36.7,1,no
962 | 19,female,39.6,1,no
963 | 29,female,25.9,0,no
964 | 63,female,35.2,1,no
965 | 46,male,24.8,3,no
966 | 52,male,36.8,2,no
967 | 35,male,27.1,1,no
968 | 51,male,24.8,2,yes
969 | 44,male,25.4,1,no
970 | 21,male,25.7,2,no
971 | 39,female,34.3,5,no
972 | 50,female,28.2,3,no
973 | 34,female,23.6,0,no
974 | 22,female,20.2,0,no
975 | 19,female,40.5,0,no
976 | 26,male,35.4,0,no
977 | 29,male,22.9,0,yes
978 | 48,male,40.2,0,no
979 | 26,male,29.2,1,no
980 | 45,female,40.0,3,no
981 | 36,female,29.9,0,no
982 | 54,male,25.5,1,no
983 | 34,male,21.4,0,no
984 | 31,male,25.9,3,yes
985 | 27,female,30.6,1,no
986 | 20,male,30.1,5,no
987 | 44,female,25.8,1,no
988 | 43,male,30.1,3,no
989 | 45,female,27.6,1,no
990 | 34,male,34.7,0,no
991 | 24,female,20.5,0,yes
992 | 26,female,19.8,1,no
993 | 38,female,27.8,2,no
994 | 50,female,31.6,2,no
995 | 38,male,28.3,1,no
996 | 27,female,20.0,3,yes
997 | 39,female,23.3,3,no
998 | 39,female,34.1,3,no
999 | 63,female,36.9,0,no
1000 | 33,female,36.3,3,no
1001 | 36,female,26.9,0,no
1002 | 30,male,23.0,2,yes
1003 | 24,male,32.7,0,yes
1004 | 24,male,25.8,0,no
1005 | 48,male,29.6,0,no
1006 | 47,male,19.2,1,no
1007 | 29,male,31.7,2,no
1008 | 28,male,29.3,2,no
1009 | 47,male,28.2,3,yes
1010 | 25,male,25.0,2,no
1011 | 51,male,27.7,1,no
1012 | 48,female,22.8,0,no
1013 | 43,male,20.1,2,yes
1014 | 61,female,33.3,4,no
1015 | 48,male,32.3,1,no
1016 | 38,female,27.6,0,no
1017 | 59,male,25.5,0,no
1018 | 19,female,24.6,1,no
1019 | 26,female,34.2,2,no
1020 | 54,female,35.8,3,no
1021 | 21,female,32.7,2,no
1022 | 51,male,37.0,0,no
1023 | 22,female,31.0,3,yes
1024 | 47,male,36.1,1,yes
1025 | 18,male,23.3,1,no
1026 | 47,female,45.3,1,no
1027 | 21,female,34.6,0,no
1028 | 19,male,26.0,1,yes
1029 | 23,male,18.7,0,no
1030 | 54,male,31.6,0,no
1031 | 37,female,17.3,2,no
1032 | 46,female,23.7,1,yes
1033 | 55,female,35.2,0,yes
1034 | 30,female,27.9,0,no
1035 | 18,male,21.6,0,yes
1036 | 61,male,38.4,0,no
1037 | 54,female,23.0,3,no
1038 | 22,male,37.1,2,yes
1039 | 45,female,30.5,1,yes
1040 | 22,male,28.9,0,no
1041 | 19,male,27.3,2,no
1042 | 35,female,28.0,0,yes
1043 | 18,male,23.1,0,no
1044 | 20,male,30.7,0,yes
1045 | 28,female,25.8,0,no
1046 | 55,male,35.2,1,no
1047 | 43,female,24.7,2,yes
1048 | 43,female,25.1,0,no
1049 | 22,male,52.6,1,yes
1050 | 25,female,22.5,1,no
1051 | 49,male,30.9,0,yes
1052 | 44,female,37.0,1,no
1053 | 64,male,26.4,0,no
1054 | 49,male,29.8,1,no
1055 | 47,male,29.8,3,yes
1056 | 27,female,21.5,0,no
1057 | 55,male,27.6,0,no
1058 | 48,female,28.9,0,no
1059 | 45,female,31.8,0,no
1060 | 24,female,39.5,0,no
1061 | 32,male,33.8,1,no
1062 | 24,male,32.0,0,no
1063 | 57,male,27.9,1,no
1064 | 59,male,41.1,1,yes
1065 | 36,male,28.6,3,no
1066 | 29,female,25.6,4,no
1067 | 42,female,25.3,1,no
1068 | 48,male,37.3,2,no
1069 | 39,male,42.7,0,no
1070 | 63,male,21.7,1,no
1071 | 54,female,31.9,1,no
1072 | 37,male,37.1,1,yes
1073 | 63,male,31.4,0,no
1074 | 21,male,31.3,0,no
1075 | 54,female,28.9,2,no
1076 | 60,female,18.3,0,no
1077 | 32,female,29.6,1,no
1078 | 47,female,32.0,1,no
1079 | 21,male,26.0,0,no
1080 | 28,male,31.7,0,yes
1081 | 63,male,33.7,3,no
1082 | 18,male,21.8,2,no
1083 | 32,male,27.8,1,no
1084 | 38,male,20.0,1,no
1085 | 32,male,31.5,1,no
1086 | 62,female,30.5,2,no
1087 | 39,female,18.3,5,yes
1088 | 55,male,29.0,0,no
1089 | 57,male,31.5,0,no
1090 | 52,male,47.7,1,no
1091 | 56,male,22.1,0,no
1092 | 47,male,36.2,0,yes
1093 | 55,female,29.8,0,no
1094 | 23,male,32.7,3,no
1095 | 22,female,30.4,0,yes
1096 | 50,female,33.7,4,no
1097 | 18,female,31.4,4,no
1098 | 51,female,35.0,2,yes
1099 | 22,male,33.8,0,no
1100 | 52,female,30.9,0,no
1101 | 25,female,34.0,1,no
1102 | 33,female,19.1,2,yes
1103 | 53,male,28.6,3,no
1104 | 29,male,38.9,1,no
1105 | 58,male,36.1,0,no
1106 | 37,male,29.8,0,no
1107 | 54,female,31.2,0,no
1108 | 49,female,29.9,0,no
1109 | 50,female,26.2,2,no
1110 | 26,male,30.0,1,no
1111 | 45,male,20.4,3,no
1112 | 54,female,32.3,1,no
1113 | 38,male,38.4,3,yes
1114 | 48,female,25.9,3,yes
1115 | 28,female,26.3,3,no
1116 | 23,male,24.5,0,no
1117 | 55,male,32.7,1,no
1118 | 41,male,29.6,5,no
1119 | 25,male,33.3,2,yes
1120 | 33,male,35.8,1,yes
1121 | 30,female,20.0,3,no
1122 | 23,female,31.4,0,yes
1123 | 46,male,38.2,2,no
1124 | 53,female,36.9,3,yes
1125 | 27,female,32.4,1,no
1126 | 23,female,42.8,1,yes
1127 | 63,female,25.1,0,no
1128 | 55,male,29.9,0,no
1129 | 35,female,35.9,2,no
1130 | 34,male,32.8,1,no
1131 | 19,female,18.6,0,no
1132 | 39,female,23.9,5,no
1133 | 27,male,45.9,2,no
1134 | 57,male,40.3,0,no
1135 | 52,female,18.3,0,no
1136 | 28,male,33.8,0,no
1137 | 50,female,28.1,3,no
1138 | 44,female,25.0,1,no
1139 | 26,female,22.2,0,no
1140 | 33,male,30.3,0,no
1141 | 19,female,32.5,0,yes
1142 | 50,male,37.1,1,no
1143 | 41,female,32.6,3,no
1144 | 52,female,24.9,0,no
1145 | 39,male,32.3,2,no
1146 | 50,male,32.3,2,no
1147 | 52,male,32.8,3,no
1148 | 60,male,32.8,0,yes
1149 | 20,female,31.9,0,no
1150 | 55,male,21.5,1,no
1151 | 42,male,34.1,0,no
1152 | 18,female,30.3,0,no
1153 | 58,female,36.5,0,no
1154 | 43,female,32.6,3,yes
1155 | 35,female,35.8,1,no
1156 | 48,female,27.9,4,no
1157 | 36,female,22.1,3,no
1158 | 19,male,44.9,0,yes
1159 | 23,female,23.2,2,no
1160 | 20,female,30.6,0,no
1161 | 32,female,41.1,0,no
1162 | 43,female,34.6,1,no
1163 | 34,male,42.1,2,no
1164 | 30,male,38.8,1,no
1165 | 18,female,28.2,0,no
1166 | 41,female,28.3,1,no
1167 | 35,female,26.1,0,no
1168 | 57,male,40.4,0,no
1169 | 29,female,24.6,2,no
1170 | 32,male,35.2,2,no
1171 | 37,female,34.1,1,no
1172 | 18,male,27.4,1,yes
1173 | 43,female,26.7,2,yes
1174 | 56,female,41.9,0,no
1175 | 38,male,29.3,2,no
1176 | 29,male,32.1,2,no
1177 | 22,female,27.1,0,no
1178 | 52,female,24.1,1,yes
1179 | 40,female,27.4,1,no
1180 | 23,female,34.9,0,no
1181 | 31,male,29.8,0,yes
1182 | 42,female,41.3,1,no
1183 | 24,female,29.9,0,no
1184 | 25,female,30.3,0,no
1185 | 48,female,27.4,1,no
1186 | 23,female,28.5,1,yes
1187 | 45,male,23.6,2,no
1188 | 20,male,35.6,3,yes
1189 | 62,female,32.7,0,no
1190 | 43,female,25.3,1,yes
1191 | 23,female,28.0,0,no
1192 | 31,female,32.8,2,no
1193 | 41,female,21.8,1,no
1194 | 58,female,32.4,1,no
1195 | 48,female,36.6,0,no
1196 | 31,female,21.8,0,no
1197 | 19,female,27.9,3,no
1198 | 19,female,30.0,0,yes
1199 | 41,male,33.6,0,no
1200 | 40,male,29.4,1,no
1201 | 31,female,25.8,2,no
1202 | 37,male,24.3,2,no
1203 | 46,male,40.4,2,no
1204 | 22,male,32.1,0,no
1205 | 51,male,32.3,1,no
1206 | 18,female,27.3,3,yes
1207 | 35,male,17.9,1,no
1208 | 59,female,34.8,2,no
1209 | 36,male,33.4,2,yes
1210 | 37,female,25.6,1,yes
1211 | 59,male,37.1,1,no
1212 | 36,male,30.9,1,no
1213 | 39,male,34.1,2,no
1214 | 18,male,21.5,0,no
1215 | 52,female,33.3,2,no
1216 | 27,female,31.3,1,no
1217 | 18,male,39.1,0,no
1218 | 40,male,25.1,0,no
1219 | 29,male,37.3,2,no
1220 | 46,female,34.6,1,yes
1221 | 38,female,30.2,3,no
1222 | 30,female,21.9,1,no
1223 | 40,male,25.0,2,no
1224 | 50,male,25.3,0,no
1225 | 20,female,24.4,0,yes
1226 | 41,male,23.9,1,no
1227 | 33,female,39.8,1,no
1228 | 38,male,16.8,2,no
1229 | 42,male,37.2,2,no
1230 | 56,male,34.4,0,no
1231 | 58,male,30.3,0,no
1232 | 52,male,34.5,3,yes
1233 | 20,female,21.8,0,yes
1234 | 54,female,24.6,3,no
1235 | 58,male,23.3,0,no
1236 | 45,female,27.8,2,no
1237 | 26,male,31.1,0,no
1238 | 63,female,21.7,0,no
1239 | 58,female,28.2,0,no
1240 | 37,male,22.7,3,no
1241 | 25,female,42.1,1,no
1242 | 52,male,41.8,2,yes
1243 | 64,male,37.0,2,yes
1244 | 22,female,21.3,3,no
1245 | 28,female,33.1,0,no
1246 | 18,male,33.3,0,no
1247 | 28,male,24.3,5,no
1248 | 45,female,25.7,3,no
1249 | 33,male,29.4,4,no
1250 | 18,female,39.8,0,no
1251 | 32,male,33.6,1,yes
1252 | 24,male,29.8,0,yes
1253 | 19,male,19.8,0,no
1254 | 20,male,27.3,0,yes
1255 | 40,female,29.3,4,no
1256 | 34,female,27.7,0,no
1257 | 42,female,37.9,0,no
1258 | 51,female,36.4,3,no
1259 | 54,female,27.6,1,no
1260 | 55,male,37.7,3,no
1261 | 52,female,23.2,0,no
1262 | 32,female,20.5,0,no
1263 | 28,male,37.1,1,no
1264 | 41,female,28.1,1,no
1265 | 43,female,29.9,1,no
1266 | 49,female,33.3,2,no
1267 | 64,male,23.8,0,yes
1268 | 55,female,30.5,0,no
1269 | 24,male,31.1,0,yes
1270 | 20,female,33.3,0,no
1271 | 45,male,27.5,3,no
1272 | 26,male,33.9,1,no
1273 | 25,female,34.5,0,no
1274 | 43,male,25.5,5,no
1275 | 35,male,27.6,1,no
1276 | 26,male,27.1,0,yes
1277 | 57,male,23.7,0,no
1278 | 22,female,30.4,0,no
1279 | 32,female,29.7,0,no
1280 | 39,male,29.9,1,yes
1281 | 25,female,26.8,2,no
1282 | 48,female,33.3,0,no
1283 | 47,female,27.6,2,yes
1284 | 18,female,21.7,0,yes
1285 | 18,male,30.0,1,no
1286 | 61,male,36.3,1,yes
1287 | 47,female,24.3,0,no
1288 | 28,female,17.3,0,no
1289 | 36,female,25.9,1,no
1290 | 20,male,39.4,2,yes
1291 | 44,male,34.3,1,no
1292 | 38,female,20.0,2,no
1293 | 19,male,34.9,0,yes
1294 | 21,male,23.2,0,no
1295 | 46,male,25.7,3,no
1296 | 58,male,25.2,0,no
1297 | 20,male,22.0,1,no
1298 | 18,male,26.1,0,no
1299 | 28,female,26.5,2,no
1300 | 33,male,27.5,2,no
1301 | 19,female,25.7,1,no
1302 | 45,male,30.4,0,yes
1303 | 62,male,30.9,3,yes
1304 | 25,female,20.8,1,no
1305 | 43,male,27.8,0,yes
1306 | 42,male,24.6,2,yes
1307 | 24,female,27.7,0,no
1308 | 29,female,21.9,0,yes
1309 | 32,male,28.1,4,yes
1310 | 25,female,30.2,0,yes
1311 | 41,male,32.2,2,no
1312 | 42,male,26.3,1,no
1313 | 33,female,26.7,0,no
1314 | 34,male,42.9,1,no
1315 | 19,female,34.7,2,yes
1316 | 30,female,23.7,3,yes
1317 | 18,male,28.3,1,no
1318 | 19,female,20.6,0,no
1319 | 18,male,53.1,0,no
1320 | 35,male,39.7,4,no
1321 | 39,female,26.3,2,no
1322 | 31,male,31.1,3,no
1323 | 62,male,26.7,0,yes
1324 | 62,male,38.8,0,no
1325 | 42,female,40.4,2,yes
1326 | 31,male,25.9,1,no
1327 | 61,male,33.5,0,no
1328 | 42,female,32.9,0,no
1329 | 51,male,30.0,1,no
1330 | 23,female,24.2,2,no
1331 | 52,male,38.6,2,no
1332 | 57,female,25.7,2,no
1333 | 23,female,33.4,0,no
1334 | 52,female,44.7,3,no
1335 | 50,male,31.0,3,no
1336 | 18,female,31.9,0,no
1337 | 18,female,36.9,0,no
1338 | 21,female,25.8,0,no
1339 | 61,female,29.1,0,yes
1340 |
--------------------------------------------------------------------------------
/config/params.yaml:
--------------------------------------------------------------------------------
1 | base:
2 | project_id: 5
3 | project: insurance-premium
4 | random_state: 42
5 | test_size: 0.2
6 |
7 |
8 | config:
9 | params: config/params.yaml
10 | schema_prediction: config/schema_prediction.json
11 | schema_training: config/schema_training.json
12 |
13 | cloud_provider:
14 | name: microsoft
15 |
16 | data_download:
17 | cloud_training_directory_path: data/training_batch_files/
18 | cloud_prediction_directory_path: data/Prediction_Batch_files
19 |
20 | artifacts:
21 | training_data:
22 | good_file_path: artifacts/training_data/Training_Raw_files_validated/Good_raw
23 | bad_file_path: artifacts/training_data/Training_Raw_files_validated/Bad_raw
24 | archive_bad_file_path: artifacts/training_data/TrainingArchiveBadData
25 | training_file_from_db: artifacts/training_data/Training_FileFromDB
26 | master_csv: master.csv
27 | null_value_info_file_path: artifacts/training_data/preprocessing/null_value
28 | plots: artifacts/training_data/plots
29 | pipeline_path: artifacts/pipeline/pipeline_model
30 |
31 | model:
32 | model_path: artifacts/model
33 |
34 |
35 | prediction_data:
36 | good_file_path: artifacts/prediction_data/Prediction_Raw_files_validated/Good_raw
37 | bad_file_path: artifacts/prediction_data/Prediction_Raw_files_validated/Bad_raw
38 | archive_bad_file_path: artifacts/prediction_data/PredictionArchiveBadData
39 | prediction_file_from_db: artifacts/prediction_data/Prediction_FileFromDB
40 | master_csv: master.csv
41 | prediction_output_file_path: artifacts/Prediction_Output_DIR/
42 | prediction_file_name: Predictions.csv
43 |
44 | data_source:
45 | Training_Batch_Files: Training_Batch_Files
46 | Prediction_Batch_Files: Prediction_Batch_Files
47 |
48 | kafka:
49 | topic_name: insurance-premium
50 | kafka_bootstrap_server: localhost:9092
51 |
52 |
53 | log_database:
54 | training_database_name: insurance_prediction_training
55 | prediction_database_name: insurance_prediction_prediction
56 |
57 | dataset:
58 | unwanted_column:
59 | - region
60 |
61 |
62 | database_detail:
63 | training_database_name: insurance_prediction_training
64 | prediction_database_name: insurance_prediction_prediction
65 | dataset_training_collection_name: insurance_prediction_training_dataset
66 | dataset_prediction_collection_name: insurance_prediction_prediction_dataset
67 |
68 | target_columns:
69 | columns:
70 | - expenses
71 |
--------------------------------------------------------------------------------
/config/schema_prediction.json:
--------------------------------------------------------------------------------
1 | {
2 | "SampleFileName": "HealthPrem_26092020_131534.csv",
3 | "LengthOfDateStampInFile": 8,
4 | "LengthOfTimeStampInFile": 6,
5 | "NumberofColumns": 6,
6 | "ColName": {
7 | "age": "Integer",
8 | "sex": "varchar",
9 | "bmi": "float",
10 | "children": "Integer",
11 | "smoker": "varchar"
12 | }
13 | }
--------------------------------------------------------------------------------
/config/schema_training.json:
--------------------------------------------------------------------------------
1 | { "SampleFileName": "HealthPrem_26092020_131534.csv",
2 | "LengthOfDateStampInFile": 8,
3 | "LengthOfTimeStampInFile": 6,
4 | "NumberofColumns" : 7,
5 | "ColName": {
6 | "age" : "Integer",
7 | "sex" : "varchar",
8 | "bmi" : "float",
9 | "children" : "Integer",
10 | "smoker" : "varchar",
11 | "region": " varchar",
12 | "expenses" : "float"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/csv_to_kafka.py:
--------------------------------------------------------------------------------
1 | from streaming.producer.kafka_csv_data_producer import KafkaCSVDataProducer
2 | from streaming.spark_manager.spark_manager import SparkManager
3 |
4 | if __name__ == "__main__":
5 | try:
6 | path = "prediction_files"
7 | spark_session = SparkManager().get_spark_session_object()
8 | kfk_csv_data_producer = KafkaCSVDataProducer(
9 | spark_session=spark_session,
10 |
11 | )
12 | kfk_csv_data_producer.send_csv_data_to_kafka_topic(directory_path=path)
13 | except Exception as e:
14 | print(e)
15 |
--------------------------------------------------------------------------------
/data/training_batch_files/.gitignore:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/diagram/Drawing1.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/Drawing1.vsdx
--------------------------------------------------------------------------------
/diagram/streaming.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/streaming.jpg
--------------------------------------------------------------------------------
/diagram/training and prediction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/training and prediction.pdf
--------------------------------------------------------------------------------
/entry_point.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from utility import get_logger_object_of_training, get_logger_object_of_prediction
5 | from training.stage_00_data_loader import loader_main
6 | from training.stage_01_data_validator import validation_main
7 | from training.stage_02_data_transformer import transform_main
8 | from training.stage_03_data_exporter import export_main
9 | from training.stage_04_model_trainer import train_main
10 |
11 | from prediction.stage_00_data_loader import loader_main as pred_loader_main
12 | from prediction.stage_01_data_validator import validation_main as pred_validation_main
13 | from prediction.stage_02_data_transformer import transform_main as pred_transform_main
14 | from prediction.stage_03_data_exporter import export_main as pred_export_main
15 | from prediction.stage_04_model_predictor import predict_main
16 | from insurance_exception.insurance_exception import InsuranceException as GenericException
17 |
18 | collection_name = "main_pipeline"
19 |
20 |
21 | def begin_training(execution_id, executed_by):
22 | try:
23 | args = dict()
24 | args['config'] = os.path.join("config", "params.yaml")
25 | logger = get_logger_object_of_training(config_path=args['config'],
26 | collection_name=collection_name,
27 | execution_id=execution_id,
28 | executed_by=executed_by
29 | )
30 |
31 | args['datasource'] = None
32 | parsed_args = args
33 | logger.log(f"dictionary created.{args}")
34 | logger.log(f"{parsed_args}")
35 | logger.log("Data loading begin..")
36 |
37 | loader_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], execution_id=execution_id,
38 | executed_by=executed_by)
39 | logger.log("Data loading completed..")
40 | logger.log("Data validation began..")
41 |
42 | validation_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
43 | execution_id=execution_id,
44 | executed_by=executed_by)
45 | logger.log("Data validation completed..")
46 | logger.log("Data transformation began..")
47 |
48 | transform_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
49 | execution_id=execution_id,
50 | executed_by=executed_by)
51 | logger.log("Data transformation completed..")
52 | logger.log("Export oberation began..")
53 |
54 | export_main(config_path=parsed_args['config'], execution_id=execution_id,
55 | executed_by=executed_by)
56 | logger.log("Export oberation completed..")
57 | logger.log("Training began..")
58 |
59 | train_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], execution_id=execution_id,
60 | executed_by=executed_by)
61 | logger.log(f"Training completed")
62 | return {'status': True, 'message': 'Training completed successfully'}
63 | except Exception as e:
64 | generic_exception = GenericException(
65 | "Error occurred in module [{0}] method [{1}]"
66 | .format(begin_training.__module__,
67 | begin_training.__name__))
68 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
69 |
70 |
71 | def begin_prediction(execution_id, executed_by):
72 | try:
73 | args = dict()
74 | args['config'] = os.path.join("config", "params.yaml")
75 | logger = get_logger_object_of_prediction(config_path=args['config'],
76 | collection_name=collection_name,
77 | execution_id=execution_id,
78 | executed_by=executed_by
79 | )
80 | args['datasource'] = None
81 | parsed_args = args
82 | logger.log(f"dictionary created.{args}")
83 |
84 | logger.log(f"{parsed_args}")
85 | logger.log("Data loading begin..")
86 |
87 | pred_loader_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
88 | execution_id=execution_id,
89 | executed_by=executed_by
90 | )
91 | logger.log("Data loading completed..")
92 | logger.log("Data validation began..")
93 |
94 | pred_validation_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
95 | execution_id=execution_id,
96 | executed_by=executed_by
97 | )
98 | logger.log("Data validation completed..")
99 | logger.log("Data transformation began..")
100 |
101 | pred_transform_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
102 | execution_id=execution_id,
103 | executed_by=executed_by
104 | )
105 | logger.log("Data transformation completed..")
106 | logger.log("Export oberation began..")
107 |
108 | pred_export_main(config_path=parsed_args['config'])
109 | logger.log("Export operation completed..")
110 | logger.log("Prediction began..")
111 |
112 | predict_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
113 | execution_id=execution_id,
114 | executed_by=executed_by
115 | )
116 | logger.log("Prediction completed")
117 | return {'status': True, 'message': 'Prediction completed successfully'}
118 | except Exception as e:
119 | generic_exception = GenericException(
120 | "Error occurred in module [{0}] method [{1}]"
121 | .format(begin_prediction.__module__,
122 | begin_prediction.__name__))
123 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
124 |
--------------------------------------------------------------------------------
/insurance_exception/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__init__.py
--------------------------------------------------------------------------------
/insurance_exception/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/insurance_exception/__pycache__/insurance_exception.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__pycache__/insurance_exception.cpython-37.pyc
--------------------------------------------------------------------------------
/insurance_exception/insurance_exception.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | class InsuranceException(Exception):
5 |
6 | def __init__(self, error_message):
7 | """
8 |
9 | :param error_message: error message in string format
10 | """
11 | self.error_message = error_message
12 |
13 | def __repr__(self):
14 | return InsuranceException.__name__.__str__()
15 |
16 | def error_message_detail(self, error, error_detail):
17 | exc_type, exc_obj, exc_tb = error_detail.exc_info()
18 | file_name = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
19 | error_message = "python script name [{0}] line number [{1}] error message [{2}]".format(file_name,
20 | exc_tb.tb_lineno,
21 | str(error))
22 | self.error_message = self.error_message + " " + error_message
23 | return self.error_message
24 |
25 | def __str__(self):
26 | return self.error_message
27 |
--------------------------------------------------------------------------------
/insurance_prediction.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: insurance-prediction
3 | Version: 0.0.3
4 | Summary: insurance-prediction
5 | Home-page: UNKNOWN
6 | Author: Avnish yadav
7 | License: MIT
8 | Platform: UNKNOWN
9 |
10 | UNKNOWN
11 |
12 |
--------------------------------------------------------------------------------
/insurance_prediction.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | azure_blob_storage/__init__.py
3 | azure_blob_storage/azure_blob_storage.py
4 | insurance_exception/__init__.py
5 | insurance_exception/insurance_exception.py
6 | insurance_prediction.egg-info/PKG-INFO
7 | insurance_prediction.egg-info/SOURCES.txt
8 | insurance_prediction.egg-info/dependency_links.txt
9 | insurance_prediction.egg-info/top_level.txt
10 | logger/__init__.py
11 | logger/logger.py
12 | mongo_db/__init__.py
13 | mongo_db/mongo_db_atlas.py
14 | prediction/__init__.py
15 | prediction/stage_00_data_loader.py
16 | prediction/stage_01_data_validator.py
17 | prediction/stage_02_data_transformer.py
18 | prediction/stage_03_data_exporter.py
19 | prediction/stage_04_model_predictor.py
20 | streaming/__init__.py
21 | streaming/consumer/__init__.py
22 | streaming/consumer/kafka_to_spark_csv_consumer.py
23 | streaming/producer/__init__.py
24 | streaming/producer/kafka_csv_data_producer.py
25 | streaming/spark_manager/__init__.py
26 | streaming/spark_manager/spark_manager.py
27 | streaming/transformer/__init__.py
28 | streaming/transformer/spark_transformer.py
29 | training/__init__.py
30 | training/stage_00_data_loader.py
31 | training/stage_01_data_validator.py
32 | training/stage_02_data_transformer.py
33 | training/stage_03_data_exporter.py
34 | training/stage_04_model_trainer.py
--------------------------------------------------------------------------------
/insurance_prediction.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/insurance_prediction.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | azure_blob_storage
2 | insurance_exception
3 | logger
4 | mongo_db
5 | prediction
6 | streaming
7 | training
8 |
--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__init__.py
--------------------------------------------------------------------------------
/logger/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/logger/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__pycache__/logger.cpython-37.pyc
--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime
3 | import uuid
4 | import sys
5 | from insurance_exception.insurance_exception import InsuranceException as AppLoggerException
6 | from mongo_db.mongo_db_atlas import MongoDBOperation
7 |
8 |
9 | class AppLogger:
10 | def __init__(self, project_id, log_database, log_collection_name, executed_by,
11 | execution_id, is_log_enable=True):
12 | try:
13 |
14 | self.log_database = log_database
15 | self.log_collection_name = log_collection_name
16 | self.executed_by = executed_by
17 | self.execution_id = execution_id
18 | self.mongo_db_object = MongoDBOperation()
19 | self.project_id = project_id
20 | self.is_log_enable = is_log_enable
21 | except Exception as e:
22 | app_logger_exception = AppLoggerException(
23 | "Error occurred in module [{0}] class [{1}] method [{2}]"
24 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
25 | "__init__"))
26 | raise Exception(app_logger_exception.error_message_detail(str(e), sys)) from e
27 |
28 | def log(self, log_message):
29 | try:
30 | if not self.is_log_enable:
31 | return 0
32 | log_data = {
33 | 'execution_id': self.execution_id,
34 | 'message': log_message,
35 | 'executed_by': self.executed_by,
36 | 'project_id': self.project_id,
37 | 'updated_date_and_time': datetime.now().strftime("%H:%M:%S")
38 | }
39 |
40 | self.mongo_db_object.insert_record_in_collection(
41 | self.log_database, self.log_collection_name, log_data)
42 | except Exception as e:
43 | app_logger_exception = AppLoggerException(
44 | "Error occurred in module [{0}] class [{1}] method [{2}]"
45 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
46 | self.log.__name__))
47 | raise Exception(app_logger_exception.error_message_detail(str(e), sys)) from e
48 |
--------------------------------------------------------------------------------
/mongo_db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__init__.py
--------------------------------------------------------------------------------
/mongo_db/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/mongo_db/__pycache__/mongo_db_atlas.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__pycache__/mongo_db_atlas.cpython-37.pyc
--------------------------------------------------------------------------------
/mongo_db/mongo_db_atlas.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Feb 8 06:06:50 2021
4 |
5 | @author: AvnishYadav
6 | """
7 | # importing mongodb file
8 | import ssl
9 | import pymongo
10 | import json
11 | import pandas as pd
12 | import sys
13 | from insurance_exception.insurance_exception import InsuranceException as MongoDbException
14 |
15 |
16 | class MongoDBOperation:
17 | def __init__(self, user_name=None, password=None):
18 | try:
19 | if user_name is None or password is None:
20 | # creating initial object to fetch mongodb credentials
21 | credentials = {
22 | "user_name": "avnyadav",
23 | "password": "Aa327030"
24 | } # get_mongo_db_credentials() # return dictionary with user name and password
25 | self.__user_name = credentials['user_name']
26 | self.__password = credentials['password']
27 | else:
28 | self.__user_name = user_name
29 | self.__password = password
30 |
31 | except Exception as e:
32 | mongo_db_exception = MongoDbException(
33 | "Failed to instantiate mongo_db_object in module [{0}] class [{1}] method [{2}]"
34 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
35 | "__init__"))
36 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
37 |
38 | def get_mongo_db_url(self):
39 | """
40 | :return: mongo_db_url
41 | """
42 | try:
43 | url = ""
44 | return url
45 | except Exception as e:
46 | mongo_db_exception = MongoDbException(
47 | "Failed to fetch mongo_db url in module [{0}] class [{1}] method [{2}]"
48 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
49 | self.get_mongo_db_url.__name__))
50 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
51 |
52 | def get_database_client_object(self):
53 | """
54 | Return pymongoClient object to perform action with MongoDB
55 | """
56 | try:
57 |
58 | url = 'mongodb+srv://{0}:{1}@cluster0.wz7et.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'.format(
59 | self.__user_name, self.__password)
60 | client = pymongo.MongoClient(url, ssl_cert_reqs=ssl.CERT_NONE) # creating database client object
61 | return client
62 | except Exception as e:
63 | mongo_db_exception = MongoDbException(
64 | "Failed to fetch data base client object in module [{0}] class [{1}] method [{2}]"
65 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
66 | self.get_database_client_object.__name__))
67 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
68 |
69 | def close_database_client_object(self, obj_name):
70 | """
71 |
72 |
73 | Parameters
74 | ----------
75 | obj_name : pymongo client
76 | DESCRIPTION.pymongo client object
77 |
78 | Raises
79 | ------
80 | Exception
81 | Failed to close database connection-->.
82 |
83 | Returns
84 | -------
85 | bool
86 | True if connection closed.
87 |
88 | """
89 | try:
90 | obj_name.close()
91 | return True
92 | except Exception as e:
93 | mongo_db_exception = MongoDbException(
94 | "Failed to close data base client object in module [{0}] class [{1}] method [{2}]"
95 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
96 | self.close_database_client_object.__name__))
97 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
98 |
99 | def is_database_present(self, client, db_name):
100 | """
101 |
102 | Parameters
103 | ----------
104 | client : pymongo client
105 | DESCRIPTION. object which will be used to fetch communicate with mongo db
106 | db_name : string
107 | database name.
108 |
109 | Raises
110 | ------
111 | Exception
112 | DESCRIPTION.If any exception occurs
113 |
114 | Returns
115 | -------
116 | bool
117 | True if database already exists.
118 |
119 | """
120 | try:
121 | if db_name in client.list_database_names():
122 | return True
123 | else:
124 | return False
125 | except Exception as e:
126 | mongo_db_exception = MongoDbException(
127 | "Failed during checking database in module [{0}] class [{1}] method [{2}]"
128 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
129 | self.is_database_present.__name__))
130 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
131 |
132 | def create_database(self, client, db_name):
133 | """
134 | client: client object of database
135 | db_name:data base name
136 | """
137 | try:
138 | return client[db_name]
139 | except Exception as e:
140 | mongo_db_exception = MongoDbException(
141 | "Failure occured duing database creation steps in module [{0}] class [{1}] method [{2}]"
142 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
143 | self.create_database.__name__))
144 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
145 |
146 | def create_collection_in_database(self, database, collection_name):
147 | """
148 | database:database
149 | collection_name: name of collection
150 | return:
151 | collection object
152 | """
153 | try:
154 | return database[collection_name]
155 | except Exception as e:
156 | mongo_db_exception = MongoDbException(
157 | "Failed during creating collection in database in module [{0}] class [{1}] method [{2}]"
158 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
159 | self.create_collection_in_database.__name__))
160 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
161 |
162 | def is_collection_present(self, collection_name, database):
163 | """
164 |
165 |
166 | Parameters
167 | ----------
168 | collection_name : collection_name
169 | DESCRIPTION.collection name which needs to verify
170 | database : TYPE
171 | DESCRIPTION.database in which collection needs to check for existence
172 |
173 | Raises
174 | ------
175 | Exception
176 | DESCRIPTION.
177 |
178 | Returns
179 | -------
180 | bool
181 | true if collection present in database.
182 |
183 | """
184 | try:
185 | """It verifies the existence of collection name in a database"""
186 | collection_list = database.list_collection_names()
187 |
188 | if collection_name in collection_list:
189 | # print("Collection:'{COLLECTION_NAME}' in Database:'{DB_NAME}' exists")
190 | return True
191 |
192 | # print(f"Collection:'{COLLECTION_NAME}' in Database:'{DB_NAME}' does not exists OR \n no documents are present in the collection")
193 | return False
194 | except Exception as e:
195 | mongo_db_exception = MongoDbException(
196 | "Failed during checking collection in module [{0}] class [{1}] method [{2}]"
197 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
198 | self.is_collection_present.__name__))
199 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
200 |
201 | def get_collection(self, collection_name, database):
202 | """
203 | collection_name:collection name
204 | database=database
205 | ------------------------------------------
206 | return collection object
207 | """
208 | try:
209 | collection = self.create_collection_in_database(database, collection_name)
210 | return collection
211 | except Exception as e:
212 | mongo_db_exception = MongoDbException(
213 | "Failed in retrival of collection in module [{0}] class [{1}] method [{2}]"
214 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
215 | self.get_collection.__name__))
216 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
217 |
218 | def is_record_present(self, db_name, collection_name, record):
219 | """
220 | db_name: database name
221 | collection_name: collection name
222 | record: records to search
223 | ----------------------------------------------
224 | return True if record exists else return false
225 | """
226 | try:
227 | client = self.get_database_client_object() # client object
228 | database = self.create_database(client, db_name) # database object
229 | collection = self.get_collection(collection_name, database) # collection object
230 | record_found = collection.find(record) # fetching record
231 | if record_found.count() > 0:
232 | client.close()
233 | return True
234 | else:
235 | client.close()
236 | return False
237 | except Exception as e:
238 | mongo_db_exception = MongoDbException(
239 | "Failed in fetching record in module [{0}] class [{1}] method [{2}]"
240 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
241 | self.is_record_present.__name__))
242 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
243 |
244 | def create_record(self, collection, data):
245 | """
246 | collection: Accept collection name
247 | data: accept single to insert into collection
248 | -------------------------------------------
249 | return 1 if record inserted
250 | """
251 | try:
252 | collection.insert_one(data) # insertion of record in collection
253 | return 1
254 | except Exception as e:
255 | mongo_db_exception = MongoDbException(
256 | "Failed in inserting record in module [{0}] class [{1}] method [{2}]"
257 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
258 | self.create_record.__name__))
259 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
260 |
261 | def create_records(self, collection, data):
262 | """
263 | collection: collection object
264 | data: data which needs to be inserted
265 | --------------------------------------------
266 | return no of record inserted
267 | """
268 | try:
269 | collection.insert_many(data)
270 | return len(data)
271 | except Exception as e:
272 | mongo_db_exception = MongoDbException(
273 | "Failed in inserting records in module [{0}] class [{1}] method [{2}]"
274 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
275 | self.create_records.__name__))
276 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
277 |
278 | def insert_record_in_collection(self, db_name, collection_name, record):
279 | """
280 | db_name: database name
281 | collection_name: collection name
282 | record: records to insert
283 | ------------------------------
284 | return No of record inserted(int).
285 | """
286 | try:
287 | no_of_row_inserted = 0
288 | client = self.get_database_client_object()
289 | database = self.create_database(client, db_name)
290 | collection = self.get_collection(collection_name, database)
291 | if not self.is_record_present(db_name, collection_name, record):
292 | no_of_row_inserted = self.create_record(collection=collection, data=record)
293 | client.close()
294 | return no_of_row_inserted
295 | except Exception as e:
296 | mongo_db_exception = MongoDbException(
297 | "Failed in inserting record in collection module [{0}] class [{1}] method [{2}]"
298 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
299 | self.insert_record_in_collection.__name__))
300 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
301 |
302 | def drop_collection(self, db_name, collection_name):
303 | """
304 |
305 | :param db_name: database name
306 | :param collection_name: collection name
307 | :return: True if collection droped successfully.
308 | """
309 | try:
310 | client = self.get_database_client_object()
311 | database = self.create_database(client, db_name)
312 | if self.is_collection_present(collection_name, database):
313 | collection_name = self.get_collection(collection_name, database)
314 | collection_name.drop()
315 | return True
316 | except Exception as e:
317 | mongo_db_exception = MongoDbException(
318 | "Failed in droping collection module [{0}] class [{1}] method [{2}]"
319 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
320 | self.drop_collection.__name__))
321 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
322 |
323 | def insert_records_in_collection(self, db_name, collection_name, records):
324 | """
325 | db_name: database name
326 | collection_name: collection name
327 | records: records to insert
328 | """
329 | try:
330 | no_of_row_inserted = 0
331 | client = self.get_database_client_object()
332 | database = self.create_database(client, db_name)
333 | collection = self.get_collection(collection_name, database)
334 | for record in records:
335 | if not self.is_record_present(db_name, collection_name, record):
336 | no_of_row_inserted = no_of_row_inserted + self.create_record(collection=collection, data=records)
337 | client.close()
338 | return no_of_row_inserted
339 | except Exception as e:
340 | mongo_db_exception = MongoDbException(
341 | "Failed in inserting records in collection module [{0}] class [{1}] method [{2}]"
342 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
343 | self.insert_record_in_collection.__name__))
344 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
345 |
346 | def insert_dataframe_into_collection(self, db_name, collection_name, data_frame):
347 | """
348 | db_name:Database Name
349 | collection_name: collection name
350 | data_frame: dataframe which needs to be inserted
351 | return:
352 |
353 | """
354 | try:
355 | data_frame.reset_index(drop=True, inplace=True)
356 | records = list(json.loads(data_frame.T.to_json()).values())
357 | client = self.get_database_client_object()
358 | database = self.create_database(client, db_name)
359 | collection = self.get_collection(collection_name, database)
360 | collection.insert_many(records)
361 | return len(records)
362 | except Exception as e:
363 | mongo_db_exception = MongoDbException(
364 | "Failed in inserting dataframe in collection module [{0}] class [{1}] method [{2}]"
365 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
366 | self.insert_dataframe_into_collection.__name__))
367 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
368 |
369 | def get_record(self, database_name, collection_name, query=None):
370 | try:
371 | client = self.get_database_client_object()
372 | database = self.create_database(client, database_name)
373 | collection = self.get_collection(collection_name=collection_name, database=database)
374 | record = collection.find_one(query)
375 | return record
376 | except Exception as e:
377 | mongo_db_exception = MongoDbException(
378 | "Failed in retriving record in collection module [{0}] class [{1}] method [{2}]"
379 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
380 | self.get_record.__name__))
381 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
382 |
383 | def get_min_value_of_column(self, database_name, collection_name, query, column):
384 | """
385 |
386 | :param database_name:
387 | :param collection_name:
388 | :param query: to get all record
389 | :param column: column name
390 | :return: minimum value
391 | """
392 | try:
393 | client = self.get_database_client_object()
394 | database = self.create_database(client, database_name)
395 | collection = self.get_collection(collection_name=collection_name, database=database)
396 | min_value = collection.find(query).sort(column, pymongo.ASCENDING).limit(1)
397 | value = [min_val for min_val in min_value]
398 | if len(value) > 0:
399 | if column in value[0]:
400 | return value[0][column]
401 | else:
402 | return None
403 | else:
404 | return None
405 | except Exception as e:
406 | mongo_db_exception = MongoDbException(
407 | "Failed in getting minimum value from column in collection module [{0}] class [{1}] method [{2}]"
408 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
409 | self.get_record.__name__))
410 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
411 |
412 | def get_max_value_of_column(self, database_name, collection_name, query, column):
413 | """
414 |
415 | :param database_name: database name
416 | :param collection_name: collection name
417 | :param query: query
418 | :param column: column name
419 | :return: maximum value
420 | """
421 | try:
422 | client = self.get_database_client_object()
423 | database = self.create_database(client, database_name)
424 | collection = self.get_collection(collection_name=collection_name, database=database)
425 | max_value = collection.find(query).sort(column, pymongo.DESCENDING).limit(1)
426 | value = [max_val for max_val in max_value]
427 | if len(value) > 0:
428 | if column in value[0]:
429 | return value[0][column]
430 | else:
431 | return None
432 | else:
433 | return None
434 |
435 | except Exception as e:
436 | mongo_db_exception = MongoDbException(
437 | "Failed in getting maximum value from column in collection module [{0}] class [{1}] method [{2}]"
438 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
439 | self.get_record.__name__))
440 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
441 |
442 | def get_records(self, database_name, collection_name, query=None):
443 | """
444 |
445 | :param database_name:
446 | :param collection_name:
447 | :param query:
448 | :return: cursor object you need to iterate
449 | """
450 | try:
451 | client = self.get_database_client_object()
452 | database = self.create_database(client, database_name)
453 | collection = self.get_collection(collection_name=collection_name, database=database)
454 | record = collection.find(query)
455 | return record
456 | except Exception as e:
457 | mongo_db_exception = MongoDbException(
458 | "Failed in retriving records in collection module [{0}] class [{1}] method [{2}]"
459 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
460 | self.get_record.__name__))
461 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
462 |
463 | def update_record_in_collection(self, database_name, collection_name, query, new_value):
464 | """
465 |
466 | :param database_name: database name
467 | :param collection_name: collection name
468 | :param query: search for record
469 | :param new_value: updated values
470 | :return: n_updated row
471 | """
472 | try:
473 | client = self.get_database_client_object()
474 | database = self.create_database(client, database_name)
475 | collection = self.get_collection(collection_name=collection_name, database=database)
476 | update_query = {'$set': new_value}
477 | result = collection.update_one(query, update_query)
478 | client.close()
479 | return result.raw_result["nModified"]
480 | except Exception as e:
481 | mongo_db_exception = MongoDbException(
482 | "Failed updating record in collection module [{0}] class [{1}] method [{2}]"
483 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
484 | self.update_record_in_collection.__name__))
485 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
486 |
487 | def get_dataframe_of_collection(self, db_name, collection_name, query=None):
488 | """
489 |
490 | Parameters
491 | ----------
492 | db_name : string
493 | DESCRIPTION. database name
494 | collection_name : string
495 | DESCRIPTION.collection name
496 |
497 | Returns
498 | -------
499 | Pandas data frame of collection name present database.
500 |
501 | """
502 | try:
503 | client = self.get_database_client_object()
504 | database = self.create_database(client, db_name)
505 | collection = self.get_collection(collection_name=collection_name, database=database)
506 | if query is None:
507 | query = {}
508 | df = pd.DataFrame(list(collection.find(query)))
509 | if "_id" in df.columns.to_list():
510 | df = df.drop(columns=["_id"], axis=1)
511 | return df.copy()
512 | except Exception as e:
513 | mongo_db_exception = MongoDbException(
514 | "Failed in returning dataframe of collection module [{0}] class [{1}] method [{2}]"
515 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
516 | self.get_dataframe_of_collection.__name__))
517 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
518 |
519 | def remove_record(self, db_name, collection_name, query):
520 | try:
521 | client = self.get_database_client_object()
522 | database = self.create_database(client, db_name)
523 | collection = self.get_collection(collection_name=collection_name, database=database)
524 | collection.delete_one(query)
525 | return True
526 | except Exception as e:
527 | mongo_db_exception = MongoDbException(
528 | "Failed in collection module [{0}] class [{1}] method [{2}]"
529 | .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
530 | self.remove_record.__name__))
531 | raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
532 |
--------------------------------------------------------------------------------
/new_data.csv:
--------------------------------------------------------------------------------
1 | age,sex,bmi,children,smoker,region,timestamp,sex_encoder,smoker_encoder,sex_encoded,smoker_encoded,input_features,prediction
2 | 21,male,25.7,2,no,northeast,2021-11-13 11:57:30.400,0.0,0.0,"(1,[0],[1.0])","(1,[0],[1.0])","[21.0,25.7,2.0,1.0,1.0]",6333.757487873788
3 | 39,female,34.3,5,no,southeast,2021-11-13 11:57:31.410,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[39.0,34.3,5.0,0.0,1.0]",9860.294896652984
4 | 50,female,28.2,3,no,southeast,2021-11-13 11:57:32.416,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[50.0,28.2,3.0,0.0,1.0]",12142.963316508103
5 | 34,female,23.6,0,no,northeast,2021-11-13 11:57:33.420,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[34.0,23.6,0.0,0.0,1.0]",6761.00125859063
6 | 22,female,20.2,0,no,northwest,2021-11-13 11:57:34.425,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[22.0,20.2,0.0,0.0,1.0]",3976.837587719984
7 |
--------------------------------------------------------------------------------
/prediction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/prediction/__init__.py
--------------------------------------------------------------------------------
/prediction/stage_00_data_loader.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 | import shutil
5 | from utility import read_params, get_logger_object_of_prediction
6 | from insurance_exception.insurance_exception import InsuranceException as GenericException
7 | from utility import clean_data_source_dir
8 |
9 | log_collection_name = "data_loader"
10 |
11 |
12 | def loader_main(config_path: str, datasource: str,is_logging_enable=True,execution_id=None,executed_by=None) -> None:
13 | try:
14 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
15 | execution_id=execution_id, executed_by=executed_by)
16 |
17 |
18 | logger.is_log_enable = is_logging_enable
19 | logger.log("Starting data loading operation.\nReading configuration file.")
20 |
21 | config = read_params(config_path)
22 | downloader_path=config['data_download']['cloud_prediction_directory_path']
23 | download_path=config['data_source']['Prediction_Batch_Files']
24 |
25 |
26 | logger.log("Configuration detail has been fetched from configuration file.")
27 | # removing existing training and additional training files from local
28 | logger.log(f"Cleaning local directory [{download_path}] for training.")
29 | clean_data_source_dir(download_path,logger=logger, is_logging_enable=is_logging_enable) # removing existing file from local system
30 |
31 | logger.log(f"Cleaning completed. Directory has been cleared now [{download_path}]")
32 | # downloading training and additional training file from cloud into local system
33 | logger.log("Data will be downloaded from cloud storage into local system")
34 |
35 |
36 | for file in os.listdir(downloader_path):
37 | if '.dvc' in file or '.gitignore' in file:
38 | continue
39 | print(f"Source dir: {downloader_path} file: {file} is being copied into destination dir: {download_path}"
40 | f" file: {file}")
41 | shutil.copy(os.path.join(downloader_path,file),os.path.join(download_path,file))
42 | logger.log("Data has been downloaded from cloud storage into local system")
43 |
44 | except Exception as e:
45 | generic_exception = GenericException(
46 | "Error occurred in module [{0}] method [{1}]"
47 | .format(loader_main.__module__,
48 | loader_main.__name__))
49 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
50 |
51 |
52 | if __name__ == '__main__':
53 | args = argparse.ArgumentParser()
54 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
55 | args.add_argument("--datasource", default=None)
56 | parsed_args = args.parse_args()
57 | print("started")
58 | loader_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
59 |
--------------------------------------------------------------------------------
/prediction/stage_01_data_validator.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 |
5 | import pandas as pd
6 | from utility import read_params, create_directory_path, values_from_schema_function, \
7 | get_logger_object_of_prediction, get_date, get_time
8 | import argparse
9 | import shutil
10 |
11 | from insurance_exception.insurance_exception import InsuranceException as GenericException
12 |
13 | log_collection_name = "data_validator"
14 |
15 |
16 | class DataValidator:
17 | def __init__(self, config, logger, is_logging_enable=True):
18 | try:
19 | self.logger = logger
20 | self.logger.is_log_enable = is_logging_enable
21 | self.config = config
22 | self.file_path = self.config['data_source']['Prediction_Batch_Files']
23 | self.good_file_path = self.config['artifacts']['prediction_data']['good_file_path']
24 | self.bad_file_path = self.config['artifacts']['prediction_data']['bad_file_path']
25 | self.archive_bad_file_path = self.config['artifacts']['prediction_data']['archive_bad_file_path']
26 | self.prediction_schema_file = self.config['config']['schema_prediction']
27 | except Exception as e:
28 | generic_exception = GenericException(
29 | "Error occurred in module [{0}] class [{1}] method [{2}]"
30 | .format(self.__module__, DataValidator.__name__,
31 | self.__init__.__name__))
32 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
33 |
34 | def archive_bad_files(self):
35 | try:
36 | folder_name=f"bad_files_{get_date().replace('-','_')}_{get_time().replace(':','_')}"
37 | archive_directory_path=os.path.join(self.archive_bad_file_path,folder_name)
38 | create_directory_path(archive_directory_path)
39 | for file in os.listdir(self.bad_file_path):
40 | source_file_path=os.path.join(self.bad_file_path,file)
41 | shutil.move(source_file_path,archive_directory_path)
42 | except Exception as e:
43 | generic_exception = GenericException(
44 | "Error occurred in module [{0}] class [{1}] method [{2}]"
45 | .format(self.__module__, DataValidator.__name__,
46 | self.archive_bad_files.__name__))
47 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
48 |
49 |
50 | def create_good_bad_archive_bad_file_path(self):
51 | try:
52 | create_directory_path(self.good_file_path)
53 | create_directory_path(self.bad_file_path)
54 | create_directory_path(self.archive_bad_file_path,is_recreate=False)
55 | except Exception as e:
56 |
57 | generic_exception = GenericException(
58 | "Error occurred in module [{0}] class [{1}] method [{2}]"
59 | .format(self.__module__, DataValidator.__name__,
60 | self.create_good_bad_archive_bad_file_path.__name__))
61 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
62 |
63 |
64 | def value_from_schema(self):
65 | """
66 |
67 | :return: tuple (sample_file_name,column_names,number_of_column)
68 | """
69 | try:
70 | return values_from_schema_function(self.prediction_schema_file)
71 | except Exception as e:
72 | generic_exception = GenericException(
73 | "Error occurred in module [{0}] class [{1}] method [{2}]"
74 | .format(self.__module__, DataValidator.__name__,
75 | self.value_from_schema.__name__))
76 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
77 |
78 |
79 | def file_name_regular_expression(self):
80 | """
81 |
82 | :return: regular expression syntax which can be used for validation of file name
83 | """
84 | return "['HealthPrem']+['\_'']+[\d_]+[\d]+\.csv"
85 |
86 | def validate_missing_values_in_whole_column(self):
87 | try:
88 | self.logger.log("Missing Values Validation Started!!")
89 | for file in os.listdir(self.good_file_path):
90 | csv = pd.read_csv(os.path.join(self.good_file_path, file))
91 | count = 0
92 | for columns in csv:
93 | if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]):
94 | count += 1
95 | shutil.move(os.path.join(self.good_file_path, file),
96 | self.bad_file_path)
97 | self.logger.log(
98 | "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
99 | break
100 | if count == 0:
101 | print(csv.columns)
102 | csv.rename(columns={"Unnamed: 0": "Premium "}, inplace=True)
103 | csv.to_csv(os.path.join(self.good_file_path, file), index=None, header=True)
104 | except Exception as e:
105 | generic_exception = GenericException(
106 | "Error occurred in module [{0}] class [{1}] method [{2}]"
107 | .format(self.__module__, DataValidator.__name__,
108 | self.validate_missing_values_in_whole_column.__name__))
109 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
110 |
111 |
112 | def validate_file_name(self):
113 | try:
114 | self.create_good_bad_archive_bad_file_path()
115 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = self.value_from_schema()
116 |
117 | file_name_reg_pattern = self.file_name_regular_expression()
118 | self.logger.log(f"Validating file names.")
119 | files = os.listdir(self.file_path)
120 | for file in files:
121 | file_path = os.path.join(self.file_path, file)
122 | split_at_dot = re.split('.csv', file)
123 | split_at_dot = (re.split('_', split_at_dot[0]))
124 | if re.match(file_name_reg_pattern, file) and len(split_at_dot[1]) == length_of_date_stamp_in_file \
125 | and len(split_at_dot[2]) == length_of_time_stamp_in_file:
126 | destination_file_path = os.path.join(self.good_file_path, file)
127 | self.logger.log(f"file name : {file} matched hence moving file to good file path {destination_file_path}")
128 | shutil.move(file_path, destination_file_path)
129 | else:
130 | destination_file_path = os.path.join(self.bad_file_path, file)
131 | self.logger.log(f"file name: {file} does not matched hence moving file to bad file path {destination_file_path}")
132 | shutil.move(file_path, destination_file_path)
133 | except Exception as e:
134 | generic_exception = GenericException(
135 | "Error occurred in module [{0}] class [{1}] method [{2}]"
136 | .format(self.__module__, DataValidator.__name__,
137 | self.validate_file_name.__name__))
138 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
139 |
140 |
141 | def validate_no_of_column(self, no_of_column):
142 | """
143 | Description:
144 | If number of column matches then file will be move to good file path else bad file path
145 | =====================================================================================
146 | :param no_of_column: int Number of column must present in each file
147 | :return: Nothing
148 | """
149 | try:
150 | self.logger.log(f"Validating number of column in input file")
151 | files = os.listdir(self.good_file_path)
152 | for file in files:
153 | file_path = os.path.join(self.good_file_path, file)
154 | df = pd.read_csv(file_path)
155 | if df.shape[1] != no_of_column:
156 | destination_file_path = os.path.join(self.bad_file_path, file)
157 | self.logger.log(f"file: {file} has incorrect number of column hence moving file to bad file path {destination_file_path}")
158 | shutil.move(file_path, destination_file_path)
159 | except Exception as e:
160 | generic_exception = GenericException(
161 | "Error occurred in module [{0}] class [{1}] method [{2}]"
162 | .format(self.__module__, DataValidator.__name__,
163 | self.validate_no_of_column.__name__))
164 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
165 |
166 |
167 | def validation_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
168 | try:
169 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
170 | execution_id=execution_id, executed_by=executed_by)
171 |
172 | logger.is_log_enable = is_logging_enable
173 | config = read_params(config_path)
174 | logger.log("data validation started")
175 | data_validator = DataValidator(config=config, logger=logger, is_logging_enable=is_logging_enable)
176 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = \
177 | data_validator.value_from_schema()
178 | data_validator.validate_file_name()
179 | data_validator.validate_no_of_column(no_of_column=number_of_columns)
180 | data_validator.validate_missing_values_in_whole_column()
181 | data_validator.archive_bad_files()
182 | except Exception as e:
183 | generic_exception = GenericException(
184 | "Error occurred in module [{0}] method [{1}]"
185 | .format(validation_main.__module__,
186 | validation_main.__name__))
187 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
188 |
189 |
190 | if __name__ == '__main__':
191 | args = argparse.ArgumentParser()
192 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
193 | args.add_argument("--datasource", default=None)
194 | parsed_args = args.parse_args()
195 | print("started")
196 | validation_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
197 |
--------------------------------------------------------------------------------
/prediction/stage_02_data_transformer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import pandas as pd
5 | import argparse
6 | from utility import read_params, get_logger_object_of_prediction
7 | from mongo_db.mongo_db_atlas import MongoDBOperation
8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
9 |
10 | log_collection_name = "data_transformer"
11 |
12 |
13 | class DataTransformer:
14 | def __init__(self, config, logger, is_log_enable=True):
15 | try:
16 | self.config = config
17 | self.logger = logger
18 | self.logger.is_log_enable = is_log_enable
19 | self.good_file_path = self.config["artifacts"]['prediction_data']['good_file_path']
20 | self.unwanted_column_names=self.config["dataset"]['unwanted_column']
21 | self.mongo_db=MongoDBOperation()
22 | self.dataset_database=self.config["dataset"]["database_detail"]["prediction_database_name"]
23 | self.dataset_collection_name=self.config["dataset"]["database_detail"]["dataset_prediction_collection_name"]
24 | self.mongo_db.drop_collection(self.dataset_database,self.dataset_collection_name)
25 | except Exception as e:
26 | generic_exception = GenericException(
27 | "Error occurred in module [{0}] class [{1}] method [{2}]"
28 | .format(self.__module__, DataTransformer.__name__,
29 | self.__init__.__name__))
30 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
31 |
32 | def unite_dataset(self):
33 | try:
34 | dataset_list=[]
35 | for file in os.listdir(self.good_file_path):
36 | dataset_list.append(pd.read_csv(os.path.join(self.good_file_path,file)))
37 | df=pd.concat(dataset_list)
38 | df=self.remove_unwanted_column(df)
39 | self.logger.log(f"Inserting dataset into database {self.dataset_database} "
40 | f"collection_name: {self.dataset_collection_name}")
41 | self.mongo_db.insert_dataframe_into_collection(self.dataset_database,self.dataset_collection_name,df)
42 | except Exception as e:
43 | generic_exception = GenericException(
44 | "Error occurred in module [{0}] class [{1}] method [{2}]"
45 | .format(self.__module__, DataTransformer.__name__,
46 | self.unite_dataset.__name__))
47 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
48 |
49 |
50 | def remove_unwanted_column(self,df):
51 | try:
52 | drop_column=list(filter(lambda x: x in df.columns ,self.unwanted_column_names))
53 | return df.drop(drop_column,axis=1)
54 | except Exception as e:
55 | generic_exception = GenericException(
56 | "Error occurred in module [{0}] class [{1}] method [{2}]"
57 | .format(self.__module__, DataTransformer.__name__,
58 | self.remove_unwanted_column.__name__))
59 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
60 |
61 |
62 | def transform_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
63 | try:
64 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
65 | execution_id=execution_id, executed_by=executed_by)
66 |
67 | logger.is_log_enable = is_logging_enable
68 | config = read_params(config_path)
69 | data_transformer = DataTransformer(config=config, logger=logger, is_log_enable=is_logging_enable)
70 | logger.log('Start of Data Preprocessing before DB')
71 | data_transformer.unite_dataset()
72 | logger.log('Data Preprocessing before DB Completed !!')
73 |
74 | except Exception as e:
75 | generic_exception = GenericException(
76 | "Error occurred in module [{0}] method [{1}]"
77 | .format(transform_main.__module__,
78 | transform_main.__name__))
79 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
80 |
81 |
82 | if __name__ == '__main__':
83 | args = argparse.ArgumentParser()
84 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
85 | args.add_argument("--datasource", default=None)
86 | parsed_args = args.parse_args()
87 | print("started")
88 | transform_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
89 |
--------------------------------------------------------------------------------
/prediction/stage_03_data_exporter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from utility import read_params, create_directory_path
5 | from mongo_db.mongo_db_atlas import MongoDBOperation
6 | import argparse
7 | from utility import get_logger_object_of_prediction
8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
9 |
10 | log_collection_name = "data_export"
11 |
12 |
13 | class DataExporter:
14 | def __init__(self, config, logger, is_log_enable):
15 | try:
16 | self.config = config
17 | self.logger = logger
18 | self.is_log_enable = is_log_enable
19 | self.mongo_db = MongoDBOperation()
20 | self.dataset_database = self.config["dataset"]["database_detail"]["prediction_database_name"]
21 | self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_prediction_collection_name"]
22 | self.prediction_file_from_db = self.config["artifacts"]['prediction_data']['prediction_file_from_db']
23 | self.master_csv = self.config["artifacts"]['prediction_data']['master_csv']
24 | except Exception as e:
25 | generic_exception = GenericException(
26 | "Error occurred in module [{0}] class [{1}] method [{2}]"
27 | .format(self.__module__, DataExporter.__name__,
28 | self.__init__.__name__))
29 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
30 |
31 | def export_dataframe_from_database(self):
32 | try:
33 | create_directory_path(self.prediction_file_from_db)
34 | self.logger.log(f"Creating dataframe of data stored db"
35 | f"[{self.dataset_database}] and collection[{self.dataset_collection_name}]")
36 | df = self.mongo_db.get_dataframe_of_collection(db_name=self.dataset_database,
37 | collection_name=self.dataset_collection_name)
38 | master_csv_file_path = os.path.join(self.prediction_file_from_db, self.master_csv)
39 | self.logger.log(f"master csv file will be generated at "
40 | f"{master_csv_file_path}.")
41 | df.to_csv(master_csv_file_path, index=None,header=True)
42 |
43 | except Exception as e:
44 | generic_exception = GenericException(
45 | "Error occurred in module [{0}] class [{1}] method [{2}]"
46 | .format(self.__module__, DataExporter.__name__,
47 | self.export_dataframe_from_database.__name__))
48 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
49 |
50 |
51 | def export_main(config_path: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
52 | try:
53 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
54 | execution_id=execution_id, executed_by=executed_by)
55 |
56 | logger.is_log_enable = is_logging_enable
57 | config = read_params(config_path)
58 | data_exporter = DataExporter(config=config, logger=logger, is_log_enable=is_logging_enable)
59 | logger.log("Generating csv file from dataset stored in database.")
60 | data_exporter.export_dataframe_from_database()
61 | logger.log("Dataset has been successfully exported in directory and exiting export pipeline.")
62 | except Exception as e:
63 | generic_exception = GenericException(
64 | "Error occurred in module [{0}] method [{1}]"
65 | .format(export_main.__module__,
66 | export_main.__name__))
67 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
68 |
69 |
70 | if __name__ == '__main__':
71 | args = argparse.ArgumentParser()
72 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
73 | parsed_args = args.parse_args()
74 | print("started")
75 | export_main(config_path=parsed_args.config)
76 |
--------------------------------------------------------------------------------
/prediction/stage_04_model_predictor.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import argparse
4 |
5 | from pyspark.sql.types import IntegerType, FloatType, StringType
6 |
7 | from utility import create_directory_path,get_logger_object_of_prediction,read_params
8 |
9 | from streaming.spark_manager.spark_manager import SparkManager
10 |
11 | from insurance_exception.insurance_exception import InsuranceException as GenericException
12 | from pyspark.ml import Pipeline, PipelineModel
13 | from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor
14 |
15 | log_collection_name = "prediction_model"
16 |
17 |
18 | class DataPreProcessing:
19 | def __init__(self, logger, is_log_enable=True, data_frame=None, pipeline_path=None):
20 | try:
21 | self.logger = logger
22 | self.logger.is_log_enable = is_log_enable
23 | self.data_frame = data_frame
24 | print(pipeline_path)
25 | self.pipeline_obj = PipelineModel.load(pipeline_path)
26 |
27 | except Exception as e:
28 | generic_exception = GenericException(
29 | "Error occurred in module [{0}] class [{1}] method [{2}]"
30 | .format(self.__module__, DataPreProcessing.__name__,
31 | self.__init__.__name__))
32 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
33 |
34 | def set_dataframe(self, dataframe):
35 | try:
36 | self.data_frame = dataframe
37 | except Exception as e:
38 | generic_exception = GenericException(
39 | "Error occurred in module [{0}] class [{1}] method [{2}]"
40 | .format(self.__module__, DataPreProcessing.__name__,
41 | self.update_dataframe_scheme.__name__))
42 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
43 |
44 | def update_dataframe_scheme(self, schema_definition: dict):
45 | """
46 |
47 | """
48 | try:
49 | print(self.data_frame.printSchema())
50 | if self.data_frame is None:
51 | raise Exception("update the attribute dataframe")
52 | for column, datatype in schema_definition.items():
53 | self.logger.log(f"Update datatype of column: {column} to {str(datatype)}")
54 | self.data_frame = self.data_frame.withColumn(column, self.data_frame[column].cast(datatype))
55 | except Exception as e:
56 | generic_exception = GenericException(
57 | "Error occurred in module [{0}] class [{1}] method [{2}]"
58 | .format(self.__module__, DataPreProcessing.__name__,
59 | self.update_dataframe_scheme.__name__))
60 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
61 |
62 | def get_prepared_data(self):
63 | try:
64 | schema_definition = {"age": IntegerType(),
65 | "sex": StringType(),
66 | "bmi": FloatType(),
67 | "children": IntegerType(),
68 | "smoker": StringType(),
69 | }
70 | self.update_dataframe_scheme(schema_definition=schema_definition)
71 | self.data_frame = self.pipeline_obj.transform(self.data_frame)
72 | print(self.data_frame.printSchema())
73 | return self.data_frame
74 | except Exception as e:
75 | generic_exception = GenericException(
76 | "Error occurred in module [{0}] class [{1}] method [{2}]"
77 | .format(self.__module__, DataPreProcessing.__name__,
78 | self.get_prepared_data.__name__))
79 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
80 |
81 |
82 | class Predictor:
83 |
84 | def __init__(self, config, logger, is_log_enable):
85 | try:
86 | self.logger = logger
87 | self.logger.is_log_enable = is_log_enable
88 | self.config = config
89 | self.prediction_file_path = self.config['artifacts']['prediction_data']['prediction_file_from_db']
90 | self.master_csv = self.config['artifacts']['prediction_data']['master_csv']
91 | self.model_path = self.config['artifacts']['model']['model_path']
92 | self.prediction_output_file_path = self.config['artifacts']['prediction_data'][
93 | 'prediction_output_file_path']
94 | self.prediction_file_name = self.config['artifacts']['prediction_data']['prediction_file_name']
95 | self.target_columns = self.config['target_columns']['columns']
96 | self.null_value_file_path = config['artifacts']['training_data']['null_value_info_file_path']
97 | self.pipeline_path = self.config['artifacts']['training_data']['pipeline_path']
98 | """
99 | self.spark = SparkSession.builder. \
100 | master("local[*]"). \
101 | appName("insurance-premium-reg").getOrCreate()
102 | """
103 | self.spark = SparkManager().get_spark_session_object()
104 | except Exception as e:
105 | generic_exception = GenericException(
106 | "Error occurred in module [{0}] class [{1}] method [{2}]"
107 | .format(self.__module__, Predictor.__name__,
108 | self.__init__.__name__))
109 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
110 |
111 | def get_dataframe(self):
112 | try:
113 | master_file_path = os.path.join(self.prediction_file_path, self.master_csv)
114 | return self.spark.read.csv(master_file_path, header=True, inferSchema=True)
115 | except Exception as e:
116 | generic_exception = GenericException(
117 | "Error occurred in module [{0}] class [{1}] method [{2}]"
118 | .format(self.__module__, Predictor.__name__,
119 | self.get_dataframe.__name__))
120 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
121 |
122 | def data_preparation(self):
123 | try:
124 |
125 | input_features = self.get_dataframe()
126 | data_preprocess = DataPreProcessing(logger=self.logger,
127 | is_log_enable=self.logger.is_log_enable,
128 | data_frame=input_features,
129 | pipeline_path=self.pipeline_path
130 | )
131 | return data_preprocess.get_prepared_data()
132 |
133 | except Exception as e:
134 | generic_exception = GenericException(
135 | "Error occurred in module [{0}] class [{1}] method [{2}]"
136 | .format(self.__module__, Predictor.__name__,
137 | self.data_preparation.__name__))
138 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
139 |
140 |
141 | def load_model(self):
142 | try:
143 | model_path = self.model_path
144 | if not os.path.exists(model_path):
145 | raise Exception(f"Model directory: {model_path} is not found.")
146 | model_names = os.listdir(model_path)
147 | if len(model_names) != 1:
148 | raise Exception(f"We have expected only one model instead we found {len(model_names)}")
149 | model_name = model_names[0]
150 | model_path = os.path.join(model_path, model_name)
151 | print(f"model path: {model_path}")
152 | return RandomForestRegressionModel.load(model_path)
153 | except Exception as e:
154 | generic_exception = GenericException(
155 | "Error occurred in module [{0}] class [{1}] method [{2}]"
156 | .format(self.__module__, Predictor.__name__,
157 | self.load_model.__name__))
158 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
159 |
160 | def predict(self):
161 | try:
162 |
163 | input_data = self.data_preparation()
164 | model = self.load_model()
165 | print(str(model))
166 | print(input_data.printSchema())
167 | prediction = model.transform(input_data)
168 | prediction_output = prediction.select("age", "sex", "children", "smoker", "prediction").toPandas()
169 | create_directory_path(self.prediction_output_file_path)
170 | output_file_path = os.path.join(self.prediction_output_file_path, self.prediction_file_name)
171 | if prediction_output is not None:
172 | prediction_output.to_csv(output_file_path, index=None, header=True)
173 | except Exception as e:
174 | generic_exception = GenericException(
175 | "Error occurred in module [{0}] class [{1}] method [{2}]"
176 | .format(self.__module__, Predictor.__name__,
177 | self.predict.__name__))
178 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
179 |
180 |
181 | def predict_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None,
182 | executed_by=None) -> None:
183 | try:
184 | logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
185 | execution_id=execution_id, executed_by=executed_by)
186 |
187 | logger.is_log_enable = is_logging_enable
188 | logger.log("Prediction begin.")
189 | config = read_params(config_path)
190 | predictor = Predictor(config=config, logger=logger, is_log_enable=is_logging_enable)
191 | predictor.predict()
192 | logger.log("Prediction completed successfully.")
193 |
194 | except Exception as e:
195 | generic_exception = GenericException(
196 | "Error occurred in module [{0}] method [{1}]"
197 | .format(predict_main.__module__,
198 | predict_main.__name__))
199 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
200 |
201 |
202 | if __name__ == '__main__':
203 | args = argparse.ArgumentParser()
204 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
205 | args.add_argument("--datasource", default=None)
206 | parsed_args = args.parse_args()
207 | print(parsed_args.config)
208 | print(parsed_args.datasource)
209 | predict_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
210 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | pyspark==3.0.1
2 | pymongo==3.11.0
3 | dnspython==1.16.0
4 | PyYAML
5 | pandas
6 | sklearn
7 | -e .
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name="insurance-prediction",
5 | version="0.0.3",
6 | description="insurance-prediction",
7 | author="Avnish yadav",
8 | packages=find_packages(),
9 | license="MIT"
10 | )
--------------------------------------------------------------------------------
/spark_consumer_from_kafka.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from pyspark.ml import PipelineModel
4 | from pyspark.ml.regression import RandomForestRegressionModel
5 |
6 | from streaming.spark_manager.spark_manager import SparkManager
7 | from streaming.consumer.kafka_to_spark_csv_consumer import KafkaToSparkCSVConsumer
8 |
9 |
10 | if __name__ == "__main__":
11 | spark_session = SparkManager().get_spark_session_object()
12 |
13 | schema_string = "age INT,sex STRING,bmi DOUBLE,children INT,smoker STRING,region STRING"
14 | database_name = "stream_prediction"
15 | collection_name = "insurance_prediction_output"
16 | kfk_con = KafkaToSparkCSVConsumer(spark_session=spark_session,
17 | schema_string=schema_string,
18 | database_name=database_name,
19 | collection_name=collection_name
20 | )
21 | transformer_list = []
22 | pipeline_model = PipelineModel.load(os.path.join("artifacts",
23 | "pipeline",
24 | "pipeline_model"))
25 | random_forest_model = RandomForestRegressionModel.load(os.path.join("artifacts",
26 | "model",
27 | "random_forest_regressor"))
28 |
29 | transformer_list.append(pipeline_model)
30 | transformer_list.append(random_forest_model)
31 | kfk_con.spark_transformer.add_machine_learning_transformer(
32 | transformer=transformer_list
33 | )
34 | kfk_con.receive_csv_data_from_kafka_topics()
35 |
--------------------------------------------------------------------------------
/streaming/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__init__.py
--------------------------------------------------------------------------------
/streaming/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/consumer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__init__.py
--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/consumer/kafka_to_spark_csv_consumer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from pyspark.sql.functions import *
4 |
5 | from insurance_exception.insurance_exception import InsuranceException as KafkaToSparkCSVConsumerException
6 | import sys
7 | from utility import read_params
8 | from streaming.transformer.spark_transformer import SparkTransformer
9 |
10 |
11 | class KafkaToSparkCSVConsumer:
12 | def __init__(self, schema_string, database_name, collection_name, spark_session, processing_interval_second=5,
13 | config_path=None, ):
14 | try:
15 | # accepting default configuration file if no configuration file path has been specified during object
16 | # instantiation
17 | path = os.path.join("config", "params.yaml") if config_path is None else os.path.join(config_path)
18 | self.config = read_params(config_path=path)
19 | self.kafka_topic_name = self.config['kafka']['topic_name']
20 | self.kafka_bootstrap_server = self.config['kafka']['kafka_bootstrap_server']
21 | self.spark_session = spark_session
22 | self.schema = schema_string # "age INT,sex STRING,bmi DOUBLE,children INT,smoker STRING,region STRING"
23 | self.spark_transformer = SparkTransformer(database_name=database_name, collection_name=collection_name)
24 | self.processing_interval_second = processing_interval_second
25 | self.query = None
26 | except Exception as e:
27 | kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException(
28 | "Error occurred in module [{0}] class [{1}] method [{2}] ".
29 | format(self.__module__, KafkaToSparkCSVConsumer.__name__,
30 | self.__init__.__name__))
31 | raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e
32 |
33 | def receive_csv_data_from_kafka_topics(self):
34 | try:
35 | dataframe = self.spark_session \
36 | .readStream \
37 | .format("kafka") \
38 | .option("kafka.bootstrap.servers", self.kafka_bootstrap_server) \
39 | .option("subscribe", self.kafka_topic_name) \
40 | .option("startingOffsets", "latest") \
41 | .load()
42 | dataframe_1 = dataframe.selectExpr("CAST(value as STRING) ", "timestamp")
43 | dataframe_2 = dataframe_1.select(from_csv(col("value"), self.schema).alias("records"), "timestamp")
44 | dataframe_3 = dataframe_2.select("records.*", "timestamp")
45 | transformed_df = dataframe_3
46 | for transformer in self.spark_transformer.ml_transformer:
47 | transformed_df = transformer.transform(transformed_df)
48 | self.query = transformed_df.writeStream.trigger(
49 | processingTime=f'{self.processing_interval_second} seconds').foreachBatch(
50 | self.spark_transformer.process_each_record).start()
51 | self.query.awaitTermination()
52 | except Exception as e:
53 | kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException(
54 | "Error occurred in module [{0}] class [{1}] method [{2}] ".
55 | format(self.__module__, KafkaToSparkCSVConsumer.__name__,
56 | self.receive_csv_data_from_kafka_topics.__name__))
57 | raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e
58 |
59 | def stop_stream(self):
60 | try:
61 | if self.query is not None:
62 | self.query.stop()
63 |
64 | except Exception as e:
65 | kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException(
66 | "Error occurred in module [{0}] class [{1}] method [{2}] ".
67 | format(self.__module__, KafkaToSparkCSVConsumer.__name__,
68 | self.receive_csv_data_from_kafka_topics.__name__))
69 | raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e
70 |
--------------------------------------------------------------------------------
/streaming/producer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__init__.py
--------------------------------------------------------------------------------
/streaming/producer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/producer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/producer/kafka_csv_data_producer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from kafka import KafkaProducer
5 | from utility import read_params
6 | import time
7 | from insurance_exception.insurance_exception import InsuranceException as KafkaCSVDataProducerException
8 | from streaming.spark_manager.spark_manager import SparkManager
9 |
10 |
11 | class KafkaCSVDataProducer:
12 |
13 | def __init__(self, spark_session,config_path=None):
14 | """
15 | Creator:
16 | **********************************************************************************************************
17 | created date: 02 November 2021
18 | Organization: iNeuron
19 | author: avnish@ineuron.ai
20 | **********************************************************************************************************
21 | Description:
22 | **********************************************************************************************************
23 | KafkaCSVDataProducer is responsible to read a csv file and send data row by row to a kafka topic specified in
24 | configuration file:
25 | define below record
26 | kafka:
27 | topic_name:
28 | kafka_bootstrap_server:
29 | *************************************************************************************************************
30 | Example:
31 | kafka:
32 | topic_name: insurance-prediction
33 | kafka_bootstrap_server: localhost:9092
34 |
35 | parameters:
36 | =============================================================================================================
37 | param config_path: configuration file path default is config/param.yaml
38 |
39 | """
40 | try:
41 | # accepting default configuration file if no configuration file path has been specified during object
42 | # instantiation
43 | path = os.path.join("config", "params.yaml") if config_path is None else os.path.join(config_path)
44 | self.config = read_params(config_path=path)
45 | self.kafka_topic_name = self.config['kafka']['topic_name']
46 | self.kafka_bootstrap_server = self.config['kafka']['kafka_bootstrap_server']
47 | # creating kafka producer object
48 | self.kafka_producer = KafkaProducer(bootstrap_servers=self.kafka_bootstrap_server,
49 | value_serializer=lambda x: x.encode('utf-8'))
50 | # obtain spark session object
51 | self.spark_session = spark_session
52 | except Exception as e:
53 | kafka_csv_data_producer_exp = KafkaCSVDataProducerException(
54 | "Error occurred in module [{0}] class [{1}] method [{2}] ".
55 | format(self.__module__, KafkaCSVDataProducer.__name__,
56 | self.__init__.__name__))
57 | raise Exception(kafka_csv_data_producer_exp.error_message_detail(str(e), sys)) from e
58 |
59 | def send_csv_data_to_kafka_topic(self, directory_path):
60 | """
61 | Creator:
62 | **********************************************************************************************************
63 | created date: 02 November 2021
64 | Organization: iNeuron
65 | author: avnish@ineuron.ai
66 | **********************************************************************************************************
67 | Description:
68 | **********************************************************************************************************
69 | function will send all csv files content to kafka topics specified in configuration file.
70 | ==========================================================================================================
71 | param:
72 | directory_path: csv file directory
73 |
74 | ==========================================================================================================
75 | return: function will not return any thing
76 | """
77 | try:
78 | files = os.listdir(directory_path)
79 | n_row = 0
80 |
81 | for file in files:
82 |
83 | # skip all files except csv
84 | if not file.endswith(".csv"):
85 | continue
86 | file_path = os.path.join(directory_path, file)
87 | # reading csv file using spark session
88 | # df = self.spark_session.read.csv(file_path)
89 | df = self.spark_session.read.csv(file_path,header=True,inferSchema=True)
90 | # sending dataframe to kafka topic iteratively
91 | for row in df.rdd.toLocalIterator():
92 | message=",".join(map(str, list(row)))
93 | print(message)
94 | self.kafka_producer.send(self.kafka_topic_name,message)
95 | n_row += 1
96 | time.sleep(1)
97 |
98 |
99 | #df.foreach(lambda row: self.kafka_producer.send(self.kafka_topic_name, ",".join(map(str, list(row)))))
100 | return n_row
101 | except Exception as e:
102 | kafka_csv_data_producer_exp = KafkaCSVDataProducerException(
103 | "Error occurred in module [{0}] class [{1}] method [{2}] ".
104 | format(self.__module__, KafkaCSVDataProducer.__name__,
105 | self.__init__.__name__))
106 | raise Exception(kafka_csv_data_producer_exp.error_message_detail(str(e), sys)) from e
107 |
108 |
109 | """
110 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1
111 | """
112 |
--------------------------------------------------------------------------------
/streaming/spark_manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__init__.py
--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/spark_manager.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/spark_manager.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/spark_manager.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/spark_manager.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/spark_manager/spark_manager.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from pyspark.sql import SparkSession
4 | from insurance_exception.insurance_exception import InsuranceException as SparkManagerException
5 |
6 |
7 | class SparkManager:
8 | spark_session = None
9 |
10 | def __init__(self,app_name="ineuron-machine-learning"):
11 | """
12 | Creator:
13 | **********************************************************************************************************
14 | created date: 02 November 2021
15 | Organization: iNeuron
16 | author: avnish@ineuron.ai
17 | **********************************************************************************************************
18 | Description:
19 | **********************************************************************************************************
20 | SparkManager is responsible to return spark_session object.
21 | Any modification required should be done in SparkManager class
22 | """
23 | try:
24 | self.app_name=app_name
25 | except Exception as e:
26 | spark_manager_exception = SparkManagerException("Error occurred in module [{0}] class [{1}] method [{2}] ".
27 | format(self.__module__, SparkManager.__name__,
28 | self.__init__.__name__))
29 | raise Exception(spark_manager_exception.error_message_detail(str(e), sys)) from e
30 |
31 | def get_spark_session_object(self):
32 | """
33 | function will return spark session object
34 | """
35 | try:
36 | if SparkManager.spark_session is None:
37 | SparkManager.spark_session = SparkSession.builder.master("local").appName(self.app_name) \
38 | .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1")\
39 | .config("spark.ui.port", "4041").getOrCreate()
40 |
41 | return SparkManager.spark_session
42 | except Exception as e:
43 | spark_manager_exception = SparkManagerException("Error occurred in module [{0}] class [{1}] method [{2}] ".
44 | format(self.__module__, SparkManager.__name__,
45 | self.get_spark_session_object.__name__))
46 | raise Exception(spark_manager_exception.error_message_detail(str(e), sys)) from e
47 |
--------------------------------------------------------------------------------
/streaming/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__init__.py
--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/spark_transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/spark_transformer.cpython-37.pyc
--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/spark_transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/spark_transformer.cpython-38.pyc
--------------------------------------------------------------------------------
/streaming/transformer/spark_transformer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from insurance_exception.insurance_exception import InsuranceException as SparkTransformerException
4 | import os, sys
5 | from mongo_db.mongo_db_atlas import MongoDBOperation
6 |
7 |
8 | class SparkTransformer():
9 | def __init__(self, database_name, collection_name):
10 | try:
11 | self.database_name = database_name
12 | self.collection_name = collection_name
13 | self.mongo_db = MongoDBOperation()
14 | self.ml_transformer = []
15 |
16 |
17 | except Exception as e:
18 | spark_transformer_exception = SparkTransformerException("Error occurred in module [{0}] class [{1}] "
19 | "method [{2}] ".
20 | format(self.__module__, SparkTransformer.__name__,
21 | self.__init__.__name__))
22 | raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e
23 |
24 | def add_machine_learning_transformer(self, transformer: list):
25 | try:
26 | self.ml_transformer.extend(transformer)
27 | except Exception as e:
28 | spark_transformer_exception = SparkTransformerException("Error occurred in module [{0}] class [{1}] "
29 | "method [{2}] ".
30 | format(self.__module__, SparkTransformer.__name__,
31 | self.__init__.__name__))
32 | raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e
33 |
34 | def process_each_record(self, dataframe,epoch_id):
35 | try:
36 | dataframe = dataframe.toPandas()
37 | if dataframe.shape[0] > 0:
38 | dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'])
39 | self.mongo_db.insert_dataframe_into_collection(db_name=self.database_name,
40 | collection_name=self.collection_name,
41 | data_frame=dataframe)
42 | dataframe.to_csv("new_data.csv", index=None)
43 | except Exception as e:
44 | spark_transformer_exception = SparkTransformerException("Error occurred in module [{0}] class [{1}] "
45 | "method [{2}] ".
46 | format(self.__module__, SparkTransformer.__name__,
47 | self.process_each_record.__name__))
48 | raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e
49 |
--------------------------------------------------------------------------------
/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/training/__init__.py
--------------------------------------------------------------------------------
/training/stage_00_data_loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import shutil
4 | from utility import read_params
5 | import argparse
6 | from utility import get_logger_object_of_training
7 | from utility import clean_data_source_dir
8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
9 |
10 | log_collection_name = "data_loader"
11 |
12 |
13 | def loader_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
14 | try:
15 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
16 | execution_id=execution_id,executed_by=executed_by)
17 | logger.is_log_enable = is_logging_enable
18 | logger.log("Starting data loading operation.\nReading configuration file.")
19 |
20 | config = read_params(config_path)
21 |
22 | downloader_path = config['data_download']['cloud_training_directory_path']
23 | download_path = config['data_source']['Training_Batch_Files']
24 |
25 | logger.log("Configuration detail has been fetched from configuration file.")
26 | # removing existing training and additional training files from local
27 | logger.log(f"Cleaning local directory [{download_path}] for training.")
28 | clean_data_source_dir(download_path, logger=logger,
29 | is_logging_enable=is_logging_enable) # removing existing file from local system
30 |
31 | logger.log(f"Cleaning completed. Directory has been cleared now [{download_path}]")
32 | # downloading traning and additional training file from cloud into local system
33 | logger.log("Data will be downloaded from cloud storage into local system")
34 |
35 |
36 | for file in os.listdir(downloader_path):
37 | if '.dvc' in file or '.gitignore' in file:
38 | continue
39 | print(f"Source dir: {downloader_path} file: {file} is being copied into destination dir: {download_path}"
40 | f" file: {file}")
41 | shutil.copy(os.path.join(downloader_path,file),os.path.join(download_path,file))
42 |
43 | logger.log("Data has been downloaded from cloud storage into local system")
44 |
45 | except Exception as e:
46 | generic_exception = GenericException(
47 | "Error occurred in module [{0}] method [{1}]"
48 | .format(loader_main.__module__,
49 | loader_main.__name__))
50 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
51 |
52 | if __name__ == '__main__':
53 | args = argparse.ArgumentParser()
54 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
55 | args.add_argument("--datasource", default=None)
56 | parsed_args = args.parse_args()
57 | print("started")
58 | loader_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
59 |
--------------------------------------------------------------------------------
/training/stage_01_data_validator.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import shutil
4 | import sys
5 |
6 | import pandas as pd
7 | from utility import read_params, create_directory_path, values_from_schema_function, get_logger_object_of_training, \
8 | get_date, get_time
9 | from insurance_exception.insurance_exception import InsuranceException as GenericException
10 | import argparse
11 | import datetime
12 | import shutil
13 |
14 | log_collection_name = "data_validator"
15 |
16 |
17 | class DataValidator:
18 | def __init__(self, config, logger, is_logging_enable=True):
19 | try:
20 | self.logger = logger
21 | self.logger.is_log_enable = is_logging_enable
22 | self.config = config
23 | self.file_path = self.config['data_source']['Training_Batch_Files']
24 | self.good_file_path = self.config['artifacts']['training_data']['good_file_path']
25 | self.bad_file_path = self.config['artifacts']['training_data']['bad_file_path']
26 | self.archive_bad_file_path = self.config['artifacts']['training_data']['archive_bad_file_path']
27 | self.training_schema_file = self.config['config']['schema_training']
28 | except Exception as e:
29 | generic_exception = GenericException(
30 | "Error occurred in module [{0}] class [{1}] method [{2}]"
31 | .format(self.__module__, DataValidator.__name__,
32 | self.__init__.__name__))
33 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
34 |
35 | def archive_bad_files(self):
36 | try:
37 | folder_name = f"bad_files_{get_date().replace('-', '_')}_{get_time().replace(':', '_')}"
38 | archive_directory_path = os.path.join(self.archive_bad_file_path, folder_name)
39 | create_directory_path(archive_directory_path)
40 | for file in os.listdir(self.bad_file_path):
41 | source_file_path = os.path.join(self.bad_file_path, file)
42 | shutil.move(source_file_path, archive_directory_path)
43 | except Exception as e:
44 | generic_exception = GenericException(
45 | "Error occurred in module [{0}] class [{1}] method [{2}]"
46 | .format(self.__module__, DataValidator.__name__,
47 | self.archive_bad_files.__name__))
48 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
49 |
50 | def create_good_bad_archive_bad_file_path(self):
51 | try:
52 | create_directory_path(self.good_file_path)
53 | create_directory_path(self.bad_file_path)
54 | create_directory_path(self.archive_bad_file_path, is_recreate=False)
55 | except Exception as e:
56 | generic_exception = GenericException(
57 | "Error occurred in module [{0}] class [{1}] method [{2}]"
58 | .format(self.__module__, DataValidator.__name__,
59 | self.create_good_bad_archive_bad_file_path.__name__))
60 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
61 |
62 | def value_from_schema(self):
63 | """
64 |
65 | :return: tuple (sample_file_name,column_names,number_of_column)
66 | """
67 | try:
68 | return values_from_schema_function(self.training_schema_file)
69 | except Exception as e:
70 | generic_exception = GenericException(
71 | "Error occurred in module [{0}] class [{1}] method [{2}]"
72 | .format(self.__module__, DataValidator.__name__,
73 | self.value_from_schema.__name__))
74 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
75 |
76 | def file_name_regular_expression(self):
77 | """
78 |
79 | :return: regular expression syntax which can be used for validation of file name
80 | """
81 | return "['HealthPrem']+['\_'']+[\d_]+[\d]+\.csv"
82 |
83 | def validate_file_name(self):
84 | try:
85 | self.create_good_bad_archive_bad_file_path()
86 | file_name_reg_pattern = self.file_name_regular_expression()
87 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = self.value_from_schema()
88 | self.logger.log(f"Validating file names.")
89 | files = os.listdir(self.file_path)
90 | for file in files:
91 | file_path = os.path.join(self.file_path, file)
92 | split_at_dot = re.split('.csv', file)
93 | split_at_dot = (re.split('_', split_at_dot[0]))
94 | if re.match(file_name_reg_pattern, file) and len(split_at_dot[1]) == length_of_date_stamp_in_file \
95 | and len(split_at_dot[2]) == length_of_time_stamp_in_file:
96 | destination_file_path = os.path.join(self.good_file_path, file)
97 | self.logger.log(
98 | f"file name : {file} matched hence moving file to good file path {destination_file_path}")
99 | shutil.move(file_path, destination_file_path)
100 | else:
101 | destination_file_path = os.path.join(self.bad_file_path, file)
102 | self.logger.log(
103 | f"file name: {file} does not matched hence moving file to bad file path {destination_file_path}")
104 | shutil.move(file_path, destination_file_path)
105 | except Exception as e:
106 | generic_exception = GenericException(
107 | "Error occurred in module [{0}] class [{1}] method [{2}]"
108 | .format(self.__module__, DataValidator.__name__,
109 | self.validate_file_name.__name__))
110 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
111 |
112 | def validate_missing_values_in_whole_column(self):
113 | try:
114 | self.logger.log("Missing Values Validation Started!!")
115 | for file in os.listdir(self.good_file_path):
116 | csv = pd.read_csv(os.path.join(self.good_file_path, file))
117 | count = 0
118 | for columns in csv:
119 | if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]):
120 | count += 1
121 | shutil.move(os.path.join(self.good_file_path, file),
122 | self.bad_file_path)
123 | self.logger.log(
124 | "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
125 | break
126 | if count == 0:
127 | print(csv.columns)
128 | csv.rename(columns={"Unnamed: 0": "Premium "}, inplace=True)
129 | csv.to_csv(os.path.join(self.good_file_path, file), index=None, header=True)
130 | except Exception as e:
131 | generic_exception = GenericException(
132 | "Error occurred in module [{0}] class [{1}] method [{2}]"
133 | .format(self.__module__, DataValidator.__name__,
134 | self.validate_missing_values_in_whole_column.__name__))
135 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
136 |
137 | def validate_no_of_column(self, no_of_column):
138 | """
139 | Description:
140 | If number of column matches then file will be move to good file path else bad file path
141 | =====================================================================================
142 | :param no_of_column: int Number of column must present in each file
143 | :return: Nothing
144 | """
145 | try:
146 | self.logger.log(f"Validating number of column in input file")
147 | files = os.listdir(self.good_file_path)
148 | for file in files:
149 | file_path = os.path.join(self.good_file_path, file)
150 | df = pd.read_csv(file_path)
151 | if df.shape[1] != no_of_column:
152 | destination_file_path = os.path.join(self.bad_file_path, file)
153 | self.logger.log(
154 | f"file: {file} has incorrect number of column hence moving file to bad file path {destination_file_path}")
155 | shutil.move(file_path, destination_file_path)
156 | except Exception as e:
157 | generic_exception = GenericException(
158 | "Error occurred in module [{0}] class [{1}] method [{2}]"
159 | .format(self.__module__, DataValidator.__name__,
160 | self.validate_no_of_column.__name__))
161 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
162 |
163 |
164 | def validation_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None,
165 | executed_by=None) -> None:
166 | try:
167 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
168 | execution_id=execution_id, executed_by=executed_by)
169 | logger.is_log_enable = is_logging_enable
170 | config = read_params(config_path)
171 | logger.log("data validation started")
172 | data_validator = DataValidator(config=config, logger=logger, is_logging_enable=is_logging_enable)
173 | pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns =\
174 | data_validator.value_from_schema()
175 | data_validator.validate_file_name()
176 | data_validator.validate_no_of_column(no_of_column=number_of_columns)
177 | data_validator.validate_missing_values_in_whole_column()
178 | data_validator.archive_bad_files()
179 | except Exception as e:
180 | generic_exception = GenericException(
181 | "Error occurred in module [{0}] method [{1}]"
182 | .format(validation_main.__module__,
183 | validation_main.__name__))
184 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
185 |
186 |
187 | if __name__ == '__main__':
188 | args = argparse.ArgumentParser()
189 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
190 | args.add_argument("--datasource", default=None)
191 | parsed_args = args.parse_args()
192 | print("started")
193 | validation_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
194 |
--------------------------------------------------------------------------------
/training/stage_02_data_transformer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import pandas as pd
5 | import argparse
6 | from utility import read_params, get_logger_object_of_training
7 | from mongo_db.mongo_db_atlas import MongoDBOperation
8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
9 |
10 | log_collection_name = "data_transformer"
11 |
12 |
13 | class DataTransformer:
14 | def __init__(self, config, logger, is_log_enable=True):
15 | try:
16 | self.config = config
17 | self.logger = logger
18 | self.logger.is_log_enable = is_log_enable
19 | self.good_file_path = self.config["artifacts"]['training_data']['good_file_path']
20 | self.unwanted_column_names = self.config["dataset"]['unwanted_column']
21 | self.mongo_db = MongoDBOperation()
22 | self.dataset_database = self.config["dataset"]["database_detail"]["training_database_name"]
23 | self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_training_collection_name"]
24 | self.mongo_db.drop_collection(self.dataset_database, self.dataset_collection_name)
25 | except Exception as e:
26 | generic_exception = GenericException(
27 | "Error occurred in module [{0}] class [{1}] method [{2}]"
28 | .format(self.__module__, DataTransformer.__name__,
29 | self.__init__.__name__))
30 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
31 |
32 | def unite_dataset(self):
33 | try:
34 | dataset_list = []
35 | for file in os.listdir(self.good_file_path):
36 | dataset_list.append(pd.read_csv(os.path.join(self.good_file_path, file)))
37 | df = pd.concat(dataset_list)
38 | df = self.remove_unwanted_column(df)
39 | self.logger.log(f"Inserting dataset into database {self.dataset_database} "
40 | f"collection_name: {self.dataset_collection_name}")
41 | self.mongo_db.insert_dataframe_into_collection(self.dataset_database, self.dataset_collection_name, df)
42 | except Exception as e:
43 | generic_exception = GenericException(
44 | "Error occurred in module [{0}] class [{1}] method [{2}]"
45 | .format(self.__module__, DataTransformer.__name__,
46 | self.unite_dataset.__name__))
47 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
48 |
49 | def remove_unwanted_column(self, df):
50 | try:
51 | print(self.unwanted_column_names)
52 | column_to_remove = list(filter(lambda x: x in df.columns, self.unwanted_column_names))
53 | if len(column_to_remove) > 0:
54 | return df.drop(column_to_remove, axis=1)
55 | return df
56 | except Exception as e:
57 | generic_exception = GenericException(
58 | "Error occurred in module [{0}] class [{1}] method [{2}]"
59 | .format(self.__module__, DataTransformer.__name__,
60 | self.remove_unwanted_column.__name__))
61 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
62 |
63 |
64 | def transform_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None,
65 | executed_by=None) -> None:
66 | try:
67 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
68 | execution_id=execution_id, executed_by=executed_by)
69 | logger.is_log_enable = is_logging_enable
70 | config = read_params(config_path)
71 | data_transformer = DataTransformer(config=config, logger=logger, is_log_enable=is_logging_enable)
72 | logger.log('Start of Data Preprocessing before DB')
73 | data_transformer.unite_dataset()
74 | logger.log('Data Preprocessing before DB Completed !!')
75 |
76 | except Exception as e:
77 | generic_exception = GenericException(
78 | "Error occurred in module [{0}] method [{1}]"
79 | .format(transform_main.__module__,
80 | transform_main.__name__))
81 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
82 |
83 |
84 | if __name__ == '__main__':
85 | args = argparse.ArgumentParser()
86 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
87 | args.add_argument("--datasource", default=None)
88 | parsed_args = args.parse_args()
89 | print("started")
90 | transform_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
91 |
--------------------------------------------------------------------------------
/training/stage_03_data_exporter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from utility import read_params, create_directory_path
5 | from mongo_db.mongo_db_atlas import MongoDBOperation
6 | import argparse
7 | from utility import get_logger_object_of_training
8 |
9 | from insurance_exception.insurance_exception import InsuranceException as GenericException
10 |
11 |
12 | log_collection_name = "data_export"
13 |
14 |
15 | class DataExporter:
16 | def __init__(self, config, logger, is_log_enable):
17 | try:
18 | self.config = config
19 | self.logger = logger
20 | self.is_log_enable = is_log_enable
21 | self.mongo_db = MongoDBOperation()
22 | self.dataset_database = self.config["dataset"]["database_detail"]["training_database_name"]
23 | self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_training_collection_name"]
24 | self.training_file_from_db = self.config["artifacts"]['training_data']['training_file_from_db']
25 | self.master_csv = self.config["artifacts"]['training_data']['master_csv']
26 | except Exception as e:
27 | generic_exception = GenericException(
28 | "Error occurred in module [{0}] class [{1}] method [{2}]"
29 | .format(self.__module__, DataExporter.__name__,
30 | self.__init__.__name__))
31 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
32 |
33 | def export_dataframe_from_database(self):
34 | try:
35 | create_directory_path(self.training_file_from_db)
36 | self.logger.log(f"Creating dataframe of data stored db"
37 | f"[{self.dataset_database}] and collection[{self.dataset_collection_name}]")
38 | df = self.mongo_db.get_dataframe_of_collection(db_name=self.dataset_database,
39 | collection_name=self.dataset_collection_name)
40 | master_csv_file_path = os.path.join(self.training_file_from_db, self.master_csv)
41 | self.logger.log(f"master csv file will be generated at "
42 | f"{master_csv_file_path}.")
43 | df.to_csv(master_csv_file_path, index=None,header=True)
44 |
45 | except Exception as e:
46 | generic_exception = GenericException(
47 | "Error occurred in module [{0}] class [{1}] method [{2}]"
48 | .format(self.__module__, DataExporter.__name__,
49 | self.export_dataframe_from_database.__name__))
50 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
51 |
52 |
53 | def export_main(config_path: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
54 | try:
55 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
56 | execution_id=execution_id, executed_by=executed_by)
57 |
58 | logger.is_log_enable = is_logging_enable
59 | config = read_params(config_path)
60 | data_exporter = DataExporter(config=config, logger=logger, is_log_enable=is_logging_enable)
61 | logger.log("Generating csv file from dataset stored in database.")
62 | data_exporter.export_dataframe_from_database()
63 | logger.log("Dataset has been successfully exported in directory and exiting export pipeline.")
64 | except Exception as e:
65 | generic_exception = GenericException(
66 | "Error occurred in module [{0}] method [{1}]"
67 | .format(export_main.__module__,
68 | export_main.__name__))
69 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
70 |
71 |
72 | if __name__ == '__main__':
73 | args = argparse.ArgumentParser()
74 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
75 | parsed_args = args.parse_args()
76 | print("started")
77 | export_main(config_path=parsed_args.config)
78 |
--------------------------------------------------------------------------------
/training/stage_04_model_trainer.py:
--------------------------------------------------------------------------------
1 |
2 | import random
3 | import sys
4 |
5 | import os
6 | import argparse
7 |
8 | from pyspark.sql.types import IntegerType, StringType, FloatType
9 | from sklearn.metrics import r2_score, mean_squared_error
10 |
11 | from utility import create_directory_path,read_params
12 | import numpy as np
13 | from utility import get_logger_object_of_training
14 | from pyspark.ml import Pipeline
15 | from pyspark.ml.regression import RandomForestRegressor
16 | from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
17 |
18 | from insurance_exception.insurance_exception import InsuranceException as GenericException
19 |
20 | from streaming.spark_manager.spark_manager import SparkManager
21 |
22 | log_collection_name = "training_model"
23 |
24 |
25 | class DataPreProcessing:
26 | def __init__(self, logger, is_log_enable=True, data_frame=None, pipeline_path=None):
27 | try:
28 | self.logger = logger
29 | self.logger.is_log_enable = is_log_enable
30 | self.data_frame = data_frame
31 | self.stages = []
32 | self.pipeline_path = pipeline_path
33 | except Exception as e:
34 | generic_exception = GenericException(
35 | "Error occurred in module [{0}] class [{1}] method [{2}]"
36 | .format(self.__module__, DataPreProcessing.__name__,
37 | self.__init__.__name__))
38 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
39 |
40 | def set_dataframe(self, dataframe):
41 | try:
42 | self.data_frame = dataframe
43 | except Exception as e:
44 | generic_exception = GenericException(
45 | "Error occurred in module [{0}] class [{1}] method [{2}]"
46 | .format(self.__module__, DataPreProcessing.__name__,
47 | self.update_dataframe_scheme.__name__))
48 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
49 |
50 | def update_dataframe_scheme(self, schema_definition: dict):
51 | """
52 |
53 | """
54 | try:
55 | print(self.data_frame.printSchema())
56 | if self.data_frame is None:
57 | raise Exception("update the attribute dataframe")
58 | for column, datatype in schema_definition.items():
59 | self.logger.log(f"Update datatype of column: {column} to {str(datatype)}")
60 | self.data_frame = self.data_frame.withColumn(column, self.data_frame[column].cast(datatype))
61 | except Exception as e:
62 | generic_exception = GenericException(
63 | "Error occurred in module [{0}] class [{1}] method [{2}]"
64 | .format(self.__module__, DataPreProcessing.__name__,
65 | self.update_dataframe_scheme.__name__))
66 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
67 |
68 | def encode_categorical_column(self, input_columns: list):
69 | try:
70 | string_indexer = StringIndexer(inputCols=input_columns,
71 | outputCols=[f"{column}_encoder" for column in input_columns])
72 | self.stages.append(string_indexer)
73 | one_hot_encoder = OneHotEncoder(inputCols=string_indexer.getOutputCols(),
74 | outputCols=[f"{column}_encoded" for column in input_columns])
75 | self.stages.append(one_hot_encoder)
76 |
77 | except Exception as e:
78 | generic_exception = GenericException(
79 | "Error occurred in module [{0}] class [{1}] method [{2}]"
80 | .format(self.__module__, DataPreProcessing.__name__,
81 | self.encode_categorical_column.__name__))
82 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
83 |
84 | def create_input_features(self, required_column: list):
85 | """
86 |
87 | """
88 | try:
89 | vector_assembler = VectorAssembler(inputCols=required_column, outputCol="input_features")
90 | self.stages.append(vector_assembler)
91 | except Exception as e:
92 | generic_exception = GenericException(
93 | "Error occurred in module [{0}] class [{1}] method [{2}]"
94 | .format(self.__module__, DataPreProcessing.__name__,
95 | self.create_input_features.__name__))
96 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
97 |
98 | def get_train_test_dataframe(self, test_size=0.2):
99 | try:
100 | train_df, test_df = self.data_frame.randomSplit([1 - test_size, test_size], seed=random.randint(0, 1000))
101 | self.logger.log(f"Training dataset count {train_df.count()}")
102 | self.logger.log(f"Test dataset count {test_df.count()}")
103 | return train_df, test_df
104 | except Exception as e:
105 | generic_exception = GenericException(
106 | "Error occurred in module [{0}] class [{1}] method [{2}]"
107 | .format(self.__module__, DataPreProcessing.__name__,
108 | self.get_train_test_dataframe.__name__))
109 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
110 |
111 | def get_prepared_dataset(self, ):
112 | try:
113 | schema_definition = {"age": IntegerType(),
114 | "sex": StringType(),
115 | "bmi": FloatType(),
116 | "children": IntegerType(),
117 | "smoker": StringType(),
118 | "expenses": FloatType()
119 | }
120 | self.update_dataframe_scheme(schema_definition=schema_definition)
121 | self.encode_categorical_column(input_columns=["sex", "smoker"])
122 | required_column = ['age', 'bmi', 'children', 'sex_encoded', 'smoker_encoded', ]
123 | self.create_input_features(required_column=required_column)
124 | pipeline = Pipeline(stages=self.stages)
125 | pipeline_fitted_obj = pipeline.fit(self.data_frame)
126 | self.data_frame = pipeline_fitted_obj.transform(self.data_frame)
127 | # os.remove(path=self.pipeline_path)
128 | create_directory_path(self.pipeline_path, is_recreate=True)
129 | pipeline_fitted_obj.write().overwrite().save(self.pipeline_path)
130 | return self.get_train_test_dataframe(test_size=0.2)
131 | except Exception as e:
132 | generic_exception = GenericException(
133 | "Error occurred in module [{0}] class [{1}] method [{2}]"
134 | .format(self.__module__, DataPreProcessing.__name__,
135 | self.get_prepared_dataset.__name__))
136 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
137 |
138 |
139 | class ModelTrainer:
140 |
141 | def __init__(self, config, logger, is_log_enable):
142 | try:
143 | self.logger = logger
144 | self.logger.is_log_enable = is_log_enable
145 | self.config = config
146 | self.training_file_path = self.config['artifacts']['training_data']['training_file_from_db']
147 | self.master_csv = self.config['artifacts']['training_data']['master_csv']
148 | self.target_columns = self.config['target_columns']['columns']
149 | self.test_size = self.config['base']['test_size']
150 | self.random_state = self.config['base']['random_state']
151 | self.plot = self.config['artifacts']['training_data']['plots']
152 | self.pipeline_path = self.config['artifacts']['training_data']['pipeline_path']
153 | self.model_path = config['artifacts']['model']['model_path']
154 | self.null_value_file_path = config['artifacts']['training_data']['null_value_info_file_path']
155 | """
156 | self.spark = SparkSession.builder.\
157 | master("local[*]").\
158 | appName("insurance-premium-reg").getOrCreate()
159 | """
160 | self.spark = SparkManager().get_spark_session_object()
161 | """
162 | self.spark=SparkSession.builder.appName('app_name') \
163 | .master('local[*]') \
164 | .config('spark.sql.execution.arrow.pyspark.enabled', True) \
165 | .config('spark.sql.session.timeZone', 'UTC') \
166 | .config('spark.driver.memory', '32G') \
167 | .config('spark.ui.showConsoleProgress', True) \
168 | .config('spark.sql.repl.eagerEval.enabled', True) \
169 | .getOrCreate()
170 | """
171 | except Exception as e:
172 | generic_exception = GenericException(
173 | "Error occurred in module [{0}] class [{1}] method [{2}]"
174 | .format(self.__module__, ModelTrainer.__name__,
175 | self.__init__.__name__))
176 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
177 |
178 |
179 | def save_regression_metric_data(self, y_true, y_pred, title):
180 | try:
181 | y_true = np.array(y_true).reshape(-1)
182 | y_pred = np.array(y_pred).reshape(-1)
183 | rmse = np.sqrt(mean_squared_error(y_true, y_pred))
184 | r_squared_score = r2_score(y_true, y_pred)
185 | msg = f"{title} R squared score: {r_squared_score:.3%}"
186 | self.logger.log(msg)
187 | print(msg)
188 | msg = f"{title} Root mean squared error: {rmse:.3}"
189 | self.logger.log(msg)
190 | print(msg)
191 | except Exception as e:
192 | generic_exception = GenericException(
193 | "Error occurred in module [{0}] class [{1}] method [{2}]"
194 | .format(self.__module__, ModelTrainer.__name__,
195 | self.save_regression_metric_data.__name__))
196 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
197 |
198 | def get_dataframe(self):
199 | try:
200 | master_file_path = os.path.join(self.training_file_path, self.master_csv)
201 |
202 | return self.spark.read.csv(master_file_path, header=True, inferSchema=True)
203 | except Exception as e:
204 | generic_exception = GenericException(
205 | "Error occurred in module [{0}] class [{1}] method [{2}]"
206 | .format(self.__module__, ModelTrainer.__name__,
207 | self.get_dataframe.__name__))
208 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
209 |
210 | def data_preparation(self):
211 | try:
212 | data_frame = self.get_dataframe()
213 | preprocessing = DataPreProcessing(logger=self.logger,
214 | is_log_enable=self.logger.is_log_enable,
215 | data_frame=data_frame,
216 | pipeline_path=self.pipeline_path)
217 | return preprocessing.get_prepared_dataset()
218 | except Exception as e:
219 | generic_exception = GenericException(
220 | "Error occurred in module [{0}] class [{1}] method [{2}]"
221 | .format(self.__module__, ModelTrainer.__name__,
222 | self.data_preparation.__name__))
223 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
224 |
225 | def begin_training(self):
226 | try:
227 | train_df, test_df = self.data_preparation()
228 | random_forest_regressor = RandomForestRegressor(featuresCol="input_features", labelCol="expenses")
229 | random_forest_model = random_forest_regressor.fit(train_df)
230 | train_prediction = random_forest_model.transform(train_df)
231 | testing_prediction = random_forest_model.transform(test_df)
232 | training_data = train_prediction.select("expenses", "prediction").toPandas()
233 | testing_data = testing_prediction.select("expenses", "prediction").toPandas()
234 | self.save_regression_metric_data(training_data['expenses'], training_data['prediction'],
235 | title="Training score")
236 | self.save_regression_metric_data(testing_data['expenses'], testing_data['prediction'],
237 | title="Testing score")
238 |
239 | self.save_model(model=random_forest_model, model_name="random_forest_regressor")
240 | self.spark.stop()
241 | except Exception as e:
242 | generic_exception = GenericException(
243 | "Error occurred in module [{0}] class [{1}] method [{2}]"
244 | .format(self.__module__, ModelTrainer.__name__,
245 | self.begin_training.__name__))
246 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
247 |
248 | def save_model(self, model, model_name, intermediate_path=None):
249 | try:
250 |
251 | if intermediate_path is None:
252 | model_path = os.path.join(self.model_path)
253 | else:
254 | model_path = os.path.join(self.model_path, intermediate_path)
255 | create_directory_path(model_path, )
256 | model_full_path = os.path.join(model_path, f"{model_name}")
257 | self.logger.log(f"Saving mode: {model_name} at path {model_full_path}")
258 | # os.remove(path=model_full_path)
259 | model.write().overwrite().save(model_full_path)
260 |
261 | except Exception as e:
262 | generic_exception = GenericException(
263 | "Error occurred in module [{0}] class [{1}] method [{2}]"
264 | .format(self.__module__, ModelTrainer.__name__,
265 | self.save_model.__name__))
266 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
267 |
268 |
269 | def train_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None, executed_by=None) -> None:
270 | try:
271 | logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
272 | execution_id=execution_id, executed_by=executed_by)
273 |
274 | logger.is_log_enable = is_logging_enable
275 | logger.log("Training begin.")
276 | config = read_params(config_path)
277 | model_trainer = ModelTrainer(config=config, logger=logger, is_log_enable=is_logging_enable)
278 | model_trainer.begin_training()
279 | logger.log("Training completed successfully.")
280 |
281 | except Exception as e:
282 | generic_exception = GenericException(
283 | "Error occurred in module [{0}] method [{1}]"
284 | .format(train_main.__module__,
285 | train_main.__name__))
286 | raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
287 |
288 |
289 | if __name__ == '__main__':
290 | args = argparse.ArgumentParser()
291 | args.add_argument("--config", default=os.path.join("config", "params.yaml"))
292 | args.add_argument("--datasource", default=None)
293 | parsed_args = args.parse_args()
294 | print(parsed_args.config)
295 | print(parsed_args.datasource)
296 | train_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
297 |
--------------------------------------------------------------------------------
/utility.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import json
3 | from datetime import datetime
4 |
5 | import yaml
6 | import uuid
7 | import os
8 | import shutil
9 | from logger.logger import AppLogger
10 |
11 | def get_time():
12 | """
13 |
14 | :return current time:
15 | """
16 | return datetime.now().strftime("%H:%M:%S").__str__()
17 |
18 | def get_date():
19 | """
20 |
21 | :return current date:
22 | """
23 | return datetime.now().date().__str__()
24 |
25 |
26 |
27 | def create_directory_path(path, is_recreate=True):
28 | """
29 | :param path:
30 | :param is_recreate: Default it will delete the existing directory yet you can pass
31 | it's value to false if you do not want to remove existing directory
32 | :return:
33 | """
34 | try:
35 | if is_recreate:
36 | if os.path.exists(path):
37 | shutil.rmtree(path, ignore_errors=False) # remove existing directory if is_recreate is true
38 | os.makedirs(path, exist_ok=True) # if directory is present it will not alter anything
39 | return True
40 | except Exception as e:
41 | raise e
42 |
43 |
44 | def clean_data_source_dir(path, logger=None, is_logging_enable=True):
45 | try:
46 | if not os.path.exists(path):
47 | os.mkdir(path)
48 | for file in os.listdir(path):
49 | if '.gitignore' in file:
50 | pass
51 | logger.log(f"{os.path.join(path, file)}file will be deleted.")
52 | os.remove(os.path.join(path, file))
53 | logger.log(f"{os.path.join(path, file)}file has been deleted.")
54 | except Exception as e:
55 | raise e
56 |
57 |
58 |
59 | def get_logger_object_of_training(config_path: str, collection_name, execution_id=None, executed_by=None) -> AppLogger:
60 | config = read_params(config_path)
61 | database_name = config['log_database']['training_database_name']
62 | if execution_id is None:
63 | execution_id = str(uuid.uuid4())
64 | if executed_by is None:
65 | executed_by = "Avnish Yadav"
66 | logger = AppLogger(project_id=5, log_database=database_name, log_collection_name=collection_name,
67 | execution_id=execution_id, executed_by=executed_by)
68 | return logger
69 |
70 |
71 | def get_logger_object_of_prediction(config_path: str, collection_name, execution_id=None,
72 | executed_by=None) -> AppLogger:
73 | config = read_params(config_path)
74 | database_name = config['log_database']['prediction_database_name']
75 | if execution_id is None:
76 | execution_id = str(uuid.uuid4())
77 | if executed_by is None:
78 | executed_by = "Avnish Yadav"
79 | logger = AppLogger(project_id=5, log_database=database_name, log_collection_name=collection_name,
80 | execution_id=execution_id, executed_by=executed_by)
81 | return logger
82 |
83 |
84 | def read_params(config_path: str) -> dict:
85 | with open(config_path) as yaml_file:
86 | config = yaml.safe_load(yaml_file)
87 | return config
88 |
89 |
90 | def values_from_schema_function(schema_path):
91 | try:
92 | with open(schema_path, 'r') as r:
93 | dic = json.load(r)
94 | r.close()
95 |
96 | pattern = dic['SampleFileName']
97 | length_of_date_stamp_in_file = dic['LengthOfDateStampInFile']
98 | length_of_time_stamp_in_file = dic['LengthOfTimeStampInFile']
99 | column_names = dic['ColName']
100 | number_of_columns = dic['NumberofColumns']
101 | return pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns
102 | except ValueError:
103 | raise ValueError
104 |
105 | except KeyError:
106 | raise KeyError
107 |
108 | except Exception as e:
109 | raise e
110 |
111 |
--------------------------------------------------------------------------------