├── .gitignore
├── .idea
    ├── .gitignore
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── pyspark machine learning.iml
    └── vcs.xml
├── README.md
├── __pycache__
    └── utility.cpython-37.pyc
├── artifacts
    ├── Prediction_Output_DIR
    │   └── Predictions.csv
    ├── model
    │   └── random_forest_regressor
    │   │   ├── data
    │   │       ├── ._SUCCESS.crc
    │   │       ├── .part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc
    │   │       ├── _SUCCESS
    │   │       └── part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet
    │   │   ├── metadata
    │   │       ├── ._SUCCESS.crc
    │   │       ├── .part-00000.crc
    │   │       ├── _SUCCESS
    │   │       └── part-00000
    │   │   └── treesMetadata
    │   │       ├── ._SUCCESS.crc
    │   │       ├── .part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc
    │   │       ├── _SUCCESS
    │   │       └── part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet
    ├── pipeline
    │   └── pipeline_model
    │   │   ├── metadata
    │   │       ├── ._SUCCESS.crc
    │   │       ├── .part-00000.crc
    │   │       ├── _SUCCESS
    │   │       └── part-00000
    │   │   └── stages
    │   │       ├── 0_StringIndexer_046a38b797e0
    │   │           ├── data
    │   │           │   ├── ._SUCCESS.crc
    │   │           │   ├── .part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc
    │   │           │   ├── _SUCCESS
    │   │           │   └── part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet
    │   │           └── metadata
    │   │           │   ├── ._SUCCESS.crc
    │   │           │   ├── .part-00000.crc
    │   │           │   ├── _SUCCESS
    │   │           │   └── part-00000
    │   │       ├── 1_OneHotEncoder_ccd93b498912
    │   │           ├── data
    │   │           │   ├── ._SUCCESS.crc
    │   │           │   ├── .part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc
    │   │           │   ├── _SUCCESS
    │   │           │   └── part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet
    │   │           └── metadata
    │   │           │   ├── ._SUCCESS.crc
    │   │           │   ├── .part-00000.crc
    │   │           │   ├── _SUCCESS
    │   │           │   └── part-00000
    │   │       └── 2_VectorAssembler_c52ccaa0dc60
    │   │           └── metadata
    │   │               ├── ._SUCCESS.crc
    │   │               ├── .part-00000.crc
    │   │               ├── _SUCCESS
    │   │               └── part-00000
    ├── prediction_data
    │   ├── Prediction_FileFromDB
    │   │   └── master.csv
    │   └── Prediction_Raw_files_validated
    │   │   └── Good_raw
    │   │       └── HealthPrem_26092020_131534.csv
    └── training_data
    │   ├── Training_FileFromDB
    │       └── master.csv
    │   └── Training_Raw_files_validated
    │       └── Good_raw
    │           └── HealthPrem_26092020_131534.csv
├── config
    ├── params.yaml
    ├── schema_prediction.json
    └── schema_training.json
├── csv_to_kafka.py
├── data
    ├── Prediction_Batch_files
    │   └── HealthPrem_26092020_131534.csv
    └── training_batch_files
    │   ├── .gitignore
    │   └── HealthPrem_26092020_131534.csv
├── diagram
    ├── Drawing1.vsdx
    ├── streaming.jpg
    └── training and prediction.pdf
├── entry_point.py
├── insurance_exception
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── insurance_exception.cpython-37.pyc
    └── insurance_exception.py
├── insurance_prediction.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    └── top_level.txt
├── logger
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── logger.cpython-37.pyc
    └── logger.py
├── mongo_db
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── mongo_db_atlas.cpython-37.pyc
    └── mongo_db_atlas.py
├── new_data.csv
├── prediction
    ├── __init__.py
    ├── stage_00_data_loader.py
    ├── stage_01_data_validator.py
    ├── stage_02_data_transformer.py
    ├── stage_03_data_exporter.py
    └── stage_04_model_predictor.py
├── prediction_files
    └── HealthPrem_26092020_131534.csv
├── requirement.txt
├── setup.py
├── spark_consumer_from_kafka.py
├── streaming
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── __init__.cpython-38.pyc
    ├── consumer
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── kafka_to_spark_csv_consumer.cpython-37.pyc
    │   │   └── kafka_to_spark_csv_consumer.cpython-38.pyc
    │   └── kafka_to_spark_csv_consumer.py
    ├── producer
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── kafka_csv_data_producer.cpython-37.pyc
    │   │   └── kafka_csv_data_producer.cpython-38.pyc
    │   └── kafka_csv_data_producer.py
    ├── spark_manager
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── spark_manager.cpython-37.pyc
    │   │   └── spark_manager.cpython-38.pyc
    │   └── spark_manager.py
    └── transformer
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-37.pyc
    │       ├── __init__.cpython-38.pyc
    │       ├── spark_transformer.cpython-37.pyc
    │       └── spark_transformer.cpython-38.pyc
    │   └── spark_transformer.py
├── training
    ├── __init__.py
    ├── stage_00_data_loader.py
    ├── stage_01_data_validator.py
    ├── stage_02_data_transformer.py
    ├── stage_03_data_exporter.py
    └── stage_04_model_trainer.py
└── utility.py


/.gitignore:
--------------------------------------------------------------------------------
1 | insurance_prediction.egg-info


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (insurance_pyspark)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pyspark machine learning.iml" filepath="$PROJECT_DIR$/.idea/pyspark machine learning.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/pyspark machine learning.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.7 (insurance_pyspark)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |     <mapping directory="$PROJECT_DIR$/src/insurance-prediction" vcs="Git" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Spark Configuration in window 10
  2 | 
  3 | 1. Downlaod all required file from below URL:
  4 | ```
  5 | https://drive.google.com/drive/folders/1rBauyUVCRTbnKXgkMGh4l9MdIOVj8CQc?usp=sharing
  6 | ```
  7 | 
  8 | 2. Install java .exe file
  9 | > note: choose installtion path of java to "C:" drive
 10 | 
 11 | 3. Extract spark file in C drive
 12 | 
 13 | 4. Extract kafka file in C drive
 14 | 
 15 | 5. Add environment variable 
 16 |  
 17 | <TABLE border="1">
 18 | <tr>
 19 | <th>ENVIRONMENT VARIABLE NAME</th>
 20 | <th>VALUE</th>
 21 | </tr>
 22 | <tr>
 23 | <td>HADOOP_HOME</td>
 24 | <td>C:\winutils</td>
 25 | </tr>
 26 | <tr>
 27 | <td>JAVA_HOME</td>
 28 | <td>C:\Java\jdk1.8.0_202</td>
 29 | </tr>
 30 | <tr>
 31 | <td>SPARK_HOME</td>
 32 | <td>C:\spark-3.0.3-bin-hadoop2.7</td>
 33 | </tr>
 34 | </TABLE>
 35 | 
 36 | 6. select path  variable from environment variable and add below values.
 37 | ```buildoutcfg
 38 | %SPARK_HOME%\bin
 39 | ```
 40 | ```buildoutcfg
 41 | %HADOOP_HOME%\bin
 42 | ```
 43 | ```buildoutcfg
 44 | %JAVA_HOME%\bin
 45 | ```
 46 | ```buildoutcfg
 47 | C:\Java\jre1.8.0_281\bin
 48 | ```
 49 | ## Create conda environment 
 50 | 
 51 | 1. open conda terminal execute below command
 52 | 
 53 | ```buildoutcfg
 54 | conda create -n <env_name> python=3.8 -y
 55 | ```
 56 | 
 57 | 2. select <env_name> created in previous step for project interpreter in pycharm.
 58 | 
 59 | 3. Install all necessary python library specified in requirements.txt file using below command.
 60 | ```buildoutcfg
 61 | pip install -r requirements.txt
 62 | ```
 63 | 
 64 | 
 65 | 5. To upload your code to gihub repo
 66 | ```
 67 | git init
 68 | git add .
 69 | git commit -m "first commit"
 70 | git branch -M main
 71 | git remote add origin <github_repo_link>
 72 | git push -u origin main
 73 | ```
 74 | 
 75 | ## Train random forest model on insurance dataset
 76 | ```buildoutcfg
 77 | python training\stage_00_data_loader.py
 78 | ```
 79 | ```buildoutcfg
 80 | python training\stage_01_data_validator.py
 81 | ```
 82 | ```buildoutcfg
 83 | python training\stage_02_data_transformer.py
 84 | ```
 85 | ```buildoutcfg
 86 | python training\stage_03_data_exporter.py
 87 | ```
 88 | ```buildoutcfg
 89 | spark-submit training\stage_04_model_trainer.py
 90 | ```
 91 | 
 92 | ## Prediction using random forest of insurance dataset
 93 | ```buildoutcfg
 94 | python prediction\stage_00_data_loader.py
 95 | ```
 96 | ```buildoutcfg
 97 | python prediction\stage_01_data_validator.py
 98 | ```
 99 | ```buildoutcfg
100 | python prediction\stage_02_data_transformer.py
101 | ```
102 | ```buildoutcfg
103 | python prediction\stage_03_data_exporter.py
104 | ```
105 | ```buildoutcfg
106 | spark-submit prediction\stage_04_model_predictor.py
107 | ```
108 | 
109 | 
110 | 
111 | # start zookeeper and kafka server
112 | 
113 | 
114 | 
115 | ## start kafka producer using below command
116 | ```buildoutcfg
117 | spark-submit csv_to_kafka.py
118 | ```
119 | 
120 | ## start pyspark consumer using below command
121 | ```buildoutcfg
122 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1  spark_consumer_from_kafka.py
123 | ```


--------------------------------------------------------------------------------
/__pycache__/utility.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/__pycache__/utility.cpython-37.pyc


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/.part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/.part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/data/part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/data/part-00000-747a037b-1756-420a-87dd-e6e880fd80e5-c000.snappy.parquet


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/metadata/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1636780450788,"sparkVersion":"3.0.3","uid":"RandomForestRegressor_ae83aa5164df","paramMap":{"labelCol":"expenses","featuresCol":"input_features"},"defaultParamMap":{"labelCol":"label","maxBins":32,"bootstrap":true,"cacheNodeIds":false,"predictionCol":"prediction","featureSubsetStrategy":"auto","featuresCol":"features","seed":469049852166159693,"leafCol":"","minInstancesPerNode":1,"checkpointInterval":10,"minInfoGain":0.0,"numTrees":20,"subsamplingRate":1.0,"maxDepth":5,"maxMemoryInMB":256,"impurity":"variance","minWeightFractionPerNode":0.0},"numFeatures":5,"numTrees":20}
2 | 


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/.part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/.part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/model/random_forest_regressor/treesMetadata/part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/model/random_forest_regressor/treesMetadata/part-00000-4410d810-00a4-4a7e-af39-cb820647e48d-c000.snappy.parquet


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/metadata/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.PipelineModel","timestamp":1636780381034,"sparkVersion":"3.0.3","uid":"PipelineModel_463619223312","paramMap":{"stageUids":["StringIndexer_046a38b797e0","OneHotEncoder_ccd93b498912","VectorAssembler_c52ccaa0dc60"]},"defaultParamMap":{}}
2 | 


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/.part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/.part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/data/part-00000-167b16f5-b0d5-4c5d-8ed3-b32962f50429-c000.snappy.parquet


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/0_StringIndexer_046a38b797e0/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1636780381434,"sparkVersion":"3.0.3","uid":"StringIndexer_046a38b797e0","paramMap":{"inputCols":["sex","smoker"],"outputCols":["sex_encoder","smoker_encoder"]},"defaultParamMap":{"handleInvalid":"error","stringOrderType":"frequencyDesc","outputCol":"StringIndexer_046a38b797e0__output"}}
2 | 


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/.part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/.part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/data/part-00000-624bba07-7841-4932-a362-837252099edf-c000.snappy.parquet


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/1_OneHotEncoder_ccd93b498912/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1636780382829,"sparkVersion":"3.0.3","uid":"OneHotEncoder_ccd93b498912","paramMap":{"inputCols":["sex_encoder","smoker_encoder"],"outputCols":["sex_encoded","smoker_encoded"]},"defaultParamMap":{"dropLast":true,"outputCol":"OneHotEncoder_ccd93b498912__output","handleInvalid":"error"}}
2 | 


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/_SUCCESS


--------------------------------------------------------------------------------
/artifacts/pipeline/pipeline_model/stages/2_VectorAssembler_c52ccaa0dc60/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1636780383948,"sparkVersion":"3.0.3","uid":"VectorAssembler_c52ccaa0dc60","paramMap":{"inputCols":["age","bmi","children","sex_encoded","smoker_encoded"],"outputCol":"input_features"},"defaultParamMap":{"outputCol":"VectorAssembler_c52ccaa0dc60__output","handleInvalid":"error"}}
2 | 


--------------------------------------------------------------------------------
/artifacts/prediction_data/Prediction_FileFromDB/master.csv:
--------------------------------------------------------------------------------
   1 | age,sex,bmi,children,smoker
   2 | 19,female,27.9,0,yes
   3 | 18,male,33.8,1,no
   4 | 28,male,33.0,3,no
   5 | 33,male,22.7,0,no
   6 | 32,male,28.9,0,no
   7 | 31,female,25.7,0,no
   8 | 46,female,33.4,1,no
   9 | 37,female,27.7,3,no
  10 | 37,male,29.8,2,no
  11 | 60,female,25.8,0,no
  12 | 25,male,26.2,0,no
  13 | 62,female,26.3,0,yes
  14 | 23,male,34.4,0,no
  15 | 56,female,39.8,0,no
  16 | 27,male,42.1,0,yes
  17 | 19,male,24.6,1,no
  18 | 52,female,30.8,1,no
  19 | 23,male,23.8,0,no
  20 | 56,male,40.3,0,no
  21 | 30,male,35.3,0,yes
  22 | 60,female,36.0,0,no
  23 | 30,female,32.4,1,no
  24 | 18,male,34.1,0,no
  25 | 34,female,31.9,1,yes
  26 | 37,male,28.0,2,no
  27 | 59,female,27.7,3,no
  28 | 63,female,23.1,0,no
  29 | 55,female,32.8,2,no
  30 | 23,male,17.4,1,no
  31 | 31,male,36.3,2,yes
  32 | 22,male,35.6,0,yes
  33 | 18,female,26.3,0,no
  34 | 19,female,28.6,5,no
  35 | 63,male,28.3,0,no
  36 | 28,male,36.4,1,yes
  37 | 19,male,20.4,0,no
  38 | 62,female,33.0,3,no
  39 | 26,male,20.8,0,no
  40 | 35,male,36.7,1,yes
  41 | 60,male,39.9,0,yes
  42 | 24,female,26.6,0,no
  43 | 31,female,36.6,2,no
  44 | 41,male,21.8,1,no
  45 | 37,female,30.8,2,no
  46 | 38,male,37.1,1,no
  47 | 55,male,37.3,0,no
  48 | 18,female,38.7,2,no
  49 | 28,female,34.8,0,no
  50 | 60,female,24.5,0,no
  51 | 36,male,35.2,1,yes
  52 | 18,female,35.6,0,no
  53 | 21,female,33.6,2,no
  54 | 48,male,28.0,1,yes
  55 | 36,male,34.4,0,yes
  56 | 40,female,28.7,3,no
  57 | 58,male,37.0,2,yes
  58 | 58,female,31.8,2,no
  59 | 18,male,31.7,2,yes
  60 | 53,female,22.9,1,yes
  61 | 34,female,37.3,2,no
  62 | 43,male,27.4,3,no
  63 | 25,male,33.7,4,no
  64 | 64,male,24.7,1,no
  65 | 28,female,25.9,1,no
  66 | 20,female,22.4,0,yes
  67 | 19,female,28.9,0,no
  68 | 61,female,39.1,2,no
  69 | 40,male,26.3,1,no
  70 | 40,female,36.2,0,no
  71 | 28,male,24.0,3,yes
  72 | 27,female,24.8,0,yes
  73 | 31,male,28.5,5,no
  74 | 53,female,28.1,3,no
  75 | 58,male,32.0,1,no
  76 | 44,male,27.4,2,no
  77 | 57,male,34.0,0,no
  78 | 29,female,29.6,1,no
  79 | 21,male,35.5,0,no
  80 | 22,female,39.8,0,no
  81 | 41,female,33.0,0,no
  82 | 31,male,26.9,1,no
  83 | 45,female,38.3,0,no
  84 | 22,male,37.6,1,yes
  85 | 48,female,41.2,4,no
  86 | 37,female,34.8,2,yes
  87 | 45,male,22.9,2,yes
  88 | 57,female,31.2,0,yes
  89 | 56,female,27.2,0,no
  90 | 46,female,27.7,0,no
  91 | 55,female,27.0,0,no
  92 | 21,female,39.5,0,no
  93 | 53,female,24.8,1,no
  94 | 59,male,29.8,3,yes
  95 | 35,male,34.8,2,no
  96 | 64,female,31.3,2,yes
  97 | 28,female,37.6,1,no
  98 | 54,female,30.8,3,no
  99 | 55,male,38.3,0,no
 100 | 56,male,20.0,0,yes
 101 | 38,male,19.3,0,yes
 102 | 41,female,31.6,0,no
 103 | 30,male,25.5,0,no
 104 | 18,female,30.1,0,no
 105 | 61,female,29.9,3,yes
 106 | 34,female,27.5,1,no
 107 | 20,male,28.0,1,yes
 108 | 19,female,28.4,1,no
 109 | 26,male,30.9,2,no
 110 | 29,male,27.9,0,no
 111 | 63,male,35.1,0,yes
 112 | 54,male,33.6,1,no
 113 | 55,female,29.7,2,no
 114 | 37,male,30.8,0,no
 115 | 21,female,35.7,0,no
 116 | 52,male,32.2,3,no
 117 | 60,male,28.6,0,no
 118 | 58,male,49.1,0,no
 119 | 29,female,27.9,1,yes
 120 | 49,female,27.2,0,no
 121 | 37,female,23.4,2,no
 122 | 44,male,37.1,2,no
 123 | 18,male,23.8,0,no
 124 | 20,female,29.0,0,no
 125 | 44,male,31.4,1,yes
 126 | 47,female,33.9,3,no
 127 | 26,female,28.8,0,no
 128 | 19,female,28.3,0,yes
 129 | 52,female,37.4,0,no
 130 | 32,female,17.8,2,yes
 131 | 38,male,34.7,2,no
 132 | 59,female,26.5,0,no
 133 | 61,female,22.0,0,no
 134 | 53,female,35.9,2,no
 135 | 19,male,25.6,0,no
 136 | 20,female,28.8,0,no
 137 | 22,female,28.1,0,no
 138 | 19,male,34.1,0,no
 139 | 22,male,25.2,0,no
 140 | 54,female,31.9,3,no
 141 | 22,female,36.0,0,no
 142 | 34,male,22.4,2,no
 143 | 26,male,32.5,1,no
 144 | 34,male,25.3,2,yes
 145 | 29,male,29.7,2,no
 146 | 30,male,28.7,3,yes
 147 | 29,female,38.8,3,no
 148 | 46,male,30.5,3,yes
 149 | 51,female,37.7,1,no
 150 | 53,female,37.4,1,no
 151 | 19,male,28.4,1,no
 152 | 35,male,24.1,1,no
 153 | 48,male,29.7,0,no
 154 | 32,female,37.1,3,no
 155 | 42,female,23.4,0,yes
 156 | 40,female,25.5,1,no
 157 | 44,male,39.5,0,no
 158 | 48,male,24.4,0,yes
 159 | 18,male,25.2,0,yes
 160 | 30,male,35.5,0,yes
 161 | 50,female,27.8,3,no
 162 | 42,female,26.6,0,yes
 163 | 18,female,36.9,0,yes
 164 | 54,male,39.6,1,no
 165 | 32,female,29.8,2,no
 166 | 37,male,29.6,0,no
 167 | 47,male,28.2,4,no
 168 | 20,female,37.0,5,no
 169 | 32,female,33.2,3,no
 170 | 19,female,31.8,1,no
 171 | 27,male,18.9,3,no
 172 | 63,male,41.5,0,no
 173 | 49,male,30.3,0,no
 174 | 18,male,16.0,0,no
 175 | 35,female,34.8,1,no
 176 | 24,female,33.3,0,no
 177 | 63,female,37.7,0,yes
 178 | 38,male,27.8,2,no
 179 | 54,male,29.2,1,no
 180 | 46,female,28.9,2,no
 181 | 41,female,33.2,3,no
 182 | 58,male,28.6,0,no
 183 | 18,female,38.3,0,no
 184 | 22,male,20.0,3,no
 185 | 44,female,26.4,0,no
 186 | 44,male,30.7,2,no
 187 | 36,male,41.9,3,yes
 188 | 26,female,29.9,2,no
 189 | 30,female,30.9,3,no
 190 | 41,female,32.2,1,no
 191 | 29,female,32.1,2,no
 192 | 61,male,31.6,0,no
 193 | 36,female,26.2,0,no
 194 | 25,male,25.7,0,no
 195 | 56,female,26.6,1,no
 196 | 18,male,34.4,0,no
 197 | 19,male,30.6,0,no
 198 | 39,female,32.8,0,no
 199 | 45,female,28.6,2,no
 200 | 51,female,18.1,0,no
 201 | 64,female,39.3,0,no
 202 | 19,female,32.1,0,no
 203 | 48,female,32.2,1,no
 204 | 60,female,24.0,0,no
 205 | 27,female,36.1,0,yes
 206 | 46,male,22.3,0,no
 207 | 28,female,28.9,1,no
 208 | 59,male,26.4,0,no
 209 | 35,male,27.7,2,yes
 210 | 63,female,31.8,0,no
 211 | 40,male,41.2,1,no
 212 | 20,male,33.0,1,no
 213 | 40,male,30.9,4,no
 214 | 24,male,28.5,2,no
 215 | 34,female,26.7,1,no
 216 | 45,female,30.9,2,no
 217 | 41,female,37.1,2,no
 218 | 53,female,26.6,0,no
 219 | 27,male,23.1,0,no
 220 | 26,female,29.9,1,no
 221 | 24,female,23.2,0,no
 222 | 34,female,33.7,1,no
 223 | 53,female,33.3,0,no
 224 | 32,male,30.8,3,no
 225 | 19,male,34.8,0,yes
 226 | 42,male,24.6,0,yes
 227 | 55,male,33.9,3,no
 228 | 28,male,38.1,0,no
 229 | 58,female,41.9,0,no
 230 | 41,female,31.6,1,no
 231 | 47,male,25.5,2,no
 232 | 42,female,36.2,1,no
 233 | 59,female,27.8,3,no
 234 | 19,female,17.8,0,no
 235 | 59,male,27.5,1,no
 236 | 39,male,24.5,2,no
 237 | 40,female,22.2,2,yes
 238 | 18,female,26.7,0,no
 239 | 31,male,38.4,2,no
 240 | 19,male,29.1,0,yes
 241 | 44,male,38.1,1,no
 242 | 23,female,36.7,2,yes
 243 | 33,female,22.1,1,no
 244 | 55,female,26.8,1,no
 245 | 40,male,35.3,3,no
 246 | 63,female,27.7,0,yes
 247 | 54,male,30.0,0,no
 248 | 60,female,38.1,0,no
 249 | 24,male,35.9,0,no
 250 | 19,male,20.9,1,no
 251 | 29,male,29.0,1,no
 252 | 18,male,17.3,2,yes
 253 | 63,female,32.2,2,yes
 254 | 54,male,34.2,2,yes
 255 | 27,male,30.3,3,no
 256 | 50,male,31.8,0,yes
 257 | 55,female,25.4,3,no
 258 | 56,male,33.6,0,yes
 259 | 38,female,40.2,0,no
 260 | 51,male,24.4,4,no
 261 | 19,male,31.9,0,yes
 262 | 58,female,25.2,0,no
 263 | 20,female,26.8,1,yes
 264 | 52,male,24.3,3,yes
 265 | 19,male,37.0,0,yes
 266 | 53,female,38.1,3,no
 267 | 46,male,42.4,3,yes
 268 | 40,male,19.8,1,yes
 269 | 59,female,32.4,3,no
 270 | 45,male,30.2,1,no
 271 | 49,male,25.8,1,no
 272 | 18,male,29.4,1,no
 273 | 50,male,34.2,2,yes
 274 | 41,male,37.1,2,no
 275 | 50,male,27.5,1,no
 276 | 25,male,27.6,0,no
 277 | 47,female,26.6,2,no
 278 | 19,male,20.6,2,no
 279 | 22,female,24.3,0,no
 280 | 59,male,31.8,2,no
 281 | 51,female,21.6,1,no
 282 | 40,female,28.1,1,yes
 283 | 54,male,40.6,3,yes
 284 | 30,male,27.6,1,no
 285 | 55,female,32.4,1,no
 286 | 52,female,31.2,0,no
 287 | 46,male,26.6,1,no
 288 | 46,female,48.1,2,no
 289 | 63,female,26.2,0,no
 290 | 59,female,36.8,1,yes
 291 | 52,male,26.4,3,no
 292 | 28,female,33.4,0,no
 293 | 29,male,29.6,1,no
 294 | 25,male,45.5,2,yes
 295 | 22,female,28.8,0,no
 296 | 25,male,26.8,3,no
 297 | 18,male,23.0,0,no
 298 | 19,male,27.7,0,yes
 299 | 47,male,25.4,1,yes
 300 | 31,male,34.4,3,yes
 301 | 48,female,28.9,1,no
 302 | 36,male,27.6,3,no
 303 | 53,female,22.6,3,yes
 304 | 56,female,37.5,2,no
 305 | 28,female,33.0,2,no
 306 | 57,female,38.0,2,no
 307 | 29,male,33.3,2,no
 308 | 28,female,27.5,2,no
 309 | 30,female,33.3,1,no
 310 | 58,male,34.9,0,no
 311 | 41,female,33.1,2,no
 312 | 50,male,26.6,0,no
 313 | 19,female,24.7,0,no
 314 | 43,male,36.0,3,yes
 315 | 49,male,35.9,0,no
 316 | 27,female,31.4,0,yes
 317 | 52,male,33.3,0,no
 318 | 50,male,32.2,0,no
 319 | 54,male,32.8,0,no
 320 | 44,female,27.6,0,no
 321 | 32,male,37.3,1,no
 322 | 34,male,25.3,1,no
 323 | 26,female,29.6,4,no
 324 | 34,male,30.8,0,yes
 325 | 57,male,40.9,0,no
 326 | 29,male,27.2,0,no
 327 | 40,male,34.1,1,no
 328 | 27,female,23.2,1,no
 329 | 45,male,36.5,2,yes
 330 | 64,female,33.8,1,yes
 331 | 52,male,36.7,0,no
 332 | 61,female,36.4,1,yes
 333 | 52,male,27.4,0,yes
 334 | 61,female,31.2,0,no
 335 | 56,female,28.8,0,no
 336 | 43,female,35.7,2,no
 337 | 64,male,34.5,0,no
 338 | 60,male,25.7,0,no
 339 | 62,male,27.6,1,no
 340 | 50,male,32.3,1,yes
 341 | 46,female,27.7,1,no
 342 | 24,female,27.6,0,no
 343 | 62,male,30.0,0,no
 344 | 60,female,27.6,0,no
 345 | 63,male,36.8,0,no
 346 | 49,female,41.5,4,no
 347 | 34,female,29.3,3,no
 348 | 33,male,35.8,2,no
 349 | 46,male,33.3,1,no
 350 | 36,female,29.9,1,no
 351 | 19,male,27.8,0,no
 352 | 57,female,23.2,0,no
 353 | 50,female,25.6,0,no
 354 | 30,female,27.7,0,no
 355 | 33,male,35.2,0,no
 356 | 18,female,38.3,0,no
 357 | 46,male,27.6,0,no
 358 | 46,male,43.9,3,no
 359 | 47,male,29.8,3,no
 360 | 23,male,41.9,0,no
 361 | 18,female,20.8,0,no
 362 | 48,female,32.3,2,no
 363 | 35,male,30.5,1,no
 364 | 19,female,21.7,0,yes
 365 | 21,female,26.4,1,no
 366 | 21,female,21.9,2,no
 367 | 49,female,30.8,1,no
 368 | 56,female,32.3,3,no
 369 | 42,female,25.0,2,no
 370 | 44,male,32.0,2,no
 371 | 18,male,30.4,3,no
 372 | 61,female,21.1,0,no
 373 | 57,female,22.2,0,no
 374 | 42,female,33.2,1,no
 375 | 26,male,32.9,2,yes
 376 | 20,male,33.3,0,no
 377 | 23,female,28.3,0,yes
 378 | 39,female,24.9,3,yes
 379 | 24,male,40.2,0,yes
 380 | 64,female,30.1,3,no
 381 | 62,male,31.5,1,no
 382 | 27,female,18.0,2,yes
 383 | 55,male,30.7,0,yes
 384 | 55,male,33.0,0,no
 385 | 35,female,43.3,2,no
 386 | 44,male,22.1,2,no
 387 | 19,male,34.4,0,no
 388 | 58,female,39.1,0,no
 389 | 50,male,25.4,2,no
 390 | 26,female,22.6,0,no
 391 | 24,female,30.2,3,no
 392 | 48,male,35.6,4,no
 393 | 19,female,37.4,0,no
 394 | 48,male,31.4,1,no
 395 | 49,male,31.4,1,no
 396 | 46,female,32.3,2,no
 397 | 46,male,19.9,0,no
 398 | 43,female,34.4,3,no
 399 | 21,male,31.0,0,no
 400 | 64,male,25.6,2,no
 401 | 18,female,38.2,0,no
 402 | 51,female,20.6,0,no
 403 | 47,male,47.5,1,no
 404 | 64,female,33.0,0,no
 405 | 49,male,32.3,3,no
 406 | 31,male,20.4,0,no
 407 | 52,female,38.4,2,no
 408 | 33,female,24.3,0,no
 409 | 47,female,23.6,1,no
 410 | 38,male,21.1,3,no
 411 | 32,male,30.0,1,no
 412 | 19,male,17.5,0,no
 413 | 44,female,20.2,1,yes
 414 | 26,female,17.2,2,yes
 415 | 25,male,23.9,5,no
 416 | 19,female,35.2,0,no
 417 | 43,female,35.6,1,no
 418 | 52,male,34.1,0,no
 419 | 36,female,22.6,2,yes
 420 | 64,male,39.2,1,no
 421 | 63,female,27.0,0,yes
 422 | 64,male,33.9,0,yes
 423 | 61,male,35.9,0,yes
 424 | 40,male,32.8,1,yes
 425 | 25,male,30.6,0,no
 426 | 48,male,30.2,2,no
 427 | 45,male,24.3,5,no
 428 | 38,female,27.3,1,no
 429 | 18,female,29.2,0,no
 430 | 21,female,16.8,1,no
 431 | 27,female,30.4,3,no
 432 | 19,male,33.1,0,no
 433 | 29,female,20.2,2,no
 434 | 42,male,26.9,0,no
 435 | 60,female,30.5,0,no
 436 | 31,male,28.6,1,no
 437 | 60,male,33.1,3,no
 438 | 22,male,31.7,0,no
 439 | 35,male,28.9,3,no
 440 | 52,female,46.8,5,no
 441 | 26,male,29.5,0,no
 442 | 31,female,32.7,1,no
 443 | 33,female,33.5,0,yes
 444 | 18,male,43.0,0,no
 445 | 59,female,36.5,1,no
 446 | 56,male,26.7,1,yes
 447 | 45,female,33.1,0,no
 448 | 60,male,29.6,0,no
 449 | 56,female,25.7,0,no
 450 | 40,female,29.6,0,no
 451 | 35,male,38.6,1,no
 452 | 39,male,29.6,4,no
 453 | 30,male,24.1,1,no
 454 | 24,male,23.4,0,no
 455 | 20,male,29.7,0,no
 456 | 32,male,46.5,2,no
 457 | 59,male,37.4,0,no
 458 | 55,female,30.1,2,no
 459 | 57,female,30.5,0,no
 460 | 56,male,39.6,0,no
 461 | 40,female,33.0,3,no
 462 | 49,female,36.6,3,no
 463 | 42,male,30.0,0,yes
 464 | 62,female,38.1,2,no
 465 | 56,male,25.9,0,no
 466 | 19,male,25.2,0,no
 467 | 30,female,28.4,1,yes
 468 | 60,female,28.7,1,no
 469 | 56,female,33.8,2,no
 470 | 28,female,24.3,1,no
 471 | 18,female,24.1,1,no
 472 | 27,male,32.7,0,no
 473 | 18,female,30.1,0,no
 474 | 19,female,29.8,0,no
 475 | 47,female,33.3,0,no
 476 | 54,male,25.1,3,yes
 477 | 61,male,28.3,1,yes
 478 | 24,male,28.5,0,yes
 479 | 25,male,35.6,0,no
 480 | 21,male,36.9,0,no
 481 | 23,male,32.6,0,no
 482 | 63,male,41.3,3,no
 483 | 49,male,37.5,2,no
 484 | 18,female,31.4,0,no
 485 | 51,female,39.5,1,no
 486 | 48,male,34.3,3,no
 487 | 31,female,31.1,0,no
 488 | 54,female,21.5,3,no
 489 | 19,male,28.7,0,no
 490 | 44,female,38.1,0,yes
 491 | 53,male,31.2,1,no
 492 | 19,female,32.9,0,no
 493 | 61,female,25.1,0,no
 494 | 18,female,25.1,0,no
 495 | 61,male,43.4,0,no
 496 | 21,male,25.7,4,yes
 497 | 20,male,27.9,0,no
 498 | 31,female,23.6,2,no
 499 | 45,male,28.7,2,no
 500 | 44,female,24.0,2,no
 501 | 62,female,39.2,0,no
 502 | 29,male,34.4,0,yes
 503 | 43,male,26.0,0,no
 504 | 51,male,23.2,1,yes
 505 | 19,male,30.3,0,yes
 506 | 38,female,28.9,1,no
 507 | 37,male,30.9,3,no
 508 | 22,male,31.4,1,no
 509 | 21,male,23.8,2,no
 510 | 24,female,25.3,0,no
 511 | 57,female,28.7,0,no
 512 | 56,male,32.1,1,no
 513 | 27,male,33.7,0,no
 514 | 51,male,22.4,0,no
 515 | 19,male,30.4,0,no
 516 | 39,male,28.3,1,yes
 517 | 58,male,35.7,0,no
 518 | 20,male,35.3,1,no
 519 | 45,male,30.5,2,no
 520 | 35,female,31.0,1,no
 521 | 31,male,30.9,0,no
 522 | 50,female,27.4,0,no
 523 | 32,female,44.2,0,no
 524 | 51,female,33.9,0,no
 525 | 38,female,37.7,0,no
 526 | 42,male,26.1,1,yes
 527 | 18,female,33.9,0,no
 528 | 19,female,30.6,2,no
 529 | 51,female,25.8,1,no
 530 | 46,male,39.4,1,no
 531 | 18,male,25.5,0,no
 532 | 57,male,42.1,1,yes
 533 | 62,female,31.7,0,no
 534 | 59,male,29.7,2,no
 535 | 37,male,36.2,0,no
 536 | 64,male,40.5,0,no
 537 | 38,male,28.0,1,no
 538 | 33,female,38.9,3,no
 539 | 46,female,30.2,2,no
 540 | 46,female,28.1,1,no
 541 | 53,male,31.4,0,no
 542 | 34,female,38.0,3,no
 543 | 20,female,31.8,2,no
 544 | 63,female,36.3,0,no
 545 | 54,female,47.4,0,yes
 546 | 54,male,30.2,0,no
 547 | 49,male,25.8,2,yes
 548 | 28,male,35.4,0,no
 549 | 54,female,46.7,2,no
 550 | 25,female,28.6,0,no
 551 | 43,female,46.2,0,yes
 552 | 63,male,30.8,0,no
 553 | 32,female,28.9,0,no
 554 | 62,male,21.4,0,no
 555 | 52,female,31.7,2,no
 556 | 25,female,41.3,0,no
 557 | 28,male,23.8,2,no
 558 | 46,male,33.4,1,no
 559 | 34,male,34.2,0,no
 560 | 35,female,34.1,3,yes
 561 | 19,male,35.5,0,no
 562 | 46,female,20.0,2,no
 563 | 54,female,32.7,0,no
 564 | 27,male,30.5,0,no
 565 | 50,male,44.8,1,no
 566 | 18,female,32.1,2,no
 567 | 19,female,30.5,0,no
 568 | 38,female,40.6,1,no
 569 | 41,male,30.6,2,no
 570 | 49,female,31.9,5,no
 571 | 48,male,40.6,2,yes
 572 | 31,female,29.1,0,no
 573 | 18,female,37.3,1,no
 574 | 30,female,43.1,2,no
 575 | 62,female,36.9,1,no
 576 | 57,female,34.3,2,no
 577 | 58,female,27.2,0,no
 578 | 22,male,26.8,0,no
 579 | 31,female,38.1,1,yes
 580 | 52,male,30.2,1,no
 581 | 25,female,23.5,0,no
 582 | 59,male,25.5,1,no
 583 | 19,male,30.6,0,no
 584 | 39,male,45.4,2,no
 585 | 32,female,23.7,1,no
 586 | 19,male,20.7,0,no
 587 | 33,female,28.3,1,no
 588 | 21,male,20.2,3,no
 589 | 34,female,30.2,1,yes
 590 | 61,female,35.9,0,no
 591 | 38,female,30.7,1,no
 592 | 58,female,29.0,0,no
 593 | 47,male,19.6,1,no
 594 | 20,male,31.1,2,no
 595 | 21,female,21.9,1,yes
 596 | 41,male,40.3,0,no
 597 | 46,female,33.7,1,no
 598 | 42,female,29.5,2,no
 599 | 34,female,33.3,1,no
 600 | 43,male,32.6,2,no
 601 | 52,female,37.5,2,no
 602 | 18,female,39.2,0,no
 603 | 51,male,31.6,0,no
 604 | 56,female,25.3,0,no
 605 | 64,female,39.1,3,no
 606 | 19,female,28.3,0,yes
 607 | 51,female,34.1,0,no
 608 | 27,female,25.2,0,no
 609 | 59,female,23.7,0,yes
 610 | 28,male,27.0,2,no
 611 | 30,male,37.8,2,yes
 612 | 47,female,29.4,1,no
 613 | 38,female,34.8,2,no
 614 | 18,female,33.2,0,no
 615 | 34,female,19.0,3,no
 616 | 20,female,33.0,0,no
 617 | 47,female,36.6,1,yes
 618 | 56,female,28.6,0,no
 619 | 49,male,25.6,2,yes
 620 | 19,female,33.1,0,yes
 621 | 55,female,37.1,0,no
 622 | 30,male,31.4,1,no
 623 | 37,male,34.1,4,yes
 624 | 49,female,21.3,1,no
 625 | 18,male,33.5,0,yes
 626 | 59,male,28.8,0,no
 627 | 29,female,26.0,0,no
 628 | 36,male,28.9,3,no
 629 | 33,male,42.5,1,no
 630 | 58,male,38.0,0,no
 631 | 44,female,39.0,0,yes
 632 | 53,male,36.1,1,no
 633 | 24,male,29.3,0,no
 634 | 29,female,35.5,0,no
 635 | 40,male,22.7,2,no
 636 | 51,male,39.7,1,no
 637 | 64,male,38.2,0,no
 638 | 19,female,24.5,1,no
 639 | 35,female,38.1,2,no
 640 | 39,male,26.4,0,yes
 641 | 56,male,33.7,4,no
 642 | 33,male,42.4,5,no
 643 | 42,male,28.3,3,yes
 644 | 61,male,33.9,0,no
 645 | 23,female,35.0,3,no
 646 | 43,male,35.3,2,no
 647 | 48,male,30.8,3,no
 648 | 39,male,26.2,1,no
 649 | 40,female,23.4,3,no
 650 | 18,male,28.5,0,no
 651 | 58,female,33.0,0,no
 652 | 49,female,42.7,2,no
 653 | 53,female,39.6,1,no
 654 | 48,female,31.1,0,no
 655 | 45,female,36.3,2,no
 656 | 59,female,35.2,0,no
 657 | 52,female,25.3,2,yes
 658 | 26,female,42.4,1,no
 659 | 27,male,33.2,2,no
 660 | 48,female,35.9,1,no
 661 | 57,female,28.8,4,no
 662 | 37,male,46.5,3,no
 663 | 57,female,24.0,1,no
 664 | 32,female,31.5,1,no
 665 | 18,male,33.7,0,no
 666 | 64,female,23.0,0,yes
 667 | 43,male,38.1,2,yes
 668 | 49,male,28.7,1,no
 669 | 40,female,32.8,2,yes
 670 | 62,male,32.0,0,yes
 671 | 40,female,29.8,1,no
 672 | 30,male,31.6,3,no
 673 | 29,female,31.2,0,no
 674 | 36,male,29.7,0,no
 675 | 41,female,31.0,0,no
 676 | 44,female,43.9,2,yes
 677 | 45,male,21.4,0,no
 678 | 55,female,40.8,3,no
 679 | 60,male,31.4,3,yes
 680 | 56,male,36.1,3,no
 681 | 49,female,23.2,2,no
 682 | 21,female,17.4,1,no
 683 | 19,male,20.3,0,no
 684 | 39,male,35.3,2,yes
 685 | 53,male,24.3,0,no
 686 | 33,female,18.5,1,no
 687 | 53,male,26.4,2,no
 688 | 42,male,26.1,2,no
 689 | 40,male,41.7,0,no
 690 | 47,female,24.1,1,no
 691 | 27,male,31.1,1,yes
 692 | 21,male,27.4,0,no
 693 | 47,male,36.2,1,no
 694 | 20,male,32.4,1,no
 695 | 24,male,23.7,0,no
 696 | 27,female,34.8,1,no
 697 | 26,female,40.2,0,no
 698 | 53,female,32.3,2,no
 699 | 41,male,35.8,1,yes
 700 | 56,male,33.7,0,no
 701 | 23,female,39.3,2,no
 702 | 21,female,34.9,0,no
 703 | 50,female,44.7,0,no
 704 | 53,male,41.5,0,no
 705 | 34,female,26.4,1,no
 706 | 47,female,29.5,1,no
 707 | 33,female,32.9,2,no
 708 | 51,female,38.1,0,yes
 709 | 49,male,28.7,3,no
 710 | 31,female,30.5,3,no
 711 | 36,female,27.7,0,no
 712 | 18,male,35.2,1,no
 713 | 50,female,23.5,2,no
 714 | 43,female,30.7,2,no
 715 | 20,male,40.5,0,no
 716 | 24,female,22.6,0,no
 717 | 60,male,28.9,0,no
 718 | 49,female,22.6,1,no
 719 | 60,male,24.3,1,no
 720 | 51,female,36.7,2,no
 721 | 58,female,33.4,0,no
 722 | 51,female,40.7,0,no
 723 | 53,male,36.6,3,no
 724 | 62,male,37.4,0,no
 725 | 19,male,35.4,0,no
 726 | 50,female,27.1,1,no
 727 | 30,female,39.1,3,yes
 728 | 41,male,28.4,1,no
 729 | 29,female,21.8,1,yes
 730 | 18,female,40.3,0,no
 731 | 41,female,36.1,1,no
 732 | 35,male,24.4,3,yes
 733 | 53,male,21.4,1,no
 734 | 24,female,30.1,3,no
 735 | 48,female,27.3,1,no
 736 | 59,female,32.1,3,no
 737 | 49,female,34.8,1,no
 738 | 37,female,38.4,0,yes
 739 | 26,male,23.7,2,no
 740 | 23,male,31.7,3,yes
 741 | 29,male,35.5,2,yes
 742 | 45,male,24.0,2,no
 743 | 27,male,29.2,0,yes
 744 | 53,male,34.1,0,yes
 745 | 31,female,26.6,0,no
 746 | 50,male,26.4,0,no
 747 | 50,female,30.1,1,no
 748 | 34,male,27.0,2,no
 749 | 19,male,21.8,0,no
 750 | 47,female,36.0,1,no
 751 | 28,male,30.9,0,no
 752 | 37,female,26.4,0,yes
 753 | 21,male,29.0,0,no
 754 | 64,male,37.9,0,no
 755 | 58,female,22.8,0,no
 756 | 24,male,33.6,4,no
 757 | 31,male,27.6,2,no
 758 | 39,female,22.8,3,no
 759 | 47,female,27.8,0,yes
 760 | 30,male,37.4,3,no
 761 | 18,male,38.2,0,yes
 762 | 22,female,34.6,2,no
 763 | 23,male,35.2,1,no
 764 | 33,male,27.1,1,yes
 765 | 27,male,26.0,0,no
 766 | 45,female,25.2,2,no
 767 | 57,female,31.8,0,no
 768 | 47,male,32.3,1,no
 769 | 42,female,29.0,1,no
 770 | 64,female,39.7,0,no
 771 | 38,female,19.5,2,no
 772 | 61,male,36.1,3,no
 773 | 53,female,26.7,2,no
 774 | 44,female,36.5,0,no
 775 | 19,female,28.9,0,yes
 776 | 41,male,34.2,2,no
 777 | 51,male,33.3,3,no
 778 | 40,male,32.3,2,no
 779 | 45,male,39.8,0,no
 780 | 35,male,34.3,3,no
 781 | 53,male,28.9,0,no
 782 | 30,male,24.4,3,yes
 783 | 18,male,41.1,0,no
 784 | 51,male,36.0,1,no
 785 | 50,female,27.6,1,yes
 786 | 31,female,29.3,1,no
 787 | 35,female,27.7,3,no
 788 | 60,male,37.0,0,no
 789 | 21,male,36.9,0,no
 790 | 29,male,22.5,3,no
 791 | 62,female,29.9,0,no
 792 | 39,female,41.8,0,no
 793 | 19,male,27.6,0,no
 794 | 22,female,23.2,0,no
 795 | 53,male,20.9,0,yes
 796 | 39,female,31.9,2,no
 797 | 27,male,28.5,0,yes
 798 | 30,male,44.2,2,no
 799 | 30,female,22.9,1,no
 800 | 58,female,33.1,0,no
 801 | 33,male,24.8,0,yes
 802 | 42,female,26.2,1,no
 803 | 64,female,36.0,0,no
 804 | 21,male,22.3,1,no
 805 | 18,female,42.2,0,yes
 806 | 23,male,26.5,0,no
 807 | 45,female,35.8,0,no
 808 | 40,female,41.4,1,no
 809 | 19,female,36.6,0,no
 810 | 18,male,30.1,0,no
 811 | 25,male,25.8,1,no
 812 | 46,female,30.8,3,no
 813 | 33,female,42.9,3,no
 814 | 54,male,21.0,2,no
 815 | 28,male,22.5,2,no
 816 | 36,male,34.4,2,no
 817 | 20,female,31.5,0,no
 818 | 24,female,24.2,0,no
 819 | 23,male,37.1,3,no
 820 | 47,female,26.1,1,yes
 821 | 33,female,35.5,0,yes
 822 | 45,male,33.7,1,no
 823 | 26,male,17.7,0,no
 824 | 18,female,31.1,0,no
 825 | 44,female,29.8,2,no
 826 | 60,male,24.3,0,no
 827 | 64,female,31.8,2,no
 828 | 56,male,31.8,2,yes
 829 | 36,male,28.0,1,yes
 830 | 41,male,30.8,3,yes
 831 | 39,male,21.9,1,no
 832 | 63,male,33.1,0,no
 833 | 36,female,25.8,0,no
 834 | 28,female,23.8,2,no
 835 | 58,male,34.4,0,no
 836 | 36,male,33.8,1,no
 837 | 42,male,36.0,2,no
 838 | 36,male,31.5,0,no
 839 | 56,female,28.3,0,no
 840 | 35,female,23.5,2,no
 841 | 59,female,31.4,0,no
 842 | 21,male,31.1,0,no
 843 | 59,male,24.7,0,no
 844 | 23,female,32.8,2,yes
 845 | 57,female,29.8,0,yes
 846 | 53,male,30.5,0,no
 847 | 60,female,32.5,0,yes
 848 | 51,female,34.2,1,no
 849 | 23,male,50.4,1,no
 850 | 27,female,24.1,0,no
 851 | 55,male,32.8,0,no
 852 | 37,female,30.8,0,yes
 853 | 61,male,32.3,2,no
 854 | 46,female,35.5,0,yes
 855 | 53,female,23.8,2,no
 856 | 49,female,23.8,3,yes
 857 | 20,female,29.6,0,no
 858 | 48,female,33.1,0,yes
 859 | 25,male,24.1,0,yes
 860 | 25,female,32.2,1,no
 861 | 57,male,28.1,0,no
 862 | 37,female,47.6,2,yes
 863 | 38,female,28.0,3,no
 864 | 55,female,33.5,2,no
 865 | 36,female,19.9,0,no
 866 | 51,male,25.4,0,no
 867 | 40,male,29.9,2,no
 868 | 18,male,37.3,0,no
 869 | 57,male,43.7,1,no
 870 | 61,male,23.7,0,no
 871 | 25,female,24.3,3,no
 872 | 50,male,36.2,0,no
 873 | 26,female,29.5,1,no
 874 | 42,male,24.9,0,no
 875 | 43,male,30.1,1,no
 876 | 44,male,21.9,3,no
 877 | 23,female,28.1,0,no
 878 | 49,female,27.1,1,no
 879 | 33,male,33.4,5,no
 880 | 41,male,28.8,1,no
 881 | 37,female,29.5,2,no
 882 | 22,male,34.8,3,no
 883 | 23,male,27.4,1,no
 884 | 21,female,22.1,0,no
 885 | 51,female,37.1,3,yes
 886 | 25,male,26.7,4,no
 887 | 32,male,28.9,1,yes
 888 | 57,male,29.0,0,yes
 889 | 36,female,30.0,0,no
 890 | 22,male,39.5,0,no
 891 | 57,male,33.6,1,no
 892 | 64,female,26.9,0,yes
 893 | 36,female,29.0,4,no
 894 | 54,male,24.0,0,no
 895 | 47,male,38.9,2,yes
 896 | 62,male,32.1,0,no
 897 | 61,female,44.0,0,no
 898 | 43,female,20.0,2,yes
 899 | 19,male,25.6,1,no
 900 | 18,female,40.3,0,no
 901 | 19,female,22.5,0,no
 902 | 49,male,22.5,0,no
 903 | 60,male,40.9,0,yes
 904 | 26,male,27.3,3,no
 905 | 49,male,36.9,0,no
 906 | 60,female,35.1,0,no
 907 | 26,female,29.4,2,no
 908 | 27,male,32.6,3,no
 909 | 44,female,32.3,1,no
 910 | 63,male,39.8,3,no
 911 | 32,female,24.6,0,yes
 912 | 22,male,28.3,1,no
 913 | 18,male,31.7,0,yes
 914 | 59,female,26.7,3,no
 915 | 44,female,27.5,1,no
 916 | 33,male,24.6,2,no
 917 | 24,female,34.0,0,no
 918 | 43,female,26.9,0,yes
 919 | 45,male,22.9,0,yes
 920 | 61,female,28.2,0,no
 921 | 35,female,34.2,1,no
 922 | 62,female,25.0,0,no
 923 | 62,female,33.2,0,no
 924 | 38,male,31.0,1,no
 925 | 34,male,35.8,0,no
 926 | 43,male,23.2,0,no
 927 | 50,male,32.1,2,no
 928 | 19,female,23.4,2,no
 929 | 57,female,20.1,1,no
 930 | 62,female,39.2,0,no
 931 | 41,male,34.2,1,no
 932 | 26,male,46.5,1,no
 933 | 39,female,32.5,1,no
 934 | 46,male,25.8,5,no
 935 | 45,female,35.3,0,no
 936 | 32,male,37.2,2,no
 937 | 59,female,27.5,0,no
 938 | 44,male,29.7,2,no
 939 | 39,female,24.2,5,no
 940 | 18,male,26.2,2,no
 941 | 53,male,29.5,0,no
 942 | 18,male,23.2,0,no
 943 | 50,female,46.1,1,no
 944 | 18,female,40.2,0,no
 945 | 19,male,22.6,0,no
 946 | 62,male,39.9,0,no
 947 | 56,female,35.8,1,no
 948 | 42,male,35.8,2,no
 949 | 37,male,34.2,1,yes
 950 | 42,male,31.3,0,no
 951 | 25,male,29.7,3,yes
 952 | 57,male,18.3,0,no
 953 | 51,male,42.9,2,yes
 954 | 30,female,28.4,1,no
 955 | 44,male,30.2,2,yes
 956 | 34,male,27.8,1,yes
 957 | 31,male,39.5,1,no
 958 | 54,male,30.8,1,yes
 959 | 24,male,26.8,1,no
 960 | 43,male,35.0,1,yes
 961 | 48,male,36.7,1,no
 962 | 19,female,39.6,1,no
 963 | 29,female,25.9,0,no
 964 | 63,female,35.2,1,no
 965 | 46,male,24.8,3,no
 966 | 52,male,36.8,2,no
 967 | 35,male,27.1,1,no
 968 | 51,male,24.8,2,yes
 969 | 44,male,25.4,1,no
 970 | 21,male,25.7,2,no
 971 | 39,female,34.3,5,no
 972 | 50,female,28.2,3,no
 973 | 34,female,23.6,0,no
 974 | 22,female,20.2,0,no
 975 | 19,female,40.5,0,no
 976 | 26,male,35.4,0,no
 977 | 29,male,22.9,0,yes
 978 | 48,male,40.2,0,no
 979 | 26,male,29.2,1,no
 980 | 45,female,40.0,3,no
 981 | 36,female,29.9,0,no
 982 | 54,male,25.5,1,no
 983 | 34,male,21.4,0,no
 984 | 31,male,25.9,3,yes
 985 | 27,female,30.6,1,no
 986 | 20,male,30.1,5,no
 987 | 44,female,25.8,1,no
 988 | 43,male,30.1,3,no
 989 | 45,female,27.6,1,no
 990 | 34,male,34.7,0,no
 991 | 24,female,20.5,0,yes
 992 | 26,female,19.8,1,no
 993 | 38,female,27.8,2,no
 994 | 50,female,31.6,2,no
 995 | 38,male,28.3,1,no
 996 | 27,female,20.0,3,yes
 997 | 39,female,23.3,3,no
 998 | 39,female,34.1,3,no
 999 | 63,female,36.9,0,no
1000 | 33,female,36.3,3,no
1001 | 36,female,26.9,0,no
1002 | 30,male,23.0,2,yes
1003 | 24,male,32.7,0,yes
1004 | 24,male,25.8,0,no
1005 | 48,male,29.6,0,no
1006 | 47,male,19.2,1,no
1007 | 29,male,31.7,2,no
1008 | 28,male,29.3,2,no
1009 | 47,male,28.2,3,yes
1010 | 25,male,25.0,2,no
1011 | 51,male,27.7,1,no
1012 | 48,female,22.8,0,no
1013 | 43,male,20.1,2,yes
1014 | 61,female,33.3,4,no
1015 | 48,male,32.3,1,no
1016 | 38,female,27.6,0,no
1017 | 59,male,25.5,0,no
1018 | 19,female,24.6,1,no
1019 | 26,female,34.2,2,no
1020 | 54,female,35.8,3,no
1021 | 21,female,32.7,2,no
1022 | 51,male,37.0,0,no
1023 | 22,female,31.0,3,yes
1024 | 47,male,36.1,1,yes
1025 | 18,male,23.3,1,no
1026 | 47,female,45.3,1,no
1027 | 21,female,34.6,0,no
1028 | 19,male,26.0,1,yes
1029 | 23,male,18.7,0,no
1030 | 54,male,31.6,0,no
1031 | 37,female,17.3,2,no
1032 | 46,female,23.7,1,yes
1033 | 55,female,35.2,0,yes
1034 | 30,female,27.9,0,no
1035 | 18,male,21.6,0,yes
1036 | 61,male,38.4,0,no
1037 | 54,female,23.0,3,no
1038 | 22,male,37.1,2,yes
1039 | 45,female,30.5,1,yes
1040 | 22,male,28.9,0,no
1041 | 19,male,27.3,2,no
1042 | 35,female,28.0,0,yes
1043 | 18,male,23.1,0,no
1044 | 20,male,30.7,0,yes
1045 | 28,female,25.8,0,no
1046 | 55,male,35.2,1,no
1047 | 43,female,24.7,2,yes
1048 | 43,female,25.1,0,no
1049 | 22,male,52.6,1,yes
1050 | 25,female,22.5,1,no
1051 | 49,male,30.9,0,yes
1052 | 44,female,37.0,1,no
1053 | 64,male,26.4,0,no
1054 | 49,male,29.8,1,no
1055 | 47,male,29.8,3,yes
1056 | 27,female,21.5,0,no
1057 | 55,male,27.6,0,no
1058 | 48,female,28.9,0,no
1059 | 45,female,31.8,0,no
1060 | 24,female,39.5,0,no
1061 | 32,male,33.8,1,no
1062 | 24,male,32.0,0,no
1063 | 57,male,27.9,1,no
1064 | 59,male,41.1,1,yes
1065 | 36,male,28.6,3,no
1066 | 29,female,25.6,4,no
1067 | 42,female,25.3,1,no
1068 | 48,male,37.3,2,no
1069 | 39,male,42.7,0,no
1070 | 63,male,21.7,1,no
1071 | 54,female,31.9,1,no
1072 | 37,male,37.1,1,yes
1073 | 63,male,31.4,0,no
1074 | 21,male,31.3,0,no
1075 | 54,female,28.9,2,no
1076 | 60,female,18.3,0,no
1077 | 32,female,29.6,1,no
1078 | 47,female,32.0,1,no
1079 | 21,male,26.0,0,no
1080 | 28,male,31.7,0,yes
1081 | 63,male,33.7,3,no
1082 | 18,male,21.8,2,no
1083 | 32,male,27.8,1,no
1084 | 38,male,20.0,1,no
1085 | 32,male,31.5,1,no
1086 | 62,female,30.5,2,no
1087 | 39,female,18.3,5,yes
1088 | 55,male,29.0,0,no
1089 | 57,male,31.5,0,no
1090 | 52,male,47.7,1,no
1091 | 56,male,22.1,0,no
1092 | 47,male,36.2,0,yes
1093 | 55,female,29.8,0,no
1094 | 23,male,32.7,3,no
1095 | 22,female,30.4,0,yes
1096 | 50,female,33.7,4,no
1097 | 18,female,31.4,4,no
1098 | 51,female,35.0,2,yes
1099 | 22,male,33.8,0,no
1100 | 52,female,30.9,0,no
1101 | 25,female,34.0,1,no
1102 | 33,female,19.1,2,yes
1103 | 53,male,28.6,3,no
1104 | 29,male,38.9,1,no
1105 | 58,male,36.1,0,no
1106 | 37,male,29.8,0,no
1107 | 54,female,31.2,0,no
1108 | 49,female,29.9,0,no
1109 | 50,female,26.2,2,no
1110 | 26,male,30.0,1,no
1111 | 45,male,20.4,3,no
1112 | 54,female,32.3,1,no
1113 | 38,male,38.4,3,yes
1114 | 48,female,25.9,3,yes
1115 | 28,female,26.3,3,no
1116 | 23,male,24.5,0,no
1117 | 55,male,32.7,1,no
1118 | 41,male,29.6,5,no
1119 | 25,male,33.3,2,yes
1120 | 33,male,35.8,1,yes
1121 | 30,female,20.0,3,no
1122 | 23,female,31.4,0,yes
1123 | 46,male,38.2,2,no
1124 | 53,female,36.9,3,yes
1125 | 27,female,32.4,1,no
1126 | 23,female,42.8,1,yes
1127 | 63,female,25.1,0,no
1128 | 55,male,29.9,0,no
1129 | 35,female,35.9,2,no
1130 | 34,male,32.8,1,no
1131 | 19,female,18.6,0,no
1132 | 39,female,23.9,5,no
1133 | 27,male,45.9,2,no
1134 | 57,male,40.3,0,no
1135 | 52,female,18.3,0,no
1136 | 28,male,33.8,0,no
1137 | 50,female,28.1,3,no
1138 | 44,female,25.0,1,no
1139 | 26,female,22.2,0,no
1140 | 33,male,30.3,0,no
1141 | 19,female,32.5,0,yes
1142 | 50,male,37.1,1,no
1143 | 41,female,32.6,3,no
1144 | 52,female,24.9,0,no
1145 | 39,male,32.3,2,no
1146 | 50,male,32.3,2,no
1147 | 52,male,32.8,3,no
1148 | 60,male,32.8,0,yes
1149 | 20,female,31.9,0,no
1150 | 55,male,21.5,1,no
1151 | 42,male,34.1,0,no
1152 | 18,female,30.3,0,no
1153 | 58,female,36.5,0,no
1154 | 43,female,32.6,3,yes
1155 | 35,female,35.8,1,no
1156 | 48,female,27.9,4,no
1157 | 36,female,22.1,3,no
1158 | 19,male,44.9,0,yes
1159 | 23,female,23.2,2,no
1160 | 20,female,30.6,0,no
1161 | 32,female,41.1,0,no
1162 | 43,female,34.6,1,no
1163 | 34,male,42.1,2,no
1164 | 30,male,38.8,1,no
1165 | 18,female,28.2,0,no
1166 | 41,female,28.3,1,no
1167 | 35,female,26.1,0,no
1168 | 57,male,40.4,0,no
1169 | 29,female,24.6,2,no
1170 | 32,male,35.2,2,no
1171 | 37,female,34.1,1,no
1172 | 18,male,27.4,1,yes
1173 | 43,female,26.7,2,yes
1174 | 56,female,41.9,0,no
1175 | 38,male,29.3,2,no
1176 | 29,male,32.1,2,no
1177 | 22,female,27.1,0,no
1178 | 52,female,24.1,1,yes
1179 | 40,female,27.4,1,no
1180 | 23,female,34.9,0,no
1181 | 31,male,29.8,0,yes
1182 | 42,female,41.3,1,no
1183 | 24,female,29.9,0,no
1184 | 25,female,30.3,0,no
1185 | 48,female,27.4,1,no
1186 | 23,female,28.5,1,yes
1187 | 45,male,23.6,2,no
1188 | 20,male,35.6,3,yes
1189 | 62,female,32.7,0,no
1190 | 43,female,25.3,1,yes
1191 | 23,female,28.0,0,no
1192 | 31,female,32.8,2,no
1193 | 41,female,21.8,1,no
1194 | 58,female,32.4,1,no
1195 | 48,female,36.6,0,no
1196 | 31,female,21.8,0,no
1197 | 19,female,27.9,3,no
1198 | 19,female,30.0,0,yes
1199 | 41,male,33.6,0,no
1200 | 40,male,29.4,1,no
1201 | 31,female,25.8,2,no
1202 | 37,male,24.3,2,no
1203 | 46,male,40.4,2,no
1204 | 22,male,32.1,0,no
1205 | 51,male,32.3,1,no
1206 | 18,female,27.3,3,yes
1207 | 35,male,17.9,1,no
1208 | 59,female,34.8,2,no
1209 | 36,male,33.4,2,yes
1210 | 37,female,25.6,1,yes
1211 | 59,male,37.1,1,no
1212 | 36,male,30.9,1,no
1213 | 39,male,34.1,2,no
1214 | 18,male,21.5,0,no
1215 | 52,female,33.3,2,no
1216 | 27,female,31.3,1,no
1217 | 18,male,39.1,0,no
1218 | 40,male,25.1,0,no
1219 | 29,male,37.3,2,no
1220 | 46,female,34.6,1,yes
1221 | 38,female,30.2,3,no
1222 | 30,female,21.9,1,no
1223 | 40,male,25.0,2,no
1224 | 50,male,25.3,0,no
1225 | 20,female,24.4,0,yes
1226 | 41,male,23.9,1,no
1227 | 33,female,39.8,1,no
1228 | 38,male,16.8,2,no
1229 | 42,male,37.2,2,no
1230 | 56,male,34.4,0,no
1231 | 58,male,30.3,0,no
1232 | 52,male,34.5,3,yes
1233 | 20,female,21.8,0,yes
1234 | 54,female,24.6,3,no
1235 | 58,male,23.3,0,no
1236 | 45,female,27.8,2,no
1237 | 26,male,31.1,0,no
1238 | 63,female,21.7,0,no
1239 | 58,female,28.2,0,no
1240 | 37,male,22.7,3,no
1241 | 25,female,42.1,1,no
1242 | 52,male,41.8,2,yes
1243 | 64,male,37.0,2,yes
1244 | 22,female,21.3,3,no
1245 | 28,female,33.1,0,no
1246 | 18,male,33.3,0,no
1247 | 28,male,24.3,5,no
1248 | 45,female,25.7,3,no
1249 | 33,male,29.4,4,no
1250 | 18,female,39.8,0,no
1251 | 32,male,33.6,1,yes
1252 | 24,male,29.8,0,yes
1253 | 19,male,19.8,0,no
1254 | 20,male,27.3,0,yes
1255 | 40,female,29.3,4,no
1256 | 34,female,27.7,0,no
1257 | 42,female,37.9,0,no
1258 | 51,female,36.4,3,no
1259 | 54,female,27.6,1,no
1260 | 55,male,37.7,3,no
1261 | 52,female,23.2,0,no
1262 | 32,female,20.5,0,no
1263 | 28,male,37.1,1,no
1264 | 41,female,28.1,1,no
1265 | 43,female,29.9,1,no
1266 | 49,female,33.3,2,no
1267 | 64,male,23.8,0,yes
1268 | 55,female,30.5,0,no
1269 | 24,male,31.1,0,yes
1270 | 20,female,33.3,0,no
1271 | 45,male,27.5,3,no
1272 | 26,male,33.9,1,no
1273 | 25,female,34.5,0,no
1274 | 43,male,25.5,5,no
1275 | 35,male,27.6,1,no
1276 | 26,male,27.1,0,yes
1277 | 57,male,23.7,0,no
1278 | 22,female,30.4,0,no
1279 | 32,female,29.7,0,no
1280 | 39,male,29.9,1,yes
1281 | 25,female,26.8,2,no
1282 | 48,female,33.3,0,no
1283 | 47,female,27.6,2,yes
1284 | 18,female,21.7,0,yes
1285 | 18,male,30.0,1,no
1286 | 61,male,36.3,1,yes
1287 | 47,female,24.3,0,no
1288 | 28,female,17.3,0,no
1289 | 36,female,25.9,1,no
1290 | 20,male,39.4,2,yes
1291 | 44,male,34.3,1,no
1292 | 38,female,20.0,2,no
1293 | 19,male,34.9,0,yes
1294 | 21,male,23.2,0,no
1295 | 46,male,25.7,3,no
1296 | 58,male,25.2,0,no
1297 | 20,male,22.0,1,no
1298 | 18,male,26.1,0,no
1299 | 28,female,26.5,2,no
1300 | 33,male,27.5,2,no
1301 | 19,female,25.7,1,no
1302 | 45,male,30.4,0,yes
1303 | 62,male,30.9,3,yes
1304 | 25,female,20.8,1,no
1305 | 43,male,27.8,0,yes
1306 | 42,male,24.6,2,yes
1307 | 24,female,27.7,0,no
1308 | 29,female,21.9,0,yes
1309 | 32,male,28.1,4,yes
1310 | 25,female,30.2,0,yes
1311 | 41,male,32.2,2,no
1312 | 42,male,26.3,1,no
1313 | 33,female,26.7,0,no
1314 | 34,male,42.9,1,no
1315 | 19,female,34.7,2,yes
1316 | 30,female,23.7,3,yes
1317 | 18,male,28.3,1,no
1318 | 19,female,20.6,0,no
1319 | 18,male,53.1,0,no
1320 | 35,male,39.7,4,no
1321 | 39,female,26.3,2,no
1322 | 31,male,31.1,3,no
1323 | 62,male,26.7,0,yes
1324 | 62,male,38.8,0,no
1325 | 42,female,40.4,2,yes
1326 | 31,male,25.9,1,no
1327 | 61,male,33.5,0,no
1328 | 42,female,32.9,0,no
1329 | 51,male,30.0,1,no
1330 | 23,female,24.2,2,no
1331 | 52,male,38.6,2,no
1332 | 57,female,25.7,2,no
1333 | 23,female,33.4,0,no
1334 | 52,female,44.7,3,no
1335 | 50,male,31.0,3,no
1336 | 18,female,31.9,0,no
1337 | 18,female,36.9,0,no
1338 | 21,female,25.8,0,no
1339 | 61,female,29.1,0,yes
1340 | 


--------------------------------------------------------------------------------
/config/params.yaml:
--------------------------------------------------------------------------------
 1 | base:
 2 |   project_id: 5
 3 |   project: insurance-premium
 4 |   random_state: 42
 5 |   test_size: 0.2
 6 | 
 7 | 
 8 | config:
 9 |   params: config/params.yaml
10 |   schema_prediction: config/schema_prediction.json
11 |   schema_training: config/schema_training.json
12 | 
13 | cloud_provider:
14 |   name: microsoft
15 | 
16 | data_download:
17 |   cloud_training_directory_path: data/training_batch_files/
18 |   cloud_prediction_directory_path: data/Prediction_Batch_files
19 | 
20 | artifacts:
21 |   training_data:
22 |     good_file_path: artifacts/training_data/Training_Raw_files_validated/Good_raw
23 |     bad_file_path: artifacts/training_data/Training_Raw_files_validated/Bad_raw
24 |     archive_bad_file_path: artifacts/training_data/TrainingArchiveBadData
25 |     training_file_from_db: artifacts/training_data/Training_FileFromDB
26 |     master_csv: master.csv
27 |     null_value_info_file_path:  artifacts/training_data/preprocessing/null_value
28 |     plots: artifacts/training_data/plots
29 |     pipeline_path: artifacts/pipeline/pipeline_model
30 | 
31 |   model:
32 |     model_path: artifacts/model
33 | 
34 | 
35 |   prediction_data:
36 |     good_file_path: artifacts/prediction_data/Prediction_Raw_files_validated/Good_raw
37 |     bad_file_path: artifacts/prediction_data/Prediction_Raw_files_validated/Bad_raw
38 |     archive_bad_file_path: artifacts/prediction_data/PredictionArchiveBadData
39 |     prediction_file_from_db: artifacts/prediction_data/Prediction_FileFromDB
40 |     master_csv: master.csv
41 |     prediction_output_file_path: artifacts/Prediction_Output_DIR/
42 |     prediction_file_name: Predictions.csv
43 | 
44 | data_source:
45 |   Training_Batch_Files: Training_Batch_Files
46 |   Prediction_Batch_Files: Prediction_Batch_Files
47 | 
48 | kafka:
49 |   topic_name: insurance-premium
50 |   kafka_bootstrap_server: localhost:9092
51 | 
52 | 
53 | log_database:
54 |   training_database_name: insurance_prediction_training
55 |   prediction_database_name: insurance_prediction_prediction
56 | 
57 | dataset:
58 |   unwanted_column:
59 |       - region
60 | 
61 | 
62 |   database_detail:
63 |     training_database_name: insurance_prediction_training
64 |     prediction_database_name: insurance_prediction_prediction
65 |     dataset_training_collection_name: insurance_prediction_training_dataset
66 |     dataset_prediction_collection_name: insurance_prediction_prediction_dataset
67 | 
68 | target_columns:
69 |   columns:
70 |     - expenses
71 | 


--------------------------------------------------------------------------------
/config/schema_prediction.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"SampleFileName": "HealthPrem_26092020_131534.csv",
 3 | 	"LengthOfDateStampInFile": 8,
 4 | 	"LengthOfTimeStampInFile": 6,
 5 | 	"NumberofColumns": 6,
 6 | 	"ColName": {
 7 | 		"age": "Integer",
 8 | 		"sex": "varchar",
 9 | 		"bmi": "float",
10 | 		"children": "Integer",
11 | 		"smoker": "varchar"
12 | 	}
13 | }


--------------------------------------------------------------------------------
/config/schema_training.json:
--------------------------------------------------------------------------------
 1 | {	"SampleFileName": "HealthPrem_26092020_131534.csv",
 2 | 	"LengthOfDateStampInFile": 8,
 3 | 	"LengthOfTimeStampInFile": 6,
 4 | 	"NumberofColumns" : 7,
 5 | 	"ColName": {
 6 | 				"age" : "Integer",
 7 | 				"sex" : "varchar",
 8 | 				"bmi" : "float",
 9 | 				"children" : "Integer",
10 | 				"smoker" : "varchar",
11 | 				"region": " varchar",
12 | 				"expenses" : "float"
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/csv_to_kafka.py:
--------------------------------------------------------------------------------
 1 | from streaming.producer.kafka_csv_data_producer import KafkaCSVDataProducer
 2 | from streaming.spark_manager.spark_manager import SparkManager
 3 | 
 4 | if __name__ == "__main__":
 5 |     try:
 6 |         path = "prediction_files"
 7 |         spark_session = SparkManager().get_spark_session_object()
 8 |         kfk_csv_data_producer = KafkaCSVDataProducer(
 9 |             spark_session=spark_session,
10 | 
11 |         )
12 |         kfk_csv_data_producer.send_csv_data_to_kafka_topic(directory_path=path)
13 |     except Exception as e:
14 |         print(e)
15 | 


--------------------------------------------------------------------------------
/data/training_batch_files/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/diagram/Drawing1.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/Drawing1.vsdx


--------------------------------------------------------------------------------
/diagram/streaming.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/streaming.jpg


--------------------------------------------------------------------------------
/diagram/training and prediction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/diagram/training and prediction.pdf


--------------------------------------------------------------------------------
/entry_point.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | from utility import get_logger_object_of_training, get_logger_object_of_prediction
  5 | from training.stage_00_data_loader import loader_main
  6 | from training.stage_01_data_validator import validation_main
  7 | from training.stage_02_data_transformer import transform_main
  8 | from training.stage_03_data_exporter import export_main
  9 | from training.stage_04_model_trainer import train_main
 10 | 
 11 | from prediction.stage_00_data_loader import loader_main as pred_loader_main
 12 | from prediction.stage_01_data_validator import validation_main as pred_validation_main
 13 | from prediction.stage_02_data_transformer import transform_main as pred_transform_main
 14 | from prediction.stage_03_data_exporter import export_main as pred_export_main
 15 | from prediction.stage_04_model_predictor import predict_main
 16 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 17 | 
 18 | collection_name = "main_pipeline"
 19 | 
 20 | 
 21 | def begin_training(execution_id, executed_by):
 22 |     try:
 23 |         args = dict()
 24 |         args['config'] = os.path.join("config", "params.yaml")
 25 |         logger = get_logger_object_of_training(config_path=args['config'],
 26 |                                                collection_name=collection_name,
 27 |                                                execution_id=execution_id,
 28 |                                                executed_by=executed_by
 29 |                                                )
 30 | 
 31 |         args['datasource'] = None
 32 |         parsed_args = args
 33 |         logger.log(f"dictionary created.{args}")
 34 |         logger.log(f"{parsed_args}")
 35 |         logger.log("Data loading begin..")
 36 | 
 37 |         loader_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], execution_id=execution_id,
 38 |                     executed_by=executed_by)
 39 |         logger.log("Data loading completed..")
 40 |         logger.log("Data validation began..")
 41 | 
 42 |         validation_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
 43 |                         execution_id=execution_id,
 44 |                         executed_by=executed_by)
 45 |         logger.log("Data validation completed..")
 46 |         logger.log("Data transformation began..")
 47 | 
 48 |         transform_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
 49 |                        execution_id=execution_id,
 50 |                        executed_by=executed_by)
 51 |         logger.log("Data transformation completed..")
 52 |         logger.log("Export oberation began..")
 53 | 
 54 |         export_main(config_path=parsed_args['config'], execution_id=execution_id,
 55 |                     executed_by=executed_by)
 56 |         logger.log("Export oberation completed..")
 57 |         logger.log("Training began..")
 58 | 
 59 |         train_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'], execution_id=execution_id,
 60 |                    executed_by=executed_by)
 61 |         logger.log(f"Training completed")
 62 |         return {'status': True, 'message': 'Training completed successfully'}
 63 |     except Exception as e:
 64 |         generic_exception = GenericException(
 65 |             "Error occurred in module [{0}] method [{1}]"
 66 |                 .format(begin_training.__module__,
 67 |                         begin_training.__name__))
 68 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 69 | 
 70 | 
 71 | def begin_prediction(execution_id, executed_by):
 72 |     try:
 73 |         args = dict()
 74 |         args['config'] = os.path.join("config", "params.yaml")
 75 |         logger = get_logger_object_of_prediction(config_path=args['config'],
 76 |                                                  collection_name=collection_name,
 77 |                                                  execution_id=execution_id,
 78 |                                                  executed_by=executed_by
 79 |                                                  )
 80 |         args['datasource'] = None
 81 |         parsed_args = args
 82 |         logger.log(f"dictionary created.{args}")
 83 | 
 84 |         logger.log(f"{parsed_args}")
 85 |         logger.log("Data loading begin..")
 86 | 
 87 |         pred_loader_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
 88 |                          execution_id=execution_id,
 89 |                          executed_by=executed_by
 90 |                          )
 91 |         logger.log("Data loading completed..")
 92 |         logger.log("Data validation began..")
 93 | 
 94 |         pred_validation_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
 95 |                              execution_id=execution_id,
 96 |                              executed_by=executed_by
 97 |                              )
 98 |         logger.log("Data validation completed..")
 99 |         logger.log("Data transformation began..")
100 | 
101 |         pred_transform_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
102 |                             execution_id=execution_id,
103 |                             executed_by=executed_by
104 |                             )
105 |         logger.log("Data transformation completed..")
106 |         logger.log("Export oberation began..")
107 | 
108 |         pred_export_main(config_path=parsed_args['config'])
109 |         logger.log("Export operation completed..")
110 |         logger.log("Prediction began..")
111 | 
112 |         predict_main(config_path=parsed_args['config'], datasource=parsed_args['datasource'],
113 |                      execution_id=execution_id,
114 |                      executed_by=executed_by
115 |                      )
116 |         logger.log("Prediction completed")
117 |         return {'status': True, 'message': 'Prediction completed successfully'}
118 |     except Exception as e:
119 |         generic_exception = GenericException(
120 |             "Error occurred in module [{0}] method [{1}]"
121 |                 .format(begin_prediction.__module__,
122 |                         begin_prediction.__name__))
123 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
124 | 


--------------------------------------------------------------------------------
/insurance_exception/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__init__.py


--------------------------------------------------------------------------------
/insurance_exception/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/insurance_exception/__pycache__/insurance_exception.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/insurance_exception/__pycache__/insurance_exception.cpython-37.pyc


--------------------------------------------------------------------------------
/insurance_exception/insurance_exception.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class InsuranceException(Exception):
 5 | 
 6 |     def __init__(self, error_message):
 7 |         """
 8 | 
 9 |         :param error_message: error message in string format
10 |         """
11 |         self.error_message = error_message
12 | 
13 |     def __repr__(self):
14 |         return InsuranceException.__name__.__str__()
15 | 
16 |     def error_message_detail(self, error, error_detail):
17 |         exc_type, exc_obj, exc_tb = error_detail.exc_info()
18 |         file_name = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
19 |         error_message = "python script name [{0}] line number [{1}] error message [{2}]".format(file_name,
20 |                                                                                                 exc_tb.tb_lineno,
21 |                                                                                                 str(error))
22 |         self.error_message = self.error_message + " " + error_message
23 |         return self.error_message
24 | 
25 |     def __str__(self):
26 |         return self.error_message
27 | 


--------------------------------------------------------------------------------
/insurance_prediction.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: insurance-prediction
 3 | Version: 0.0.3
 4 | Summary: insurance-prediction
 5 | Home-page: UNKNOWN
 6 | Author: Avnish yadav
 7 | License: MIT
 8 | Platform: UNKNOWN
 9 | 
10 | UNKNOWN
11 | 
12 | 


--------------------------------------------------------------------------------
/insurance_prediction.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | azure_blob_storage/__init__.py
 3 | azure_blob_storage/azure_blob_storage.py
 4 | insurance_exception/__init__.py
 5 | insurance_exception/insurance_exception.py
 6 | insurance_prediction.egg-info/PKG-INFO
 7 | insurance_prediction.egg-info/SOURCES.txt
 8 | insurance_prediction.egg-info/dependency_links.txt
 9 | insurance_prediction.egg-info/top_level.txt
10 | logger/__init__.py
11 | logger/logger.py
12 | mongo_db/__init__.py
13 | mongo_db/mongo_db_atlas.py
14 | prediction/__init__.py
15 | prediction/stage_00_data_loader.py
16 | prediction/stage_01_data_validator.py
17 | prediction/stage_02_data_transformer.py
18 | prediction/stage_03_data_exporter.py
19 | prediction/stage_04_model_predictor.py
20 | streaming/__init__.py
21 | streaming/consumer/__init__.py
22 | streaming/consumer/kafka_to_spark_csv_consumer.py
23 | streaming/producer/__init__.py
24 | streaming/producer/kafka_csv_data_producer.py
25 | streaming/spark_manager/__init__.py
26 | streaming/spark_manager/spark_manager.py
27 | streaming/transformer/__init__.py
28 | streaming/transformer/spark_transformer.py
29 | training/__init__.py
30 | training/stage_00_data_loader.py
31 | training/stage_01_data_validator.py
32 | training/stage_02_data_transformer.py
33 | training/stage_03_data_exporter.py
34 | training/stage_04_model_trainer.py


--------------------------------------------------------------------------------
/insurance_prediction.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/insurance_prediction.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | azure_blob_storage
2 | insurance_exception
3 | logger
4 | mongo_db
5 | prediction
6 | streaming
7 | training
8 | 


--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__init__.py


--------------------------------------------------------------------------------
/logger/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/logger/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/logger/__pycache__/logger.cpython-37.pyc


--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | import uuid
 4 | import sys
 5 | from insurance_exception.insurance_exception import InsuranceException as AppLoggerException
 6 | from mongo_db.mongo_db_atlas import MongoDBOperation
 7 | 
 8 | 
 9 | class AppLogger:
10 |     def __init__(self, project_id, log_database, log_collection_name, executed_by,
11 |                  execution_id, is_log_enable=True):
12 |         try:
13 | 
14 |             self.log_database = log_database
15 |             self.log_collection_name = log_collection_name
16 |             self.executed_by = executed_by
17 |             self.execution_id = execution_id
18 |             self.mongo_db_object = MongoDBOperation()
19 |             self.project_id = project_id
20 |             self.is_log_enable = is_log_enable
21 |         except Exception as e:
22 |             app_logger_exception = AppLoggerException(
23 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
24 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
25 |                             "__init__"))
26 |             raise Exception(app_logger_exception.error_message_detail(str(e), sys)) from e
27 | 
28 |     def log(self, log_message):
29 |         try:
30 |             if not self.is_log_enable:
31 |                 return 0
32 |             log_data = {
33 |                 'execution_id': self.execution_id,
34 |                 'message': log_message,
35 |                 'executed_by': self.executed_by,
36 |                 'project_id': self.project_id,
37 |                 'updated_date_and_time': datetime.now().strftime("%H:%M:%S")
38 |             }
39 | 
40 |             self.mongo_db_object.insert_record_in_collection(
41 |                 self.log_database, self.log_collection_name, log_data)
42 |         except Exception as e:
43 |             app_logger_exception = AppLoggerException(
44 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
45 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
46 |                             self.log.__name__))
47 |             raise Exception(app_logger_exception.error_message_detail(str(e), sys)) from e
48 | 


--------------------------------------------------------------------------------
/mongo_db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__init__.py


--------------------------------------------------------------------------------
/mongo_db/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/mongo_db/__pycache__/mongo_db_atlas.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/mongo_db/__pycache__/mongo_db_atlas.cpython-37.pyc


--------------------------------------------------------------------------------
/mongo_db/mongo_db_atlas.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Feb  8 06:06:50 2021
  4 | 
  5 | @author: AvnishYadav
  6 | """
  7 | # importing mongodb file
  8 | import ssl
  9 | import pymongo
 10 | import json
 11 | import pandas as pd
 12 | import sys
 13 | from insurance_exception.insurance_exception import InsuranceException as MongoDbException
 14 | 
 15 | 
 16 | class MongoDBOperation:
 17 |     def __init__(self, user_name=None, password=None):
 18 |         try:
 19 |             if user_name is None or password is None:
 20 |                 # creating initial object to fetch mongodb credentials
 21 |                 credentials = {
 22 |                     "user_name": "avnyadav",
 23 |                     "password": "Aa327030"
 24 |                 }  # get_mongo_db_credentials()  # return dictionary with user name and password
 25 |                 self.__user_name = credentials['user_name']
 26 |                 self.__password = credentials['password']
 27 |             else:
 28 |                 self.__user_name = user_name
 29 |                 self.__password = password
 30 | 
 31 |         except Exception as e:
 32 |             mongo_db_exception = MongoDbException(
 33 |                 "Failed to instantiate mongo_db_object in module [{0}] class [{1}] method [{2}]"
 34 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
 35 |                             "__init__"))
 36 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
 37 | 
 38 |     def get_mongo_db_url(self):
 39 |         """
 40 |         :return: mongo_db_url
 41 |         """
 42 |         try:
 43 |             url = ""
 44 |             return url
 45 |         except Exception as e:
 46 |             mongo_db_exception = MongoDbException(
 47 |                 "Failed to fetch  mongo_db url in module [{0}] class [{1}] method [{2}]"
 48 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
 49 |                             self.get_mongo_db_url.__name__))
 50 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
 51 | 
 52 |     def get_database_client_object(self):
 53 |         """
 54 |         Return pymongoClient object to perform action with MongoDB
 55 |         """
 56 |         try:
 57 | 
 58 |             url = 'mongodb+srv://{0}:{1}@cluster0.wz7et.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'.format(
 59 |                 self.__user_name, self.__password)
 60 |             client = pymongo.MongoClient(url, ssl_cert_reqs=ssl.CERT_NONE)  # creating database client object
 61 |             return client
 62 |         except Exception as e:
 63 |             mongo_db_exception = MongoDbException(
 64 |                 "Failed to fetch  data base client object in module [{0}] class [{1}] method [{2}]"
 65 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
 66 |                             self.get_database_client_object.__name__))
 67 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
 68 | 
 69 |     def close_database_client_object(self, obj_name):
 70 |         """
 71 | 
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         obj_name : pymongo client
 76 |             DESCRIPTION.pymongo client object
 77 | 
 78 |         Raises
 79 |         ------
 80 |         Exception
 81 |             Failed to close database connection-->.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         bool
 86 |             True if connection closed.
 87 | 
 88 |         """
 89 |         try:
 90 |             obj_name.close()
 91 |             return True
 92 |         except Exception as e:
 93 |             mongo_db_exception = MongoDbException(
 94 |                 "Failed to close data base client object in module [{0}] class [{1}] method [{2}]"
 95 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
 96 |                             self.close_database_client_object.__name__))
 97 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
 98 | 
 99 |     def is_database_present(self, client, db_name):
100 |         """
101 | 
102 |         Parameters
103 |         ----------
104 |         client : pymongo client
105 |             DESCRIPTION. object which will be used to fetch communicate with mongo db
106 |         db_name : string
107 |             database name.
108 | 
109 |         Raises
110 |         ------
111 |         Exception
112 |             DESCRIPTION.If any exception occurs
113 | 
114 |         Returns
115 |         -------
116 |         bool
117 |             True if database already exists.
118 | 
119 |         """
120 |         try:
121 |             if db_name in client.list_database_names():
122 |                 return True
123 |             else:
124 |                 return False
125 |         except Exception as e:
126 |             mongo_db_exception = MongoDbException(
127 |                 "Failed during checking database  in module [{0}] class [{1}] method [{2}]"
128 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
129 |                             self.is_database_present.__name__))
130 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
131 | 
132 |     def create_database(self, client, db_name):
133 |         """
134 |         client: client object of database
135 |         db_name:data base name
136 |         """
137 |         try:
138 |             return client[db_name]
139 |         except Exception as e:
140 |             mongo_db_exception = MongoDbException(
141 |                 "Failure occured duing database creation steps in module [{0}] class [{1}] method [{2}]"
142 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
143 |                             self.create_database.__name__))
144 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
145 | 
146 |     def create_collection_in_database(self, database, collection_name):
147 |         """
148 |         database:database
149 |         collection_name: name of collection
150 |         return:
151 |             collection object
152 |         """
153 |         try:
154 |             return database[collection_name]
155 |         except Exception as e:
156 |             mongo_db_exception = MongoDbException(
157 |                 "Failed during creating collection in database  in module [{0}] class [{1}] method [{2}]"
158 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
159 |                             self.create_collection_in_database.__name__))
160 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
161 | 
162 |     def is_collection_present(self, collection_name, database):
163 |         """
164 | 
165 | 
166 |         Parameters
167 |         ----------
168 |         collection_name : collection_name
169 |             DESCRIPTION.collection name which needs to verify
170 |         database : TYPE
171 |             DESCRIPTION.database in which collection needs to check for existence
172 | 
173 |         Raises
174 |         ------
175 |         Exception
176 |             DESCRIPTION.
177 | 
178 |         Returns
179 |         -------
180 |         bool
181 |             true if collection present in database.
182 | 
183 |         """
184 |         try:
185 |             """It verifies the existence of collection name in a database"""
186 |             collection_list = database.list_collection_names()
187 | 
188 |             if collection_name in collection_list:
189 |                 # print("Collection:'{COLLECTION_NAME}' in Database:'{DB_NAME}' exists")
190 |                 return True
191 | 
192 |             # print(f"Collection:'{COLLECTION_NAME}' in Database:'{DB_NAME}' does not exists OR \n        no documents are present in the collection")
193 |             return False
194 |         except Exception as e:
195 |             mongo_db_exception = MongoDbException(
196 |                 "Failed during checking collection  in module [{0}] class [{1}] method [{2}]"
197 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
198 |                             self.is_collection_present.__name__))
199 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
200 | 
201 |     def get_collection(self, collection_name, database):
202 |         """
203 |         collection_name:collection name
204 |         database=database
205 |         ------------------------------------------
206 |         return collection object
207 |         """
208 |         try:
209 |             collection = self.create_collection_in_database(database, collection_name)
210 |             return collection
211 |         except Exception as e:
212 |             mongo_db_exception = MongoDbException(
213 |                 "Failed in retrival of collection  in module [{0}] class [{1}] method [{2}]"
214 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
215 |                             self.get_collection.__name__))
216 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
217 | 
218 |     def is_record_present(self, db_name, collection_name, record):
219 |         """
220 |         db_name: database name
221 |         collection_name: collection name
222 |         record: records to search
223 |         ----------------------------------------------
224 |         return True if record exists else return false
225 |         """
226 |         try:
227 |             client = self.get_database_client_object()  # client object
228 |             database = self.create_database(client, db_name)  # database object
229 |             collection = self.get_collection(collection_name, database)  # collection object
230 |             record_found = collection.find(record)  # fetching record
231 |             if record_found.count() > 0:
232 |                 client.close()
233 |                 return True
234 |             else:
235 |                 client.close()
236 |                 return False
237 |         except Exception as e:
238 |             mongo_db_exception = MongoDbException(
239 |                 "Failed in fetching record  in module [{0}] class [{1}] method [{2}]"
240 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
241 |                             self.is_record_present.__name__))
242 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
243 | 
244 |     def create_record(self, collection, data):
245 |         """
246 |         collection: Accept collection name
247 |         data: accept single to insert into collection
248 |         -------------------------------------------
249 |         return 1 if record inserted
250 |         """
251 |         try:
252 |             collection.insert_one(data)  # insertion of record in collection
253 |             return 1
254 |         except Exception as e:
255 |             mongo_db_exception = MongoDbException(
256 |                 "Failed in inserting record in module [{0}] class [{1}] method [{2}]"
257 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
258 |                             self.create_record.__name__))
259 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
260 | 
261 |     def create_records(self, collection, data):
262 |         """
263 |         collection: collection object
264 |         data: data which needs to be inserted
265 |         --------------------------------------------
266 |         return no of record inserted
267 |         """
268 |         try:
269 |             collection.insert_many(data)
270 |             return len(data)
271 |         except Exception as e:
272 |             mongo_db_exception = MongoDbException(
273 |                 "Failed in inserting records in module [{0}] class [{1}] method [{2}]"
274 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
275 |                             self.create_records.__name__))
276 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
277 | 
278 |     def insert_record_in_collection(self, db_name, collection_name, record):
279 |         """
280 |         db_name: database name
281 |         collection_name: collection name
282 |         record: records to insert
283 |         ------------------------------
284 |         return No of record inserted(int).
285 |         """
286 |         try:
287 |             no_of_row_inserted = 0
288 |             client = self.get_database_client_object()
289 |             database = self.create_database(client, db_name)
290 |             collection = self.get_collection(collection_name, database)
291 |             if not self.is_record_present(db_name, collection_name, record):
292 |                 no_of_row_inserted = self.create_record(collection=collection, data=record)
293 |             client.close()
294 |             return no_of_row_inserted
295 |         except Exception as e:
296 |             mongo_db_exception = MongoDbException(
297 |                 "Failed in inserting record  in collection module [{0}] class [{1}] method [{2}]"
298 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
299 |                             self.insert_record_in_collection.__name__))
300 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
301 | 
302 |     def drop_collection(self, db_name, collection_name):
303 |         """
304 | 
305 |         :param db_name: database name
306 |         :param collection_name:  collection name
307 |         :return: True if collection droped successfully.
308 |         """
309 |         try:
310 |             client = self.get_database_client_object()
311 |             database = self.create_database(client, db_name)
312 |             if self.is_collection_present(collection_name, database):
313 |                 collection_name = self.get_collection(collection_name, database)
314 |                 collection_name.drop()
315 |             return True
316 |         except Exception as e:
317 |             mongo_db_exception = MongoDbException(
318 |                 "Failed in droping collection module [{0}] class [{1}] method [{2}]"
319 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
320 |                             self.drop_collection.__name__))
321 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
322 | 
323 |     def insert_records_in_collection(self, db_name, collection_name, records):
324 |         """
325 |         db_name: database name
326 |         collection_name: collection name
327 |         records: records to insert
328 |         """
329 |         try:
330 |             no_of_row_inserted = 0
331 |             client = self.get_database_client_object()
332 |             database = self.create_database(client, db_name)
333 |             collection = self.get_collection(collection_name, database)
334 |             for record in records:
335 |                 if not self.is_record_present(db_name, collection_name, record):
336 |                     no_of_row_inserted = no_of_row_inserted + self.create_record(collection=collection, data=records)
337 |             client.close()
338 |             return no_of_row_inserted
339 |         except Exception as e:
340 |             mongo_db_exception = MongoDbException(
341 |                 "Failed in inserting records in collection module [{0}] class [{1}] method [{2}]"
342 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
343 |                             self.insert_record_in_collection.__name__))
344 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
345 | 
346 |     def insert_dataframe_into_collection(self, db_name, collection_name, data_frame):
347 |         """
348 |         db_name:Database Name
349 |         collection_name: collection name
350 |         data_frame: dataframe which needs to be inserted
351 |         return:
352 | 
353 |         """
354 |         try:
355 |             data_frame.reset_index(drop=True, inplace=True)
356 |             records = list(json.loads(data_frame.T.to_json()).values())
357 |             client = self.get_database_client_object()
358 |             database = self.create_database(client, db_name)
359 |             collection = self.get_collection(collection_name, database)
360 |             collection.insert_many(records)
361 |             return len(records)
362 |         except Exception as e:
363 |             mongo_db_exception = MongoDbException(
364 |                 "Failed in inserting dataframe in collection module [{0}] class [{1}] method [{2}]"
365 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
366 |                             self.insert_dataframe_into_collection.__name__))
367 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
368 | 
369 |     def get_record(self, database_name, collection_name, query=None):
370 |         try:
371 |             client = self.get_database_client_object()
372 |             database = self.create_database(client, database_name)
373 |             collection = self.get_collection(collection_name=collection_name, database=database)
374 |             record = collection.find_one(query)
375 |             return record
376 |         except Exception as e:
377 |             mongo_db_exception = MongoDbException(
378 |                 "Failed in retriving record in collection module [{0}] class [{1}] method [{2}]"
379 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
380 |                             self.get_record.__name__))
381 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
382 | 
383 |     def get_min_value_of_column(self, database_name, collection_name, query, column):
384 |         """
385 | 
386 |         :param database_name:
387 |         :param collection_name:
388 |         :param query: to get all record
389 |         :param column: column name
390 |         :return: minimum value
391 |         """
392 |         try:
393 |             client = self.get_database_client_object()
394 |             database = self.create_database(client, database_name)
395 |             collection = self.get_collection(collection_name=collection_name, database=database)
396 |             min_value = collection.find(query).sort(column, pymongo.ASCENDING).limit(1)
397 |             value = [min_val for min_val in min_value]
398 |             if len(value) > 0:
399 |                 if column in value[0]:
400 |                     return value[0][column]
401 |                 else:
402 |                     return None
403 |             else:
404 |                 return None
405 |         except Exception as e:
406 |             mongo_db_exception = MongoDbException(
407 |                 "Failed in getting minimum value from column in collection module [{0}] class [{1}] method [{2}]"
408 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
409 |                             self.get_record.__name__))
410 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
411 | 
412 |     def get_max_value_of_column(self, database_name, collection_name, query, column):
413 |         """
414 | 
415 |         :param database_name: database name
416 |         :param collection_name: collection name
417 |         :param query: query
418 |         :param column: column name
419 |         :return: maximum value
420 |         """
421 |         try:
422 |             client = self.get_database_client_object()
423 |             database = self.create_database(client, database_name)
424 |             collection = self.get_collection(collection_name=collection_name, database=database)
425 |             max_value = collection.find(query).sort(column, pymongo.DESCENDING).limit(1)
426 |             value = [max_val for max_val in max_value]
427 |             if len(value) > 0:
428 |                 if column in value[0]:
429 |                     return value[0][column]
430 |                 else:
431 |                     return None
432 |             else:
433 |                 return None
434 | 
435 |         except Exception as e:
436 |             mongo_db_exception = MongoDbException(
437 |                 "Failed in getting maximum value from column in collection module [{0}] class [{1}] method [{2}]"
438 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
439 |                             self.get_record.__name__))
440 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
441 | 
442 |     def get_records(self, database_name, collection_name, query=None):
443 |         """
444 | 
445 |         :param database_name:
446 |         :param collection_name:
447 |         :param query:
448 |         :return: cursor object you need to iterate
449 |         """
450 |         try:
451 |             client = self.get_database_client_object()
452 |             database = self.create_database(client, database_name)
453 |             collection = self.get_collection(collection_name=collection_name, database=database)
454 |             record = collection.find(query)
455 |             return record
456 |         except Exception as e:
457 |             mongo_db_exception = MongoDbException(
458 |                 "Failed in retriving records in collection module [{0}] class [{1}] method [{2}]"
459 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
460 |                             self.get_record.__name__))
461 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
462 | 
463 |     def update_record_in_collection(self, database_name, collection_name, query, new_value):
464 |         """
465 | 
466 |         :param database_name: database name
467 |         :param collection_name: collection name
468 |         :param query: search for record
469 |         :param new_value: updated values
470 |         :return: n_updated row
471 |         """
472 |         try:
473 |             client = self.get_database_client_object()
474 |             database = self.create_database(client, database_name)
475 |             collection = self.get_collection(collection_name=collection_name, database=database)
476 |             update_query = {'$set': new_value}
477 |             result = collection.update_one(query, update_query)
478 |             client.close()
479 |             return result.raw_result["nModified"]
480 |         except Exception as e:
481 |             mongo_db_exception = MongoDbException(
482 |                 "Failed updating record in collection module [{0}] class [{1}] method [{2}]"
483 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
484 |                             self.update_record_in_collection.__name__))
485 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
486 | 
487 |     def get_dataframe_of_collection(self, db_name, collection_name, query=None):
488 |         """
489 | 
490 |         Parameters
491 |         ----------
492 |         db_name : string
493 |             DESCRIPTION. database name
494 |         collection_name : string
495 |             DESCRIPTION.collection name
496 | 
497 |         Returns
498 |         -------
499 |         Pandas data frame of  collection name present database.
500 | 
501 |         """
502 |         try:
503 |             client = self.get_database_client_object()
504 |             database = self.create_database(client, db_name)
505 |             collection = self.get_collection(collection_name=collection_name, database=database)
506 |             if query is None:
507 |                 query = {}
508 |             df = pd.DataFrame(list(collection.find(query)))
509 |             if "_id" in df.columns.to_list():
510 |                 df = df.drop(columns=["_id"], axis=1)
511 |             return df.copy()
512 |         except Exception as e:
513 |             mongo_db_exception = MongoDbException(
514 |                 "Failed in returning dataframe of collection module [{0}] class [{1}] method [{2}]"
515 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
516 |                             self.get_dataframe_of_collection.__name__))
517 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
518 | 
519 |     def remove_record(self, db_name, collection_name, query):
520 |         try:
521 |             client = self.get_database_client_object()
522 |             database = self.create_database(client, db_name)
523 |             collection = self.get_collection(collection_name=collection_name, database=database)
524 |             collection.delete_one(query)
525 |             return True
526 |         except Exception as e:
527 |             mongo_db_exception = MongoDbException(
528 |                 "Failed in collection module [{0}] class [{1}] method [{2}]"
529 |                     .format(MongoDBOperation.__module__.__str__(), MongoDBOperation.__name__,
530 |                             self.remove_record.__name__))
531 |             raise Exception(mongo_db_exception.error_message_detail(str(e), sys)) from e
532 | 


--------------------------------------------------------------------------------
/new_data.csv:
--------------------------------------------------------------------------------
1 | age,sex,bmi,children,smoker,region,timestamp,sex_encoder,smoker_encoder,sex_encoded,smoker_encoded,input_features,prediction
2 | 21,male,25.7,2,no,northeast,2021-11-13 11:57:30.400,0.0,0.0,"(1,[0],[1.0])","(1,[0],[1.0])","[21.0,25.7,2.0,1.0,1.0]",6333.757487873788
3 | 39,female,34.3,5,no,southeast,2021-11-13 11:57:31.410,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[39.0,34.3,5.0,0.0,1.0]",9860.294896652984
4 | 50,female,28.2,3,no,southeast,2021-11-13 11:57:32.416,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[50.0,28.2,3.0,0.0,1.0]",12142.963316508103
5 | 34,female,23.6,0,no,northeast,2021-11-13 11:57:33.420,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[34.0,23.6,0.0,0.0,1.0]",6761.00125859063
6 | 22,female,20.2,0,no,northwest,2021-11-13 11:57:34.425,1.0,0.0,"(1,[],[])","(1,[0],[1.0])","[22.0,20.2,0.0,0.0,1.0]",3976.837587719984
7 | 


--------------------------------------------------------------------------------
/prediction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/prediction/__init__.py


--------------------------------------------------------------------------------
/prediction/stage_00_data_loader.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | import shutil
 5 | from utility import read_params, get_logger_object_of_prediction
 6 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 7 | from utility import clean_data_source_dir
 8 | 
 9 | log_collection_name = "data_loader"
10 | 
11 | 
12 | def loader_main(config_path: str, datasource: str,is_logging_enable=True,execution_id=None,executed_by=None) -> None:
13 |     try:
14 |         logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
15 |                                                execution_id=execution_id, executed_by=executed_by)
16 | 
17 | 
18 |         logger.is_log_enable = is_logging_enable
19 |         logger.log("Starting data loading operation.\nReading configuration file.")
20 | 
21 |         config = read_params(config_path)
22 |         downloader_path=config['data_download']['cloud_prediction_directory_path']
23 |         download_path=config['data_source']['Prediction_Batch_Files']
24 | 
25 | 
26 |         logger.log("Configuration detail has been fetched from configuration file.")
27 |         # removing existing training and additional training files from local
28 |         logger.log(f"Cleaning local directory [{download_path}]  for training.")
29 |         clean_data_source_dir(download_path,logger=logger, is_logging_enable=is_logging_enable)  # removing existing file from local system
30 | 
31 |         logger.log(f"Cleaning completed. Directory has been cleared now  [{download_path}]")
32 |         # downloading training and additional training file from cloud into local system
33 |         logger.log("Data will be downloaded from cloud storage into local system")
34 | 
35 | 
36 |         for file in os.listdir(downloader_path):
37 |             if '.dvc' in file or '.gitignore' in file:
38 |                 continue
39 |             print(f"Source dir: {downloader_path} file: {file} is being copied into destination dir: {download_path}"
40 |                   f" file: {file}")
41 |             shutil.copy(os.path.join(downloader_path,file),os.path.join(download_path,file))
42 |         logger.log("Data has been downloaded from cloud storage into local system")
43 | 
44 |     except Exception as e:
45 |         generic_exception = GenericException(
46 |             "Error occurred in module [{0}] method [{1}]"
47 |                 .format(loader_main.__module__,
48 |                         loader_main.__name__))
49 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     args = argparse.ArgumentParser()
54 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
55 |     args.add_argument("--datasource", default=None)
56 |     parsed_args = args.parse_args()
57 |     print("started")
58 |     loader_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
59 | 


--------------------------------------------------------------------------------
/prediction/stage_01_data_validator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sys
  4 | 
  5 | import pandas as pd
  6 | from utility import read_params, create_directory_path, values_from_schema_function, \
  7 |     get_logger_object_of_prediction, get_date, get_time
  8 | import argparse
  9 | import shutil
 10 | 
 11 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 12 | 
 13 | log_collection_name = "data_validator"
 14 | 
 15 | 
 16 | class DataValidator:
 17 |     def __init__(self, config, logger, is_logging_enable=True):
 18 |         try:
 19 |             self.logger = logger
 20 |             self.logger.is_log_enable = is_logging_enable
 21 |             self.config = config
 22 |             self.file_path = self.config['data_source']['Prediction_Batch_Files']
 23 |             self.good_file_path = self.config['artifacts']['prediction_data']['good_file_path']
 24 |             self.bad_file_path = self.config['artifacts']['prediction_data']['bad_file_path']
 25 |             self.archive_bad_file_path = self.config['artifacts']['prediction_data']['archive_bad_file_path']
 26 |             self.prediction_schema_file = self.config['config']['schema_prediction']
 27 |         except Exception as e:
 28 |             generic_exception = GenericException(
 29 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 30 |                     .format(self.__module__, DataValidator.__name__,
 31 |                             self.__init__.__name__))
 32 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 33 | 
 34 |     def archive_bad_files(self):
 35 |         try:
 36 |             folder_name=f"bad_files_{get_date().replace('-','_')}_{get_time().replace(':','_')}"
 37 |             archive_directory_path=os.path.join(self.archive_bad_file_path,folder_name)
 38 |             create_directory_path(archive_directory_path)
 39 |             for file in os.listdir(self.bad_file_path):
 40 |                 source_file_path=os.path.join(self.bad_file_path,file)
 41 |                 shutil.move(source_file_path,archive_directory_path)
 42 |         except Exception as e:
 43 |             generic_exception = GenericException(
 44 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 45 |                     .format(self.__module__, DataValidator.__name__,
 46 |                             self.archive_bad_files.__name__))
 47 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 48 | 
 49 | 
 50 |     def create_good_bad_archive_bad_file_path(self):
 51 |         try:
 52 |             create_directory_path(self.good_file_path)
 53 |             create_directory_path(self.bad_file_path)
 54 |             create_directory_path(self.archive_bad_file_path,is_recreate=False)
 55 |         except Exception as e:
 56 | 
 57 |             generic_exception = GenericException(
 58 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 59 |                     .format(self.__module__, DataValidator.__name__,
 60 |                             self.create_good_bad_archive_bad_file_path.__name__))
 61 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 62 | 
 63 | 
 64 |     def value_from_schema(self):
 65 |         """
 66 | 
 67 |         :return: tuple (sample_file_name,column_names,number_of_column)
 68 |         """
 69 |         try:
 70 |             return values_from_schema_function(self.prediction_schema_file)
 71 |         except Exception as e:
 72 |             generic_exception = GenericException(
 73 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 74 |                     .format(self.__module__, DataValidator.__name__,
 75 |                             self.value_from_schema.__name__))
 76 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 77 | 
 78 | 
 79 |     def file_name_regular_expression(self):
 80 |         """
 81 | 
 82 |         :return: regular expression syntax which can be used for validation of file name
 83 |         """
 84 |         return "['HealthPrem']+['\_'']+[\d_]+[\d]+\.csv"
 85 | 
 86 |     def validate_missing_values_in_whole_column(self):
 87 |         try:
 88 |             self.logger.log("Missing Values Validation Started!!")
 89 |             for file in os.listdir(self.good_file_path):
 90 |                 csv = pd.read_csv(os.path.join(self.good_file_path, file))
 91 |                 count = 0
 92 |                 for columns in csv:
 93 |                     if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]):
 94 |                         count += 1
 95 |                         shutil.move(os.path.join(self.good_file_path, file),
 96 |                                     self.bad_file_path)
 97 |                         self.logger.log(
 98 |                             "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
 99 |                         break
100 |                 if count == 0:
101 |                     print(csv.columns)
102 |                     csv.rename(columns={"Unnamed: 0": "Premium "}, inplace=True)
103 |                     csv.to_csv(os.path.join(self.good_file_path, file), index=None, header=True)
104 |         except Exception as e:
105 |             generic_exception = GenericException(
106 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
107 |                     .format(self.__module__, DataValidator.__name__,
108 |                             self.validate_missing_values_in_whole_column.__name__))
109 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
110 | 
111 | 
112 |     def validate_file_name(self):
113 |         try:
114 |             self.create_good_bad_archive_bad_file_path()
115 |             pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = self.value_from_schema()
116 | 
117 |             file_name_reg_pattern = self.file_name_regular_expression()
118 |             self.logger.log(f"Validating file names.")
119 |             files = os.listdir(self.file_path)
120 |             for file in files:
121 |                 file_path = os.path.join(self.file_path, file)
122 |                 split_at_dot = re.split('.csv', file)
123 |                 split_at_dot = (re.split('_', split_at_dot[0]))
124 |                 if re.match(file_name_reg_pattern, file) and len(split_at_dot[1]) == length_of_date_stamp_in_file \
125 |                         and len(split_at_dot[2]) == length_of_time_stamp_in_file:
126 |                     destination_file_path = os.path.join(self.good_file_path, file)
127 |                     self.logger.log(f"file name : {file} matched hence moving file to good file path {destination_file_path}")
128 |                     shutil.move(file_path, destination_file_path)
129 |                 else:
130 |                     destination_file_path = os.path.join(self.bad_file_path, file)
131 |                     self.logger.log(f"file name: {file} does not matched hence moving file to bad file path {destination_file_path}")
132 |                     shutil.move(file_path, destination_file_path)
133 |         except Exception as e:
134 |             generic_exception = GenericException(
135 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
136 |                     .format(self.__module__, DataValidator.__name__,
137 |                             self.validate_file_name.__name__))
138 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
139 | 
140 | 
141 |     def validate_no_of_column(self, no_of_column):
142 |         """
143 |         Description:
144 |         If number of column matches then file will be move to good file path else bad file path
145 |         =====================================================================================
146 |         :param no_of_column: int Number of column must present in each file
147 |         :return: Nothing
148 |         """
149 |         try:
150 |             self.logger.log(f"Validating number of column in input file")
151 |             files = os.listdir(self.good_file_path)
152 |             for file in files:
153 |                 file_path = os.path.join(self.good_file_path, file)
154 |                 df = pd.read_csv(file_path)
155 |                 if df.shape[1] != no_of_column:
156 |                     destination_file_path = os.path.join(self.bad_file_path, file)
157 |                     self.logger.log(f"file: {file} has incorrect number of column hence moving file to bad file path {destination_file_path}")
158 |                     shutil.move(file_path, destination_file_path)
159 |         except Exception as e:
160 |             generic_exception = GenericException(
161 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
162 |                     .format(self.__module__, DataValidator.__name__,
163 |                             self.validate_no_of_column.__name__))
164 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
165 | 
166 | 
167 | def validation_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
168 |     try:
169 |         logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
170 |                                                  execution_id=execution_id, executed_by=executed_by)
171 | 
172 |         logger.is_log_enable = is_logging_enable
173 |         config = read_params(config_path)
174 |         logger.log("data validation started")
175 |         data_validator = DataValidator(config=config, logger=logger, is_logging_enable=is_logging_enable)
176 |         pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = \
177 |             data_validator.value_from_schema()
178 |         data_validator.validate_file_name()
179 |         data_validator.validate_no_of_column(no_of_column=number_of_columns)
180 |         data_validator.validate_missing_values_in_whole_column()
181 |         data_validator.archive_bad_files()
182 |     except Exception as e:
183 |         generic_exception = GenericException(
184 |             "Error occurred in module [{0}] method [{1}]"
185 |                 .format(validation_main.__module__,
186 |                         validation_main.__name__))
187 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     args = argparse.ArgumentParser()
192 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
193 |     args.add_argument("--datasource", default=None)
194 |     parsed_args = args.parse_args()
195 |     print("started")
196 |     validation_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
197 | 


--------------------------------------------------------------------------------
/prediction/stage_02_data_transformer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pandas as pd
 5 | import argparse
 6 | from utility import read_params, get_logger_object_of_prediction
 7 | from mongo_db.mongo_db_atlas import MongoDBOperation
 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 9 | 
10 | log_collection_name = "data_transformer"
11 | 
12 | 
13 | class DataTransformer:
14 |     def __init__(self, config, logger, is_log_enable=True):
15 |         try:
16 |             self.config = config
17 |             self.logger = logger
18 |             self.logger.is_log_enable = is_log_enable
19 |             self.good_file_path = self.config["artifacts"]['prediction_data']['good_file_path']
20 |             self.unwanted_column_names=self.config["dataset"]['unwanted_column']
21 |             self.mongo_db=MongoDBOperation()
22 |             self.dataset_database=self.config["dataset"]["database_detail"]["prediction_database_name"]
23 |             self.dataset_collection_name=self.config["dataset"]["database_detail"]["dataset_prediction_collection_name"]
24 |             self.mongo_db.drop_collection(self.dataset_database,self.dataset_collection_name)
25 |         except Exception as e:
26 |             generic_exception = GenericException(
27 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
28 |                     .format(self.__module__, DataTransformer.__name__,
29 |                             self.__init__.__name__))
30 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
31 | 
32 |     def unite_dataset(self):
33 |         try:
34 |             dataset_list=[]
35 |             for file in os.listdir(self.good_file_path):
36 |                 dataset_list.append(pd.read_csv(os.path.join(self.good_file_path,file)))
37 |             df=pd.concat(dataset_list)
38 |             df=self.remove_unwanted_column(df)
39 |             self.logger.log(f"Inserting dataset into database {self.dataset_database} "
40 |                             f"collection_name: {self.dataset_collection_name}")
41 |             self.mongo_db.insert_dataframe_into_collection(self.dataset_database,self.dataset_collection_name,df)
42 |         except Exception as e:
43 |             generic_exception = GenericException(
44 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
45 |                     .format(self.__module__, DataTransformer.__name__,
46 |                             self.unite_dataset.__name__))
47 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
48 | 
49 | 
50 |     def remove_unwanted_column(self,df):
51 |             try:
52 |                 drop_column=list(filter(lambda x: x in df.columns ,self.unwanted_column_names))
53 |                 return df.drop(drop_column,axis=1)
54 |             except Exception as e:
55 |                 generic_exception = GenericException(
56 |                     "Error occurred in module [{0}] class [{1}] method [{2}]"
57 |                         .format(self.__module__, DataTransformer.__name__,
58 |                                 self.remove_unwanted_column.__name__))
59 |                 raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
60 | 
61 | 
62 | def transform_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
63 |     try:
64 |         logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
65 |                                                  execution_id=execution_id, executed_by=executed_by)
66 | 
67 |         logger.is_log_enable = is_logging_enable
68 |         config = read_params(config_path)
69 |         data_transformer = DataTransformer(config=config, logger=logger, is_log_enable=is_logging_enable)
70 |         logger.log('Start of Data Preprocessing before DB')
71 |         data_transformer.unite_dataset()
72 |         logger.log('Data Preprocessing before DB Completed !!')
73 | 
74 |     except Exception as e:
75 |         generic_exception = GenericException(
76 |             "Error occurred in module [{0}] method [{1}]"
77 |                 .format(transform_main.__module__,
78 |                         transform_main.__name__))
79 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     args = argparse.ArgumentParser()
84 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
85 |     args.add_argument("--datasource", default=None)
86 |     parsed_args = args.parse_args()
87 |     print("started")
88 |     transform_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
89 | 


--------------------------------------------------------------------------------
/prediction/stage_03_data_exporter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from utility import read_params, create_directory_path
 5 | from mongo_db.mongo_db_atlas import MongoDBOperation
 6 | import argparse
 7 | from utility import get_logger_object_of_prediction
 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 9 | 
10 | log_collection_name = "data_export"
11 | 
12 | 
13 | class DataExporter:
14 |     def __init__(self, config, logger, is_log_enable):
15 |         try:
16 |             self.config = config
17 |             self.logger = logger
18 |             self.is_log_enable = is_log_enable
19 |             self.mongo_db = MongoDBOperation()
20 |             self.dataset_database = self.config["dataset"]["database_detail"]["prediction_database_name"]
21 |             self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_prediction_collection_name"]
22 |             self.prediction_file_from_db = self.config["artifacts"]['prediction_data']['prediction_file_from_db']
23 |             self.master_csv = self.config["artifacts"]['prediction_data']['master_csv']
24 |         except Exception as e:
25 |             generic_exception = GenericException(
26 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
27 |                     .format(self.__module__, DataExporter.__name__,
28 |                             self.__init__.__name__))
29 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
30 | 
31 |     def export_dataframe_from_database(self):
32 |         try:
33 |             create_directory_path(self.prediction_file_from_db)
34 |             self.logger.log(f"Creating dataframe of data stored db"
35 |                             f"[{self.dataset_database}] and collection[{self.dataset_collection_name}]")
36 |             df = self.mongo_db.get_dataframe_of_collection(db_name=self.dataset_database,
37 |                                                            collection_name=self.dataset_collection_name)
38 |             master_csv_file_path = os.path.join(self.prediction_file_from_db, self.master_csv)
39 |             self.logger.log(f"master csv file will be generated at "
40 |                             f"{master_csv_file_path}.")
41 |             df.to_csv(master_csv_file_path, index=None,header=True)
42 | 
43 |         except Exception as e:
44 |             generic_exception = GenericException(
45 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
46 |                     .format(self.__module__, DataExporter.__name__,
47 |                             self.export_dataframe_from_database.__name__))
48 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
49 | 
50 | 
51 | def export_main(config_path: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
52 |     try:
53 |         logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
54 |                                                  execution_id=execution_id, executed_by=executed_by)
55 | 
56 |         logger.is_log_enable = is_logging_enable
57 |         config = read_params(config_path)
58 |         data_exporter = DataExporter(config=config, logger=logger, is_log_enable=is_logging_enable)
59 |         logger.log("Generating csv file from dataset stored in database.")
60 |         data_exporter.export_dataframe_from_database()
61 |         logger.log("Dataset has been successfully exported in directory and exiting export pipeline.")
62 |     except Exception as e:
63 |         generic_exception = GenericException(
64 |             "Error occurred in module [{0}] method [{1}]"
65 |                 .format(export_main.__module__,
66 |                         export_main.__name__))
67 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     args = argparse.ArgumentParser()
72 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
73 |     parsed_args = args.parse_args()
74 |     print("started")
75 |     export_main(config_path=parsed_args.config)
76 | 


--------------------------------------------------------------------------------
/prediction/stage_04_model_predictor.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import argparse
  4 | 
  5 | from pyspark.sql.types import IntegerType, FloatType, StringType
  6 | 
  7 | from utility import create_directory_path,get_logger_object_of_prediction,read_params
  8 | 
  9 | from streaming.spark_manager.spark_manager import SparkManager
 10 | 
 11 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 12 | from pyspark.ml import Pipeline, PipelineModel
 13 | from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor
 14 | 
 15 | log_collection_name = "prediction_model"
 16 | 
 17 | 
 18 | class DataPreProcessing:
 19 |     def __init__(self, logger, is_log_enable=True, data_frame=None, pipeline_path=None):
 20 |         try:
 21 |             self.logger = logger
 22 |             self.logger.is_log_enable = is_log_enable
 23 |             self.data_frame = data_frame
 24 |             print(pipeline_path)
 25 |             self.pipeline_obj = PipelineModel.load(pipeline_path)
 26 | 
 27 |         except Exception as e:
 28 |             generic_exception = GenericException(
 29 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 30 |                     .format(self.__module__, DataPreProcessing.__name__,
 31 |                             self.__init__.__name__))
 32 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 33 | 
 34 |     def set_dataframe(self, dataframe):
 35 |         try:
 36 |             self.data_frame = dataframe
 37 |         except Exception as e:
 38 |             generic_exception = GenericException(
 39 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 40 |                     .format(self.__module__, DataPreProcessing.__name__,
 41 |                             self.update_dataframe_scheme.__name__))
 42 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 43 | 
 44 |     def update_dataframe_scheme(self, schema_definition: dict):
 45 |         """
 46 | 
 47 |         """
 48 |         try:
 49 |             print(self.data_frame.printSchema())
 50 |             if self.data_frame is None:
 51 |                 raise Exception("update the attribute dataframe")
 52 |             for column, datatype in schema_definition.items():
 53 |                 self.logger.log(f"Update datatype of column: {column} to {str(datatype)}")
 54 |                 self.data_frame = self.data_frame.withColumn(column, self.data_frame[column].cast(datatype))
 55 |         except Exception as e:
 56 |             generic_exception = GenericException(
 57 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 58 |                     .format(self.__module__, DataPreProcessing.__name__,
 59 |                             self.update_dataframe_scheme.__name__))
 60 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 61 | 
 62 |     def get_prepared_data(self):
 63 |         try:
 64 |             schema_definition = {"age": IntegerType(),
 65 |                                  "sex": StringType(),
 66 |                                  "bmi": FloatType(),
 67 |                                  "children": IntegerType(),
 68 |                                  "smoker": StringType(),
 69 |                                  }
 70 |             self.update_dataframe_scheme(schema_definition=schema_definition)
 71 |             self.data_frame = self.pipeline_obj.transform(self.data_frame)
 72 |             print(self.data_frame.printSchema())
 73 |             return self.data_frame
 74 |         except Exception as e:
 75 |             generic_exception = GenericException(
 76 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 77 |                     .format(self.__module__, DataPreProcessing.__name__,
 78 |                             self.get_prepared_data.__name__))
 79 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 80 | 
 81 | 
 82 | class Predictor:
 83 | 
 84 |     def __init__(self, config, logger, is_log_enable):
 85 |         try:
 86 |             self.logger = logger
 87 |             self.logger.is_log_enable = is_log_enable
 88 |             self.config = config
 89 |             self.prediction_file_path = self.config['artifacts']['prediction_data']['prediction_file_from_db']
 90 |             self.master_csv = self.config['artifacts']['prediction_data']['master_csv']
 91 |             self.model_path = self.config['artifacts']['model']['model_path']
 92 |             self.prediction_output_file_path = self.config['artifacts']['prediction_data'][
 93 |                 'prediction_output_file_path']
 94 |             self.prediction_file_name = self.config['artifacts']['prediction_data']['prediction_file_name']
 95 |             self.target_columns = self.config['target_columns']['columns']
 96 |             self.null_value_file_path = config['artifacts']['training_data']['null_value_info_file_path']
 97 |             self.pipeline_path = self.config['artifacts']['training_data']['pipeline_path']
 98 |             """
 99 |             self.spark = SparkSession.builder. \
100 |                 master("local[*]"). \
101 |                 appName("insurance-premium-reg").getOrCreate()
102 |                 """
103 |             self.spark = SparkManager().get_spark_session_object()
104 |         except Exception as e:
105 |             generic_exception = GenericException(
106 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
107 |                     .format(self.__module__, Predictor.__name__,
108 |                             self.__init__.__name__))
109 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
110 | 
111 |     def get_dataframe(self):
112 |         try:
113 |             master_file_path = os.path.join(self.prediction_file_path, self.master_csv)
114 |             return self.spark.read.csv(master_file_path, header=True, inferSchema=True)
115 |         except Exception as e:
116 |             generic_exception = GenericException(
117 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
118 |                     .format(self.__module__, Predictor.__name__,
119 |                             self.get_dataframe.__name__))
120 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
121 | 
122 |     def data_preparation(self):
123 |         try:
124 | 
125 |             input_features = self.get_dataframe()
126 |             data_preprocess = DataPreProcessing(logger=self.logger,
127 |                                                 is_log_enable=self.logger.is_log_enable,
128 |                                                 data_frame=input_features,
129 |                                                 pipeline_path=self.pipeline_path
130 |                                                 )
131 |             return data_preprocess.get_prepared_data()
132 | 
133 |         except Exception as e:
134 |             generic_exception = GenericException(
135 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
136 |                     .format(self.__module__, Predictor.__name__,
137 |                             self.data_preparation.__name__))
138 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
139 | 
140 | 
141 |     def load_model(self):
142 |         try:
143 |             model_path = self.model_path
144 |             if not os.path.exists(model_path):
145 |                 raise Exception(f"Model directory: {model_path} is not found.")
146 |             model_names = os.listdir(model_path)
147 |             if len(model_names) != 1:
148 |                 raise Exception(f"We have expected only one model instead we found {len(model_names)}")
149 |             model_name = model_names[0]
150 |             model_path = os.path.join(model_path, model_name)
151 |             print(f"model path: {model_path}")
152 |             return RandomForestRegressionModel.load(model_path)
153 |         except Exception as e:
154 |             generic_exception = GenericException(
155 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
156 |                     .format(self.__module__, Predictor.__name__,
157 |                             self.load_model.__name__))
158 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
159 | 
160 |     def predict(self):
161 |         try:
162 | 
163 |             input_data = self.data_preparation()
164 |             model = self.load_model()
165 |             print(str(model))
166 |             print(input_data.printSchema())
167 |             prediction = model.transform(input_data)
168 |             prediction_output = prediction.select("age", "sex", "children", "smoker", "prediction").toPandas()
169 |             create_directory_path(self.prediction_output_file_path)
170 |             output_file_path = os.path.join(self.prediction_output_file_path, self.prediction_file_name)
171 |             if prediction_output is not None:
172 |                 prediction_output.to_csv(output_file_path, index=None, header=True)
173 |         except Exception as e:
174 |             generic_exception = GenericException(
175 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
176 |                     .format(self.__module__, Predictor.__name__,
177 |                             self.predict.__name__))
178 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
179 | 
180 | 
181 | def predict_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None,
182 |                  executed_by=None) -> None:
183 |     try:
184 |         logger = get_logger_object_of_prediction(config_path=config_path, collection_name=log_collection_name,
185 |                                                  execution_id=execution_id, executed_by=executed_by)
186 | 
187 |         logger.is_log_enable = is_logging_enable
188 |         logger.log("Prediction begin.")
189 |         config = read_params(config_path)
190 |         predictor = Predictor(config=config, logger=logger, is_log_enable=is_logging_enable)
191 |         predictor.predict()
192 |         logger.log("Prediction completed successfully.")
193 | 
194 |     except Exception as e:
195 |         generic_exception = GenericException(
196 |             "Error occurred in module [{0}] method [{1}]"
197 |                 .format(predict_main.__module__,
198 |                         predict_main.__name__))
199 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     args = argparse.ArgumentParser()
204 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
205 |     args.add_argument("--datasource", default=None)
206 |     parsed_args = args.parse_args()
207 |     print(parsed_args.config)
208 |     print(parsed_args.datasource)
209 |     predict_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
210 | 


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | pyspark==3.0.1
2 | pymongo==3.11.0
3 | dnspython==1.16.0
4 | PyYAML
5 | pandas
6 | sklearn
7 | -e .


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="insurance-prediction",
 5 |     version="0.0.3",
 6 |     description="insurance-prediction",
 7 |     author="Avnish yadav",
 8 |     packages=find_packages(),
 9 |     license="MIT"
10 | )


--------------------------------------------------------------------------------
/spark_consumer_from_kafka.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyspark.ml import PipelineModel
 4 | from pyspark.ml.regression import RandomForestRegressionModel
 5 | 
 6 | from streaming.spark_manager.spark_manager import SparkManager
 7 | from streaming.consumer.kafka_to_spark_csv_consumer import KafkaToSparkCSVConsumer
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     spark_session = SparkManager().get_spark_session_object()
12 | 
13 |     schema_string = "age INT,sex STRING,bmi DOUBLE,children INT,smoker STRING,region STRING"
14 |     database_name = "stream_prediction"
15 |     collection_name = "insurance_prediction_output"
16 |     kfk_con = KafkaToSparkCSVConsumer(spark_session=spark_session,
17 |                                       schema_string=schema_string,
18 |                                       database_name=database_name,
19 |                                       collection_name=collection_name
20 |                                       )
21 |     transformer_list = []
22 |     pipeline_model = PipelineModel.load(os.path.join("artifacts",
23 |                                                      "pipeline",
24 |                                                      "pipeline_model"))
25 |     random_forest_model = RandomForestRegressionModel.load(os.path.join("artifacts",
26 |                                                                         "model",
27 |                                                                         "random_forest_regressor"))
28 | 
29 |     transformer_list.append(pipeline_model)
30 |     transformer_list.append(random_forest_model)
31 |     kfk_con.spark_transformer.add_machine_learning_transformer(
32 |         transformer=transformer_list
33 |     )
34 |     kfk_con.receive_csv_data_from_kafka_topics()
35 | 


--------------------------------------------------------------------------------
/streaming/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__init__.py


--------------------------------------------------------------------------------
/streaming/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/consumer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__init__.py


--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/consumer/__pycache__/kafka_to_spark_csv_consumer.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/consumer/kafka_to_spark_csv_consumer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyspark.sql.functions import *
 4 | 
 5 | from insurance_exception.insurance_exception import  InsuranceException as KafkaToSparkCSVConsumerException
 6 | import sys
 7 | from utility import read_params
 8 | from streaming.transformer.spark_transformer import SparkTransformer
 9 | 
10 | 
11 | class KafkaToSparkCSVConsumer:
12 |     def __init__(self, schema_string, database_name, collection_name, spark_session, processing_interval_second=5,
13 |                  config_path=None, ):
14 |         try:
15 |             # accepting default configuration file if no configuration file path has been specified during object
16 |             # instantiation
17 |             path = os.path.join("config", "params.yaml") if config_path is None else os.path.join(config_path)
18 |             self.config = read_params(config_path=path)
19 |             self.kafka_topic_name = self.config['kafka']['topic_name']
20 |             self.kafka_bootstrap_server = self.config['kafka']['kafka_bootstrap_server']
21 |             self.spark_session = spark_session
22 |             self.schema = schema_string  # "age INT,sex STRING,bmi DOUBLE,children INT,smoker STRING,region STRING"
23 |             self.spark_transformer = SparkTransformer(database_name=database_name, collection_name=collection_name)
24 |             self.processing_interval_second = processing_interval_second
25 |             self.query = None
26 |         except Exception as e:
27 |             kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException(
28 |                 "Error occurred  in module [{0}] class [{1}] method [{2}] ".
29 |                     format(self.__module__, KafkaToSparkCSVConsumer.__name__,
30 |                            self.__init__.__name__))
31 |             raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e
32 | 
33 |     def receive_csv_data_from_kafka_topics(self):
34 |         try:
35 |             dataframe = self.spark_session \
36 |                 .readStream \
37 |                 .format("kafka") \
38 |                 .option("kafka.bootstrap.servers", self.kafka_bootstrap_server) \
39 |                 .option("subscribe", self.kafka_topic_name) \
40 |                 .option("startingOffsets", "latest") \
41 |                 .load()
42 |             dataframe_1 = dataframe.selectExpr("CAST(value as STRING) ", "timestamp")
43 |             dataframe_2 = dataframe_1.select(from_csv(col("value"), self.schema).alias("records"), "timestamp")
44 |             dataframe_3 = dataframe_2.select("records.*", "timestamp")
45 |             transformed_df = dataframe_3
46 |             for transformer in self.spark_transformer.ml_transformer:
47 |                 transformed_df = transformer.transform(transformed_df)
48 |             self.query = transformed_df.writeStream.trigger(
49 |                 processingTime=f'{self.processing_interval_second} seconds').foreachBatch(
50 |                 self.spark_transformer.process_each_record).start()
51 |             self.query.awaitTermination()
52 |         except Exception as e:
53 |             kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException(
54 |                 "Error occurred  in module [{0}] class [{1}] method [{2}] ".
55 |                     format(self.__module__, KafkaToSparkCSVConsumer.__name__,
56 |                            self.receive_csv_data_from_kafka_topics.__name__))
57 |             raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e
58 | 
59 |     def stop_stream(self):
60 |         try:
61 |             if self.query is not None:
62 |                 self.query.stop()
63 | 
64 |         except Exception as e:
65 |             kafka_to_csv_consumer_exception = KafkaToSparkCSVConsumerException(
66 |                 "Error occurred  in module [{0}] class [{1}] method [{2}] ".
67 |                     format(self.__module__, KafkaToSparkCSVConsumer.__name__,
68 |                            self.receive_csv_data_from_kafka_topics.__name__))
69 |             raise Exception(kafka_to_csv_consumer_exception.error_message_detail(str(e), sys)) from e
70 | 


--------------------------------------------------------------------------------
/streaming/producer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__init__.py


--------------------------------------------------------------------------------
/streaming/producer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/producer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/producer/__pycache__/kafka_csv_data_producer.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/producer/kafka_csv_data_producer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | from kafka import KafkaProducer
  5 | from utility import read_params
  6 | import time
  7 | from insurance_exception.insurance_exception import InsuranceException as KafkaCSVDataProducerException
  8 | from streaming.spark_manager.spark_manager import SparkManager
  9 | 
 10 | 
 11 | class KafkaCSVDataProducer:
 12 | 
 13 |     def __init__(self, spark_session,config_path=None):
 14 |         """
 15 |         Creator:
 16 |         **********************************************************************************************************
 17 |         created date: 02 November 2021
 18 |         Organization: iNeuron
 19 |         author: avnish@ineuron.ai
 20 |         **********************************************************************************************************
 21 |         Description:
 22 |         **********************************************************************************************************
 23 |         KafkaCSVDataProducer is responsible to read a csv file and send data row by row to a kafka topic specified in
 24 |         configuration file:
 25 |         define below record
 26 |         kafka:
 27 |           topic_name:<topic-name>
 28 |           kafka_bootstrap_server: <server:port>
 29 |         *************************************************************************************************************
 30 |         Example:
 31 |         kafka:
 32 |           topic_name: insurance-prediction
 33 |           kafka_bootstrap_server: localhost:9092
 34 | 
 35 |         parameters:
 36 |         =============================================================================================================
 37 |         param config_path: configuration file path default is config/param.yaml
 38 | 
 39 |         """
 40 |         try:
 41 |             # accepting default configuration file if no configuration file path has been specified during object
 42 |             # instantiation
 43 |             path = os.path.join("config", "params.yaml") if config_path is None else os.path.join(config_path)
 44 |             self.config = read_params(config_path=path)
 45 |             self.kafka_topic_name = self.config['kafka']['topic_name']
 46 |             self.kafka_bootstrap_server = self.config['kafka']['kafka_bootstrap_server']
 47 |             # creating kafka producer object
 48 |             self.kafka_producer = KafkaProducer(bootstrap_servers=self.kafka_bootstrap_server,
 49 |                                                 value_serializer=lambda x: x.encode('utf-8'))
 50 |             # obtain spark session object
 51 |             self.spark_session = spark_session
 52 |         except Exception as e:
 53 |             kafka_csv_data_producer_exp = KafkaCSVDataProducerException(
 54 |                 "Error occurred  in module [{0}] class [{1}] method [{2}] ".
 55 |                     format(self.__module__, KafkaCSVDataProducer.__name__,
 56 |                            self.__init__.__name__))
 57 |             raise Exception(kafka_csv_data_producer_exp.error_message_detail(str(e), sys)) from e
 58 | 
 59 |     def send_csv_data_to_kafka_topic(self, directory_path):
 60 |         """
 61 |         Creator:
 62 |         **********************************************************************************************************
 63 |         created date: 02 November 2021
 64 |         Organization: iNeuron
 65 |         author: avnish@ineuron.ai
 66 |         **********************************************************************************************************
 67 |         Description:
 68 |         **********************************************************************************************************
 69 |         function will send all csv files content to kafka topics specified in configuration file.
 70 |         ==========================================================================================================
 71 |         param:
 72 |         directory_path: csv file directory
 73 | 
 74 |         ==========================================================================================================
 75 |         return: function will not return any thing
 76 |         """
 77 |         try:
 78 |             files = os.listdir(directory_path)
 79 |             n_row = 0
 80 | 
 81 |             for file in files:
 82 | 
 83 |                 # skip all files except csv
 84 |                 if not file.endswith(".csv"):
 85 |                     continue
 86 |                 file_path = os.path.join(directory_path, file)
 87 |                 # reading csv file using spark session
 88 |                 # df = self.spark_session.read.csv(file_path)
 89 |                 df = self.spark_session.read.csv(file_path,header=True,inferSchema=True)
 90 |                 # sending dataframe to kafka topic iteratively
 91 |                 for row in df.rdd.toLocalIterator():
 92 |                     message=",".join(map(str, list(row)))
 93 |                     print(message)
 94 |                     self.kafka_producer.send(self.kafka_topic_name,message)
 95 |                     n_row += 1
 96 |                     time.sleep(1)
 97 | 
 98 | 
 99 |                 #df.foreach(lambda row: self.kafka_producer.send(self.kafka_topic_name, ",".join(map(str, list(row)))))
100 |             return n_row
101 |         except Exception as e:
102 |             kafka_csv_data_producer_exp = KafkaCSVDataProducerException(
103 |                 "Error occurred  in module [{0}] class [{1}] method [{2}] ".
104 |                     format(self.__module__, KafkaCSVDataProducer.__name__,
105 |                            self.__init__.__name__))
106 |             raise Exception(kafka_csv_data_producer_exp.error_message_detail(str(e), sys)) from e
107 | 
108 | 
109 | """
110 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 
111 | """
112 | 


--------------------------------------------------------------------------------
/streaming/spark_manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__init__.py


--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/spark_manager.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/spark_manager.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/spark_manager/__pycache__/spark_manager.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/spark_manager/__pycache__/spark_manager.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/spark_manager/spark_manager.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from pyspark.sql import SparkSession
 4 | from insurance_exception.insurance_exception import InsuranceException as SparkManagerException
 5 | 
 6 | 
 7 | class SparkManager:
 8 |     spark_session = None
 9 | 
10 |     def __init__(self,app_name="ineuron-machine-learning"):
11 |         """
12 |         Creator:
13 |         **********************************************************************************************************
14 |         created date: 02 November 2021
15 |         Organization: iNeuron
16 |         author: avnish@ineuron.ai
17 |         **********************************************************************************************************
18 |         Description:
19 |         **********************************************************************************************************
20 |         SparkManager is responsible to return spark_session object.
21 |         Any modification required should be done in SparkManager class
22 |         """
23 |         try:
24 |             self.app_name=app_name
25 |         except Exception as e:
26 |             spark_manager_exception = SparkManagerException("Error occurred  in module [{0}] class [{1}] method [{2}] ".
27 |                                                             format(self.__module__, SparkManager.__name__,
28 |                                                                    self.__init__.__name__))
29 |             raise Exception(spark_manager_exception.error_message_detail(str(e), sys)) from e
30 | 
31 |     def get_spark_session_object(self):
32 |         """
33 |         function will return spark session object
34 |         """
35 |         try:
36 |             if SparkManager.spark_session is None:
37 |                 SparkManager.spark_session = SparkSession.builder.master("local").appName(self.app_name) \
38 |                     .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1")\
39 |                     .config("spark.ui.port", "4041").getOrCreate()
40 | 
41 |             return SparkManager.spark_session
42 |         except Exception as e:
43 |             spark_manager_exception = SparkManagerException("Error occurred  in module [{0}] class [{1}] method [{2}] ".
44 |                                                             format(self.__module__, SparkManager.__name__,
45 |                                                                    self.get_spark_session_object.__name__))
46 |             raise Exception(spark_manager_exception.error_message_detail(str(e), sys)) from e
47 | 


--------------------------------------------------------------------------------
/streaming/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__init__.py


--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/spark_transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/spark_transformer.cpython-37.pyc


--------------------------------------------------------------------------------
/streaming/transformer/__pycache__/spark_transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/streaming/transformer/__pycache__/spark_transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/streaming/transformer/spark_transformer.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from insurance_exception.insurance_exception import InsuranceException as SparkTransformerException
 4 | import os, sys
 5 | from mongo_db.mongo_db_atlas import MongoDBOperation
 6 | 
 7 | 
 8 | class SparkTransformer():
 9 |     def __init__(self, database_name, collection_name):
10 |         try:
11 |             self.database_name = database_name
12 |             self.collection_name = collection_name
13 |             self.mongo_db = MongoDBOperation()
14 |             self.ml_transformer = []
15 | 
16 | 
17 |         except Exception as e:
18 |             spark_transformer_exception = SparkTransformerException("Error occurred  in module [{0}] class [{1}] "
19 |                                                                     "method [{2}] ".
20 |                                                                     format(self.__module__, SparkTransformer.__name__,
21 |                                                                            self.__init__.__name__))
22 |             raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e
23 | 
24 |     def add_machine_learning_transformer(self, transformer: list):
25 |         try:
26 |             self.ml_transformer.extend(transformer)
27 |         except Exception as e:
28 |             spark_transformer_exception = SparkTransformerException("Error occurred  in module [{0}] class [{1}] "
29 |                                                                     "method [{2}] ".
30 |                                                                     format(self.__module__, SparkTransformer.__name__,
31 |                                                                            self.__init__.__name__))
32 |             raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e
33 | 
34 |     def process_each_record(self, dataframe,epoch_id):
35 |         try:
36 |             dataframe = dataframe.toPandas()
37 |             if dataframe.shape[0] > 0:
38 |                 dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'])
39 |                 self.mongo_db.insert_dataframe_into_collection(db_name=self.database_name,
40 |                                                                collection_name=self.collection_name,
41 |                                                                data_frame=dataframe)
42 |                 dataframe.to_csv("new_data.csv", index=None)
43 |         except Exception as e:
44 |             spark_transformer_exception = SparkTransformerException("Error occurred  in module [{0}] class [{1}] "
45 |                                                                     "method [{2}] ".
46 |                                                                     format(self.__module__, SparkTransformer.__name__,
47 |                                                                            self.process_each_record.__name__))
48 |             raise Exception(spark_transformer_exception.error_message_detail(str(e), sys)) from e
49 | 


--------------------------------------------------------------------------------
/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avnyadav/spark_project/fc628253a393823e51b45735e58ccf3cbb186725/training/__init__.py


--------------------------------------------------------------------------------
/training/stage_00_data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | from utility import read_params
 5 | import argparse
 6 | from utility import get_logger_object_of_training
 7 | from utility import clean_data_source_dir
 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 9 | 
10 | log_collection_name = "data_loader"
11 | 
12 | 
13 | def loader_main(config_path: str, datasource: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
14 |     try:
15 |         logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
16 |                                                execution_id=execution_id,executed_by=executed_by)
17 |         logger.is_log_enable = is_logging_enable
18 |         logger.log("Starting data loading operation.\nReading configuration file.")
19 | 
20 |         config = read_params(config_path)
21 | 
22 |         downloader_path = config['data_download']['cloud_training_directory_path']
23 |         download_path = config['data_source']['Training_Batch_Files']
24 | 
25 |         logger.log("Configuration detail has been fetched from configuration file.")
26 |         # removing existing training and additional training files from local
27 |         logger.log(f"Cleaning local directory [{download_path}]  for training.")
28 |         clean_data_source_dir(download_path, logger=logger,
29 |                               is_logging_enable=is_logging_enable)  # removing existing file from local system
30 | 
31 |         logger.log(f"Cleaning completed. Directory has been cleared now  [{download_path}]")
32 |         # downloading traning and additional training file from cloud into local system
33 |         logger.log("Data will be downloaded from cloud storage into local system")
34 | 
35 | 
36 |         for file in os.listdir(downloader_path):
37 |             if '.dvc' in file or '.gitignore' in file:
38 |                 continue
39 |             print(f"Source dir: {downloader_path} file: {file} is being copied into destination dir: {download_path}"
40 |                   f" file: {file}")
41 |             shutil.copy(os.path.join(downloader_path,file),os.path.join(download_path,file))
42 | 
43 |         logger.log("Data has been downloaded from cloud storage into local system")
44 | 
45 |     except Exception as e:
46 |         generic_exception = GenericException(
47 |                 "Error occurred in module [{0}] method [{1}]"
48 |                     .format(loader_main.__module__,
49 |                             loader_main.__name__))
50 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
51 | 
52 | if __name__ == '__main__':
53 |     args = argparse.ArgumentParser()
54 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
55 |     args.add_argument("--datasource", default=None)
56 |     parsed_args = args.parse_args()
57 |     print("started")
58 |     loader_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
59 | 


--------------------------------------------------------------------------------
/training/stage_01_data_validator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import shutil
  4 | import sys
  5 | 
  6 | import pandas as pd
  7 | from utility import read_params, create_directory_path, values_from_schema_function, get_logger_object_of_training, \
  8 |     get_date, get_time
  9 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 10 | import argparse
 11 | import datetime
 12 | import shutil
 13 | 
 14 | log_collection_name = "data_validator"
 15 | 
 16 | 
 17 | class DataValidator:
 18 |     def __init__(self, config, logger, is_logging_enable=True):
 19 |         try:
 20 |             self.logger = logger
 21 |             self.logger.is_log_enable = is_logging_enable
 22 |             self.config = config
 23 |             self.file_path = self.config['data_source']['Training_Batch_Files']
 24 |             self.good_file_path = self.config['artifacts']['training_data']['good_file_path']
 25 |             self.bad_file_path = self.config['artifacts']['training_data']['bad_file_path']
 26 |             self.archive_bad_file_path = self.config['artifacts']['training_data']['archive_bad_file_path']
 27 |             self.training_schema_file = self.config['config']['schema_training']
 28 |         except Exception as e:
 29 |             generic_exception = GenericException(
 30 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 31 |                     .format(self.__module__, DataValidator.__name__,
 32 |                             self.__init__.__name__))
 33 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 34 | 
 35 |     def archive_bad_files(self):
 36 |         try:
 37 |             folder_name = f"bad_files_{get_date().replace('-', '_')}_{get_time().replace(':', '_')}"
 38 |             archive_directory_path = os.path.join(self.archive_bad_file_path, folder_name)
 39 |             create_directory_path(archive_directory_path)
 40 |             for file in os.listdir(self.bad_file_path):
 41 |                 source_file_path = os.path.join(self.bad_file_path, file)
 42 |                 shutil.move(source_file_path, archive_directory_path)
 43 |         except Exception as e:
 44 |             generic_exception = GenericException(
 45 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 46 |                     .format(self.__module__, DataValidator.__name__,
 47 |                             self.archive_bad_files.__name__))
 48 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 49 | 
 50 |     def create_good_bad_archive_bad_file_path(self):
 51 |         try:
 52 |             create_directory_path(self.good_file_path)
 53 |             create_directory_path(self.bad_file_path)
 54 |             create_directory_path(self.archive_bad_file_path, is_recreate=False)
 55 |         except Exception as e:
 56 |             generic_exception = GenericException(
 57 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 58 |                     .format(self.__module__, DataValidator.__name__,
 59 |                             self.create_good_bad_archive_bad_file_path.__name__))
 60 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 61 | 
 62 |     def value_from_schema(self):
 63 |         """
 64 | 
 65 |         :return: tuple (sample_file_name,column_names,number_of_column)
 66 |         """
 67 |         try:
 68 |             return values_from_schema_function(self.training_schema_file)
 69 |         except Exception as e:
 70 |             generic_exception = GenericException(
 71 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 72 |                     .format(self.__module__, DataValidator.__name__,
 73 |                             self.value_from_schema.__name__))
 74 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 75 | 
 76 |     def file_name_regular_expression(self):
 77 |         """
 78 | 
 79 |         :return: regular expression syntax which can be used for validation of file name
 80 |         """
 81 |         return "['HealthPrem']+['\_'']+[\d_]+[\d]+\.csv"
 82 | 
 83 |     def validate_file_name(self):
 84 |         try:
 85 |             self.create_good_bad_archive_bad_file_path()
 86 |             file_name_reg_pattern = self.file_name_regular_expression()
 87 |             pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns = self.value_from_schema()
 88 |             self.logger.log(f"Validating file names.")
 89 |             files = os.listdir(self.file_path)
 90 |             for file in files:
 91 |                 file_path = os.path.join(self.file_path, file)
 92 |                 split_at_dot = re.split('.csv', file)
 93 |                 split_at_dot = (re.split('_', split_at_dot[0]))
 94 |                 if re.match(file_name_reg_pattern, file) and len(split_at_dot[1]) == length_of_date_stamp_in_file \
 95 |                         and len(split_at_dot[2]) == length_of_time_stamp_in_file:
 96 |                     destination_file_path = os.path.join(self.good_file_path, file)
 97 |                     self.logger.log(
 98 |                         f"file name : {file} matched hence moving file to good file path {destination_file_path}")
 99 |                     shutil.move(file_path, destination_file_path)
100 |                 else:
101 |                     destination_file_path = os.path.join(self.bad_file_path, file)
102 |                     self.logger.log(
103 |                         f"file name: {file} does not matched hence moving file to bad file path {destination_file_path}")
104 |                     shutil.move(file_path, destination_file_path)
105 |         except Exception as e:
106 |             generic_exception = GenericException(
107 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
108 |                     .format(self.__module__, DataValidator.__name__,
109 |                             self.validate_file_name.__name__))
110 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
111 | 
112 |     def validate_missing_values_in_whole_column(self):
113 |         try:
114 |             self.logger.log("Missing Values Validation Started!!")
115 |             for file in os.listdir(self.good_file_path):
116 |                 csv = pd.read_csv(os.path.join(self.good_file_path, file))
117 |                 count = 0
118 |                 for columns in csv:
119 |                     if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]):
120 |                         count += 1
121 |                         shutil.move(os.path.join(self.good_file_path, file),
122 |                                     self.bad_file_path)
123 |                         self.logger.log(
124 |                             "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
125 |                         break
126 |                 if count == 0:
127 |                     print(csv.columns)
128 |                     csv.rename(columns={"Unnamed: 0": "Premium "}, inplace=True)
129 |                     csv.to_csv(os.path.join(self.good_file_path, file), index=None, header=True)
130 |         except Exception as e:
131 |             generic_exception = GenericException(
132 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
133 |                     .format(self.__module__, DataValidator.__name__,
134 |                             self.validate_missing_values_in_whole_column.__name__))
135 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
136 | 
137 |     def validate_no_of_column(self, no_of_column):
138 |         """
139 |         Description:
140 |         If number of column matches then file will be move to good file path else bad file path
141 |         =====================================================================================
142 |         :param no_of_column: int Number of column must present in each file
143 |         :return: Nothing
144 |         """
145 |         try:
146 |             self.logger.log(f"Validating number of column in input file")
147 |             files = os.listdir(self.good_file_path)
148 |             for file in files:
149 |                 file_path = os.path.join(self.good_file_path, file)
150 |                 df = pd.read_csv(file_path)
151 |                 if df.shape[1] != no_of_column:
152 |                     destination_file_path = os.path.join(self.bad_file_path, file)
153 |                     self.logger.log(
154 |                         f"file: {file} has incorrect number of column hence moving file to bad file path {destination_file_path}")
155 |                     shutil.move(file_path, destination_file_path)
156 |         except Exception as e:
157 |             generic_exception = GenericException(
158 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
159 |                     .format(self.__module__, DataValidator.__name__,
160 |                             self.validate_no_of_column.__name__))
161 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
162 | 
163 | 
164 | def validation_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None,
165 |                     executed_by=None) -> None:
166 |     try:
167 |         logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
168 |                                                execution_id=execution_id, executed_by=executed_by)
169 |         logger.is_log_enable = is_logging_enable
170 |         config = read_params(config_path)
171 |         logger.log("data validation started")
172 |         data_validator = DataValidator(config=config, logger=logger, is_logging_enable=is_logging_enable)
173 |         pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns =\
174 |             data_validator.value_from_schema()
175 |         data_validator.validate_file_name()
176 |         data_validator.validate_no_of_column(no_of_column=number_of_columns)
177 |         data_validator.validate_missing_values_in_whole_column()
178 |         data_validator.archive_bad_files()
179 |     except Exception as e:
180 |         generic_exception = GenericException(
181 |             "Error occurred in module [{0}] method [{1}]"
182 |                 .format(validation_main.__module__,
183 |                         validation_main.__name__))
184 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     args = argparse.ArgumentParser()
189 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
190 |     args.add_argument("--datasource", default=None)
191 |     parsed_args = args.parse_args()
192 |     print("started")
193 |     validation_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
194 | 


--------------------------------------------------------------------------------
/training/stage_02_data_transformer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pandas as pd
 5 | import argparse
 6 | from utility import read_params, get_logger_object_of_training
 7 | from mongo_db.mongo_db_atlas import MongoDBOperation
 8 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 9 | 
10 | log_collection_name = "data_transformer"
11 | 
12 | 
13 | class DataTransformer:
14 |     def __init__(self, config, logger, is_log_enable=True):
15 |         try:
16 |             self.config = config
17 |             self.logger = logger
18 |             self.logger.is_log_enable = is_log_enable
19 |             self.good_file_path = self.config["artifacts"]['training_data']['good_file_path']
20 |             self.unwanted_column_names = self.config["dataset"]['unwanted_column']
21 |             self.mongo_db = MongoDBOperation()
22 |             self.dataset_database = self.config["dataset"]["database_detail"]["training_database_name"]
23 |             self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_training_collection_name"]
24 |             self.mongo_db.drop_collection(self.dataset_database, self.dataset_collection_name)
25 |         except Exception as e:
26 |             generic_exception = GenericException(
27 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
28 |                     .format(self.__module__, DataTransformer.__name__,
29 |                             self.__init__.__name__))
30 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
31 | 
32 |     def unite_dataset(self):
33 |         try:
34 |             dataset_list = []
35 |             for file in os.listdir(self.good_file_path):
36 |                 dataset_list.append(pd.read_csv(os.path.join(self.good_file_path, file)))
37 |             df = pd.concat(dataset_list)
38 |             df = self.remove_unwanted_column(df)
39 |             self.logger.log(f"Inserting dataset into database {self.dataset_database} "
40 |                             f"collection_name: {self.dataset_collection_name}")
41 |             self.mongo_db.insert_dataframe_into_collection(self.dataset_database, self.dataset_collection_name, df)
42 |         except Exception as e:
43 |             generic_exception = GenericException(
44 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
45 |                     .format(self.__module__, DataTransformer.__name__,
46 |                             self.unite_dataset.__name__))
47 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
48 | 
49 |     def remove_unwanted_column(self, df):
50 |         try:
51 |             print(self.unwanted_column_names)
52 |             column_to_remove = list(filter(lambda x: x in df.columns, self.unwanted_column_names))
53 |             if len(column_to_remove) > 0:
54 |                 return df.drop(column_to_remove, axis=1)
55 |             return df
56 |         except Exception as e:
57 |             generic_exception = GenericException(
58 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
59 |                     .format(self.__module__, DataTransformer.__name__,
60 |                             self.remove_unwanted_column.__name__))
61 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
62 | 
63 | 
64 | def transform_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None,
65 |                    executed_by=None) -> None:
66 |     try:
67 |         logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
68 |                                                execution_id=execution_id, executed_by=executed_by)
69 |         logger.is_log_enable = is_logging_enable
70 |         config = read_params(config_path)
71 |         data_transformer = DataTransformer(config=config, logger=logger, is_log_enable=is_logging_enable)
72 |         logger.log('Start of Data Preprocessing before DB')
73 |         data_transformer.unite_dataset()
74 |         logger.log('Data Preprocessing before DB Completed !!')
75 | 
76 |     except Exception as e:
77 |         generic_exception = GenericException(
78 |             "Error occurred in module [{0}] method [{1}]"
79 |                 .format(transform_main.__module__,
80 |                         transform_main.__name__))
81 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     args = argparse.ArgumentParser()
86 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
87 |     args.add_argument("--datasource", default=None)
88 |     parsed_args = args.parse_args()
89 |     print("started")
90 |     transform_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
91 | 


--------------------------------------------------------------------------------
/training/stage_03_data_exporter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from utility import read_params, create_directory_path
 5 | from mongo_db.mongo_db_atlas import MongoDBOperation
 6 | import argparse
 7 | from utility import get_logger_object_of_training
 8 | 
 9 | from insurance_exception.insurance_exception import InsuranceException as GenericException
10 | 
11 | 
12 | log_collection_name = "data_export"
13 | 
14 | 
15 | class DataExporter:
16 |     def __init__(self, config, logger, is_log_enable):
17 |         try:
18 |             self.config = config
19 |             self.logger = logger
20 |             self.is_log_enable = is_log_enable
21 |             self.mongo_db = MongoDBOperation()
22 |             self.dataset_database = self.config["dataset"]["database_detail"]["training_database_name"]
23 |             self.dataset_collection_name = self.config["dataset"]["database_detail"]["dataset_training_collection_name"]
24 |             self.training_file_from_db = self.config["artifacts"]['training_data']['training_file_from_db']
25 |             self.master_csv = self.config["artifacts"]['training_data']['master_csv']
26 |         except Exception as e:
27 |             generic_exception = GenericException(
28 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
29 |                     .format(self.__module__, DataExporter.__name__,
30 |                             self.__init__.__name__))
31 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
32 | 
33 |     def export_dataframe_from_database(self):
34 |         try:
35 |             create_directory_path(self.training_file_from_db)
36 |             self.logger.log(f"Creating dataframe of data stored db"
37 |                             f"[{self.dataset_database}] and collection[{self.dataset_collection_name}]")
38 |             df = self.mongo_db.get_dataframe_of_collection(db_name=self.dataset_database,
39 |                                                            collection_name=self.dataset_collection_name)
40 |             master_csv_file_path = os.path.join(self.training_file_from_db, self.master_csv)
41 |             self.logger.log(f"master csv file will be generated at "
42 |                             f"{master_csv_file_path}.")
43 |             df.to_csv(master_csv_file_path, index=None,header=True)
44 | 
45 |         except Exception as e:
46 |             generic_exception = GenericException(
47 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
48 |                     .format(self.__module__, DataExporter.__name__,
49 |                             self.export_dataframe_from_database.__name__))
50 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
51 | 
52 | 
53 | def export_main(config_path: str, is_logging_enable=True,execution_id=None,executed_by=None) -> None:
54 |     try:
55 |         logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
56 |                                                execution_id=execution_id, executed_by=executed_by)
57 | 
58 |         logger.is_log_enable = is_logging_enable
59 |         config = read_params(config_path)
60 |         data_exporter = DataExporter(config=config, logger=logger, is_log_enable=is_logging_enable)
61 |         logger.log("Generating csv file from dataset stored in database.")
62 |         data_exporter.export_dataframe_from_database()
63 |         logger.log("Dataset has been successfully exported in directory and exiting export pipeline.")
64 |     except Exception as e:
65 |         generic_exception = GenericException(
66 |             "Error occurred in module [{0}] method [{1}]"
67 |                 .format(export_main.__module__,
68 |                         export_main.__name__))
69 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     args = argparse.ArgumentParser()
74 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
75 |     parsed_args = args.parse_args()
76 |     print("started")
77 |     export_main(config_path=parsed_args.config)
78 | 


--------------------------------------------------------------------------------
/training/stage_04_model_trainer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | import sys
  4 | 
  5 | import os
  6 | import argparse
  7 | 
  8 | from pyspark.sql.types import IntegerType, StringType, FloatType
  9 | from sklearn.metrics import r2_score, mean_squared_error
 10 | 
 11 | from utility import create_directory_path,read_params
 12 | import numpy as np
 13 | from utility import get_logger_object_of_training
 14 | from pyspark.ml import Pipeline
 15 | from pyspark.ml.regression import  RandomForestRegressor
 16 | from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
 17 | 
 18 | from insurance_exception.insurance_exception import InsuranceException as GenericException
 19 | 
 20 | from streaming.spark_manager.spark_manager import SparkManager
 21 | 
 22 | log_collection_name = "training_model"
 23 | 
 24 | 
 25 | class DataPreProcessing:
 26 |     def __init__(self, logger, is_log_enable=True, data_frame=None, pipeline_path=None):
 27 |         try:
 28 |             self.logger = logger
 29 |             self.logger.is_log_enable = is_log_enable
 30 |             self.data_frame = data_frame
 31 |             self.stages = []
 32 |             self.pipeline_path = pipeline_path
 33 |         except Exception as e:
 34 |             generic_exception = GenericException(
 35 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 36 |                     .format(self.__module__, DataPreProcessing.__name__,
 37 |                             self.__init__.__name__))
 38 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 39 | 
 40 |     def set_dataframe(self, dataframe):
 41 |         try:
 42 |             self.data_frame = dataframe
 43 |         except Exception as e:
 44 |             generic_exception = GenericException(
 45 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 46 |                     .format(self.__module__, DataPreProcessing.__name__,
 47 |                             self.update_dataframe_scheme.__name__))
 48 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 49 | 
 50 |     def update_dataframe_scheme(self, schema_definition: dict):
 51 |         """
 52 | 
 53 |         """
 54 |         try:
 55 |             print(self.data_frame.printSchema())
 56 |             if self.data_frame is None:
 57 |                 raise Exception("update the attribute dataframe")
 58 |             for column, datatype in schema_definition.items():
 59 |                 self.logger.log(f"Update datatype of column: {column} to {str(datatype)}")
 60 |                 self.data_frame = self.data_frame.withColumn(column, self.data_frame[column].cast(datatype))
 61 |         except Exception as e:
 62 |             generic_exception = GenericException(
 63 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 64 |                     .format(self.__module__, DataPreProcessing.__name__,
 65 |                             self.update_dataframe_scheme.__name__))
 66 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 67 | 
 68 |     def encode_categorical_column(self, input_columns: list):
 69 |         try:
 70 |             string_indexer = StringIndexer(inputCols=input_columns,
 71 |                                            outputCols=[f"{column}_encoder" for column in input_columns])
 72 |             self.stages.append(string_indexer)
 73 |             one_hot_encoder = OneHotEncoder(inputCols=string_indexer.getOutputCols(),
 74 |                                             outputCols=[f"{column}_encoded" for column in input_columns])
 75 |             self.stages.append(one_hot_encoder)
 76 | 
 77 |         except Exception as e:
 78 |             generic_exception = GenericException(
 79 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 80 |                     .format(self.__module__, DataPreProcessing.__name__,
 81 |                             self.encode_categorical_column.__name__))
 82 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 83 | 
 84 |     def create_input_features(self, required_column: list):
 85 |         """
 86 | 
 87 |         """
 88 |         try:
 89 |             vector_assembler = VectorAssembler(inputCols=required_column, outputCol="input_features")
 90 |             self.stages.append(vector_assembler)
 91 |         except Exception as e:
 92 |             generic_exception = GenericException(
 93 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
 94 |                     .format(self.__module__, DataPreProcessing.__name__,
 95 |                             self.create_input_features.__name__))
 96 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
 97 | 
 98 |     def get_train_test_dataframe(self, test_size=0.2):
 99 |         try:
100 |             train_df, test_df = self.data_frame.randomSplit([1 - test_size, test_size], seed=random.randint(0, 1000))
101 |             self.logger.log(f"Training dataset count {train_df.count()}")
102 |             self.logger.log(f"Test dataset count {test_df.count()}")
103 |             return train_df, test_df
104 |         except Exception as e:
105 |             generic_exception = GenericException(
106 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
107 |                     .format(self.__module__, DataPreProcessing.__name__,
108 |                             self.get_train_test_dataframe.__name__))
109 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
110 | 
111 |     def get_prepared_dataset(self, ):
112 |         try:
113 |             schema_definition = {"age": IntegerType(),
114 |                                  "sex": StringType(),
115 |                                  "bmi": FloatType(),
116 |                                  "children": IntegerType(),
117 |                                  "smoker": StringType(),
118 |                                  "expenses": FloatType()
119 |                                  }
120 |             self.update_dataframe_scheme(schema_definition=schema_definition)
121 |             self.encode_categorical_column(input_columns=["sex", "smoker"])
122 |             required_column = ['age', 'bmi', 'children', 'sex_encoded', 'smoker_encoded', ]
123 |             self.create_input_features(required_column=required_column)
124 |             pipeline = Pipeline(stages=self.stages)
125 |             pipeline_fitted_obj = pipeline.fit(self.data_frame)
126 |             self.data_frame = pipeline_fitted_obj.transform(self.data_frame)
127 |             # os.remove(path=self.pipeline_path)
128 |             create_directory_path(self.pipeline_path, is_recreate=True)
129 |             pipeline_fitted_obj.write().overwrite().save(self.pipeline_path)
130 |             return self.get_train_test_dataframe(test_size=0.2)
131 |         except Exception as e:
132 |             generic_exception = GenericException(
133 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
134 |                     .format(self.__module__, DataPreProcessing.__name__,
135 |                             self.get_prepared_dataset.__name__))
136 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
137 | 
138 | 
139 | class ModelTrainer:
140 | 
141 |     def __init__(self, config, logger, is_log_enable):
142 |         try:
143 |             self.logger = logger
144 |             self.logger.is_log_enable = is_log_enable
145 |             self.config = config
146 |             self.training_file_path = self.config['artifacts']['training_data']['training_file_from_db']
147 |             self.master_csv = self.config['artifacts']['training_data']['master_csv']
148 |             self.target_columns = self.config['target_columns']['columns']
149 |             self.test_size = self.config['base']['test_size']
150 |             self.random_state = self.config['base']['random_state']
151 |             self.plot = self.config['artifacts']['training_data']['plots']
152 |             self.pipeline_path = self.config['artifacts']['training_data']['pipeline_path']
153 |             self.model_path = config['artifacts']['model']['model_path']
154 |             self.null_value_file_path = config['artifacts']['training_data']['null_value_info_file_path']
155 |             """
156 |             self.spark = SparkSession.builder.\
157 |                 master("local[*]").\
158 |                 appName("insurance-premium-reg").getOrCreate()
159 |                 """
160 |             self.spark = SparkManager().get_spark_session_object()
161 |             """
162 |             self.spark=SparkSession.builder.appName('app_name') \
163 |                 .master('local[*]') \
164 |                 .config('spark.sql.execution.arrow.pyspark.enabled', True) \
165 |                 .config('spark.sql.session.timeZone', 'UTC') \
166 |                 .config('spark.driver.memory', '32G') \
167 |                 .config('spark.ui.showConsoleProgress', True) \
168 |                 .config('spark.sql.repl.eagerEval.enabled', True) \
169 |                 .getOrCreate()
170 |                 """
171 |         except Exception as e:
172 |             generic_exception = GenericException(
173 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
174 |                     .format(self.__module__, ModelTrainer.__name__,
175 |                             self.__init__.__name__))
176 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
177 | 
178 | 
179 |     def save_regression_metric_data(self, y_true, y_pred, title):
180 |         try:
181 |             y_true = np.array(y_true).reshape(-1)
182 |             y_pred = np.array(y_pred).reshape(-1)
183 |             rmse = np.sqrt(mean_squared_error(y_true, y_pred))
184 |             r_squared_score = r2_score(y_true, y_pred)
185 |             msg = f"{title} R squared score: {r_squared_score:.3%}"
186 |             self.logger.log(msg)
187 |             print(msg)
188 |             msg = f"{title} Root mean squared error: {rmse:.3}"
189 |             self.logger.log(msg)
190 |             print(msg)
191 |         except Exception as e:
192 |             generic_exception = GenericException(
193 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
194 |                     .format(self.__module__, ModelTrainer.__name__,
195 |                             self.save_regression_metric_data.__name__))
196 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
197 | 
198 |     def get_dataframe(self):
199 |         try:
200 |             master_file_path = os.path.join(self.training_file_path, self.master_csv)
201 | 
202 |             return self.spark.read.csv(master_file_path, header=True, inferSchema=True)
203 |         except Exception as e:
204 |             generic_exception = GenericException(
205 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
206 |                     .format(self.__module__, ModelTrainer.__name__,
207 |                             self.get_dataframe.__name__))
208 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
209 | 
210 |     def data_preparation(self):
211 |         try:
212 |             data_frame = self.get_dataframe()
213 |             preprocessing = DataPreProcessing(logger=self.logger,
214 |                                               is_log_enable=self.logger.is_log_enable,
215 |                                               data_frame=data_frame,
216 |                                               pipeline_path=self.pipeline_path)
217 |             return preprocessing.get_prepared_dataset()
218 |         except Exception as e:
219 |             generic_exception = GenericException(
220 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
221 |                     .format(self.__module__, ModelTrainer.__name__,
222 |                             self.data_preparation.__name__))
223 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
224 | 
225 |     def begin_training(self):
226 |         try:
227 |             train_df, test_df = self.data_preparation()
228 |             random_forest_regressor = RandomForestRegressor(featuresCol="input_features", labelCol="expenses")
229 |             random_forest_model = random_forest_regressor.fit(train_df)
230 |             train_prediction = random_forest_model.transform(train_df)
231 |             testing_prediction = random_forest_model.transform(test_df)
232 |             training_data = train_prediction.select("expenses", "prediction").toPandas()
233 |             testing_data = testing_prediction.select("expenses", "prediction").toPandas()
234 |             self.save_regression_metric_data(training_data['expenses'], training_data['prediction'],
235 |                                              title="Training score")
236 |             self.save_regression_metric_data(testing_data['expenses'], testing_data['prediction'],
237 |                                              title="Testing score")
238 | 
239 |             self.save_model(model=random_forest_model, model_name="random_forest_regressor")
240 |             self.spark.stop()
241 |         except Exception as e:
242 |             generic_exception = GenericException(
243 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
244 |                     .format(self.__module__, ModelTrainer.__name__,
245 |                             self.begin_training.__name__))
246 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
247 | 
248 |     def save_model(self, model, model_name, intermediate_path=None):
249 |         try:
250 | 
251 |             if intermediate_path is None:
252 |                 model_path = os.path.join(self.model_path)
253 |             else:
254 |                 model_path = os.path.join(self.model_path, intermediate_path)
255 |             create_directory_path(model_path, )
256 |             model_full_path = os.path.join(model_path, f"{model_name}")
257 |             self.logger.log(f"Saving mode: {model_name} at path {model_full_path}")
258 |             # os.remove(path=model_full_path)
259 |             model.write().overwrite().save(model_full_path)
260 | 
261 |         except Exception as e:
262 |             generic_exception = GenericException(
263 |                 "Error occurred in module [{0}] class [{1}] method [{2}]"
264 |                     .format(self.__module__, ModelTrainer.__name__,
265 |                             self.save_model.__name__))
266 |             raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
267 | 
268 | 
269 | def train_main(config_path: str, datasource: str, is_logging_enable=True, execution_id=None, executed_by=None) -> None:
270 |     try:
271 |         logger = get_logger_object_of_training(config_path=config_path, collection_name=log_collection_name,
272 |                                                execution_id=execution_id, executed_by=executed_by)
273 | 
274 |         logger.is_log_enable = is_logging_enable
275 |         logger.log("Training begin.")
276 |         config = read_params(config_path)
277 |         model_trainer = ModelTrainer(config=config, logger=logger, is_log_enable=is_logging_enable)
278 |         model_trainer.begin_training()
279 |         logger.log("Training completed successfully.")
280 | 
281 |     except Exception as e:
282 |         generic_exception = GenericException(
283 |             "Error occurred in module [{0}] method [{1}]"
284 |                 .format(train_main.__module__,
285 |                         train_main.__name__))
286 |         raise Exception(generic_exception.error_message_detail(str(e), sys)) from e
287 | 
288 | 
289 | if __name__ == '__main__':
290 |     args = argparse.ArgumentParser()
291 |     args.add_argument("--config", default=os.path.join("config", "params.yaml"))
292 |     args.add_argument("--datasource", default=None)
293 |     parsed_args = args.parse_args()
294 |     print(parsed_args.config)
295 |     print(parsed_args.datasource)
296 |     train_main(config_path=parsed_args.config, datasource=parsed_args.datasource)
297 | 


--------------------------------------------------------------------------------
/utility.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import json
  3 | from datetime import datetime
  4 | 
  5 | import yaml
  6 | import uuid
  7 | import os
  8 | import shutil
  9 | from logger.logger import AppLogger
 10 | 
 11 | def get_time():
 12 |     """
 13 | 
 14 |     :return current time:
 15 |     """
 16 |     return datetime.now().strftime("%H:%M:%S").__str__()
 17 | 
 18 | def get_date():
 19 |     """
 20 | 
 21 |     :return current date:
 22 |     """
 23 |     return datetime.now().date().__str__()
 24 | 
 25 | 
 26 | 
 27 | def create_directory_path(path, is_recreate=True):
 28 |     """
 29 |     :param path:
 30 |     :param is_recreate: Default it will delete the existing directory yet you can pass
 31 |     it's value to false if you do not want to remove existing directory
 32 |     :return:
 33 |     """
 34 |     try:
 35 |         if is_recreate:
 36 |             if os.path.exists(path):
 37 |                 shutil.rmtree(path, ignore_errors=False)  # remove existing directory if is_recreate is true
 38 |         os.makedirs(path, exist_ok=True)  # if directory is present it will not alter anything
 39 |         return True
 40 |     except Exception as e:
 41 |         raise e
 42 | 
 43 | 
 44 | def clean_data_source_dir(path, logger=None, is_logging_enable=True):
 45 |     try:
 46 |         if not os.path.exists(path):
 47 |             os.mkdir(path)
 48 |         for file in os.listdir(path):
 49 |             if '.gitignore' in file:
 50 |                 pass
 51 |             logger.log(f"{os.path.join(path, file)}file will be deleted.")
 52 |             os.remove(os.path.join(path, file))
 53 |             logger.log(f"{os.path.join(path, file)}file has been deleted.")
 54 |     except Exception as e:
 55 |         raise e
 56 | 
 57 | 
 58 | 
 59 | def get_logger_object_of_training(config_path: str, collection_name, execution_id=None, executed_by=None) -> AppLogger:
 60 |     config = read_params(config_path)
 61 |     database_name = config['log_database']['training_database_name']
 62 |     if execution_id is None:
 63 |         execution_id = str(uuid.uuid4())
 64 |     if executed_by is None:
 65 |         executed_by = "Avnish Yadav"
 66 |     logger = AppLogger(project_id=5, log_database=database_name, log_collection_name=collection_name,
 67 |                        execution_id=execution_id, executed_by=executed_by)
 68 |     return logger
 69 | 
 70 | 
 71 | def get_logger_object_of_prediction(config_path: str, collection_name, execution_id=None,
 72 |                                     executed_by=None) -> AppLogger:
 73 |     config = read_params(config_path)
 74 |     database_name = config['log_database']['prediction_database_name']
 75 |     if execution_id is None:
 76 |         execution_id = str(uuid.uuid4())
 77 |     if executed_by is None:
 78 |         executed_by = "Avnish Yadav"
 79 |     logger = AppLogger(project_id=5, log_database=database_name, log_collection_name=collection_name,
 80 |                        execution_id=execution_id, executed_by=executed_by)
 81 |     return logger
 82 | 
 83 | 
 84 | def read_params(config_path: str) -> dict:
 85 |     with open(config_path) as yaml_file:
 86 |         config = yaml.safe_load(yaml_file)
 87 |     return config
 88 | 
 89 | 
 90 | def values_from_schema_function(schema_path):
 91 |     try:
 92 |         with open(schema_path, 'r') as r:
 93 |             dic = json.load(r)
 94 |             r.close()
 95 | 
 96 |         pattern = dic['SampleFileName']
 97 |         length_of_date_stamp_in_file = dic['LengthOfDateStampInFile']
 98 |         length_of_time_stamp_in_file = dic['LengthOfTimeStampInFile']
 99 |         column_names = dic['ColName']
100 |         number_of_columns = dic['NumberofColumns']
101 |         return pattern, length_of_date_stamp_in_file, length_of_time_stamp_in_file, column_names, number_of_columns
102 |     except ValueError:
103 |         raise ValueError
104 | 
105 |     except KeyError:
106 |         raise KeyError
107 | 
108 |     except Exception as e:
109 |         raise e
110 | 
111 | 


--------------------------------------------------------------------------------