├── Project21
    └── Emojify
    │   └── readme.md
├── Project41
    └── DBSCAN
    │   └── readme.md
├── Project26
    └── FIFA Capacity LARS
    │   └── readme.md
├── Project42
    └── OPTICS_clustering
    │   └── readme.md
├── Project20
    └── Chatbot_using_seq2seq
    │   └── readme.md
├── Project34
    └── Chatbot_with_attention
    │   └── readme.md
├── Project35
    └── Memory Network Chatbots
    │   └── readme.md
├── Project38
    └── Affinity Propagation
    │   └── readme.md
├── Project39
    └── meanshift_clustering
    │   └── readme.md
├── Project43
    └── Spectral Co Clustering
    │   ├── readme.md
    │   └── Spectral Co Clustering from scratch.ipynb
├── Project44
    └── NGBoost_implementation
    │   ├── readme.md
    │   ├── Sample_Submission.xlsx
    │   ├── Test.csv
    │   └── Train.csv
├── Project9
    └── YOLO_Object_Detection
    │   ├── readme.md
    │   ├── yolo-coco
    │       ├── readme.md
    │       ├── coco.names
    │       └── yolov3.cfg
    │   ├── sample images
    │       ├── readme.md
    │       └── bean and teddy.jpg
    │   ├── yolo_six_lines.py
    │   └── yolo_wrapper.py
├── Project10
    └── Voice_Recognition_Adaboost
    │   ├── readme.md
    │   └── configuration_sheet.xlsx
├── Project13
    └── Spam_Or_Ham_MultinomialNB
    │   ├── readme.md
    │   └── multinomial NB.ipynb
├── Project14
    └── Pulsar_star_prediction_gbm
    │   └── readme.md
├── Project24
    └── Graduate_Admission_Lasso
    │   └── readme.md
├── Project25
    └── Facebook Metrics Elastic Net
    │   └── readme.md
├── Project37
    └── Nifty50_volatility_forecast
    │   └── readme.md
├── Project15
    └── Lower_back_pain_detection_KNN
    │   ├── readme.md
    │   └── Lower Back pain detection.ipynb
├── Project16
    └── Parkinsons_classification_SVM
    │   └── readme.md
├── Project17
    └── Stumble Upon Bagging Classifier
    │   ├── readme.md
    │   └── Bagging Classifier.ipynb
├── Project18
    └── Quality_detection_Decision_trees
    │   ├── readme.md
    │   └── Wine_quality_Decision_Trees.ipynb
├── Project22
    └── House Price Prediction Regression
    │   ├── readme.md
    │   └── Linear Regression.ipynb
├── Project23
    └── Insurance_claim_prediction_Lasso
    │   └── readme.md
├── Project28
    └── Air_Quality_Bayesian_Regression
    │   ├── readme.md
    │   └── Bayesian Regression.ipynb
├── Project30
    └── world_war2_weather_SGDRegressor
    │   ├── readme.md
    │   └── SGDRegressor.ipynb
├── Project32
    └── Logistic_Regression_credit_card
    │   └── readme.md
├── Project11
    └── Forest_Cover_Prediction_Random_Forests
    │   └── readme.md
├── Project12
    └── Fraud_detection_Extra_tree_classifier
    │   └── readme.md
├── Project40
    └── agglomerative_hierarchial_clustering
    │   └── readme.md
├── Project33
    └── Neural machine Translation with Attention
    │   └── readme.md
├── Project29
    └── weather_prediction_passive_aggressive_regression
    │   ├── readme.md
    │   └── Passive Aggressive Regression.ipynb
├── Project31
    └── House_Price_Revisted_Gaussian_Process_Regression
    │   ├── readme.md
    │   └── Gaussian Process Regression.ipynb
├── Project7
    └── Machine Translation using Seq2Seq architecture
    │   └── readme.md
├── Project19
    └── Instant_gratification_QDA_LDA
    │   ├── readme.md
    │   └── QDA_LDA.ipynb
├── Project1
    └── Transfer_Learning_VGG16
    │   ├── param1.jpg
    │   ├── config1.png
    │   ├── config2.png
    │   ├── architecture_vgg.jpg
    │   ├── utilities_to_run_code
    │       ├── configuration.xlsx
    │       ├── readme.md
    │       └── monkey_labels.txt
    │   └── readme.md
├── Project2
    └── Neural Style Transfer
    │   ├── styles
    │       ├── scream.jpg
    │       ├── facepaint.jpg
    │       ├── flamenco.jpg
    │       ├── lacquer.jpg
    │       ├── pablopicaso.jpg
    │       ├── starrynight.jpg
    │       └── oil_painting_style.jpg
    │   ├── contents
    │       ├── beach.jpg
    │       ├── scenic view.jpg
    │       └── taylorswift.jpg
    │   ├── configuration_sheet.xlsx
    │   └── readme.md
├── Project3
    └── Intel Image Classification
    │   ├── configuration_sheet.xlsx
    │   └── readme.md
├── Project36
    └── Spectral_Clustering
    │   └── readme.md
├── Project5
    └── Sentiment Analysis using Bidirectional LSTM
    │   └── readme.md
├── Project8
    └── Attention mechanism to classify News
    │   └── readme.md
├── Project4
    └── Stock Price Prediction using LSTM
    │   └── readme.md
├── README.md
├── Project27
    └── House Price Revisited OMP vs Other regression methods
    │   └── readme.md
└── Project6
    └── Text Generation from Taylor Swift's songs
        └── readme.md


/Project21/Emojify/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project41/DBSCAN/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project26/FIFA Capacity LARS/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project42/OPTICS_clustering/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project20/Chatbot_using_seq2seq/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project34/Chatbot_with_attention/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project35/Memory Network Chatbots/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project38/Affinity Propagation/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project39/meanshift_clustering/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project43/Spectral Co Clustering/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project44/NGBoost_implementation/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project10/Voice_Recognition_Adaboost/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project13/Spam_Or_Ham_MultinomialNB/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project14/Pulsar_star_prediction_gbm/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project24/Graduate_Admission_Lasso/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project25/Facebook Metrics Elastic Net/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project37/Nifty50_volatility_forecast/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project15/Lower_back_pain_detection_KNN/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project16/Parkinsons_classification_SVM/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project17/Stumble Upon Bagging Classifier/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project18/Quality_detection_Decision_trees/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project22/House Price Prediction Regression/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project23/Insurance_claim_prediction_Lasso/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project28/Air_Quality_Bayesian_Regression/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project30/world_war2_weather_SGDRegressor/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project32/Logistic_Regression_credit_card/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/yolo-coco/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project11/Forest_Cover_Prediction_Random_Forests/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project12/Fraud_detection_Extra_tree_classifier/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project40/agglomerative_hierarchial_clustering/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/sample images/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project33/Neural machine Translation with Attention/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project29/weather_prediction_passive_aggressive_regression/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project31/House_Price_Revisted_Gaussian_Process_Regression/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project7/Machine Translation using Seq2Seq architecture/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Project19/Instant_gratification_QDA_LDA/readme.md:
--------------------------------------------------------------------------------
1 | Still work in progress Not final
2 | 


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/param1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project1/Transfer_Learning_VGG16/param1.jpg


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/config1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project1/Transfer_Learning_VGG16/config1.png


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/config2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project1/Transfer_Learning_VGG16/config2.png


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/scream.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/scream.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/contents/beach.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/contents/beach.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/facepaint.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/facepaint.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/flamenco.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/flamenco.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/lacquer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/lacquer.jpg


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/architecture_vgg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project1/Transfer_Learning_VGG16/architecture_vgg.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/pablopicaso.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/pablopicaso.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/starrynight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/starrynight.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/configuration_sheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/configuration_sheet.xlsx


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/contents/scenic view.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/contents/scenic view.jpg


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/contents/taylorswift.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/contents/taylorswift.jpg


--------------------------------------------------------------------------------
/Project44/NGBoost_implementation/Sample_Submission.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project44/NGBoost_implementation/Sample_Submission.xlsx


--------------------------------------------------------------------------------
/Project10/Voice_Recognition_Adaboost/configuration_sheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project10/Voice_Recognition_Adaboost/configuration_sheet.xlsx


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/styles/oil_painting_style.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project2/Neural Style Transfer/styles/oil_painting_style.jpg


--------------------------------------------------------------------------------
/Project3/Intel Image Classification/configuration_sheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project3/Intel Image Classification/configuration_sheet.xlsx


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/sample images/bean and teddy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project9/YOLO_Object_Detection/sample images/bean and teddy.jpg


--------------------------------------------------------------------------------
/Project36/Spectral_Clustering/readme.md:
--------------------------------------------------------------------------------
1 | The necessary theory and explanation about spectral clustering is explained in my medium blog :- 
2 | 
3 | https://medium.com/@darkprogrammerpb/spectral-clustering-cdc224001433
4 | 


--------------------------------------------------------------------------------
/Project5/Sentiment Analysis using Bidirectional LSTM/readme.md:
--------------------------------------------------------------------------------
1 | The data source is :- https://www.kaggle.com/crowdflower/first-gop-debate-twitter-sentiment
2 | Bi directional LSTM is used for analysing positive and negative tweets
3 | 


--------------------------------------------------------------------------------
/Project8/Attention mechanism to classify News/readme.md:
--------------------------------------------------------------------------------
1 | The data is obtained from the source https://www.kaggle.com/yufengdev/bbc-fulltext-and-category
2 | Attention models are used to classify the categories of news articles.
3 | 


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/utilities_to_run_code/configuration.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darkprogrammerpb/DeepLearningProjects_when_I_was_a_noob/HEAD/Project1/Transfer_Learning_VGG16/utilities_to_run_code/configuration.xlsx


--------------------------------------------------------------------------------
/Project4/Stock Price Prediction using LSTM/readme.md:
--------------------------------------------------------------------------------
1 | The data source is :- https://www.kaggle.com/szrlee/stock-time-series-20050101-to-20171231
2 | We are considering the stock prices of Amazon. Also I have uploaded the data used in the code in the repository
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepLearningProjects
2 | 
3 | These projects are pretty old and I have not updated them yet because I am too busy to maintain a Github of my projects and competition codes. 
4 | 
5 | I did these projects when I started learning Data science. A lot has changed over years and refer to these notebooks with caution. 
6 | 


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/utilities_to_run_code/readme.md:
--------------------------------------------------------------------------------
1 | # Data for the code :- 
2 | The data is obtained from the following Kaggle link:- https://www.kaggle.com/slothkong/10-monkey-species
3 | There will be 2 folders. Both the folders are to be saved in the same location as the jupyter notebook and its configuration file.
4 | 


--------------------------------------------------------------------------------
/Project3/Intel Image Classification/readme.md:
--------------------------------------------------------------------------------
1 | We are working with a simple Convolutional Neural Network for predicting the stated labels in the dataset :- 
2 | https://www.kaggle.com/puneet6060/intel-image-classification
3 | 
4 | An attempt is made to unbox the convolutional layers by visualizing what happens at each layer and how the final prediction is made
5 | 


--------------------------------------------------------------------------------
/Project27/House Price Revisited OMP vs Other regression methods/readme.md:
--------------------------------------------------------------------------------
1 | We will look at a brief overview of what Orthogonal Matching Pursuit means:- 
2 | ![omp_1](https://user-images.githubusercontent.com/51089715/66738172-7d34fc00-ee8b-11e9-82c9-dd5f7765d68a.jpg)
3 | ![omp_2](https://user-images.githubusercontent.com/51089715/66738181-81611980-ee8b-11e9-9338-f8ead42d74c7.jpg)
4 | 
5 | # Algorithmic implementation and some essential sklearn module parameters of OMP:
6 | ![omp_3](https://user-images.githubusercontent.com/51089715/66738189-83c37380-ee8b-11e9-97df-673cc257e8d9.jpg)
7 | 


--------------------------------------------------------------------------------
/Project6/Text Generation from Taylor Swift's songs/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | ![textgen_1](https://user-images.githubusercontent.com/51089715/63297065-2b8a4e00-c2ee-11e9-9200-72b732cd942c.jpg)
3 | ![textgen_2](https://user-images.githubusercontent.com/51089715/63297071-2fb66b80-c2ee-11e9-861f-a7bebb902e9c.jpg)
4 | ![textgen_3](https://user-images.githubusercontent.com/51089715/63297083-35ac4c80-c2ee-11e9-8ab9-9bd1cd98cb41.jpg)
5 | ![textgen_4](https://user-images.githubusercontent.com/51089715/63297089-393fd380-c2ee-11e9-831b-800859bef070.jpg)
6 | ![textgen_5](https://user-images.githubusercontent.com/51089715/63297094-3cd35a80-c2ee-11e9-9b36-323b6bc882b1.jpg)
7 | ![textgen_6](https://user-images.githubusercontent.com/51089715/63297098-3f35b480-c2ee-11e9-9895-ef5dcd0014ea.jpg)
8 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/yolo_six_lines.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import cv2
 4 | import os
 5 | import matplotlib
 6 | matplotlib.rcParams['figure.figsize']= (5.0,5.0)
 7 | import matplotlib.pyplot as plt
 8 | from yolo_wrapper import *
 9 | labels_path  = os.getcwd()+'\\yolo-coco\\coco.names'                                     ### load label path
10 | weights_path = os.getcwd()+'\\yolo-coco\\yolov3.weights'                                 ### load weights path
11 | configs_path = os.getcwd()+'\\yolo-coco\\yolov3.cfg'                                     ### Load configuration path
12 | test_image   = os.getcwd()+'\\sample images\\bean and teddy.jpg'                         ### Load test image path
13 | yolo_class = Yolo_Implementation(labels_path,weights_path,configs_path,test_image)       ### call wrapper library created
14 | yolo_class.yolo_non_max_suppress()                                                       ### Image created
15 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/yolo-coco/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/readme.md:
--------------------------------------------------------------------------------
 1 | # VGG16 Architecture 
 2 | A typical VGG16 architecture looks like this.
 3 | Each box in the Convolution and Pooling Block from Blocks 1 to 5, contains the Convolution Number, Image tensor(representing the height, width and features), kernal size and stride.
 4 | 
 5 | ![architecture_vgg](https://user-images.githubusercontent.com/51089715/61504390-c98dae80-a9f8-11e9-8596-f38e73b4cb67.jpg)
 6 | 
 7 | # A brief overview of Training Data, Batch, Epochs and Batch size
 8 | ![param1](https://user-images.githubusercontent.com/51089715/61504563-749e6800-a9f9-11e9-816d-3c88bbf63130.jpg)
 9 | 
10 | In the code, I have used a configuration sheet which contains all the parameters needed as an input for both the model as well as for data augmentation. A snippet of the configuration file (saved as configuration.xlsx)is shown below:- 
11 | ![config2](https://user-images.githubusercontent.com/51089715/61504631-b9c29a00-a9f9-11e9-9a01-d243f812e36f.png)
12 | ![config1](https://user-images.githubusercontent.com/51089715/61504632-b9c29a00-a9f9-11e9-93c9-b63e7c3d4492.png)
13 | 


--------------------------------------------------------------------------------
/Project1/Transfer_Learning_VGG16/utilities_to_run_code/monkey_labels.txt:
--------------------------------------------------------------------------------
 1 | Label,  Latin Name           , Common Name                   , Train Images , Validation Images
 2 | n0   , alouatta_palliata	 , mantled_howler                , 131          , 26
 3 | n1   , erythrocebus_patas	 , patas_monkey                  , 139          , 28
 4 | n2   , cacajao_calvus	     , bald_uakari                   , 137          , 27
 5 | n3   , macaca_fuscata	     , japanese_macaque              , 152          , 30
 6 | n4   , cebuella_pygmea	     , pygmy_marmoset                , 131          , 26
 7 | n5   , cebus_capucinus	     , white_headed_capuchin         , 141          , 28
 8 | n6   , mico_argentatus	     , silvery_marmoset              , 132          , 26
 9 | n7   , saimiri_sciureus	     , common_squirrel_monkey        , 142          , 28
10 | n8   , aotus_nigriceps	     , black_headed_night_monkey     , 133          , 27
11 | n9   , trachypithecus_johnii , nilgiri_langur                , 132          , 26
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/Project2/Neural Style Transfer/readme.md:
--------------------------------------------------------------------------------
 1 | ### Sample demonstration of Neural Style Transfer (Using VGG19)
 2 | ![examplestyletransfer](https://user-images.githubusercontent.com/51089715/62014529-ca280100-b1bf-11e9-973e-85ea5c2c9f2b.PNG)
 3 | 
 4 | # Neural Style Transfer explained 
 5 | 
 6 | ![style transfer_1](https://user-images.githubusercontent.com/51089715/62105102-14dc7280-b2bf-11e9-85c2-c65d9f871db2.jpg)
 7 | 
 8 | ![style transfer_2](https://user-images.githubusercontent.com/51089715/62105105-15750900-b2bf-11e9-95c8-2ed2d57b9c6c.jpg)
 9 | 
10 | ![style transfer_3](https://user-images.githubusercontent.com/51089715/62105104-14dc7280-b2bf-11e9-8108-c0575870762f.jpg)
11 | 
12 | ![style transfer_4](https://user-images.githubusercontent.com/51089715/62105103-14dc7280-b2bf-11e9-83f3-2fb8943406c9.jpg)
13 | 
14 | 
15 | # Citation 
16 | 1. A Neural algorithm for artistic style transfer (Leon A. Gatys, Alexander S. Ecker, Matthias Bethge)
17 | Link :- https://arxiv.org/pdf/1508.06576.pdf
18 | 2. Understanding Deep Image Representations by Inverting them (Aravindh Mahendran, Andrea Vedaldi) 
19 | Link :- https://arxiv.org/pdf/1412.0035.pdf
20 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/yolo_wrapper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import cv2
 4 | import os
 5 | import matplotlib
 6 | matplotlib.rcParams['figure.figsize']= (5.0,5.0)
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | 
10 | class Yolo_Implementation(object):
11 |     def __init__(self,labels_path,weights_path,config_path,test_image,score_threshold=0.1,nms_threshold=0.2):
12 |         self.score_threshold = score_threshold
13 |         self.nms_threshold   = nms_threshold
14 |         self.weights_path    = weights_path
15 |         self.config_path     = config_path
16 |         self.image           = cv2.imread(test_image)
17 |         (self.H, self.W)     = self.image.shape[:2]
18 |         self.labels          = open(labels_path).read().strip().split("\n")
19 |         self.colors          = np.random.randint(0, 255, size=(len(self.labels), 3),dtype="uint8")
20 |       
21 |     def build_model(self):
22 |         model              = cv2.dnn.readNetFromDarknet(self.config_path,self.weights_path)
23 |         blob               = cv2.dnn.blobFromImage(self.image, 1 / 255.0, (480, 480),swapRB=True, crop=False)
24 |         layers_yolo        = model.getLayerNames()
25 |         yolo_layers_needed = [layers_yolo[i[0]-1] for i in model.getUnconnectedOutLayers()]
26 |         retval             = model.setInput(blob)
27 |         layer_outputs      = model.forward(yolo_layers_needed)
28 |         return layer_outputs
29 |     
30 |     def yolo_filter_boxes(self):
31 |         boxes                = []
32 |         probabilities        = []
33 |         classIDs             = []
34 |         layer_outputs        = self.build_model()
35 |         for output in layer_outputs:
36 |             for detection in output:
37 |                 if detection[4]>0.0:                                                ### Detecting the presence of object 
38 |                     scores  = detection[5:]                                         ### Capturing the probabilities of corresponding class ID 
39 |                     classid =  np.argmax(scores)                                    ### Finding the class ID with maximum probability 
40 |                     prob = np.max(scores)                                           ### Finding maximum probability
41 |                     if prob > self.score_threshold:                                      ### Thresholding to filter yolo boxes (score threshold) 
42 |                         box = detection[0:4]*np.array([self.W,self.H,self.W,self.H])
43 |                         (centerX, centerY, width, height) = box.astype("int")
44 |                         x = int(centerX - (width / 2))
45 |                         y = int(centerY - (height / 2))
46 |                         boxes.append([x, y, int(width), int(height)])
47 |                         probabilities.append(float(prob))
48 |                         classIDs.append(classid)
49 |         return boxes,probabilities,classIDs
50 |     
51 |     def yolo_non_max_suppress(self):
52 |         boxes,probabilities,classIDs = self.yolo_filter_boxes()
53 |         idxs = cv2.dnn.NMSBoxes(boxes, probabilities,self.score_threshold,self.nms_threshold)
54 |         if len(idxs)>0:
55 |             for i in idxs.flatten():
56 |                 (x,y) = (boxes[i][0],boxes[i][1])
57 |                 (w,h) = (boxes[i][2],boxes[i][3])
58 |                 color = [int(c) for c in self.colors[classIDs[i]]]
59 |                 cv2.rectangle(self.image, (x, y), (x + w, y + h), color, 2)
60 |                 text = "{}: {:.4f}".format(self.labels[classIDs[i]], 100*round(probabilities[i],4))
61 |                 cv2.putText(self.image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,0.5, color, 1)
62 |         plt.figure(figsize=(20,20));
63 |         plt.imshow(self.image[:,:,::-1])
64 |         plt.axis('off');
65 | 
66 | 
67 |         
68 | 
69 | 


--------------------------------------------------------------------------------
/Project44/NGBoost_implementation/Test.csv:
--------------------------------------------------------------------------------
  1 | City,Location_Score,Internal_Audit_Score,External_Audit_Score,Fin_Score,Loss_score,Past_Results
  2 | 41,18.272,13,12,9,7,0
  3 | 17,64.799,6,10,7,4,1
  4 | 31,68.89,3,3,7,8,0
  5 | 3,16.492,15,10,7,4,1
  6 | 9,17.178,7,3,8,3,1
  7 | 30,64.151,5,8,5,8,0
  8 | 23,35.149,6,7,4,3,1
  9 | 31,63.258,5,6,3,5,0
 10 | 29,6.163,7,14,3,4,0
 11 | 5,62.562,10,14,5,3,0
 12 | 35,14.521,11,6,3,5,0
 13 | 4,64.968,4,7,3,4,1
 14 | 23,13.614,8,4,7,7,1
 15 | 37,7.754,6,7,4,5,0
 16 | 12,16.445,6,15,15,5,1
 17 | 37,69.472,5,3,3,6,1
 18 | 9,18.84,6,5,8,4,0
 19 | 37,31.485,8,3,7,8,1
 20 | 20,11.418,7,9,7,8,0
 21 | 10,20.22,13,14,12,6,1
 22 | 38,14.292,5,8,5,5,1
 23 | 22,8.949,6,4,5,8,1
 24 | 40,60.878,3,11,7,4,0
 25 | 37,71.927,5,7,8,4,1
 26 | 5,74.542,8,7,5,6,1
 27 | 9,21.54,8,15,10,4,1
 28 | 6,14.902,7,4,8,3,1
 29 | 41,12.679,10,15,11,3,1
 30 | 40,71.995,10,14,4,3,0
 31 | 41,70.479,3,6,3,3,1
 32 | 4,75.694,6,8,4,5,1
 33 | 26,15.29,15,8,8,4,1
 34 | 23,10.552,11,5,4,5,1
 35 | 23,9.689,12,10,13,3,2
 36 | 37,59.47,6,6,4,7,0
 37 | 23,67.137,5,7,7,6,0
 38 | 28,9.16,15,11,15,3,1
 39 | 37,10.364,5,5,7,3,1
 40 | 41,70.015,8,7,4,5,1
 41 | 37,76.166,6,3,5,6,1
 42 | 40,11.909,15,3,3,8,1
 43 | 9,11.532,4,4,6,6,1
 44 | 40,8.813,6,6,4,7,1
 45 | 31,25.807,3,6,6,6,0
 46 | 34,11.649,11,12,12,3,1
 47 | 2,69.585,6,8,6,6,0
 48 | 40,34.46,5,5,5,8,0
 49 | 13,7.14,10,10,14,5,1
 50 | 1,18.098,11,7,5,3,0
 51 | 23,61.045,3,11,8,6,0
 52 | 41,75.06,10,4,4,4,0
 53 | 9,20.981,11,5,8,7,1
 54 | 6,75.149,5,6,5,8,0
 55 | 31,22.93,8,13,9,6,1
 56 | 37,8.929,3,8,5,8,0
 57 | 4,70.452,7,3,3,6,1
 58 | 40,14.09,10,9,14,5,2
 59 | 8,76.782,8,8,6,3,0
 60 | 9,36.991,5,4,8,7,0
 61 | 23,9.347,6,8,8,8,1
 62 | 21,17.141,5,4,5,3,1
 63 | 39,69.422,6,3,7,6,1
 64 | 19,20.011,7,8,6,8,1
 65 | 13,63.497,4,15,3,3,1
 66 | 9,10.867,3,6,3,4,1
 67 | 9,59.895,8,6,4,8,0
 68 | 40,14.722,13,12,5,6,1
 69 | 40,16.382,5,6,3,3,0
 70 | 9,21.779,7,6,4,8,0
 71 | 38,23.923,6,6,4,5,0
 72 | 2,18.137,3,3,8,3,0
 73 | 37,20.891,10,10,10,7,0
 74 | 10,14.552,10,6,11,4,5
 75 | 2,24.254,15,11,13,8,2
 76 | 38,9.483,8,5,8,7,0
 77 | 0,19.968,7,10,12,8,1
 78 | 9,16.489,8,5,7,5,1
 79 | 40,17.641,10,10,3,8,1
 80 | 38,61.137,6,5,5,8,0
 81 | 28,17.913,11,4,13,7,0
 82 | 11,12.582,11,12,10,5,0
 83 | 11,69.736,8,11,8,4,1
 84 | 1,22.301,7,6,8,4,0
 85 | 37,77.668,8,5,3,3,0
 86 | 12,77.497,5,5,6,8,0
 87 | 37,67.345,4,3,7,3,0
 88 | 39,20.108,10,9,10,7,1
 89 | 23,7.627,7,8,3,3,0
 90 | 22,20.249,4,7,8,6,1
 91 | 36,70.052,5,9,6,3,1
 92 | 37,34.943,7,4,8,6,0
 93 | 40,13.047,8,7,14,5,1
 94 | 41,7.052,11,4,4,3,1
 95 | 37,61.237,8,4,6,6,0
 96 | 5,12.156,9,8,10,6,0
 97 | 3,11.083,14,11,6,5,0
 98 | 2,76.327,8,7,6,8,1
 99 | 9,61.427,7,3,6,6,0
100 | 9,28.466,8,3,8,4,1
101 | 40,7.724,12,5,4,8,0
102 | 22,7.278,8,6,4,6,0
103 | 37,62.974,8,5,4,4,1
104 | 22,20.781,12,13,11,6,1
105 | 3,67.933,4,10,4,8,1
106 | 41,13.286,6,3,8,4,0
107 | 13,7.108,11,14,14,4,0
108 | 6,22.053,4,7,3,6,1
109 | 5,20.186,13,15,15,6,0
110 | 38,19.025,3,5,5,8,0
111 | 30,23.931,10,13,14,8,0
112 | 2,62.952,4,6,8,7,1
113 | 6,19.707,7,4,6,5,1
114 | 40,5.808,7,11,7,8,1
115 | 31,68.645,7,3,7,5,1
116 | 31,64.455,6,8,3,5,1
117 | 23,68.512,7,12,8,5,0
118 | 22,14.062,10,12,10,7,1
119 | 2,74.233,7,12,6,3,1
120 | 38,20.05,4,14,13,6,1
121 | 3,58.576,10,15,10,5,0
122 | 2,16.898,8,4,3,5,0
123 | 4,13.668,11,13,3,6,1
124 | 41,59.899,3,6,5,4,0
125 | 9,32.142,4,3,6,4,0
126 | 12,67.397,8,6,3,6,1
127 | 8,11.921,10,14,14,5,1
128 | 10,18.416,3,5,5,8,0
129 | 41,17.444,10,6,4,5,0
130 | 3,73.677,11,4,5,6,0
131 | 0,6.041,10,3,5,6,1
132 | 40,6.917,5,4,8,3,0
133 | 12,64.758,4,6,4,8,0
134 | 41,64.43,4,4,4,3,1
135 | 37,58.964,6,7,3,6,0
136 | 1,17.652,11,14,6,7,2
137 | 19,65.905,9,11,8,4,0
138 | 37,17.567,9,8,8,4,0
139 | 23,22.81,7,6,6,5,0
140 | 2,67.802,8,5,6,4,1
141 | 3,69.432,7,8,6,4,0
142 | 1,21.748,8,10,10,6,1
143 | 40,21.639,3,6,5,8,1
144 | 5,9.816,12,4,6,5,0
145 | 1,59.553,4,14,6,4,1
146 | 8,40.557,6,6,8,3,0
147 | 6,17.014,7,4,10,6,1
148 | 1,22.983,10,12,11,8,0
149 | 19,10.658,13,13,12,6,0
150 | 37,74.905,7,8,6,7,1
151 | 1,62.766,7,7,5,6,0
152 | 8,9.755,11,8,13,7,2
153 | 6,61.76,6,8,4,5,0
154 | 2,8.283,5,4,3,8,0
155 | 19,64.47,4,7,3,8,0
156 | 2,19.369,4,9,6,5,1
157 | 19,11.546,8,7,3,7,0
158 | 10,19.028,8,3,5,5,0
159 | 40,8.032,11,11,3,8,2
160 | 9,10.099,4,5,12,3,0
161 | 13,22.776,10,3,12,10,1
162 | 40,7.349,11,11,11,6,1
163 | 6,33.668,3,4,8,6,0
164 | 40,61.095,5,5,5,4,0
165 | 31,12.499,8,8,7,7,0
166 | 6,13.028,11,6,5,6,0
167 | 6,16.039,7,3,7,4,0
168 | 3,14.413,8,4,5,8,1
169 | 3,21.512,13,4,6,8,0
170 | 41,6.366,15,14,12,6,1
171 | 2,41.776,8,5,4,6,0
172 | 9,16.166,12,15,10,3,0
173 | 40,73.751,11,13,6,3,0
174 | 1,7.511,14,12,12,8,1
175 | 6,76.377,7,3,8,3,0
176 | 41,62.196,8,8,4,4,1
177 | 10,22.084,7,3,14,4,1
178 | 4,6.894,6,8,7,7,1
179 | 9,8.854,12,7,14,4,1
180 | 41,7.122,8,7,10,4,0
181 | 10,6.996,14,7,13,8,0
182 | 19,14.166,10,15,14,6,1
183 | 41,18.223,6,7,11,8,1
184 | 3,9.979,15,5,7,6,1
185 | 18,17.17,8,6,7,7,0
186 | 40,70.917,8,6,4,3,1
187 | 27,13.84,5,6,4,4,0
188 | 6,10.888,12,14,14,7,0
189 | 6,15.65,15,10,14,4,0
190 | 2,22.511,12,12,9,6,0
191 | 8,19.883,7,9,7,4,0
192 | 18,66.671,7,4,3,4,0
193 | 3,6.965,4,4,6,7,1
194 | 28,64.037,6,15,8,7,0
195 | 31,11.694,11,8,6,6,0
196 | 9,23.353,10,5,6,8,1
197 | 31,14.707,13,4,4,6,0
198 | 31,75.583,4,8,8,6,0
199 | 13,67.544,11,8,3,6,1
200 | 40,8.693,15,14,15,7,3
201 | 41,61.352,3,5,3,8,0
202 | 13,21.243,13,10,15,8,1
203 | 31,15.861,3,4,7,7,0
204 | 9,19.396,6,13,12,7,0
205 | 9,16.204,13,5,4,6,1
206 | 9,10.812,12,11,5,8,0
207 | 40,27.975,4,5,3,7,1
208 | 12,15.27,8,8,7,7,1
209 | 0,7.106,8,8,14,8,5
210 | 38,19.081,3,4,7,4,1
211 | 9,15.496,10,5,3,4,1
212 | 19,10.04,6,5,8,6,1
213 | 2,11.604,13,13,4,8,1
214 | 2,9.586,8,3,3,6,0
215 | 5,22.514,11,7,6,3,1
216 | 9,73.12,6,6,6,8,1
217 | 8,7.531,11,4,4,7,1
218 | 1,65.03,3,3,7,6,0
219 | 1,9.895,8,5,8,7,0
220 | 9,5.469,6,4,3,5,0
221 | 41,73.41,5,3,7,4,1
222 | 28,16.596,6,4,6,5,0
223 | 31,72.562,7,3,11,8,0
224 | 1,33.662,4,5,4,5,0
225 | 6,18.415,8,5,3,6,0
226 | 6,65.557,10,12,9,7,1
227 | 5,63.253,6,10,3,8,1
228 | 5,10.222,3,8,7,7,0
229 | 1,12.685,10,3,4,5,1
230 | 2,73.165,6,5,5,5,0
231 | 2,6.952,8,8,5,4,1
232 | 3,6.796,7,5,4,5,1
233 | 3,9.197,13,12,10,7,1
234 | 41,67.581,6,5,3,6,1
235 | 


--------------------------------------------------------------------------------
/Project43/Spectral Co Clustering/Spectral Co Clustering from scratch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Importing necessary libraries for analysis"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 8,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import math\n",
 17 |     "from scipy.linalg import *\n",
 18 |     "from sklearn.datasets import make_checkerboard\n",
 19 |     "from sklearn.cluster import KMeans\n",
 20 |     "import numpy as np\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "%matplotlib inline\n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Creating data for application of algorithm"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 23,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQEAAAECCAYAAAD+eGJTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAALt0lEQVR4nO3db4xlhVnH8e/PsmBdcLtIgQ3FooQY/8WlmaAJRjENDeIL4EVNedFsY+PyohhIqpHwprzRoCmgMYZkEdI1oTQ1gPCC2BJCgk0M7UI2sLgqTbNWYN0tLgG62paFxxdzF4dlZufuzL333PH5fpLJvffcO3OenOx+95x7z55JVSGprx8begBJwzICUnNGQGrOCEjNGQGpOSMgNTdIBJJcleRfk3w7yS1DzHCiJAeSPJ9kb5I9A85xX5LDSfYtWXZ2kseTvDi63ToHM92W5OXR9tqb5OoZz3RhkieT7E/yQpKbRssH21YnmWnQbbWazPo8gSQfAP4NuBJ4CfgWcH1V/fNMB3n/XAeAhap6deA5fgP4PvC3VfVLo2V/DhypqttH0dxaVX888Ey3Ad+vqi/Oao4TZtoGbKuqZ5OcBTwDXAt8hoG21Ulm+l0G3FarGWJP4DLg21X1nar6EfAV4JoB5phLVfUUcOSExdcAu0f3d7P4B2vomQZVVQer6tnR/TeB/cAFDLitTjLTXBsiAhcA/7Hk8UvMx4Yq4OtJnkmyc+hhTnBeVR2ExT9owLkDz3PcjUmeGx0uzPQQZakkFwGXAk8zJ9vqhJlgTrbVcoaIQJZZNg/nLl9eVR8Dfhv43GgXWCu7G7gY2A4cBO4YYogkZwIPAjdX1RtDzHCiZWaai221kiEi8BJw4ZLHHwFeGWCO96iqV0a3h4GHWTxsmReHRsebx487Dw88D1V1qKrerqp3gHsYYHsl2cTiX7b7q+qh0eJBt9VyM83DtjqZISLwLeCSJD+T5HTgU8CjA8zxriSbR2/kkGQz8Alg38m/a6YeBXaM7u8AHhlwFuDdv2DHXceMt1eSAPcC+6vqziVPDbatVppp6G21mpl/OgAw+ojkL4APAPdV1Z/MfIj3zvOzLP7rD3Aa8OWhZkryAHAFcA5wCPgC8PfAV4GfBr4LfLKqZvZG3QozXcHi7m0BB4Abjh+Lz2imXwf+EXgeeGe0+FYWj8EH2VYnmel6BtxWqxkkApLmh2cMSs0ZAak5IyA1ZwSk5oyA1NygEZjD03OdaUzONL55neu4ofcE5nHjONN4nGl88zoXMHwEJA1sXScLJbkK+EsWz/z7m6q6/WSv37R5S52x9fx3H7919HU2bd6y5vVPw7zMtOUnTn/3/tHXj7B5y9kDTvN+8zLTwf987d379dZRsmnzgNMsbx7mqh+8Rv3o6HL/eY/T1vpDRxcH+WuWXBwkyaMnuzjIGVvP55f/YNdaV9nK1R+bh/9dPf/+9M8eHHqEDeGH3/yrFZ9bz+GAFweR/h9YTwTm9eIgkk7BeiIw1sVBkuxMsifJnreOvr6O1UmahvVEYKyLg1TVrqpaqKqFeXjDTdJ7rScCc3dxEEmnbs2fDlTVsSQ3Al/j/y4O8sLEJpM0E2uOAEBVPQY8NqFZJA3AMwal5oyA1JwRkJozAlJzRkBqzghIzRkBqTkjIDVnBKTmjIDUnBGQmjMCUnNGQGrOCEjNGQGpOSMgNWcEpOaMgNScEZCaMwJSc0ZAas4ISM0ZAak5IyA1ZwSk5oyA1JwRkJozAlJz6/qFpEkOAG8CbwPHqmphEkNJmp11RWDkt6rq1Qn8HEkD8HBAam69ESjg60meSbJzEgNJmq31Hg5cXlWvJDkXeDzJv1TVU0tfMIrDToDTP3TeOlcnadLWtSdQVa+Mbg8DDwOXLfOaXVW1UFULmzZvWc/qJE3BmiOQZHOSs47fBz4B7JvUYJJmYz2HA+cBDyc5/nO+XFX/MJGpJM3MmiNQVd8BfmWCs0gagB8RSs0ZAak5IyA1ZwSk5oyA1JwRkJozAlJzRkBqzghIzRkBqTkjIDVnBKTmjIDU3CQuNDq2/z7yGnu/8nezXOWG9Xu/+UdDj7AhbP7IR4ceYUN4a+/pKz7nnoDUnBGQmjMCUnNGQGrOCEjNGQGpOSMgNWcEpOaMgNScEZCaMwJSc0ZAas4ISM0ZAam5VSOQ5L4kh5PsW7Ls7CSPJ3lxdLt1umNKmpZx9gS+BFx1wrJbgCeq6hLgidFjSRvQqhGoqqeAIycsvgbYPbq/G7h2wnNJmpG1vidwXlUdBBjdnju5kSTN0tQvL5ZkJ7ATgE1nTnt1kk7RWvcEDiXZBjC6PbzSC6tqV1UtVNVCTvvgGlcnaVrWGoFHgR2j+zuARyYzjqRZG+cjwgeAfwJ+LslLST4L3A5cmeRF4MrRY0kb0KrvCVTV9Ss89fEJzyJpAJ4xKDVnBKTmjIDUnBGQmjMCUnNGQGrOCEjNGQGpOSMgNWcEpOaMgNScEZCaMwJSc0ZAas4ISM0ZAak5IyA1ZwSk5oyA1JwRkJozAlJzRkBqzghIzRkBqTkjIDVnBKTmjIDUnBGQmjMCUnPj/Gry+5IcTrJvybLbkrycZO/o6+rpjilpWsbZE/gScNUyy++qqu2jr8cmO5akWVk1AlX1FHBkBrNIGsB63hO4Mclzo8OFrSu9KMnOJHuS7Klj/7OO1UmahrVG4G7gYmA7cBC4Y6UXVtWuqlqoqoWc9sE1rk7StKwpAlV1qKrerqp3gHuAyyY7lqRZWVMEkmxb8vA6YN9Kr5U0305b7QVJHgCuAM5J8hLwBeCKJNuBAg4AN0xxRklTtGoEqur6ZRbfO4VZJA3AMwal5oyA1JwRkJozAlJzRkBqbtVPByZp6/kf5nf+8PdnucoN67F93xt6hA3h6IvPDz3ChvDOD1Y+Zd89Aak5IyA1ZwSk5oyA1JwRkJozAlJzRkBqzghIzRkBqTkjIDVnBKTmjIDUnBGQmjMCUnNGQGrOCEjNGQGpOSMgNWcEpOaMgNScEZCaMwJSc6tGIMmFSZ5Msj/JC0luGi0/O8njSV4c3W6d/riSJm2cPYFjwOer6ueBXwM+l+QXgFuAJ6rqEuCJ0WNJG8yqEaiqg1X17Oj+m8B+4ALgGmD36GW7gWunNaSk6Tml9wSSXARcCjwNnFdVB2ExFMC5kx5O0vSNHYEkZwIPAjdX1Run8H07k+xJsueHb7y2lhklTdFYEUiyicUA3F9VD40WH0qybfT8NuDwct9bVbuqaqGqFs74Sd87lObNOJ8OBLgX2F9Vdy556lFgx+j+DuCRyY8nadrG+a3ElwOfBp5Psne07FbgduCrST4LfBf45HRGlDRNq0agqr4BZIWnPz7ZcSTNmmcMSs0ZAak5IyA1ZwSk5oyA1JwRkJozAlJzRkBqzghIzRkBqTkjIDVnBKTmjIDUnBGQmjMCUnNGQGrOCEjNGQGpOSMgNWcEpObGudrwxFz4oR/nrmt/cZar3LCOvV1Dj7AhbPnMwtAjbAiX/+rXVnzOPQGpOSMgNWcEpOaMgNScEZCaMwJSc0ZAam6cX01+YZInk+xP8kKSm0bLb0vycpK9o6+rpz+upEkb52ShY8Dnq+rZJGcBzyR5fPTcXVX1xemNJ2naxvnV5AeBg6P7bybZD1ww7cEkzcYpvSeQ5CLgUuDp0aIbkzyX5L4kWyc8m6QZGDsCSc4EHgRurqo3gLuBi4HtLO4p3LHC9+1MsifJnv969dUJjCxpksaKQJJNLAbg/qp6CKCqDlXV21X1DnAPcNly31tVu6pqoaoWfuqccyY1t6QJGefTgQD3Avur6s4ly7ctedl1wL7Jjydp2sb5dOBy4NPA80n2jpbdClyfZDtQwAHghqlMKGmqxvl04BtAlnnqscmPI2nWPGNQas4ISM0ZAak5IyA1ZwSk5oyA1JwRkJozAlJzRkBqzghIzRkBqTkjIDVnBKTmjIDUnBGQmjMCUnNGQGrOCEjNGQGpOSMgNWcEpOaMgNScEZCaMwJSc0ZAas4ISM0ZAak5IyA1l6qa3cqS7wH/vmTROcCrMxtgPM40Hmca3zzM9dGq+vByT8w0Au9bebKnqhYGG2AZzjQeZxrfvM51nIcDUnNGQGpu6AjsGnj9y3Gm8TjT+OZ1LmDg9wQkDW/oPQFJAzMCUnNGQGrOCEjNGQGpuf8F6Tc6D2c23VMAAAAASUVORK5CYII=\n",
 40 |       "text/plain": [
 41 |        "<Figure size 288x288 with 1 Axes>"
 42 |       ]
 43 |      },
 44 |      "metadata": {
 45 |       "needs_background": "light"
 46 |      },
 47 |      "output_type": "display_data"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "n_clusters   = (4,3)\n",
 52 |     "A,_,_        = make_checkerboard(shape = (30,30),n_clusters = n_clusters,shuffle=False,random_state=0)\n",
 53 |     "plt.matshow(A, cmap=plt.cm.Blues);\n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Creating the Row and Column diagonal matrices and also applying SVD on An"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 10,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "D_one_half = np.diag([np.power(A[i].sum(),-0.5) for i in range(A.shape[0])])\n",
 70 |     "D_two_half = np.diag([np.power(A.T[i].sum(),-0.5) for i in range(A.shape[1])])\n",
 71 |     "An         = (D_one_half.dot(A)).dot(D_two_half)\n",
 72 |     "U,S,Vt     = svd(An)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Input parameters for the Co clustering"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 11,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "k          = 4\n",
 89 |     "l          = int(np.ceil(math.log(k,2)))"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Creating the matrix for application of kmeans algorithm"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 13,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "Ul         = U[:,1:(l+1)]\n",
106 |     "Vl         = Vt.T[:,1:(l+1)]\n",
107 |     "Z          = np.vstack((Ul,Vl))"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Applying KMeans on the data Z and performing clustering "
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 15,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n",
126 |        "       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',\n",
127 |        "       random_state=None, tol=0.0001, verbose=0)"
128 |       ]
129 |      },
130 |      "execution_count": 15,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "kmeans     = KMeans(n_clusters=k)\n",
137 |     "kmeans.fit(Z)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 16,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "labels          = kmeans.labels_\n",
147 |     "row_clusters    = labels[:A.shape[0]]\n",
148 |     "column_clusters = labels[A.shape[0]:]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "## We can clearly see the 4 clusters corresponding to the rows"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 21,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "array([3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,\n",
167 |        "       1, 1, 2, 2, 2, 2, 2, 2])"
168 |       ]
169 |      },
170 |      "execution_count": 21,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "row_clusters"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## We can clearly see the 3 clusters corresponding to the columns"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 22,
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "data": {
193 |       "text/plain": [
194 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,\n",
195 |        "       3, 3, 3, 3, 3, 3, 3, 3])"
196 |       ]
197 |      },
198 |      "execution_count": 22,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "column_clusters"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": []
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.7.4"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 2
236 | }
237 | 


--------------------------------------------------------------------------------
/Project9/YOLO_Object_Detection/yolo-coco/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/Project28/Air_Quality_Bayesian_Regression/Bayesian Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import scipy\n",
 15 |     "from sklearn.decomposition import PCA\n",
 16 |     "from sklearn.linear_model import BayesianRidge\n",
 17 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 18 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 19 |     "from sklearn.metrics import *\n",
 20 |     "import hyperopt\n",
 21 |     "from hyperopt import *\n",
 22 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "%matplotlib inline \n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "<div class=\"alert alert-block alert-info\">\n",
 32 |     "<b>Loading the data:</b> We load the data from the mentioned path\n",
 33 |     "</div>"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 7,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>Date</th>\n",
 63 |        "      <th>Time</th>\n",
 64 |        "      <th>CO(GT)</th>\n",
 65 |        "      <th>PT08.S1(CO)</th>\n",
 66 |        "      <th>NMHC(GT)</th>\n",
 67 |        "      <th>C6H6(GT)</th>\n",
 68 |        "      <th>PT08.S2(NMHC)</th>\n",
 69 |        "      <th>NOx(GT)</th>\n",
 70 |        "      <th>PT08.S3(NOx)</th>\n",
 71 |        "      <th>NO2(GT)</th>\n",
 72 |        "      <th>PT08.S4(NO2)</th>\n",
 73 |        "      <th>PT08.S5(O3)</th>\n",
 74 |        "      <th>T</th>\n",
 75 |        "      <th>RH</th>\n",
 76 |        "      <th>AH</th>\n",
 77 |        "    </tr>\n",
 78 |        "  </thead>\n",
 79 |        "  <tbody>\n",
 80 |        "    <tr>\n",
 81 |        "      <td>0</td>\n",
 82 |        "      <td>2004-03-10</td>\n",
 83 |        "      <td>18:00:00</td>\n",
 84 |        "      <td>2.6</td>\n",
 85 |        "      <td>1360.00</td>\n",
 86 |        "      <td>150</td>\n",
 87 |        "      <td>11.881723</td>\n",
 88 |        "      <td>1045.50</td>\n",
 89 |        "      <td>166.0</td>\n",
 90 |        "      <td>1056.25</td>\n",
 91 |        "      <td>113.0</td>\n",
 92 |        "      <td>1692.00</td>\n",
 93 |        "      <td>1267.50</td>\n",
 94 |        "      <td>13.6</td>\n",
 95 |        "      <td>48.875001</td>\n",
 96 |        "      <td>0.757754</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <td>1</td>\n",
100 |        "      <td>2004-03-10</td>\n",
101 |        "      <td>19:00:00</td>\n",
102 |        "      <td>2.0</td>\n",
103 |        "      <td>1292.25</td>\n",
104 |        "      <td>112</td>\n",
105 |        "      <td>9.397165</td>\n",
106 |        "      <td>954.75</td>\n",
107 |        "      <td>103.0</td>\n",
108 |        "      <td>1173.75</td>\n",
109 |        "      <td>92.0</td>\n",
110 |        "      <td>1558.75</td>\n",
111 |        "      <td>972.25</td>\n",
112 |        "      <td>13.3</td>\n",
113 |        "      <td>47.700000</td>\n",
114 |        "      <td>0.725487</td>\n",
115 |        "    </tr>\n",
116 |        "  </tbody>\n",
117 |        "</table>\n",
118 |        "</div>"
119 |       ],
120 |       "text/plain": [
121 |        "        Date      Time  CO(GT)  PT08.S1(CO)  NMHC(GT)   C6H6(GT)  \\\n",
122 |        "0 2004-03-10  18:00:00     2.6      1360.00       150  11.881723   \n",
123 |        "1 2004-03-10  19:00:00     2.0      1292.25       112   9.397165   \n",
124 |        "\n",
125 |        "   PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  \\\n",
126 |        "0        1045.50    166.0       1056.25    113.0       1692.00      1267.50   \n",
127 |        "1         954.75    103.0       1173.75     92.0       1558.75       972.25   \n",
128 |        "\n",
129 |        "      T         RH        AH  \n",
130 |        "0  13.6  48.875001  0.757754  \n",
131 |        "1  13.3  47.700000  0.725487  "
132 |       ]
133 |      },
134 |      "execution_count": 7,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "path_of_input_file = r'D:\\kaggle_trials\\AirQualityUCI\\AirQualityUCI.xlsx'\n",
141 |     "df                 = pd.read_excel(path_of_input_file)\n",
142 |     "df.head(2)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "<div class=\"alert alert-block alert-info\">\n",
150 |     "<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively \n",
151 |     "</div>"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 9,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "cols_needed           = list(df.columns)\n",
161 |     "cols_needed           = cols_needed[2:len(cols_needed)-1]\n",
162 |     "\n",
163 |     "possible_numeric_cols = list(df._get_numeric_data().columns)\n",
164 |     "\n",
165 |     "categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))\n",
166 |     "\n",
167 |     "numerical_columns     = []\n",
168 |     "for i in range(len(possible_numeric_cols)):\n",
169 |     "    col_name  = possible_numeric_cols[i]\n",
170 |     "    if len(df[col_name].unique())<10:\n",
171 |     "        categorical_columns.append(col_name)\n",
172 |     "    else:\n",
173 |     "        numerical_columns.append(col_name)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "<div class=\"alert alert-block alert-info\">\n",
181 |     "<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.\n",
182 |     "</div>"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 11,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "for i in range(len(categorical_columns)):\n",
192 |     "    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])\n",
193 |     "mean_impute_dict    ={}\n",
194 |     "for i in range(len(numerical_columns)):\n",
195 |     "    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))\n",
196 |     "for i in range(len(numerical_columns)):\n",
197 |     "    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "<div class=\"alert alert-block alert-info\">\n",
205 |     "<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations\n",
206 |     "</div>"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 13,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "ohe                  = OneHotEncoder()\n",
216 |     "scalar               = MinMaxScaler()\n",
217 |     "encoded_matrix       = ohe.fit_transform(df[categorical_columns])\n",
218 |     "scaled_matrix        = scalar.fit_transform(df[numerical_columns])\n",
219 |     "X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A\n",
220 |     "Y                    = scalar.fit_transform(df[['AH']])\n"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "<div class=\"alert alert-block alert-info\">\n",
228 |     "<b>Train Test Split :</b> We split the data to train and test set \n",
229 |     "</div>"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 14,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "<div class=\"alert alert-block alert-info\">\n",
246 |     "<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use\n",
247 |     "</div>"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 15,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "Bayesian_ridge_grid  = {'n_iter' : hp.choice('n_iter',range(300,600)),\n",
257 |     "                        'alpha_1': hp.uniform('alpha_1',0.0,1.0),\n",
258 |     "                        'alpha_2': hp.uniform('alpha_2',0.0,1.0),\n",
259 |     "                        'lambda_1': hp.uniform('lambda_1',0.0,1.0),\n",
260 |     "                        'lambda_2': hp.uniform('lambda_2',0.0,1.0),\n",
261 |     "                     }"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 18,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "100%|████████████████████████████████████████████████| 100/100 [00:02<00:00, 34.63it/s, best loss: -0.9999999999995861]\n",
274 |       "The best parameter tuned on training set is given by :-  {'alpha_1': 0.7822556011911279, 'alpha_2': 0.0014753500365261268, 'lambda_1': 0.12542036284480557, 'lambda_2': 0.8048114623228889, 'n_iter': 333}\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "def hyperopt_train_test(params):\n",
280 |     "    reg = BayesianRidge(**params)\n",
281 |     "    return cross_val_score(reg, X_train, y_train).mean()\n",
282 |     "\n",
283 |     "def function_to_minimise(params):\n",
284 |     "    accuracy = hyperopt_train_test(params)\n",
285 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
286 |     "\n",
287 |     "\n",
288 |     "trials          = Trials()\n",
289 |     "best            = fmin(function_to_minimise, Bayesian_ridge_grid, algo=tpe.suggest, max_evals=100, trials=trials)\n",
290 |     "best_parameters = space_eval(Bayesian_ridge_grid, best)\n",
291 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "<div class=\"alert alert-block alert-info\">\n",
299 |     "<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score\n",
300 |     "</div>"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 21,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "BayesianRidge(alpha_1=0.7822556011911279, alpha_2=0.0014753500365261268,\n",
312 |        "              compute_score=False, copy_X=True, fit_intercept=True,\n",
313 |        "              lambda_1=0.12542036284480557, lambda_2=0.8048114623228889,\n",
314 |        "              n_iter=333, normalize=False, tol=0.001, verbose=False)"
315 |       ]
316 |      },
317 |      "execution_count": 21,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "model = BayesianRidge(**best_parameters)\n",
324 |     "model.fit(X_train, y_train)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 22,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "The coefficient of determination is:-  0.9999999999999205\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "y_pred = model.predict(X_test)\n",
342 |     "print('The coefficient of determination is:- ',r2_score(y_pred,y_test))"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": []
351 |   }
352 |  ],
353 |  "metadata": {
354 |   "kernelspec": {
355 |    "display_name": "Python 3",
356 |    "language": "python",
357 |    "name": "python3"
358 |   },
359 |   "language_info": {
360 |    "codemirror_mode": {
361 |     "name": "ipython",
362 |     "version": 3
363 |    },
364 |    "file_extension": ".py",
365 |    "mimetype": "text/x-python",
366 |    "name": "python",
367 |    "nbconvert_exporter": "python",
368 |    "pygments_lexer": "ipython3",
369 |    "version": "3.7.4"
370 |   }
371 |  },
372 |  "nbformat": 4,
373 |  "nbformat_minor": 2
374 | }
375 | 


--------------------------------------------------------------------------------
/Project29/weather_prediction_passive_aggressive_regression/Passive Aggressive Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import scipy\n",
 15 |     "from sklearn.decomposition import PCA\n",
 16 |     "from sklearn.linear_model import PassiveAggressiveRegressor\n",
 17 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 18 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 19 |     "from sklearn.metrics import *\n",
 20 |     "import hyperopt\n",
 21 |     "from hyperopt import *\n",
 22 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "%matplotlib inline \n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "<div class=\"alert alert-block alert-info\">\n",
 32 |     "<b>Loading the data:</b> We load the data from the mentioned path\n",
 33 |     "</div>"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>Formatted Date</th>\n",
 63 |        "      <th>Summary</th>\n",
 64 |        "      <th>Precip Type</th>\n",
 65 |        "      <th>Temperature (C)</th>\n",
 66 |        "      <th>Apparent Temperature (C)</th>\n",
 67 |        "      <th>Humidity</th>\n",
 68 |        "      <th>Wind Speed (km/h)</th>\n",
 69 |        "      <th>Wind Bearing (degrees)</th>\n",
 70 |        "      <th>Visibility (km)</th>\n",
 71 |        "      <th>Loud Cover</th>\n",
 72 |        "      <th>Pressure (millibars)</th>\n",
 73 |        "      <th>Daily Summary</th>\n",
 74 |        "    </tr>\n",
 75 |        "  </thead>\n",
 76 |        "  <tbody>\n",
 77 |        "    <tr>\n",
 78 |        "      <td>0</td>\n",
 79 |        "      <td>2006-04-01 00:00:00.000 +0200</td>\n",
 80 |        "      <td>Partly Cloudy</td>\n",
 81 |        "      <td>rain</td>\n",
 82 |        "      <td>9.472222</td>\n",
 83 |        "      <td>7.388889</td>\n",
 84 |        "      <td>0.89</td>\n",
 85 |        "      <td>14.1197</td>\n",
 86 |        "      <td>251.0</td>\n",
 87 |        "      <td>15.8263</td>\n",
 88 |        "      <td>0.0</td>\n",
 89 |        "      <td>1015.13</td>\n",
 90 |        "      <td>Partly cloudy throughout the day.</td>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "      <td>1</td>\n",
 94 |        "      <td>2006-04-01 01:00:00.000 +0200</td>\n",
 95 |        "      <td>Partly Cloudy</td>\n",
 96 |        "      <td>rain</td>\n",
 97 |        "      <td>9.355556</td>\n",
 98 |        "      <td>7.227778</td>\n",
 99 |        "      <td>0.86</td>\n",
100 |        "      <td>14.2646</td>\n",
101 |        "      <td>259.0</td>\n",
102 |        "      <td>15.8263</td>\n",
103 |        "      <td>0.0</td>\n",
104 |        "      <td>1015.63</td>\n",
105 |        "      <td>Partly cloudy throughout the day.</td>\n",
106 |        "    </tr>\n",
107 |        "  </tbody>\n",
108 |        "</table>\n",
109 |        "</div>"
110 |       ],
111 |       "text/plain": [
112 |        "                  Formatted Date        Summary Precip Type  Temperature (C)  \\\n",
113 |        "0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   \n",
114 |        "1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   \n",
115 |        "\n",
116 |        "   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \\\n",
117 |        "0                  7.388889      0.89            14.1197   \n",
118 |        "1                  7.227778      0.86            14.2646   \n",
119 |        "\n",
120 |        "   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \\\n",
121 |        "0                   251.0          15.8263         0.0               1015.13   \n",
122 |        "1                   259.0          15.8263         0.0               1015.63   \n",
123 |        "\n",
124 |        "                       Daily Summary  \n",
125 |        "0  Partly cloudy throughout the day.  \n",
126 |        "1  Partly cloudy throughout the day.  "
127 |       ]
128 |      },
129 |      "execution_count": 2,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "path_of_input_file = r'D:\\kaggle_trials\\szeged-weather\\weatherHistory.csv'\n",
136 |     "df                 = pd.read_csv(path_of_input_file)\n",
137 |     "df.head(2)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "<div class=\"alert alert-block alert-info\">\n",
145 |     "<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively \n",
146 |     "</div>"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 3,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "cols_needed           = list(df.columns)\n",
156 |     "cols_needed           = cols_needed[1:len(cols_needed)-1]\n",
157 |     "\n",
158 |     "possible_numeric_cols = list(df._get_numeric_data().columns)\n",
159 |     "possible_numeric_cols.remove('Humidity')\n",
160 |     "categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))\n",
161 |     "\n",
162 |     "numerical_columns     = []\n",
163 |     "for i in range(len(possible_numeric_cols)):\n",
164 |     "    col_name  = possible_numeric_cols[i]\n",
165 |     "    if len(df[col_name].unique())<10:\n",
166 |     "        categorical_columns.append(col_name)\n",
167 |     "    else:\n",
168 |     "        numerical_columns.append(col_name)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "<div class=\"alert alert-block alert-info\">\n",
176 |     "<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.\n",
177 |     "</div>"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 4,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "for i in range(len(categorical_columns)):\n",
187 |     "    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])\n",
188 |     "mean_impute_dict    ={}\n",
189 |     "for i in range(len(numerical_columns)):\n",
190 |     "    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))\n",
191 |     "for i in range(len(numerical_columns)):\n",
192 |     "    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "<div class=\"alert alert-block alert-info\">\n",
200 |     "<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations\n",
201 |     "</div>"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 5,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "ohe                  = OneHotEncoder()\n",
211 |     "scalar               = MinMaxScaler()\n",
212 |     "encoded_matrix       = ohe.fit_transform(df[categorical_columns])\n",
213 |     "scaled_matrix        = scalar.fit_transform(df[numerical_columns])\n",
214 |     "X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A\n",
215 |     "Y                    = scalar.fit_transform(df[['Humidity']])\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "<div class=\"alert alert-block alert-info\">\n",
223 |     "<b>Train Test Split :</b> We split the data to train and test set \n",
224 |     "</div>"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 6,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "<div class=\"alert alert-block alert-info\">\n",
241 |     "<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use\n",
242 |     "</div>"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 11,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "Pars_grid  = {          'max_iter': hp.choice('max_iter',range(2,200)),\n",
252 |     "                        'C': hp.uniform('C',0.0,1.0),\n",
253 |     "                        'loss'    : hp.choice('loss',['epsilon_insensitive','squared_epsilon_insensitive'])\n",
254 |     "                        \n",
255 |     "                     }"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 12,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "100%|████████████████████████████████████████████████| 100/100 [00:55<00:00,  1.80it/s, best loss: -0.9739912951898163]\n",
268 |       "The best parameter tuned on training set is given by :-  {'C': 0.8948999920414936, 'loss': 'epsilon_insensitive', 'max_iter': 36}\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "def hyperopt_train_test(params):\n",
274 |     "    reg = PassiveAggressiveRegressor(**params,random_state=19)\n",
275 |     "    return cross_val_score(reg, X_train, y_train).mean()\n",
276 |     "\n",
277 |     "def function_to_minimise(params):\n",
278 |     "    accuracy = hyperopt_train_test(params)\n",
279 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
280 |     "\n",
281 |     "\n",
282 |     "trials          = Trials()\n",
283 |     "best            = fmin(function_to_minimise, Pars_grid, algo=tpe.suggest, max_evals=100, trials=trials)\n",
284 |     "best_parameters = space_eval(Pars_grid, best)\n",
285 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "<div class=\"alert alert-block alert-info\">\n",
293 |     "<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score\n",
294 |     "</div>"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 13,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "data": {
304 |       "text/plain": [
305 |        "PassiveAggressiveRegressor(C=0.8948999920414936, average=False,\n",
306 |        "                           early_stopping=False, epsilon=0.1,\n",
307 |        "                           fit_intercept=True, loss='epsilon_insensitive',\n",
308 |        "                           max_iter=36, n_iter_no_change=5, random_state=None,\n",
309 |        "                           shuffle=True, tol=0.001, validation_fraction=0.1,\n",
310 |        "                           verbose=0, warm_start=False)"
311 |       ]
312 |      },
313 |      "execution_count": 13,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "model = PassiveAggressiveRegressor(**best_parameters)\n",
320 |     "model.fit(X_train, y_train)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 14,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "name": "stdout",
330 |      "output_type": "stream",
331 |      "text": [
332 |       "The coefficient of determination is:-  0.9770641814885767\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "y_pred = model.predict(X_test)\n",
338 |     "print('The coefficient of determination is:- ',r2_score(y_pred,y_test))"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": []
347 |   }
348 |  ],
349 |  "metadata": {
350 |   "kernelspec": {
351 |    "display_name": "Python 3",
352 |    "language": "python",
353 |    "name": "python3"
354 |   },
355 |   "language_info": {
356 |    "codemirror_mode": {
357 |     "name": "ipython",
358 |     "version": 3
359 |    },
360 |    "file_extension": ".py",
361 |    "mimetype": "text/x-python",
362 |    "name": "python",
363 |    "nbconvert_exporter": "python",
364 |    "pygments_lexer": "ipython3",
365 |    "version": "3.7.4"
366 |   }
367 |  },
368 |  "nbformat": 4,
369 |  "nbformat_minor": 2
370 | }
371 | 


--------------------------------------------------------------------------------
/Project30/world_war2_weather_SGDRegressor/SGDRegressor.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import scipy\n",
 15 |     "from sklearn.decomposition import PCA\n",
 16 |     "from sklearn.linear_model import SGDRegressor\n",
 17 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 18 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 19 |     "from sklearn.metrics import *\n",
 20 |     "import hyperopt\n",
 21 |     "from hyperopt import *\n",
 22 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "%matplotlib inline \n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "<div class=\"alert alert-block alert-info\">\n",
 32 |     "<b>Loading the data:</b> We load the data from the mentioned path\n",
 33 |     "</div>"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 21,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>STA</th>\n",
 63 |        "      <th>WindGustSpd</th>\n",
 64 |        "      <th>MaxTemp</th>\n",
 65 |        "      <th>MinTemp</th>\n",
 66 |        "      <th>MeanTemp</th>\n",
 67 |        "      <th>Snowfall</th>\n",
 68 |        "      <th>PoorWeather</th>\n",
 69 |        "      <th>YR</th>\n",
 70 |        "      <th>MO</th>\n",
 71 |        "      <th>DA</th>\n",
 72 |        "      <th>DR</th>\n",
 73 |        "      <th>SPD</th>\n",
 74 |        "      <th>MAX</th>\n",
 75 |        "      <th>MIN</th>\n",
 76 |        "      <th>MEA</th>\n",
 77 |        "    </tr>\n",
 78 |        "  </thead>\n",
 79 |        "  <tbody>\n",
 80 |        "    <tr>\n",
 81 |        "      <td>0</td>\n",
 82 |        "      <td>10001</td>\n",
 83 |        "      <td>NaN</td>\n",
 84 |        "      <td>25.555556</td>\n",
 85 |        "      <td>22.222222</td>\n",
 86 |        "      <td>23.888889</td>\n",
 87 |        "      <td>0</td>\n",
 88 |        "      <td>NaN</td>\n",
 89 |        "      <td>42</td>\n",
 90 |        "      <td>7</td>\n",
 91 |        "      <td>1</td>\n",
 92 |        "      <td>NaN</td>\n",
 93 |        "      <td>NaN</td>\n",
 94 |        "      <td>78.0</td>\n",
 95 |        "      <td>72.0</td>\n",
 96 |        "      <td>75.0</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <td>1</td>\n",
100 |        "      <td>10001</td>\n",
101 |        "      <td>NaN</td>\n",
102 |        "      <td>28.888889</td>\n",
103 |        "      <td>21.666667</td>\n",
104 |        "      <td>25.555556</td>\n",
105 |        "      <td>0</td>\n",
106 |        "      <td>NaN</td>\n",
107 |        "      <td>42</td>\n",
108 |        "      <td>7</td>\n",
109 |        "      <td>2</td>\n",
110 |        "      <td>NaN</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>84.0</td>\n",
113 |        "      <td>71.0</td>\n",
114 |        "      <td>78.0</td>\n",
115 |        "    </tr>\n",
116 |        "  </tbody>\n",
117 |        "</table>\n",
118 |        "</div>"
119 |       ],
120 |       "text/plain": [
121 |        "     STA  WindGustSpd    MaxTemp    MinTemp   MeanTemp Snowfall PoorWeather  \\\n",
122 |        "0  10001          NaN  25.555556  22.222222  23.888889        0         NaN   \n",
123 |        "1  10001          NaN  28.888889  21.666667  25.555556        0         NaN   \n",
124 |        "\n",
125 |        "   YR  MO  DA  DR  SPD   MAX   MIN   MEA  \n",
126 |        "0  42   7   1 NaN  NaN  78.0  72.0  75.0  \n",
127 |        "1  42   7   2 NaN  NaN  84.0  71.0  78.0  "
128 |       ]
129 |      },
130 |      "execution_count": 21,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "path_of_input_file = r'D:\\kaggle_trials\\weatherww2\\Summary of Weather.csv'\n",
137 |     "cols2read = ['STA','WindGustSpd','MaxTemp','MinTemp','MeanTemp'\n",
138 |     "             ,'Snowfall','PoorWeather','YR','MO','DA','DR',\n",
139 |     "             'SPD','MAX','MIN','MEA']\n",
140 |     "\n",
141 |     "df                 = pd.read_csv(path_of_input_file,usecols= cols2read)\n",
142 |     "df.head(2)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": []
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "<div class=\"alert alert-block alert-info\">\n",
157 |     "<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively \n",
158 |     "</div>"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 22,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "cols_needed           = list(df.columns)\n",
168 |     "cols_needed           = cols_needed[:len(cols_needed)-11]\n",
169 |     "cols_needed.remove('MaxTemp')\n",
170 |     "possible_numeric_cols = list(df._get_numeric_data().columns)\n",
171 |     "possible_numeric_cols.remove('MaxTemp')\n",
172 |     "categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))\n",
173 |     "\n",
174 |     "numerical_columns     = []\n",
175 |     "for i in range(len(possible_numeric_cols)):\n",
176 |     "    col_name  = possible_numeric_cols[i]\n",
177 |     "    if len(df[col_name].unique())<10:\n",
178 |     "        categorical_columns.append(col_name)\n",
179 |     "    else:\n",
180 |     "        numerical_columns.append(col_name)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "<div class=\"alert alert-block alert-info\">\n",
188 |     "<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.\n",
189 |     "</div>"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 23,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "for i in range(len(categorical_columns)):\n",
199 |     "    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])\n",
200 |     "mean_impute_dict    ={}\n",
201 |     "for i in range(len(numerical_columns)):\n",
202 |     "    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))\n",
203 |     "for i in range(len(numerical_columns)):\n",
204 |     "    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "<div class=\"alert alert-block alert-info\">\n",
212 |     "<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations\n",
213 |     "</div>"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 24,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "ohe                  = OneHotEncoder()\n",
223 |     "scalar               = MinMaxScaler()\n",
224 |     "encoded_matrix       = ohe.fit_transform(df[categorical_columns])\n",
225 |     "scaled_matrix        = scalar.fit_transform(df[numerical_columns])\n",
226 |     "X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A\n",
227 |     "Y                    = scalar.fit_transform(df[['MaxTemp']])\n"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "<div class=\"alert alert-block alert-info\">\n",
235 |     "<b>Train Test Split :</b> We split the data to train and test set \n",
236 |     "</div>"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 25,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "<div class=\"alert alert-block alert-info\">\n",
253 |     "<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use\n",
254 |     "</div>"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 27,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "SGDR_grid   = {'loss' : hp.choice('loss',['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive']),\n",
264 |     "               'penalty' : hp.choice('penalty',['l2','l1','elasticnet','none']),\n",
265 |     "                'alpha': hp.uniform('alpha',0.0,1.0),\n",
266 |     "                'learning_rate': hp.choice('learning_rate',['constant','optimal','invscaling','adaptive'])\n",
267 |     "                        \n",
268 |     "                     }"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 28,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "100%|████████████████████████████████████████████████| 100/100 [03:09<00:00,  1.90s/it, best loss: -0.9939970686605907]\n",
281 |       "The best parameter tuned on training set is given by :-  {'alpha': 0.055081188680586174, 'learning_rate': 'adaptive', 'loss': 'squared_loss', 'penalty': 'none'}\n"
282 |      ]
283 |     }
284 |    ],
285 |    "source": [
286 |     "def hyperopt_train_test(params):\n",
287 |     "    reg = SGDRegressor(**params)\n",
288 |     "    return cross_val_score(reg, X_train, y_train).mean()\n",
289 |     "\n",
290 |     "def function_to_minimise(params):\n",
291 |     "    accuracy = hyperopt_train_test(params)\n",
292 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
293 |     "\n",
294 |     "\n",
295 |     "trials          = Trials()\n",
296 |     "best            = fmin(function_to_minimise, SGDR_grid, algo=tpe.suggest, max_evals=100, trials=trials)\n",
297 |     "best_parameters = space_eval(SGDR_grid, best)\n",
298 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "<div class=\"alert alert-block alert-info\">\n",
306 |     "<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score\n",
307 |     "</div>"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 29,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "SGDRegressor(alpha=0.055081188680586174, average=False, early_stopping=False,\n",
319 |        "             epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,\n",
320 |        "             learning_rate='adaptive', loss='squared_loss', max_iter=1000,\n",
321 |        "             n_iter_no_change=5, penalty='none', power_t=0.25,\n",
322 |        "             random_state=None, shuffle=True, tol=0.001,\n",
323 |        "             validation_fraction=0.1, verbose=0, warm_start=False)"
324 |       ]
325 |      },
326 |      "execution_count": 29,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "model = SGDRegressor(**best_parameters)\n",
333 |     "model.fit(X_train, y_train)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 30,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "name": "stdout",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "The coefficient of determination is:-  0.9947724644478078\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "y_pred = model.predict(X_test)\n",
351 |     "print('The coefficient of determination is:- ',r2_score(y_pred,y_test))"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": []
360 |   }
361 |  ],
362 |  "metadata": {
363 |   "kernelspec": {
364 |    "display_name": "Python 3",
365 |    "language": "python",
366 |    "name": "python3"
367 |   },
368 |   "language_info": {
369 |    "codemirror_mode": {
370 |     "name": "ipython",
371 |     "version": 3
372 |    },
373 |    "file_extension": ".py",
374 |    "mimetype": "text/x-python",
375 |    "name": "python",
376 |    "nbconvert_exporter": "python",
377 |    "pygments_lexer": "ipython3",
378 |    "version": "3.7.4"
379 |   }
380 |  },
381 |  "nbformat": 4,
382 |  "nbformat_minor": 2
383 | }
384 | 


--------------------------------------------------------------------------------
/Project13/Spam_Or_Ham_MultinomialNB/multinomial NB.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Using TensorFlow backend.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import warnings\n",
 18 |     "warnings.filterwarnings('ignore')\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "import os\n",
 22 |     "from imblearn.over_sampling import SMOTE \n",
 23 |     "from sklearn.naive_bayes import MultinomialNB\n",
 24 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 25 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 26 |     "from sklearn.metrics import *\n",
 27 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
 28 |     "from sklearn.decomposition import PCA,TruncatedSVD\n",
 29 |     "import hyperopt\n",
 30 |     "from hyperopt import *\n",
 31 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 32 |     "import string\n",
 33 |     "import matplotlib.pyplot as plt\n",
 34 |     "%matplotlib inline \n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "<div class=\"alert alert-block alert-info\">\n",
 42 |     "<b>Loading dataset:</b> We load the dataset and rename certain columns to be used in our analysis\n",
 43 |     "</div>"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/html": [
 54 |        "<div>\n",
 55 |        "<style scoped>\n",
 56 |        "    .dataframe tbody tr th:only-of-type {\n",
 57 |        "        vertical-align: middle;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe tbody tr th {\n",
 61 |        "        vertical-align: top;\n",
 62 |        "    }\n",
 63 |        "\n",
 64 |        "    .dataframe thead th {\n",
 65 |        "        text-align: right;\n",
 66 |        "    }\n",
 67 |        "</style>\n",
 68 |        "<table border=\"1\" class=\"dataframe\">\n",
 69 |        "  <thead>\n",
 70 |        "    <tr style=\"text-align: right;\">\n",
 71 |        "      <th></th>\n",
 72 |        "      <th>labels</th>\n",
 73 |        "      <th>data</th>\n",
 74 |        "    </tr>\n",
 75 |        "  </thead>\n",
 76 |        "  <tbody>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>0</th>\n",
 79 |        "      <td>ham</td>\n",
 80 |        "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>1</th>\n",
 84 |        "      <td>ham</td>\n",
 85 |        "      <td>Ok lar... Joking wif u oni...</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>2</th>\n",
 89 |        "      <td>spam</td>\n",
 90 |        "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>3</th>\n",
 94 |        "      <td>ham</td>\n",
 95 |        "      <td>U dun say so early hor... U c already then say...</td>\n",
 96 |        "    </tr>\n",
 97 |        "  </tbody>\n",
 98 |        "</table>\n",
 99 |        "</div>"
100 |       ],
101 |       "text/plain": [
102 |        "  labels                                               data\n",
103 |        "0    ham  Go until jurong point, crazy.. Available only ...\n",
104 |        "1    ham                      Ok lar... Joking wif u oni...\n",
105 |        "2   spam  Free entry in 2 a wkly comp to win FA Cup fina...\n",
106 |        "3    ham  U dun say so early hor... U c already then say..."
107 |       ]
108 |      },
109 |      "execution_count": 2,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "path_of_input_file = 'D:\\\\kaggle_trials\\\\sms-spam-collection-dataset\\\\spam.csv'\n",
116 |     "df                 = pd.read_csv(path_of_input_file,encoding='ISO-8859-1')\n",
117 |     "df = df.drop([\"Unnamed: 2\", \"Unnamed: 3\", \"Unnamed: 4\"], axis=1)\n",
118 |     "df.columns = ['labels', 'data']\n",
119 |     "df.head(4)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "<div class=\"alert alert-block alert-info\">\n",
127 |     "<b>Imbalance check:</b> We can clearly see that the data is imbalanced because there will be more usual mails than spam mails."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 3,
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "name": "stdout",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "The number of labels are  2\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "num_labels = df['labels'].unique()\n",
145 |     "print('The number of labels are ',len(num_labels))"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 4,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "The number of  ham  labels are :-  4825\n",
158 |       "The number of  spam  labels are :-  747\n",
159 |       "We dont have a balanced dataset and hence we need to perform imbalanced dataset handling\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "for i in range(len(num_labels)):\n",
165 |     "    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['labels']==num_labels[i]]))\n",
166 |     "print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "<div class=\"alert alert-block alert-info\">\n",
174 |     "<b>Label Binarizing:</b> We binarize the labels to integers making it easy to feed into the model\n",
175 |     "</div>"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 5,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "lb           = LabelBinarizer()\n",
185 |     "Y            = lb.fit_transform(df['labels'].values)\n"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "<div class=\"alert alert-block alert-info\">\n",
193 |     "<b>Text Preprocessing:</b> We preprocess the text data by removing punctuations and converting every word to lowercase. Also we create a feature matrix X by using Tf-Idf vectorizer. We used Tf-idf because there are some words people use in usual sms conversations that may not have any word embeddings associated with them\n",
194 |     "</div>"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 6,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "\n",
204 |     "def preprocess_text(statement):\n",
205 |     "    punc_removed_statement = \"\".join(l for l in statement if l not in string.punctuation)\n",
206 |     "    splitting2words        = punc_removed_statement.split()\n",
207 |     "    lower_cased_statement  = \" \".join(word.lower() for word in splitting2words)\n",
208 |     "    return lower_cased_statement"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 7,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "df['preprocessed_data']= df['data'].apply(preprocess_text)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 8,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "tfidf = TfidfVectorizer(decode_error='ignore')\n",
227 |     "X = tfidf.fit_transform(df['preprocessed_data'])\n"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "<div class=\"alert alert-block alert-info\">\n",
235 |     "<b>Removing Imbalance :</b> Our data is balanced now after applying SMOTE\n",
236 |     "</div>"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 10,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "sm           = SMOTE(random_state=42)\n",
246 |     "X_res, Y_res = sm.fit_resample(X, Y)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 11,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "Positive examples before Oversampling is  747\n",
259 |       "Negative examples before Oversampling is  4825\n",
260 |       "\n",
261 |       "\n",
262 |       "Positive examples after Oversampling is  4825\n",
263 |       "Negative examples after Oversampling is  4825\n",
264 |       "\n",
265 |       "\n"
266 |      ]
267 |     }
268 |    ],
269 |    "source": [
270 |     "print('Positive examples before Oversampling is ', sum(Y == [1])[0])\n",
271 |     "print('Negative examples before Oversampling is ', sum(Y == [0])[0])\n",
272 |     "print('\\n')\n",
273 |     "print('Positive examples after Oversampling is ', sum(Y_res == [1]))\n",
274 |     "print('Negative examples after Oversampling is ', sum(Y_res == [0]))\n",
275 |     "print('\\n')"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "<div class=\"alert alert-block alert-info\">\n",
283 |     "<b>Train test split:</b> We create the train test split of the data\n",
284 |     "</div>"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 12,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "<div class=\"alert alert-block alert-info\">\n",
301 |     "<b>Hyper parameter grid:</b> We now set the grid for tuning the hyper parameters associated with the model.\n",
302 |     "</div>"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 13,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "multinomial_grid = {'alpha' : hp.uniform('alpha',0.5,5),\n",
312 |     "                   'fit_prior'     : hp.choice('fit_prior',[True,False])}"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 14,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "name": "stdout",
322 |      "output_type": "stream",
323 |      "text": [
324 |       "100%|████████████████████████████████████████████████| 500/500 [00:06<00:00, 75.32it/s, best loss: -0.9862334199996453]\n",
325 |       "The best parameter tuned on training set is given by :-  {'alpha': 0.5031703617001609, 'fit_prior': False}\n"
326 |      ]
327 |     }
328 |    ],
329 |    "source": [
330 |     "def hyperopt_train_test(params):\n",
331 |     "    clf = MultinomialNB(**params)\n",
332 |     "    return cross_val_score(clf, X_train, y_train).mean()\n",
333 |     "\n",
334 |     "def function_to_minimise(params):\n",
335 |     "    accuracy = hyperopt_train_test(params)\n",
336 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
337 |     "\n",
338 |     "\n",
339 |     "trials          = Trials()\n",
340 |     "best            = fmin(function_to_minimise, multinomial_grid, algo=tpe.suggest, max_evals=500, trials=trials)\n",
341 |     "best_parameters = space_eval(multinomial_grid, best)\n",
342 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "<div class=\"alert alert-block alert-info\">\n",
350 |     "<b>Final Results and Model fitting:</b> We finally fit the model with the tuned hyper parameters and present a classification report as our analysis \n",
351 |     "</div>"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 15,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "data": {
361 |       "text/plain": [
362 |        "MultinomialNB(alpha=0.5031703617001609, class_prior=None, fit_prior=False)"
363 |       ]
364 |      },
365 |      "execution_count": 15,
366 |      "metadata": {},
367 |      "output_type": "execute_result"
368 |     }
369 |    ],
370 |    "source": [
371 |     "model = MultinomialNB(**best_parameters)\n",
372 |     "model.fit(X_train, y_train)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 16,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "y_pred = model.predict(X_test)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 17,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "              precision    recall  f1-score   support\n",
394 |       "\n",
395 |       "           0       0.99      0.99      0.99      1614\n",
396 |       "           1       0.99      0.99      0.99      1571\n",
397 |       "\n",
398 |       "    accuracy                           0.99      3185\n",
399 |       "   macro avg       0.99      0.99      0.99      3185\n",
400 |       "weighted avg       0.99      0.99      0.99      3185\n",
401 |       "\n"
402 |      ]
403 |     }
404 |    ],
405 |    "source": [
406 |     "print(classification_report(y_pred,y_test))"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": []
415 |   }
416 |  ],
417 |  "metadata": {
418 |   "kernelspec": {
419 |    "display_name": "Python 3",
420 |    "language": "python",
421 |    "name": "python3"
422 |   },
423 |   "language_info": {
424 |    "codemirror_mode": {
425 |     "name": "ipython",
426 |     "version": 3
427 |    },
428 |    "file_extension": ".py",
429 |    "mimetype": "text/x-python",
430 |    "name": "python",
431 |    "nbconvert_exporter": "python",
432 |    "pygments_lexer": "ipython3",
433 |    "version": "3.7.3"
434 |   }
435 |  },
436 |  "nbformat": 4,
437 |  "nbformat_minor": 2
438 | }
439 | 


--------------------------------------------------------------------------------
/Project17/Stumble Upon Bagging Classifier/Bagging Classifier.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import scipy\n",
 15 |     "from sklearn.decomposition import PCA,TruncatedSVD\n",
 16 |     "from sklearn.ensemble import *\n",
 17 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 18 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 19 |     "from sklearn.metrics import *\n",
 20 |     "import hyperopt\n",
 21 |     "from hyperopt import *\n",
 22 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "%matplotlib inline \n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "<div class=\"alert alert-block alert-info\">\n",
 32 |     "<b>oading dataset:</b> We load our dataset here \n",
 33 |     "</div>"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>url</th>\n",
 63 |        "      <th>urlid</th>\n",
 64 |        "      <th>boilerplate</th>\n",
 65 |        "      <th>alchemy_category</th>\n",
 66 |        "      <th>alchemy_category_score</th>\n",
 67 |        "      <th>avglinksize</th>\n",
 68 |        "      <th>commonlinkratio_1</th>\n",
 69 |        "      <th>commonlinkratio_2</th>\n",
 70 |        "      <th>commonlinkratio_3</th>\n",
 71 |        "      <th>commonlinkratio_4</th>\n",
 72 |        "      <th>...</th>\n",
 73 |        "      <th>is_news</th>\n",
 74 |        "      <th>lengthyLinkDomain</th>\n",
 75 |        "      <th>linkwordscore</th>\n",
 76 |        "      <th>news_front_page</th>\n",
 77 |        "      <th>non_markup_alphanum_characters</th>\n",
 78 |        "      <th>numberOfLinks</th>\n",
 79 |        "      <th>numwords_in_url</th>\n",
 80 |        "      <th>parametrizedLinkRatio</th>\n",
 81 |        "      <th>spelling_errors_ratio</th>\n",
 82 |        "      <th>label</th>\n",
 83 |        "    </tr>\n",
 84 |        "  </thead>\n",
 85 |        "  <tbody>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>0</th>\n",
 88 |        "      <td>http://www.bloomberg.com/news/2010-12-23/ibm-p...</td>\n",
 89 |        "      <td>4042</td>\n",
 90 |        "      <td>{\"title\":\"IBM Sees Holographic Calls Air Breat...</td>\n",
 91 |        "      <td>business</td>\n",
 92 |        "      <td>0.789131</td>\n",
 93 |        "      <td>2.055556</td>\n",
 94 |        "      <td>0.676471</td>\n",
 95 |        "      <td>0.205882</td>\n",
 96 |        "      <td>0.047059</td>\n",
 97 |        "      <td>0.023529</td>\n",
 98 |        "      <td>...</td>\n",
 99 |        "      <td>1</td>\n",
100 |        "      <td>1</td>\n",
101 |        "      <td>24</td>\n",
102 |        "      <td>0</td>\n",
103 |        "      <td>5424</td>\n",
104 |        "      <td>170</td>\n",
105 |        "      <td>8</td>\n",
106 |        "      <td>0.152941</td>\n",
107 |        "      <td>0.079130</td>\n",
108 |        "      <td>0</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>1</th>\n",
112 |        "      <td>http://www.popsci.com/technology/article/2012-...</td>\n",
113 |        "      <td>8471</td>\n",
114 |        "      <td>{\"title\":\"The Fully Electronic Futuristic Star...</td>\n",
115 |        "      <td>recreation</td>\n",
116 |        "      <td>0.574147</td>\n",
117 |        "      <td>3.677966</td>\n",
118 |        "      <td>0.508021</td>\n",
119 |        "      <td>0.288770</td>\n",
120 |        "      <td>0.213904</td>\n",
121 |        "      <td>0.144385</td>\n",
122 |        "      <td>...</td>\n",
123 |        "      <td>1</td>\n",
124 |        "      <td>1</td>\n",
125 |        "      <td>40</td>\n",
126 |        "      <td>0</td>\n",
127 |        "      <td>4973</td>\n",
128 |        "      <td>187</td>\n",
129 |        "      <td>9</td>\n",
130 |        "      <td>0.181818</td>\n",
131 |        "      <td>0.125448</td>\n",
132 |        "      <td>1</td>\n",
133 |        "    </tr>\n",
134 |        "  </tbody>\n",
135 |        "</table>\n",
136 |        "<p>2 rows × 27 columns</p>\n",
137 |        "</div>"
138 |       ],
139 |       "text/plain": [
140 |        "                                                 url  urlid  \\\n",
141 |        "0  http://www.bloomberg.com/news/2010-12-23/ibm-p...   4042   \n",
142 |        "1  http://www.popsci.com/technology/article/2012-...   8471   \n",
143 |        "\n",
144 |        "                                         boilerplate alchemy_category  \\\n",
145 |        "0  {\"title\":\"IBM Sees Holographic Calls Air Breat...         business   \n",
146 |        "1  {\"title\":\"The Fully Electronic Futuristic Star...       recreation   \n",
147 |        "\n",
148 |        "  alchemy_category_score  avglinksize  commonlinkratio_1  commonlinkratio_2  \\\n",
149 |        "0               0.789131     2.055556           0.676471           0.205882   \n",
150 |        "1               0.574147     3.677966           0.508021           0.288770   \n",
151 |        "\n",
152 |        "   commonlinkratio_3  commonlinkratio_4  ...  is_news  lengthyLinkDomain  \\\n",
153 |        "0           0.047059           0.023529  ...        1                  1   \n",
154 |        "1           0.213904           0.144385  ...        1                  1   \n",
155 |        "\n",
156 |        "   linkwordscore  news_front_page  non_markup_alphanum_characters  \\\n",
157 |        "0             24                0                            5424   \n",
158 |        "1             40                0                            4973   \n",
159 |        "\n",
160 |        "   numberOfLinks  numwords_in_url parametrizedLinkRatio  \\\n",
161 |        "0            170                8              0.152941   \n",
162 |        "1            187                9              0.181818   \n",
163 |        "\n",
164 |        "   spelling_errors_ratio  label  \n",
165 |        "0               0.079130      0  \n",
166 |        "1               0.125448      1  \n",
167 |        "\n",
168 |        "[2 rows x 27 columns]"
169 |       ]
170 |      },
171 |      "execution_count": 2,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "input_file_path = r'D:\\kaggle_trials\\stumbleupon\\train.tsv'\n",
178 |     "df              = pd.read_csv(input_file_path,sep = '\\t')\n",
179 |     "df.head(2)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "<div class=\"alert alert-block alert-info\">\n",
187 |     "<b>Preprocessing data :</b> We handle missing data, one hot encode categorical data and then finally scale numerical data.\n",
188 |     "</div>"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 3,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "cols_needed          = df.columns\n",
198 |     "df                   = df[cols_needed].replace('?', np.nan)\n",
199 |     "irrelevant_columns   = ['framebased','urlid','url','boilerplate']\n",
200 |     "cols_needed          = list(set(cols_needed)-set(irrelevant_columns))\n",
201 |     "cols_needed_features = list(cols_needed[:len(cols_needed)-1])\n",
202 |     "cols_needed_labels   = cols_needed[-1]\n",
203 |     "cols_encoding_needed = ['alchemy_category','hasDomainLink','is_news','lengthyLinkDomain','news_front_page']\n",
204 |     "cols_scaling_needed  = list(set(cols_needed)-set(cols_encoding_needed))\n",
205 |     "for i in range(len(cols_encoding_needed)):\n",
206 |     "    df[cols_encoding_needed[i]] = df[cols_encoding_needed[i]].fillna(df[cols_encoding_needed[i]].mode()[0])\n",
207 |     "mean_impute_dict    ={}\n",
208 |     "for i in range(len(cols_scaling_needed)):\n",
209 |     "    mean_impute_dict[cols_scaling_needed[i]] = np.nanmean(np.float_(df[cols_scaling_needed[i]].values))\n",
210 |     "for i in range(len(cols_scaling_needed)):\n",
211 |     "    df[cols_scaling_needed[i]]   = df[cols_scaling_needed[i]].fillna(mean_impute_dict[cols_scaling_needed[i]])\n",
212 |     "ohe                  = OneHotEncoder()\n",
213 |     "scalar               = MinMaxScaler()\n",
214 |     "encoded_matrix       = ohe.fit_transform(df[cols_encoding_needed])\n",
215 |     "scaled_matrix        = scalar.fit_transform(df[cols_scaling_needed])\n",
216 |     "X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A\n",
217 |     "Y                    = df['label'].values"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "<div class=\"alert alert-block alert-info\">\n",
225 |     "<b>Train Test split:</b> We perform train test split on the data\n",
226 |     "</div>"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 4,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.33, random_state=42)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "<div class=\"alert alert-block alert-info\">\n",
243 |     "<b>Hyper parameter grid creation :</b> We perform parameter tuning by creating a grid of hyper parameters\n",
244 |     "</div>"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 5,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "bagging_grid = {'n_estimators' : hp.choice('n_estimators',range(5,20)),\n",
254 |     "                'max_features' : hp.uniform('max_features',0.1,0.95),\n",
255 |     "                'bootstrap'    : hp.choice('bootstrap',[True,False])\n",
256 |     "               }"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 6,
262 |    "metadata": {},
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "100%|█████████████████████████████████████████████████████████████████| 30/30 [00:12<00:00,  2.82it/s, best loss: -1.0]\n",
269 |       "The best parameter tuned on training set is given by :-  {'bootstrap': False, 'max_features': 0.8755934908566991, 'n_estimators': 10}\n"
270 |      ]
271 |     }
272 |    ],
273 |    "source": [
274 |     "def hyperopt_train_test(params):\n",
275 |     "    clf = BaggingClassifier(**params)\n",
276 |     "    return cross_val_score(clf, X_train, y_train).mean()\n",
277 |     "\n",
278 |     "def function_to_minimise(params):\n",
279 |     "    accuracy = hyperopt_train_test(params)\n",
280 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
281 |     "\n",
282 |     "\n",
283 |     "trials          = Trials()\n",
284 |     "best            = fmin(function_to_minimise, bagging_grid, algo=tpe.suggest, max_evals=30, trials=trials)\n",
285 |     "best_parameters = space_eval(bagging_grid, best)\n",
286 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "<div class=\"alert alert-block alert-info\">\n",
294 |     "<b>Model Fitting and analysis:</b> We fit the model using the tuned parameters and then present a classification report as analysis\n",
295 |     "</div>"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 7,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/plain": [
306 |        "BaggingClassifier(base_estimator=None, bootstrap=False,\n",
307 |        "                  bootstrap_features=False, max_features=0.8755934908566991,\n",
308 |        "                  max_samples=1.0, n_estimators=10, n_jobs=None,\n",
309 |        "                  oob_score=False, random_state=None, verbose=0,\n",
310 |        "                  warm_start=False)"
311 |       ]
312 |      },
313 |      "execution_count": 7,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "model = BaggingClassifier(**best_parameters)\n",
320 |     "model.fit(X_train, y_train)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 8,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "y_pred = model.predict(X_test)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 9,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "name": "stdout",
339 |      "output_type": "stream",
340 |      "text": [
341 |       "              precision    recall  f1-score   support\n",
342 |       "\n",
343 |       "           0       1.00      1.00      1.00      1198\n",
344 |       "           1       1.00      1.00      1.00      1243\n",
345 |       "\n",
346 |       "    accuracy                           1.00      2441\n",
347 |       "   macro avg       1.00      1.00      1.00      2441\n",
348 |       "weighted avg       1.00      1.00      1.00      2441\n",
349 |       "\n"
350 |      ]
351 |     }
352 |    ],
353 |    "source": [
354 |     "print(classification_report(y_pred,y_test))"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": []
363 |   }
364 |  ],
365 |  "metadata": {
366 |   "kernelspec": {
367 |    "display_name": "Python 3",
368 |    "language": "python",
369 |    "name": "python3"
370 |   },
371 |   "language_info": {
372 |    "codemirror_mode": {
373 |     "name": "ipython",
374 |     "version": 3
375 |    },
376 |    "file_extension": ".py",
377 |    "mimetype": "text/x-python",
378 |    "name": "python",
379 |    "nbconvert_exporter": "python",
380 |    "pygments_lexer": "ipython3",
381 |    "version": "3.7.3"
382 |   }
383 |  },
384 |  "nbformat": 4,
385 |  "nbformat_minor": 2
386 | }
387 | 


--------------------------------------------------------------------------------
/Project44/NGBoost_implementation/Train.csv:
--------------------------------------------------------------------------------
  1 | City,Location_Score,Internal_Audit_Score,External_Audit_Score,Fin_Score,Loss_score,Past_Results,IsUnderRisk
  2 | 2,8.032,14,8,3,6,0,1
  3 | 31,77.73,8,3,3,8,1,0
  4 | 40,59.203,3,12,11,3,0,1
  5 | 12,73.08,4,5,7,6,0,0
  6 | 4,15.666,13,15,6,7,2,1
  7 | 1,6.237,10,10,12,3,1,1
  8 | 9,13.795,8,3,5,3,0,0
  9 | 23,74.132,11,15,5,8,0,1
 10 | 40,69.522,8,4,7,6,0,0
 11 | 38,6.577,8,5,7,3,1,0
 12 | 4,75.514,8,12,4,4,0,1
 13 | 0,34.522,11,10,14,3,1,1
 14 | 10,34.374,4,8,7,3,1,0
 15 | 8,22.872,8,8,7,7,0,1
 16 | 9,16.313,14,10,3,4,1,1
 17 | 11,16.396,13,15,8,6,0,1
 18 | 19,9.537,3,7,5,7,0,0
 19 | 9,16.433,12,11,15,3,1,1
 20 | 6,41.253,3,6,4,5,0,0
 21 | 2,22.067,7,3,8,5,1,1
 22 | 8,7.061,3,5,6,8,1,0
 23 | 23,38.318,7,7,6,3,1,0
 24 | 1,15.931,10,7,7,6,0,1
 25 | 6,8.875,7,7,7,4,1,1
 26 | 10,17.463,13,15,12,3,1,1
 27 | 37,7.087,11,3,4,5,1,1
 28 | 6,23.298,7,8,4,4,1,1
 29 | 2,70.261,3,6,5,3,0,0
 30 | 3,11.092,8,5,4,6,0,0
 31 | 40,14.226,8,5,6,4,1,1
 32 | 40,21.448,5,7,7,3,1,0
 33 | 5,73.111,10,8,4,8,0,1
 34 | 2,73.793,7,7,5,4,0,1
 35 | 31,9.606,11,4,4,4,1,1
 36 | 6,12.058,6,7,3,7,1,1
 37 | 41,17.908,6,3,7,6,1,0
 38 | 10,7.11,7,5,7,6,1,0
 39 | 40,73.507,8,5,5,7,1,1
 40 | 16,68.547,6,3,3,7,1,0
 41 | 9,5.518,12,4,7,6,0,1
 42 | 22,15.105,5,8,5,8,0,0
 43 | 6,18.332,12,14,12,8,1,1
 44 | 40,33.219,6,7,7,5,1,0
 45 | 10,14.211,11,3,13,6,1,1
 46 | 40,61.459,10,8,5,8,1,1
 47 | 6,75.615,8,5,4,8,1,0
 48 | 23,67.797,3,4,8,3,0,0
 49 | 2,10.968,15,13,15,6,2,1
 50 | 40,9.348,15,10,6,5,0,1
 51 | 37,22.864,12,14,13,3,2,1
 52 | 2,73.972,10,3,3,3,1,1
 53 | 37,9.138,6,3,7,8,1,0
 54 | 41,16.482,4,3,6,4,1,0
 55 | 40,17.024,12,13,7,5,1,1
 56 | 31,64.502,5,8,8,5,1,0
 57 | 38,16.893,11,3,7,6,0,1
 58 | 41,63.372,10,15,4,5,0,1
 59 | 29,12.337,12,10,11,5,1,1
 60 | 9,61.625,3,3,5,4,0,0
 61 | 41,62.94,7,8,8,6,0,0
 62 | 4,18.072,10,3,4,3,1,1
 63 | 19,64.93,6,6,5,6,1,0
 64 | 0,24.597,11,12,14,3,1,1
 65 | 38,66.043,6,6,5,5,1,0
 66 | 37,15.586,7,3,14,3,1,1
 67 | 2,9.83,3,7,6,6,0,0
 68 | 6,17.653,10,10,12,6,0,1
 69 | 8,6.141,8,3,5,4,1,0
 70 | 5,70.881,9,13,4,3,0,1
 71 | 40,11.362,4,6,6,6,1,0
 72 | 40,7.031,5,5,4,11,1,1
 73 | 37,11.446,5,3,8,4,0,0
 74 | 3,8.165,13,11,13,9,2,1
 75 | 37,22.682,8,3,8,7,1,1
 76 | 38,68.387,13,14,8,6,1,1
 77 | 16,18.916,8,15,11,7,1,1
 78 | 40,21.057,7,5,4,8,0,1
 79 | 9,9.183,11,8,8,8,0,1
 80 | 2,69.686,5,5,3,5,0,0
 81 | 28,17.085,12,13,3,6,1,1
 82 | 10,13.895,5,6,5,8,1,0
 83 | 31,65.878,5,10,8,4,1,1
 84 | 38,63.619,3,5,4,3,1,0
 85 | 9,12.289,9,3,7,4,0,1
 86 | 9,14.239,6,7,4,6,0,0
 87 | 3,16.714,7,12,5,6,0,1
 88 | 5,18.76,7,13,10,3,0,1
 89 | 31,20.709,8,15,6,4,0,1
 90 | 37,69.554,7,3,7,3,0,0
 91 | 5,16.925,12,11,12,8,1,1
 92 | 40,19.273,7,8,4,7,1,1
 93 | 31,32.018,8,6,4,5,1,0
 94 | 9,12.78,7,4,5,4,0,0
 95 | 10,13.312,13,15,15,4,1,1
 96 | 9,13.948,11,4,3,6,0,1
 97 | 31,20.751,12,15,11,6,0,1
 98 | 2,11.506,13,10,11,8,1,1
 99 | 40,68.709,6,4,3,6,0,1
100 | 22,16.466,3,4,8,8,1,1
101 | 4,73.469,8,3,5,8,1,0
102 | 10,14.56,13,4,3,8,1,1
103 | 6,15.273,15,10,7,8,0,1
104 | 9,17.042,7,6,4,7,1,0
105 | 2,64.536,6,4,7,5,0,0
106 | 9,7.33,4,5,5,11,0,1
107 | 23,9.178,10,13,10,3,2,1
108 | 18,20.105,8,6,10,5,1,1
109 | 41,60.605,3,4,7,6,0,0
110 | 35,73.107,7,5,5,4,0,0
111 | 41,74.696,8,4,7,5,0,0
112 | 10,6.99,11,7,7,5,0,1
113 | 13,10.026,13,15,14,3,1,1
114 | 32,11.969,7,8,7,7,1,0
115 | 14,13.858,12,5,7,7,0,1
116 | 40,62.755,3,8,4,7,0,1
117 | 31,16.789,13,6,6,8,0,1
118 | 5,11.703,10,14,11,7,1,1
119 | 3,58.664,11,11,10,8,0,1
120 | 41,71.89,4,6,3,7,0,0
121 | 10,8.543,11,15,13,8,3,1
122 | 30,11.016,15,14,10,7,0,1
123 | 10,70.989,9,13,8,5,1,1
124 | 12,77.731,7,7,7,7,0,1
125 | 16,14.956,14,13,14,5,1,1
126 | 8,11.475,10,14,11,6,3,1
127 | 0,6.265,6,3,13,4,0,1
128 | 10,69.63,11,7,4,6,1,1
129 | 23,21.842,8,8,4,4,0,0
130 | 9,17.917,3,3,6,7,0,0
131 | 31,11.331,6,3,8,8,1,1
132 | 29,11.587,11,11,12,5,1,1
133 | 5,17.81,10,6,8,6,1,1
134 | 3,22.057,12,4,6,7,1,1
135 | 40,15.402,7,6,8,3,0,1
136 | 37,26.522,6,5,4,7,1,0
137 | 38,62.351,6,4,8,7,0,0
138 | 38,67.086,8,8,8,4,1,0
139 | 35,72.523,6,3,6,3,0,0
140 | 43,14.394,7,4,4,8,0,0
141 | 2,9.513,10,7,7,6,0,1
142 | 39,23.428,11,12,11,6,0,1
143 | 23,16.741,11,11,6,3,0,1
144 | 41,20.429,10,9,10,6,1,1
145 | 41,6.365,4,4,4,8,1,0
146 | 6,15.527,6,8,3,3,1,1
147 | 41,7.738,11,15,11,8,2,1
148 | 40,15.243,7,5,5,4,0,1
149 | 5,15.152,15,7,7,5,0,1
150 | 21,19.288,11,13,15,3,1,1
151 | 6,71.521,5,3,8,4,0,0
152 | 24,23.122,14,3,9,5,1,1
153 | 39,16.12,14,11,10,4,0,1
154 | 41,12.131,8,3,4,5,1,0
155 | 1,61.136,3,7,6,7,1,0
156 | 10,24.4,14,5,11,6,1,1
157 | 5,15.54,8,8,5,3,0,1
158 | 23,76.248,7,5,8,7,0,0
159 | 38,20.659,5,4,5,6,1,0
160 | 41,68.617,7,10,6,3,0,1
161 | 6,7.168,7,8,4,5,1,1
162 | 3,23.528,8,11,8,3,0,1
163 | 13,34.33,7,6,6,8,0,0
164 | 8,66.838,6,14,8,3,0,1
165 | 31,13.364,10,4,3,7,1,1
166 | 19,74.089,4,6,5,4,1,0
167 | 41,69.327,4,7,6,4,0,0
168 | 41,8.501,12,10,5,6,2,1
169 | 5,5.864,10,8,5,7,0,1
170 | 2,64.828,11,12,7,4,0,1
171 | 9,11.663,7,6,7,9,1,1
172 | 38,73.012,3,6,4,5,1,0
173 | 40,14.323,12,7,8,4,0,1
174 | 32,6.836,11,15,12,3,0,1
175 | 10,73.305,10,15,10,8,3,1
176 | 38,18.869,6,7,7,6,0,0
177 | 31,22.474,8,8,11,5,0,1
178 | 9,15.904,10,8,8,6,1,1
179 | 40,61.413,7,8,3,8,1,1
180 | 41,18.559,9,12,14,8,1,1
181 | 9,75.588,5,5,8,4,0,0
182 | 3,72.991,6,4,7,8,1,0
183 | 0,68.706,6,7,8,8,1,0
184 | 30,11.759,13,12,13,8,1,1
185 | 5,59.421,7,7,5,7,1,0
186 | 9,20.316,7,5,8,3,1,1
187 | 37,66.535,4,7,4,6,1,0
188 | 10,8.82,4,5,3,4,1,0
189 | 6,21.472,11,10,8,3,0,1
190 | 41,68.892,6,3,4,8,1,0
191 | 41,41.666,15,9,6,8,1,1
192 | 8,22.09,12,7,7,5,1,1
193 | 1,19.142,4,8,14,7,0,1
194 | 28,22.846,10,3,6,3,0,1
195 | 19,8.563,13,7,7,6,0,1
196 | 8,65.226,5,4,7,8,0,0
197 | 10,5.643,3,5,5,5,0,1
198 | 13,65.948,8,7,7,7,1,0
199 | 5,24.318,12,12,8,4,1,1
200 | 41,14.28,8,4,4,8,0,0
201 | 5,21.454,6,7,7,7,0,0
202 | 40,6.625,3,5,11,6,1,1
203 | 3,15.364,8,3,3,6,1,0
204 | 25,20.164,15,11,11,4,0,1
205 | 5,13.616,8,6,4,3,0,1
206 | 37,23.185,6,4,5,5,1,0
207 | 17,72.416,3,7,3,4,1,0
208 | 19,70.696,8,5,3,3,0,0
209 | 33,16.881,10,11,12,7,1,1
210 | 5,60.24,8,7,8,5,1,0
211 | 2,17.192,12,5,7,6,1,1
212 | 40,7.522,9,5,8,6,0,1
213 | 4,22.535,10,5,6,8,1,1
214 | 31,9.919,7,5,6,7,1,1
215 | 3,76.245,7,5,8,4,0,0
216 | 2,61.29,7,8,3,7,0,0
217 | 13,21.153,4,14,6,8,1,1
218 | 2,77.113,7,3,8,3,1,0
219 | 1,72.068,3,5,7,8,1,0
220 | 37,11.999,6,3,5,7,1,0
221 | 23,9.546,7,3,5,7,1,1
222 | 2,7.698,12,9,3,6,1,1
223 | 9,27.759,5,7,14,8,1,1
224 | 3,68.356,4,5,3,8,0,0
225 | 35,29.201,3,5,7,4,1,0
226 | 37,13.203,7,7,6,5,1,0
227 | 8,66.008,5,7,5,6,1,0
228 | 40,40.352,6,4,5,8,1,0
229 | 15,16.942,10,13,12,4,1,1
230 | 21,17.052,13,5,10,8,1,1
231 | 38,73.811,8,5,8,5,1,0
232 | 6,72.971,4,5,3,6,1,0
233 | 40,13.558,10,11,4,5,0,1
234 | 31,75.918,5,7,7,3,0,0
235 | 18,11.058,4,3,13,8,0,1
236 | 4,59.526,5,7,6,4,1,0
237 | 2,18.979,3,4,10,7,0,1
238 | 3,67.946,8,7,13,8,1,1
239 | 31,12.785,4,7,4,4,1,0
240 | 9,18.976,11,10,4,3,1,1
241 | 9,80.219,7,3,6,4,1,0
242 | 41,9.254,13,11,14,5,1,1
243 | 3,64.037,7,4,8,4,1,0
244 | 40,11.955,6,4,3,6,0,1
245 | 21,22.967,7,10,10,8,0,1
246 | 10,20.613,12,3,5,7,0,1
247 | 41,9.075,6,3,4,7,0,0
248 | 5,22.625,14,12,13,5,0,1
249 | 30,20.766,11,9,7,5,0,1
250 | 17,20.078,7,5,7,8,1,0
251 | 23,20.682,7,11,3,4,1,1
252 | 9,9.928,13,4,5,7,1,1
253 | 37,14.566,11,5,6,8,0,1
254 | 13,62.656,5,4,5,6,0,0
255 | 40,63.099,6,3,5,3,1,0
256 | 16,64.281,4,5,5,5,1,0
257 | 13,13.359,10,8,6,4,0,1
258 | 6,18.501,6,8,5,6,0,1
259 | 2,14.481,11,3,14,6,0,1
260 | 41,7.571,12,12,12,5,1,1
261 | 30,12.394,11,7,8,5,1,1
262 | 10,17.583,12,13,12,3,3,1
263 | 19,33.904,5,6,5,8,1,0
264 | 4,12.165,12,11,4,5,0,1
265 | 1,6.324,12,10,4,3,0,1
266 | 16,8.297,7,5,4,7,0,0
267 | 28,19.9,13,10,3,6,1,1
268 | 37,22.902,10,13,11,4,0,1
269 | 35,8.091,6,5,8,5,1,0
270 | 9,23.716,3,8,3,5,1,0
271 | 27,19.155,13,14,11,4,1,1
272 | 8,64.194,5,7,3,6,0,0
273 | 13,12.117,11,10,11,7,1,1
274 | 5,70.247,4,7,6,4,1,0
275 | 9,11.595,13,4,7,3,1,1
276 | 20,24.486,12,12,11,3,1,1
277 | 40,65.404,5,8,5,4,1,1
278 | 17,19.701,11,13,6,10,3,1
279 | 9,11.83,5,6,3,3,0,0
280 | 10,22.447,9,3,10,6,1,1
281 | 22,7.991,10,13,15,4,1,1
282 | 6,18.282,4,5,7,3,0,0
283 | 40,72.207,5,8,3,6,1,1
284 | 30,11.222,14,3,8,7,0,1
285 | 10,75.574,8,3,3,11,0,1
286 | 22,18.042,15,11,12,8,1,1
287 | 40,8.621,9,8,8,5,1,1
288 | 3,5.363,4,6,4,3,0,1
289 | 13,75.891,3,5,8,8,0,0
290 | 1,15.782,6,11,4,7,0,1
291 | 4,16.334,7,8,5,5,1,1
292 | 6,12.858,10,11,11,3,0,1
293 | 6,63.614,5,4,4,4,0,0
294 | 40,21.671,15,10,10,5,4,1
295 | 31,74.686,3,4,6,7,1,0
296 | 31,8.0,5,8,7,3,0,0
297 | 9,16.56,11,8,6,9,1,1
298 | 23,16.49,10,7,7,8,0,1
299 | 13,14.713,3,7,8,4,1,1
300 | 17,19.683,7,11,13,13,1,1
301 | 6,17.594,11,7,10,4,1,1
302 | 10,12.576,5,6,10,3,0,1
303 | 4,12.268,14,10,15,4,0,1
304 | 3,18.244,15,6,5,6,1,1
305 | 9,67.553,11,5,5,8,1,1
306 | 38,21.641,11,5,3,10,0,1
307 | 3,63.693,5,6,4,4,0,0
308 | 9,69.416,10,5,12,3,0,1
309 | 5,60.086,3,4,5,6,0,0
310 | 17,59.321,7,8,8,4,1,0
311 | 6,6.261,8,5,6,4,0,1
312 | 38,10.282,7,7,7,6,1,1
313 | 41,17.047,15,8,3,3,1,1
314 | 9,13.036,11,10,7,7,1,1
315 | 41,33.32,8,5,8,4,0,0
316 | 10,16.115,12,3,12,3,2,1
317 | 37,8.369,7,6,6,5,0,0
318 | 27,72.532,7,4,5,7,0,0
319 | 3,80.664,7,3,4,6,1,0
320 | 23,19.707,13,10,10,6,0,1
321 | 10,13.762,8,8,8,7,0,1
322 | 9,10.442,11,14,7,6,1,1
323 | 32,9.691,13,10,10,6,1,1
324 | 37,21.999,11,7,3,4,1,1
325 | 37,10.15,11,15,12,7,0,1
326 | 3,30.135,3,4,8,3,1,0
327 | 23,16.807,13,10,14,3,1,1
328 | 13,10.768,7,8,11,3,0,1
329 | 4,24.984,6,8,8,8,1,0
330 | 37,20.889,15,10,8,3,0,1
331 | 40,71.992,10,11,8,6,0,1
332 | 20,24.68,6,3,8,3,1,0
333 | 6,8.325,10,14,10,7,2,1
334 | 2,14.849,8,3,11,5,1,1
335 | 28,11.034,15,15,15,6,1,1
336 | 9,8.56,10,11,14,4,1,1
337 | 9,69.772,6,7,3,3,0,0
338 | 41,66.184,7,4,6,4,1,0
339 | 12,17.618,3,11,10,7,2,1
340 | 10,40.773,12,13,13,5,0,1
341 | 6,6.112,5,6,7,7,1,0
342 | 13,75.738,7,5,6,7,0,0
343 | 18,20.051,7,4,8,4,1,1
344 | 13,18.501,12,12,5,6,1,1
345 | 41,68.023,8,3,4,8,0,0
346 | 38,19.019,4,5,7,7,1,0
347 | 0,64.929,6,5,4,8,0,1
348 | 31,30.22,5,3,5,4,0,0
349 | 13,16.512,4,8,5,7,1,0
350 | 2,21.333,13,7,5,3,0,1
351 | 9,15.379,13,10,15,3,0,1
352 | 31,14.672,11,13,11,5,0,1
353 | 18,80.138,7,4,6,6,1,0
354 | 31,20.253,5,3,8,8,0,0
355 | 37,12.874,5,8,8,8,1,0
356 | 37,72.212,5,6,5,3,0,0
357 | 5,11.535,8,4,7,3,0,0
358 | 31,16.408,15,15,10,6,1,1
359 | 9,60.309,3,6,6,3,0,0
360 | 37,18.162,5,8,3,5,0,0
361 | 6,73.498,3,7,4,5,0,0
362 | 9,14.677,10,6,13,6,0,1
363 | 40,18.887,8,6,11,3,0,1
364 | 23,11.823,5,6,11,8,0,1
365 | 19,12.789,15,14,9,8,0,1
366 | 19,58.808,5,7,8,8,0,0
367 | 37,32.721,6,6,7,7,1,1
368 | 40,21.223,14,14,7,5,1,1
369 | 6,24.267,15,11,5,8,1,1
370 | 41,70.218,8,4,8,5,1,0
371 | 6,14.561,4,5,3,8,0,0
372 | 31,58.818,6,3,5,3,1,0
373 | 38,16.393,8,8,8,8,0,0
374 | 37,10.168,10,5,5,8,1,1
375 | 4,37.642,3,7,3,5,1,0
376 | 19,75.161,6,6,5,4,0,0
377 | 10,20.034,4,5,3,5,0,1
378 | 13,18.201,11,11,14,3,0,1
379 | 5,19.175,15,12,14,3,1,1
380 | 37,61.63,7,5,3,3,1,1
381 | 28,65.894,7,3,3,6,1,0
382 | 40,17.033,9,8,5,7,0,1
383 | 4,66.136,6,6,8,3,1,0
384 | 38,71.909,6,3,7,6,1,0
385 | 13,27.942,8,8,4,4,0,0
386 | 5,67.543,10,6,3,4,0,1
387 | 9,66.707,11,13,8,6,1,1
388 | 6,18.038,8,4,11,5,0,1
389 | 37,12.487,10,5,8,7,0,1
390 | 18,20.02,7,8,5,4,1,1
391 | 40,73.983,3,4,4,3,0,0
392 | 1,11.969,10,13,11,4,0,1
393 | 7,58.874,4,6,7,6,0,0
394 | 29,16.48,3,12,13,3,1,1
395 | 8,68.296,6,7,6,6,0,0
396 | 6,22.004,7,6,6,7,0,1
397 | 3,30.951,6,5,6,3,0,0
398 | 4,23.571,5,7,5,3,0,0
399 | 5,62.532,4,6,3,8,0,0
400 | 6,37.788,8,5,4,5,1,0
401 | 13,15.208,7,8,8,5,0,1
402 | 2,20.733,11,5,4,3,0,1
403 | 38,16.718,11,13,12,4,6,1
404 | 40,21.376,14,10,13,7,1,1
405 | 38,18.017,5,4,8,6,1,0
406 | 38,21.251,8,3,5,6,0,0
407 | 6,14.154,13,10,8,4,1,1
408 | 6,20.265,10,15,4,4,0,1
409 | 11,22.732,3,3,7,7,1,1
410 | 9,20.05,6,8,4,6,0,1
411 | 1,15.202,3,4,3,7,0,0
412 | 31,80.809,4,3,4,5,0,0
413 | 2,24.728,7,7,4,4,1,0
414 | 40,16.587,15,10,10,8,0,1
415 | 6,41.128,3,4,5,7,0,0
416 | 9,18.172,6,4,8,4,1,1
417 | 40,7.385,15,6,8,7,0,1
418 | 10,19.029,5,3,6,3,0,1
419 | 23,19.803,9,10,3,8,1,1
420 | 9,42.041,6,4,4,5,1,0
421 | 3,60.161,6,6,7,5,1,1
422 | 22,9.142,12,7,8,5,0,1
423 | 2,10.854,8,8,3,4,0,1
424 | 0,16.903,12,9,10,5,1,1
425 | 6,14.766,8,5,6,8,1,1
426 | 4,16.089,5,5,4,5,0,0
427 | 6,63.296,3,5,4,6,1,1
428 | 5,67.833,8,6,5,6,1,0
429 | 9,60.644,13,7,6,6,1,1
430 | 40,74.922,7,4,8,3,1,0
431 | 19,14.959,8,15,10,8,0,1
432 | 41,61.286,7,8,7,3,0,1
433 | 9,17.933,8,6,5,6,1,1
434 | 13,15.847,5,11,5,8,1,1
435 | 38,21.237,10,8,7,6,1,1
436 | 2,10.077,10,3,6,4,1,1
437 | 23,16.805,8,3,9,3,0,1
438 | 30,13.572,9,6,14,7,0,1
439 | 2,6.999,8,5,4,6,1,0
440 | 5,16.108,10,8,11,5,2,1
441 | 41,5.185,12,4,8,4,1,1
442 | 5,25.534,8,7,8,8,0,1
443 | 38,30.828,7,8,8,3,0,1
444 | 10,67.897,4,5,10,7,1,1
445 | 30,14.507,11,15,11,7,1,1
446 | 38,7.458,9,3,7,8,1,1
447 | 41,12.804,11,3,5,3,1,1
448 | 38,19.81,3,8,6,4,0,0
449 | 40,67.522,8,5,7,5,1,0
450 | 40,73.206,8,6,5,6,0,0
451 | 41,13.028,3,7,6,4,1,1
452 | 0,76.01,8,15,14,7,2,1
453 | 38,75.721,8,6,7,3,0,0
454 | 23,12.662,13,7,11,8,0,1
455 | 38,21.563,7,8,4,6,1,1
456 | 6,13.987,12,3,15,3,1,1
457 | 38,14.646,6,3,3,7,0,1
458 | 10,71.1,3,6,3,6,0,1
459 | 5,8.196,15,13,9,5,0,1
460 | 2,17.289,4,6,8,6,0,0
461 | 5,22.043,11,15,13,7,1,1
462 | 11,19.878,8,10,11,3,1,1
463 | 2,16.54,11,12,11,6,1,1
464 | 3,69.527,8,4,7,4,1,0
465 | 19,18.109,12,10,15,5,0,1
466 | 2,17.371,8,8,6,4,0,1
467 | 6,21.531,11,14,15,8,1,1
468 | 9,15.612,6,7,4,3,1,1
469 | 41,65.896,4,7,8,3,0,0
470 | 42,17.919,3,4,4,8,0,0
471 | 28,17.0,4,11,10,3,1,1
472 | 0,17.261,12,12,9,6,1,1
473 | 22,21.615,13,7,8,5,1,1
474 | 3,23.557,8,5,8,3,1,1
475 | 19,10.466,11,14,6,3,0,1
476 | 1,13.054,7,6,5,8,1,0
477 | 1,66.935,8,12,4,3,0,1
478 | 2,8.17,12,6,6,3,0,1
479 | 23,13.901,10,10,12,3,0,1
480 | 10,20.925,6,5,5,8,10,1
481 | 40,59.672,8,4,3,3,0,1
482 | 10,76.146,6,5,7,4,1,1
483 | 40,12.52,15,14,6,5,1,1
484 | 1,10.14,6,8,13,8,0,1
485 | 31,72.246,6,8,8,4,0,0
486 | 3,7.615,13,6,3,8,1,1
487 | 27,65.626,4,7,8,7,0,0
488 | 9,73.268,6,11,3,7,1,1
489 | 3,10.284,10,14,15,5,0,1
490 | 40,73.547,4,7,6,6,1,1
491 | 4,22.291,11,3,4,4,1,1
492 | 10,64.806,8,4,7,8,0,1
493 | 40,22.447,12,10,5,5,0,1
494 | 40,39.997,5,8,4,4,1,0
495 | 37,18.104,6,8,3,6,1,0
496 | 40,7.864,15,3,5,7,0,1
497 | 23,71.073,6,3,8,6,0,1
498 | 23,72.196,9,5,4,7,0,1
499 | 40,76.346,7,4,4,8,1,1
500 | 41,59.967,7,8,6,5,1,0
501 | 17,22.117,10,10,4,8,0,1
502 | 1,22.523,6,6,7,3,1,0
503 | 5,66.913,8,3,7,7,0,1
504 | 40,74.813,5,6,4,6,1,0
505 | 10,9.372,7,9,5,4,0,1
506 | 10,9.722,7,3,7,5,1,1
507 | 41,70.952,10,12,6,7,0,1
508 | 31,14.813,12,8,7,8,1,1
509 | 40,18.708,10,14,11,6,1,1
510 | 10,68.119,13,11,9,5,1,1
511 | 40,10.332,15,10,12,8,0,1
512 | 9,7.538,7,3,4,6,1,1
513 | 40,18.307,6,6,4,8,1,0
514 | 10,11.284,8,8,7,6,0,1
515 | 6,10.744,12,4,4,5,1,1
516 | 22,42.592,4,6,3,7,1,0
517 | 31,73.383,3,7,6,3,1,0
518 | 19,76.174,6,3,4,4,0,0
519 | 6,68.657,3,5,4,7,0,0
520 | 40,18.196,5,6,3,7,0,1
521 | 17,68.004,8,7,3,5,0,0
522 | 40,15.71,3,3,6,5,0,1
523 | 41,11.1,8,6,8,4,0,0
524 | 35,14.471,12,3,4,8,0,1
525 | 40,19.433,8,8,4,8,1,1
526 | 2,12.28,10,15,8,4,1,1
527 | 35,69.969,7,6,6,4,1,0
528 | 19,74.384,8,5,5,4,1,0
529 | 44,5.931,5,8,6,4,0,0
530 | 10,24.361,8,8,4,4,0,0
531 | 6,10.578,7,3,5,5,0,1
532 | 9,11.344,11,13,4,8,0,1
533 | 9,21.946,15,13,6,6,1,1
534 | 37,12.318,8,8,3,5,0,0
535 | 41,22.134,11,7,7,8,1,1
536 | 26,10.607,10,11,13,5,1,1
537 | 18,62.091,4,3,7,8,1,0
538 | 6,74.338,8,7,6,4,0,0
539 | 41,20.634,4,7,7,5,0,0
540 | 16,74.017,7,4,5,7,1,0
541 | 2,70.46,7,5,6,4,0,0
542 | 1,79.243,7,5,3,8,1,0
543 | 40,69.14,7,8,4,5,1,1
544 | 13,23.332,14,12,10,3,2,1
545 | 


--------------------------------------------------------------------------------
/Project19/Instant_gratification_QDA_LDA/QDA_LDA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 37,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "from sklearn.decomposition import PCA,TruncatedSVD\n",
 15 |     "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis,LinearDiscriminantAnalysis\n",
 16 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 17 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 18 |     "from sklearn.metrics import *\n",
 19 |     "import hyperopt\n",
 20 |     "from hyperopt import *\n",
 21 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "%matplotlib inline \n"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 38,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/html": [
 34 |        "<div>\n",
 35 |        "<style scoped>\n",
 36 |        "    .dataframe tbody tr th:only-of-type {\n",
 37 |        "        vertical-align: middle;\n",
 38 |        "    }\n",
 39 |        "\n",
 40 |        "    .dataframe tbody tr th {\n",
 41 |        "        vertical-align: top;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe thead th {\n",
 45 |        "        text-align: right;\n",
 46 |        "    }\n",
 47 |        "</style>\n",
 48 |        "<table border=\"1\" class=\"dataframe\">\n",
 49 |        "  <thead>\n",
 50 |        "    <tr style=\"text-align: right;\">\n",
 51 |        "      <th></th>\n",
 52 |        "      <th>id</th>\n",
 53 |        "      <th>muggy-smalt-axolotl-pembus</th>\n",
 54 |        "      <th>dorky-peach-sheepdog-ordinal</th>\n",
 55 |        "      <th>slimy-seashell-cassowary-goose</th>\n",
 56 |        "      <th>snazzy-harlequin-chicken-distraction</th>\n",
 57 |        "      <th>frumpy-smalt-mau-ordinal</th>\n",
 58 |        "      <th>stealthy-beige-pinscher-golden</th>\n",
 59 |        "      <th>chummy-cream-tarantula-entropy</th>\n",
 60 |        "      <th>hazy-emerald-cuttlefish-unsorted</th>\n",
 61 |        "      <th>nerdy-indigo-wolfhound-sorted</th>\n",
 62 |        "      <th>...</th>\n",
 63 |        "      <th>wheezy-myrtle-mandrill-entropy</th>\n",
 64 |        "      <th>wiggy-lilac-lemming-sorted</th>\n",
 65 |        "      <th>gloppy-cerise-snail-contributor</th>\n",
 66 |        "      <th>woozy-silver-havanese-gaussian</th>\n",
 67 |        "      <th>jumpy-thistle-discus-sorted</th>\n",
 68 |        "      <th>muggy-turquoise-donkey-important</th>\n",
 69 |        "      <th>blurry-buff-hyena-entropy</th>\n",
 70 |        "      <th>bluesy-chocolate-kudu-fepid</th>\n",
 71 |        "      <th>gamy-white-monster-expert</th>\n",
 72 |        "      <th>target</th>\n",
 73 |        "    </tr>\n",
 74 |        "  </thead>\n",
 75 |        "  <tbody>\n",
 76 |        "    <tr>\n",
 77 |        "      <th>0</th>\n",
 78 |        "      <td>707b395ecdcbb4dc2eabea00e4d1b179</td>\n",
 79 |        "      <td>-2.070654</td>\n",
 80 |        "      <td>1.018160</td>\n",
 81 |        "      <td>0.228643</td>\n",
 82 |        "      <td>0.857221</td>\n",
 83 |        "      <td>0.052271</td>\n",
 84 |        "      <td>0.230303</td>\n",
 85 |        "      <td>-6.385090</td>\n",
 86 |        "      <td>0.439369</td>\n",
 87 |        "      <td>-0.721946</td>\n",
 88 |        "      <td>...</td>\n",
 89 |        "      <td>0.351895</td>\n",
 90 |        "      <td>0.618824</td>\n",
 91 |        "      <td>-1.542423</td>\n",
 92 |        "      <td>0.598175</td>\n",
 93 |        "      <td>0.611757</td>\n",
 94 |        "      <td>0.678772</td>\n",
 95 |        "      <td>0.247059</td>\n",
 96 |        "      <td>-0.806677</td>\n",
 97 |        "      <td>-0.193649</td>\n",
 98 |        "      <td>0</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>1</th>\n",
102 |        "      <td>5880c03c6582a7b42248668e56b4bdec</td>\n",
103 |        "      <td>-0.491702</td>\n",
104 |        "      <td>0.082645</td>\n",
105 |        "      <td>-0.011193</td>\n",
106 |        "      <td>1.071266</td>\n",
107 |        "      <td>-0.346347</td>\n",
108 |        "      <td>-0.082209</td>\n",
109 |        "      <td>0.110579</td>\n",
110 |        "      <td>-0.382374</td>\n",
111 |        "      <td>-0.229620</td>\n",
112 |        "      <td>...</td>\n",
113 |        "      <td>-0.645115</td>\n",
114 |        "      <td>-1.246090</td>\n",
115 |        "      <td>2.613357</td>\n",
116 |        "      <td>-0.479664</td>\n",
117 |        "      <td>1.581289</td>\n",
118 |        "      <td>0.931258</td>\n",
119 |        "      <td>0.151937</td>\n",
120 |        "      <td>-0.766595</td>\n",
121 |        "      <td>0.474351</td>\n",
122 |        "      <td>0</td>\n",
123 |        "    </tr>\n",
124 |        "    <tr>\n",
125 |        "      <th>2</th>\n",
126 |        "      <td>4ccbcb3d13e5072ff1d9c61afe2c4f77</td>\n",
127 |        "      <td>-1.680473</td>\n",
128 |        "      <td>0.860529</td>\n",
129 |        "      <td>-1.076195</td>\n",
130 |        "      <td>0.740124</td>\n",
131 |        "      <td>3.678445</td>\n",
132 |        "      <td>0.288558</td>\n",
133 |        "      <td>0.515875</td>\n",
134 |        "      <td>0.920590</td>\n",
135 |        "      <td>-1.223277</td>\n",
136 |        "      <td>...</td>\n",
137 |        "      <td>0.516422</td>\n",
138 |        "      <td>0.130521</td>\n",
139 |        "      <td>-0.459210</td>\n",
140 |        "      <td>2.028205</td>\n",
141 |        "      <td>-0.093968</td>\n",
142 |        "      <td>-0.218274</td>\n",
143 |        "      <td>-0.163136</td>\n",
144 |        "      <td>-0.870289</td>\n",
145 |        "      <td>0.064038</td>\n",
146 |        "      <td>1</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>3</th>\n",
150 |        "      <td>e350f17a357f12a1941f0837afb7eb8d</td>\n",
151 |        "      <td>0.183774</td>\n",
152 |        "      <td>0.919134</td>\n",
153 |        "      <td>-0.946958</td>\n",
154 |        "      <td>0.918492</td>\n",
155 |        "      <td>0.862278</td>\n",
156 |        "      <td>1.155287</td>\n",
157 |        "      <td>0.911106</td>\n",
158 |        "      <td>0.562598</td>\n",
159 |        "      <td>-1.349685</td>\n",
160 |        "      <td>...</td>\n",
161 |        "      <td>-1.168967</td>\n",
162 |        "      <td>1.385089</td>\n",
163 |        "      <td>-0.353028</td>\n",
164 |        "      <td>3.316150</td>\n",
165 |        "      <td>-0.524087</td>\n",
166 |        "      <td>-0.794327</td>\n",
167 |        "      <td>3.936365</td>\n",
168 |        "      <td>0.682989</td>\n",
169 |        "      <td>-2.521211</td>\n",
170 |        "      <td>0</td>\n",
171 |        "    </tr>\n",
172 |        "  </tbody>\n",
173 |        "</table>\n",
174 |        "<p>4 rows × 258 columns</p>\n",
175 |        "</div>"
176 |       ],
177 |       "text/plain": [
178 |        "                                 id  muggy-smalt-axolotl-pembus  \\\n",
179 |        "0  707b395ecdcbb4dc2eabea00e4d1b179                   -2.070654   \n",
180 |        "1  5880c03c6582a7b42248668e56b4bdec                   -0.491702   \n",
181 |        "2  4ccbcb3d13e5072ff1d9c61afe2c4f77                   -1.680473   \n",
182 |        "3  e350f17a357f12a1941f0837afb7eb8d                    0.183774   \n",
183 |        "\n",
184 |        "   dorky-peach-sheepdog-ordinal  slimy-seashell-cassowary-goose  \\\n",
185 |        "0                      1.018160                        0.228643   \n",
186 |        "1                      0.082645                       -0.011193   \n",
187 |        "2                      0.860529                       -1.076195   \n",
188 |        "3                      0.919134                       -0.946958   \n",
189 |        "\n",
190 |        "   snazzy-harlequin-chicken-distraction  frumpy-smalt-mau-ordinal  \\\n",
191 |        "0                              0.857221                  0.052271   \n",
192 |        "1                              1.071266                 -0.346347   \n",
193 |        "2                              0.740124                  3.678445   \n",
194 |        "3                              0.918492                  0.862278   \n",
195 |        "\n",
196 |        "   stealthy-beige-pinscher-golden  chummy-cream-tarantula-entropy  \\\n",
197 |        "0                        0.230303                       -6.385090   \n",
198 |        "1                       -0.082209                        0.110579   \n",
199 |        "2                        0.288558                        0.515875   \n",
200 |        "3                        1.155287                        0.911106   \n",
201 |        "\n",
202 |        "   hazy-emerald-cuttlefish-unsorted  nerdy-indigo-wolfhound-sorted  ...  \\\n",
203 |        "0                          0.439369                      -0.721946  ...   \n",
204 |        "1                         -0.382374                      -0.229620  ...   \n",
205 |        "2                          0.920590                      -1.223277  ...   \n",
206 |        "3                          0.562598                      -1.349685  ...   \n",
207 |        "\n",
208 |        "   wheezy-myrtle-mandrill-entropy  wiggy-lilac-lemming-sorted  \\\n",
209 |        "0                        0.351895                    0.618824   \n",
210 |        "1                       -0.645115                   -1.246090   \n",
211 |        "2                        0.516422                    0.130521   \n",
212 |        "3                       -1.168967                    1.385089   \n",
213 |        "\n",
214 |        "   gloppy-cerise-snail-contributor  woozy-silver-havanese-gaussian  \\\n",
215 |        "0                        -1.542423                        0.598175   \n",
216 |        "1                         2.613357                       -0.479664   \n",
217 |        "2                        -0.459210                        2.028205   \n",
218 |        "3                        -0.353028                        3.316150   \n",
219 |        "\n",
220 |        "   jumpy-thistle-discus-sorted  muggy-turquoise-donkey-important  \\\n",
221 |        "0                     0.611757                          0.678772   \n",
222 |        "1                     1.581289                          0.931258   \n",
223 |        "2                    -0.093968                         -0.218274   \n",
224 |        "3                    -0.524087                         -0.794327   \n",
225 |        "\n",
226 |        "   blurry-buff-hyena-entropy  bluesy-chocolate-kudu-fepid  \\\n",
227 |        "0                   0.247059                    -0.806677   \n",
228 |        "1                   0.151937                    -0.766595   \n",
229 |        "2                  -0.163136                    -0.870289   \n",
230 |        "3                   3.936365                     0.682989   \n",
231 |        "\n",
232 |        "   gamy-white-monster-expert  target  \n",
233 |        "0                  -0.193649       0  \n",
234 |        "1                   0.474351       0  \n",
235 |        "2                   0.064038       1  \n",
236 |        "3                  -2.521211       0  \n",
237 |        "\n",
238 |        "[4 rows x 258 columns]"
239 |       ]
240 |      },
241 |      "execution_count": 38,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "path_of_input_file = 'D:\\\\kaggle_trials\\\\instant-gratification\\\\train.csv'\n",
248 |     "df                 = pd.read_csv(path_of_input_file)\n",
249 |     "df.head(4)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 40,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "The number of labels are  2\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "num_labels = df['target'].unique()\n",
267 |     "print('The number of labels are ',len(num_labels))"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 41,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "The number of  0  labels are :-  131013\n",
280 |       "The number of  1  labels are :-  131131\n",
281 |       "We dont have a balanced dataset and hence we need to perform imbalanced dataset handling\n"
282 |      ]
283 |     }
284 |    ],
285 |    "source": [
286 |     "for i in range(len(num_labels)):\n",
287 |     "    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['target']==num_labels[i]]))\n",
288 |     "print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 63,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "0.5900932689923516\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "col_names                        = df.columns\n",
306 |     "Y                                = df[col_names[-1]].values\n",
307 |     "\n",
308 |     "columns_to_scale                 = col_names[1:-1]\n",
309 |     "scaler                           = MinMaxScaler()\n",
310 |     "scaled_columns                   = scaler.fit_transform(df[columns_to_scale]) \n",
311 |     "X_processed_data                 = scaled_columns\n",
312 |     "dim_r                            = TruncatedSVD(n_components=200)\n",
313 |     "dim_r.fit(X_processed_data)\n",
314 |     "X_train, X_test, y_train, y_test = train_test_split(X_processed_data, Y, test_size=0.2, random_state=42)\n",
315 |     "qda                              = QuadraticDiscriminantAnalysis(0.1)\n",
316 |     "qda.fit(X_train,y_train)\n",
317 |     "accuracy                         = qda.score(X_test,y_test)\n",
318 |     "print(accuracy)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 62,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "0.5214480535581453\n"
331 |      ]
332 |     }
333 |    ],
334 |    "source": [
335 |     "lda = LinearDiscriminantAnalysis()\n",
336 |     "lda.fit(X_train,y_train)\n",
337 |     "accuracy                         = lda.score(X_test,y_test)\n",
338 |     "print(accuracy)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": []
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": []
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": []
361 |   }
362 |  ],
363 |  "metadata": {
364 |   "kernelspec": {
365 |    "display_name": "Python 3",
366 |    "language": "python",
367 |    "name": "python3"
368 |   },
369 |   "language_info": {
370 |    "codemirror_mode": {
371 |     "name": "ipython",
372 |     "version": 3
373 |    },
374 |    "file_extension": ".py",
375 |    "mimetype": "text/x-python",
376 |    "name": "python",
377 |    "nbconvert_exporter": "python",
378 |    "pygments_lexer": "ipython3",
379 |    "version": "3.7.3"
380 |   }
381 |  },
382 |  "nbformat": 4,
383 |  "nbformat_minor": 2
384 | }
385 | 


--------------------------------------------------------------------------------
/Project15/Lower_back_pain_detection_KNN/Lower Back pain detection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Using TensorFlow backend.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import warnings\n",
 18 |     "warnings.filterwarnings('ignore')\n",
 19 |     "from imblearn.over_sampling import SMOTE \n",
 20 |     "import numpy as np\n",
 21 |     "import pandas as pd\n",
 22 |     "import os\n",
 23 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 24 |     "from sklearn.preprocessing import StandardScaler, LabelBinarizer\n",
 25 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 26 |     "from sklearn.metrics import *\n",
 27 |     "import hyperopt\n",
 28 |     "from hyperopt import *\n",
 29 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 30 |     "import matplotlib.pyplot as plt\n",
 31 |     "%matplotlib inline \n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "<div class=\"alert alert-block alert-info\">\n",
 39 |     "<b>Loading the data :</b> We loaded the data from the given data source to demonstrate K neighbors Classifier\n",
 40 |     "</div>"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/html": [
 51 |        "<div>\n",
 52 |        "<style scoped>\n",
 53 |        "    .dataframe tbody tr th:only-of-type {\n",
 54 |        "        vertical-align: middle;\n",
 55 |        "    }\n",
 56 |        "\n",
 57 |        "    .dataframe tbody tr th {\n",
 58 |        "        vertical-align: top;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe thead th {\n",
 62 |        "        text-align: right;\n",
 63 |        "    }\n",
 64 |        "</style>\n",
 65 |        "<table border=\"1\" class=\"dataframe\">\n",
 66 |        "  <thead>\n",
 67 |        "    <tr style=\"text-align: right;\">\n",
 68 |        "      <th></th>\n",
 69 |        "      <th>Col1</th>\n",
 70 |        "      <th>Col2</th>\n",
 71 |        "      <th>Col3</th>\n",
 72 |        "      <th>Col4</th>\n",
 73 |        "      <th>Col5</th>\n",
 74 |        "      <th>Col6</th>\n",
 75 |        "      <th>Col7</th>\n",
 76 |        "      <th>Col8</th>\n",
 77 |        "      <th>Col9</th>\n",
 78 |        "      <th>Col10</th>\n",
 79 |        "      <th>Col11</th>\n",
 80 |        "      <th>Col12</th>\n",
 81 |        "      <th>Class_att</th>\n",
 82 |        "      <th>Unnamed: 13</th>\n",
 83 |        "    </tr>\n",
 84 |        "  </thead>\n",
 85 |        "  <tbody>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>0</th>\n",
 88 |        "      <td>63.027818</td>\n",
 89 |        "      <td>22.552586</td>\n",
 90 |        "      <td>39.609117</td>\n",
 91 |        "      <td>40.475232</td>\n",
 92 |        "      <td>98.672917</td>\n",
 93 |        "      <td>-0.254400</td>\n",
 94 |        "      <td>0.744503</td>\n",
 95 |        "      <td>12.5661</td>\n",
 96 |        "      <td>14.5386</td>\n",
 97 |        "      <td>15.30468</td>\n",
 98 |        "      <td>-28.658501</td>\n",
 99 |        "      <td>43.5123</td>\n",
100 |        "      <td>Abnormal</td>\n",
101 |        "      <td>NaN</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>1</th>\n",
105 |        "      <td>39.056951</td>\n",
106 |        "      <td>10.060991</td>\n",
107 |        "      <td>25.015378</td>\n",
108 |        "      <td>28.995960</td>\n",
109 |        "      <td>114.405425</td>\n",
110 |        "      <td>4.564259</td>\n",
111 |        "      <td>0.415186</td>\n",
112 |        "      <td>12.8874</td>\n",
113 |        "      <td>17.5323</td>\n",
114 |        "      <td>16.78486</td>\n",
115 |        "      <td>-25.530607</td>\n",
116 |        "      <td>16.1102</td>\n",
117 |        "      <td>Abnormal</td>\n",
118 |        "      <td>NaN</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>2</th>\n",
122 |        "      <td>68.832021</td>\n",
123 |        "      <td>22.218482</td>\n",
124 |        "      <td>50.092194</td>\n",
125 |        "      <td>46.613539</td>\n",
126 |        "      <td>105.985135</td>\n",
127 |        "      <td>-3.530317</td>\n",
128 |        "      <td>0.474889</td>\n",
129 |        "      <td>26.8343</td>\n",
130 |        "      <td>17.4861</td>\n",
131 |        "      <td>16.65897</td>\n",
132 |        "      <td>-29.031888</td>\n",
133 |        "      <td>19.2221</td>\n",
134 |        "      <td>Abnormal</td>\n",
135 |        "      <td>Prediction is done by using binary classificat...</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>3</th>\n",
139 |        "      <td>69.297008</td>\n",
140 |        "      <td>24.652878</td>\n",
141 |        "      <td>44.311238</td>\n",
142 |        "      <td>44.644130</td>\n",
143 |        "      <td>101.868495</td>\n",
144 |        "      <td>11.211523</td>\n",
145 |        "      <td>0.369345</td>\n",
146 |        "      <td>23.5603</td>\n",
147 |        "      <td>12.7074</td>\n",
148 |        "      <td>11.42447</td>\n",
149 |        "      <td>-30.470246</td>\n",
150 |        "      <td>18.8329</td>\n",
151 |        "      <td>Abnormal</td>\n",
152 |        "      <td>NaN</td>\n",
153 |        "    </tr>\n",
154 |        "  </tbody>\n",
155 |        "</table>\n",
156 |        "</div>"
157 |       ],
158 |       "text/plain": [
159 |        "        Col1       Col2       Col3       Col4        Col5       Col6  \\\n",
160 |        "0  63.027818  22.552586  39.609117  40.475232   98.672917  -0.254400   \n",
161 |        "1  39.056951  10.060991  25.015378  28.995960  114.405425   4.564259   \n",
162 |        "2  68.832021  22.218482  50.092194  46.613539  105.985135  -3.530317   \n",
163 |        "3  69.297008  24.652878  44.311238  44.644130  101.868495  11.211523   \n",
164 |        "\n",
165 |        "       Col7     Col8     Col9     Col10      Col11    Col12 Class_att  \\\n",
166 |        "0  0.744503  12.5661  14.5386  15.30468 -28.658501  43.5123  Abnormal   \n",
167 |        "1  0.415186  12.8874  17.5323  16.78486 -25.530607  16.1102  Abnormal   \n",
168 |        "2  0.474889  26.8343  17.4861  16.65897 -29.031888  19.2221  Abnormal   \n",
169 |        "3  0.369345  23.5603  12.7074  11.42447 -30.470246  18.8329  Abnormal   \n",
170 |        "\n",
171 |        "                                         Unnamed: 13  \n",
172 |        "0                                                NaN  \n",
173 |        "1                                                NaN  \n",
174 |        "2  Prediction is done by using binary classificat...  \n",
175 |        "3                                                NaN  "
176 |       ]
177 |      },
178 |      "execution_count": 2,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "path_of_input_file = 'D:\\\\kaggle_trials\\\\lower-back-pain-symptoms-dataset\\\\Dataset_spine.csv'\n",
185 |     "df                 = pd.read_csv(path_of_input_file,)\n",
186 |     "df.head(4)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "<div class=\"alert alert-block alert-info\">\n",
194 |     "<b>Data Imbalance:</b> We check the data imbalance here. Clearly we have an imbalanced dataset\n",
195 |     "</div>"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 3,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "The number of labels are  2\n"
208 |      ]
209 |     }
210 |    ],
211 |    "source": [
212 |     "num_labels = df['Class_att'].unique()\n",
213 |     "print('The number of labels are ',len(num_labels))"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 4,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "The number of  Abnormal  labels are :-  210\n",
226 |       "The number of  Normal  labels are :-  100\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "for i in range(len(num_labels)):\n",
232 |     "    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['Class_att']==num_labels[i]]))"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "<div class=\"alert alert-block alert-info\">\n",
240 |     "<b>Preprocessing:</b> We now preproocess and make the dataset balanced\n",
241 |     "</div>"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 5,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "lb           = LabelBinarizer()\n",
251 |     "Y            = lb.fit_transform(df['Class_att'].values)\n",
252 |     "X            = df[df.columns[:12]].values\n",
253 |     "\n",
254 |     "sm           = SMOTE(random_state=42)\n",
255 |     "X_res, Y_res = sm.fit_resample(X, Y)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 6,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Positive examples before Oversampling is  100\n",
268 |       "Negative examples before Oversampling is  210\n",
269 |       "\n",
270 |       "\n",
271 |       "Positive examples after Oversampling is  210\n",
272 |       "Negative examples after Oversampling is  210\n",
273 |       "\n",
274 |       "\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "print('Positive examples before Oversampling is ', sum(Y == [1])[0])\n",
280 |     "print('Negative examples before Oversampling is ', sum(Y == [0])[0])\n",
281 |     "print('\\n')\n",
282 |     "print('Positive examples after Oversampling is ', sum(Y_res == [1]))\n",
283 |     "print('Negative examples after Oversampling is ', sum(Y_res == [0]))\n",
284 |     "print('\\n')"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "<div class=\"alert alert-block alert-info\">\n",
292 |     "<b>Train Test Split:</b> We split the data to train and test components.\n",
293 |     "</div>"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 7,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "<div class=\"alert alert-block alert-info\">\n",
310 |     "<b>Hyper parameter Grid:</b> We create a grid for different hyper parameters to iterate from \n",
311 |     "</div>"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 8,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "kneighbors_grid = {'n_neighbors' : hp.choice('n_neighbors',range(10,20)),\n",
321 |     "                   'weights'     : hp.choice('weights',['uniform','distance']),\n",
322 |     "                   'algorithm'   : hp.choice('algorithm',['ball_tree','kd_tree','brute']),\n",
323 |     "                   'leaf_size'   : hp.choice('leaf_size',range(1,50)),\n",
324 |     "                   'metric'      : hp.choice('metric',['euclidean','manhattan','chebyshev','minkowski'])\n",
325 |     "}"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 9,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "100%|████████████████████████████████████████████████| 500/500 [00:06<00:00, 77.86it/s, best loss: -0.8609696283720053]\n",
338 |       "The best parameter tuned on training set is given by :-  {'algorithm': 'ball_tree', 'leaf_size': 45, 'metric': 'chebyshev', 'n_neighbors': 10, 'weights': 'distance'}\n"
339 |      ]
340 |     }
341 |    ],
342 |    "source": [
343 |     "def hyperopt_train_test(params):\n",
344 |     "    clf = KNeighborsClassifier(**params)\n",
345 |     "    return cross_val_score(clf, X_train, y_train).mean()\n",
346 |     "\n",
347 |     "def function_to_minimise(params):\n",
348 |     "    accuracy = hyperopt_train_test(params)\n",
349 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
350 |     "\n",
351 |     "\n",
352 |     "trials          = Trials()\n",
353 |     "best            = fmin(function_to_minimise, kneighbors_grid, algo=tpe.suggest, max_evals=500, trials=trials)\n",
354 |     "best_parameters = space_eval(kneighbors_grid, best)\n",
355 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "<div class=\"alert alert-block alert-info\">\n",
363 |     "<b>Model Fitting and conclusion:</b> We now fit the model and then provide a classification analysis on the model fit\n",
364 |     "</div>"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 10,
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "KNeighborsClassifier(algorithm='ball_tree', leaf_size=45, metric='chebyshev',\n",
376 |        "                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,\n",
377 |        "                     weights='distance')"
378 |       ]
379 |      },
380 |      "execution_count": 10,
381 |      "metadata": {},
382 |      "output_type": "execute_result"
383 |     }
384 |    ],
385 |    "source": [
386 |     "knnclf = KNeighborsClassifier(**best_parameters)\n",
387 |     "knnclf.fit(X_train,y_train)"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 11,
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "y_hat = knnclf.predict(X_test)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 12,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "              precision    recall  f1-score   support\n",
409 |       "\n",
410 |       "           0       0.72      0.98      0.83        54\n",
411 |       "           1       0.98      0.75      0.85        85\n",
412 |       "\n",
413 |       "    accuracy                           0.84       139\n",
414 |       "   macro avg       0.85      0.87      0.84       139\n",
415 |       "weighted avg       0.88      0.84      0.84       139\n",
416 |       "\n"
417 |      ]
418 |     }
419 |    ],
420 |    "source": [
421 |     "print(classification_report(y_hat,y_test))"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": []
430 |   }
431 |  ],
432 |  "metadata": {
433 |   "kernelspec": {
434 |    "display_name": "Python 3",
435 |    "language": "python",
436 |    "name": "python3"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 3
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython3",
448 |    "version": "3.7.3"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 2
453 | }
454 | 


--------------------------------------------------------------------------------
/Project18/Quality_detection_Decision_trees/Wine_quality_Decision_Trees.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Using TensorFlow backend.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import warnings\n",
 18 |     "warnings.filterwarnings('ignore')\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "import os\n",
 22 |     "from imblearn.over_sampling import SMOTE \n",
 23 |     "from sklearn.tree import DecisionTreeClassifier\n",
 24 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder, LabelEncoder\n",
 25 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 26 |     "from sklearn.metrics import *\n",
 27 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
 28 |     "from sklearn.decomposition import PCA,TruncatedSVD\n",
 29 |     "import hyperopt\n",
 30 |     "from hyperopt import *\n",
 31 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 32 |     "import string\n",
 33 |     "import matplotlib.pyplot as plt\n",
 34 |     "%matplotlib inline \n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "<div class=\"alert alert-block alert-info\">\n",
 42 |     "<b>Loading data:</b> We load the dataset necessary for analysis\n",
 43 |     "</div>"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/html": [
 54 |        "<div>\n",
 55 |        "<style scoped>\n",
 56 |        "    .dataframe tbody tr th:only-of-type {\n",
 57 |        "        vertical-align: middle;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe tbody tr th {\n",
 61 |        "        vertical-align: top;\n",
 62 |        "    }\n",
 63 |        "\n",
 64 |        "    .dataframe thead th {\n",
 65 |        "        text-align: right;\n",
 66 |        "    }\n",
 67 |        "</style>\n",
 68 |        "<table border=\"1\" class=\"dataframe\">\n",
 69 |        "  <thead>\n",
 70 |        "    <tr style=\"text-align: right;\">\n",
 71 |        "      <th></th>\n",
 72 |        "      <th>fixed acidity</th>\n",
 73 |        "      <th>volatile acidity</th>\n",
 74 |        "      <th>citric acid</th>\n",
 75 |        "      <th>residual sugar</th>\n",
 76 |        "      <th>chlorides</th>\n",
 77 |        "      <th>free sulfur dioxide</th>\n",
 78 |        "      <th>total sulfur dioxide</th>\n",
 79 |        "      <th>density</th>\n",
 80 |        "      <th>pH</th>\n",
 81 |        "      <th>sulphates</th>\n",
 82 |        "      <th>alcohol</th>\n",
 83 |        "      <th>quality</th>\n",
 84 |        "    </tr>\n",
 85 |        "  </thead>\n",
 86 |        "  <tbody>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>0</th>\n",
 89 |        "      <td>7.0</td>\n",
 90 |        "      <td>0.27</td>\n",
 91 |        "      <td>0.36</td>\n",
 92 |        "      <td>20.7</td>\n",
 93 |        "      <td>0.045</td>\n",
 94 |        "      <td>45.0</td>\n",
 95 |        "      <td>170.0</td>\n",
 96 |        "      <td>1.0010</td>\n",
 97 |        "      <td>3.00</td>\n",
 98 |        "      <td>0.45</td>\n",
 99 |        "      <td>8.8</td>\n",
100 |        "      <td>6</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>1</th>\n",
104 |        "      <td>6.3</td>\n",
105 |        "      <td>0.30</td>\n",
106 |        "      <td>0.34</td>\n",
107 |        "      <td>1.6</td>\n",
108 |        "      <td>0.049</td>\n",
109 |        "      <td>14.0</td>\n",
110 |        "      <td>132.0</td>\n",
111 |        "      <td>0.9940</td>\n",
112 |        "      <td>3.30</td>\n",
113 |        "      <td>0.49</td>\n",
114 |        "      <td>9.5</td>\n",
115 |        "      <td>6</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>2</th>\n",
119 |        "      <td>8.1</td>\n",
120 |        "      <td>0.28</td>\n",
121 |        "      <td>0.40</td>\n",
122 |        "      <td>6.9</td>\n",
123 |        "      <td>0.050</td>\n",
124 |        "      <td>30.0</td>\n",
125 |        "      <td>97.0</td>\n",
126 |        "      <td>0.9951</td>\n",
127 |        "      <td>3.26</td>\n",
128 |        "      <td>0.44</td>\n",
129 |        "      <td>10.1</td>\n",
130 |        "      <td>6</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>3</th>\n",
134 |        "      <td>7.2</td>\n",
135 |        "      <td>0.23</td>\n",
136 |        "      <td>0.32</td>\n",
137 |        "      <td>8.5</td>\n",
138 |        "      <td>0.058</td>\n",
139 |        "      <td>47.0</td>\n",
140 |        "      <td>186.0</td>\n",
141 |        "      <td>0.9956</td>\n",
142 |        "      <td>3.19</td>\n",
143 |        "      <td>0.40</td>\n",
144 |        "      <td>9.9</td>\n",
145 |        "      <td>6</td>\n",
146 |        "    </tr>\n",
147 |        "  </tbody>\n",
148 |        "</table>\n",
149 |        "</div>"
150 |       ],
151 |       "text/plain": [
152 |        "   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \\\n",
153 |        "0            7.0              0.27         0.36            20.7      0.045   \n",
154 |        "1            6.3              0.30         0.34             1.6      0.049   \n",
155 |        "2            8.1              0.28         0.40             6.9      0.050   \n",
156 |        "3            7.2              0.23         0.32             8.5      0.058   \n",
157 |        "\n",
158 |        "   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \\\n",
159 |        "0                 45.0                 170.0   1.0010  3.00       0.45   \n",
160 |        "1                 14.0                 132.0   0.9940  3.30       0.49   \n",
161 |        "2                 30.0                  97.0   0.9951  3.26       0.44   \n",
162 |        "3                 47.0                 186.0   0.9956  3.19       0.40   \n",
163 |        "\n",
164 |        "   alcohol  quality  \n",
165 |        "0      8.8        6  \n",
166 |        "1      9.5        6  \n",
167 |        "2     10.1        6  \n",
168 |        "3      9.9        6  "
169 |       ]
170 |      },
171 |      "execution_count": 2,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "path_of_input_file = 'D:\\\\kaggle_trials\\\\mlcourse\\\\winequality-white.csv'\n",
178 |     "df                 = pd.read_csv(path_of_input_file)\n",
179 |     "df.head(4)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 3,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "We can clearly see that every value is numerical and hence only scaling will be needed for preprocessing steps\n"
192 |      ]
193 |     }
194 |    ],
195 |    "source": [
196 |     "print('We can clearly see that every value is numerical and hence only scaling will be needed for preprocessing steps')"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "<div class=\"alert alert-block alert-info\">\n",
204 |     "<b>Unbalanced data:</b> We can clearly see that the data in unbalanced\n",
205 |     "</div>"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 4,
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "The number of labels are  7\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "num_labels = df['quality'].unique()\n",
223 |     "print('The number of labels are ',len(num_labels))"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 5,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "The number of  6  labels are :-  2198\n",
236 |       "The number of  5  labels are :-  1457\n",
237 |       "The number of  7  labels are :-  880\n",
238 |       "The number of  8  labels are :-  175\n",
239 |       "The number of  4  labels are :-  163\n",
240 |       "The number of  3  labels are :-  20\n",
241 |       "The number of  9  labels are :-  5\n",
242 |       "We dont have a balanced dataset and hence we need to perform imbalanced dataset handling\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "for i in range(len(num_labels)):\n",
248 |     "    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['quality']==num_labels[i]]))\n",
249 |     "print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "<div class=\"alert alert-block alert-info\">\n",
257 |     "<b>Preprocessing steps:</b> We preprocess the data and make the data balanced\n",
258 |     "</div>"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 46,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "cols_needed       = df.columns\n",
268 |     "columns_to_scale  = cols_needed[:-1]\n",
269 |     "scaler            = MinMaxScaler()\n",
270 |     "scaled_columns    = scaler.fit_transform(df[columns_to_scale]) \n",
271 |     "X_processed_data  = scaled_columns\n",
272 |     "lb                = LabelEncoder()\n",
273 |     "Y                 = lb.fit_transform(df['quality'].values)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 47,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "pca       = PCA(n_components=10)\n",
283 |     "X_reduced = pca.fit_transform(X_processed_data)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 48,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "sm           = SMOTE(random_state=42,k_neighbors=4)\n",
293 |     "X_res, Y_res = sm.fit_resample(X_reduced, Y)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 49,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "The number of  0  labels are :-  2198\n",
306 |       "The number of  1  labels are :-  2198\n",
307 |       "The number of  2  labels are :-  2198\n",
308 |       "The number of  3  labels are :-  2198\n",
309 |       "The number of  4  labels are :-  2198\n",
310 |       "The number of  5  labels are :-  2198\n",
311 |       "The number of  6  labels are :-  2198\n"
312 |      ]
313 |     }
314 |    ],
315 |    "source": [
316 |     "for i in range(len(num_labels)):\n",
317 |     "    print('The number of ', i ,' labels are :- ',\n",
318 |     "          [np.array_equal(Y_res[j],i) for j in range(len(Y_res))].count(True))"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "<div class=\"alert alert-block alert-info\">\n",
326 |     "<b>Train-test split:</b> train test split of data is performed\n",
327 |     "</div>"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 50,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 51,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "decision_tree_grid = {'criterion' : hp.choice('criterion',['gini','entropy']),\n",
346 |     "                      'max_depth' : hp.choice('max_depth',range(1,150)),\n",
347 |     "                      'min_samples_split' : hp.choice('min_samples_split',range(2,30)),\n",
348 |     "                      'min_samples_leaf'  : hp.uniform('min_samples_leaf',0.1,0.5),\n",
349 |     "                      'max_features'      : hp.choice('max_features',range(1,10))\n",
350 |     "                      \n",
351 |     "                     }"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 52,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "name": "stdout",
361 |      "output_type": "stream",
362 |      "text": [
363 |       "100%|████████████████████████████████████████████████| 500/500 [00:26<00:00, 19.15it/s, best loss: -0.4014437049772785]\n",
364 |       "The best parameter tuned on training set is given by :-  {'criterion': 'gini', 'max_depth': 128, 'max_features': 6, 'min_samples_leaf': 0.10042508293788373, 'min_samples_split': 5}\n"
365 |      ]
366 |     }
367 |    ],
368 |    "source": [
369 |     "def hyperopt_train_test(params):\n",
370 |     "    clf = DecisionTreeClassifier(**params)\n",
371 |     "    return cross_val_score(clf, X_train, y_train).mean()\n",
372 |     "\n",
373 |     "def function_to_minimise(params):\n",
374 |     "    accuracy = hyperopt_train_test(params)\n",
375 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
376 |     "\n",
377 |     "\n",
378 |     "trials          = Trials()\n",
379 |     "best            = fmin(function_to_minimise, decision_tree_grid, algo=tpe.suggest, max_evals=500, trials=trials)\n",
380 |     "best_parameters = space_eval(decision_tree_grid, best)\n",
381 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 53,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/plain": [
392 |        "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=128,\n",
393 |        "                       max_features=6, max_leaf_nodes=None,\n",
394 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
395 |        "                       min_samples_leaf=0.10042508293788373,\n",
396 |        "                       min_samples_split=5, min_weight_fraction_leaf=0.0,\n",
397 |        "                       presort=False, random_state=None, splitter='best')"
398 |       ]
399 |      },
400 |      "execution_count": 53,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "model = DecisionTreeClassifier(**best_parameters)\n",
407 |     "model.fit(X_train, y_train)"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 54,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "y_pred = model.predict(X_test)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 55,
422 |    "metadata": {},
423 |    "outputs": [
424 |     {
425 |      "name": "stdout",
426 |      "output_type": "stream",
427 |      "text": [
428 |       "              precision    recall  f1-score   support\n",
429 |       "\n",
430 |       "           0       0.35      0.47      0.40       532\n",
431 |       "           1       0.60      0.34      0.43      1327\n",
432 |       "           2       0.20      0.23      0.21       644\n",
433 |       "           3       0.00      0.00      0.00         0\n",
434 |       "           4       0.31      0.31      0.31       715\n",
435 |       "           5       0.35      0.47      0.40       541\n",
436 |       "           6       0.89      0.47      0.62      1319\n",
437 |       "\n",
438 |       "    accuracy                           0.38      5078\n",
439 |       "   macro avg       0.39      0.33      0.34      5078\n",
440 |       "weighted avg       0.53      0.38      0.43      5078\n",
441 |       "\n"
442 |      ]
443 |     }
444 |    ],
445 |    "source": [
446 |     "print(classification_report(y_pred,y_test))"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "metadata": {},
453 |    "outputs": [],
454 |    "source": []
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": []
462 |   }
463 |  ],
464 |  "metadata": {
465 |   "kernelspec": {
466 |    "display_name": "Python 3",
467 |    "language": "python",
468 |    "name": "python3"
469 |   },
470 |   "language_info": {
471 |    "codemirror_mode": {
472 |     "name": "ipython",
473 |     "version": 3
474 |    },
475 |    "file_extension": ".py",
476 |    "mimetype": "text/x-python",
477 |    "name": "python",
478 |    "nbconvert_exporter": "python",
479 |    "pygments_lexer": "ipython3",
480 |    "version": "3.7.3"
481 |   }
482 |  },
483 |  "nbformat": 4,
484 |  "nbformat_minor": 2
485 | }
486 | 


--------------------------------------------------------------------------------
/Project31/House_Price_Revisted_Gaussian_Process_Regression/Gaussian Process Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import scipy\n",
 15 |     "from sklearn.decomposition import PCA\n",
 16 |     "from sklearn.gaussian_process import GaussianProcessRegressor\n",
 17 |     "from sklearn.ensemble import GradientBoostingRegressor\n",
 18 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 19 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 20 |     "from sklearn.metrics import *\n",
 21 |     "import hyperopt\n",
 22 |     "from hyperopt import *\n",
 23 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "%matplotlib inline \n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "<div class=\"alert alert-block alert-info\">\n",
 33 |     "<b>Loading the data:</b> We load the data from the mentioned path\n",
 34 |     "</div>"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/html": [
 45 |        "<div>\n",
 46 |        "<style scoped>\n",
 47 |        "    .dataframe tbody tr th:only-of-type {\n",
 48 |        "        vertical-align: middle;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe tbody tr th {\n",
 52 |        "        vertical-align: top;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe thead th {\n",
 56 |        "        text-align: right;\n",
 57 |        "    }\n",
 58 |        "</style>\n",
 59 |        "<table border=\"1\" class=\"dataframe\">\n",
 60 |        "  <thead>\n",
 61 |        "    <tr style=\"text-align: right;\">\n",
 62 |        "      <th></th>\n",
 63 |        "      <th>Id</th>\n",
 64 |        "      <th>MSSubClass</th>\n",
 65 |        "      <th>MSZoning</th>\n",
 66 |        "      <th>LotFrontage</th>\n",
 67 |        "      <th>LotArea</th>\n",
 68 |        "      <th>Street</th>\n",
 69 |        "      <th>Alley</th>\n",
 70 |        "      <th>LotShape</th>\n",
 71 |        "      <th>LandContour</th>\n",
 72 |        "      <th>Utilities</th>\n",
 73 |        "      <th>...</th>\n",
 74 |        "      <th>PoolArea</th>\n",
 75 |        "      <th>PoolQC</th>\n",
 76 |        "      <th>Fence</th>\n",
 77 |        "      <th>MiscFeature</th>\n",
 78 |        "      <th>MiscVal</th>\n",
 79 |        "      <th>MoSold</th>\n",
 80 |        "      <th>YrSold</th>\n",
 81 |        "      <th>SaleType</th>\n",
 82 |        "      <th>SaleCondition</th>\n",
 83 |        "      <th>SalePrice</th>\n",
 84 |        "    </tr>\n",
 85 |        "  </thead>\n",
 86 |        "  <tbody>\n",
 87 |        "    <tr>\n",
 88 |        "      <td>0</td>\n",
 89 |        "      <td>1</td>\n",
 90 |        "      <td>60</td>\n",
 91 |        "      <td>RL</td>\n",
 92 |        "      <td>65.0</td>\n",
 93 |        "      <td>8450</td>\n",
 94 |        "      <td>Pave</td>\n",
 95 |        "      <td>NaN</td>\n",
 96 |        "      <td>Reg</td>\n",
 97 |        "      <td>Lvl</td>\n",
 98 |        "      <td>AllPub</td>\n",
 99 |        "      <td>...</td>\n",
100 |        "      <td>0</td>\n",
101 |        "      <td>NaN</td>\n",
102 |        "      <td>NaN</td>\n",
103 |        "      <td>NaN</td>\n",
104 |        "      <td>0</td>\n",
105 |        "      <td>2</td>\n",
106 |        "      <td>2008</td>\n",
107 |        "      <td>WD</td>\n",
108 |        "      <td>Normal</td>\n",
109 |        "      <td>208500</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <td>1</td>\n",
113 |        "      <td>2</td>\n",
114 |        "      <td>20</td>\n",
115 |        "      <td>RL</td>\n",
116 |        "      <td>80.0</td>\n",
117 |        "      <td>9600</td>\n",
118 |        "      <td>Pave</td>\n",
119 |        "      <td>NaN</td>\n",
120 |        "      <td>Reg</td>\n",
121 |        "      <td>Lvl</td>\n",
122 |        "      <td>AllPub</td>\n",
123 |        "      <td>...</td>\n",
124 |        "      <td>0</td>\n",
125 |        "      <td>NaN</td>\n",
126 |        "      <td>NaN</td>\n",
127 |        "      <td>NaN</td>\n",
128 |        "      <td>0</td>\n",
129 |        "      <td>5</td>\n",
130 |        "      <td>2007</td>\n",
131 |        "      <td>WD</td>\n",
132 |        "      <td>Normal</td>\n",
133 |        "      <td>181500</td>\n",
134 |        "    </tr>\n",
135 |        "  </tbody>\n",
136 |        "</table>\n",
137 |        "<p>2 rows × 81 columns</p>\n",
138 |        "</div>"
139 |       ],
140 |       "text/plain": [
141 |        "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
142 |        "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
143 |        "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
144 |        "\n",
145 |        "  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n",
146 |        "0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
147 |        "1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   \n",
148 |        "\n",
149 |        "  YrSold  SaleType  SaleCondition  SalePrice  \n",
150 |        "0   2008        WD         Normal     208500  \n",
151 |        "1   2007        WD         Normal     181500  \n",
152 |        "\n",
153 |        "[2 rows x 81 columns]"
154 |       ]
155 |      },
156 |      "execution_count": 2,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "path_of_input_file = r'D:\\kaggle_trials\\house-prices-advanced-regression-techniques\\train.csv'\n",
163 |     "df                 = pd.read_csv(path_of_input_file)\n",
164 |     "df.head(2)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "<div class=\"alert alert-block alert-info\">\n",
172 |     "<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively \n",
173 |     "</div>"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 3,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "cols_needed           = list(df.columns)\n",
183 |     "cols_needed           = cols_needed[:len(cols_needed)-1]\n",
184 |     "\n",
185 |     "possible_numeric_cols = list(df._get_numeric_data().columns)\n",
186 |     "possible_numeric_cols.remove('Id')\n",
187 |     "\n",
188 |     "categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))\n",
189 |     "\n",
190 |     "numerical_columns     = []\n",
191 |     "for i in range(len(possible_numeric_cols)):\n",
192 |     "    col_name  = possible_numeric_cols[i]\n",
193 |     "    if len(df[col_name].unique())<10:\n",
194 |     "        categorical_columns.append(col_name)\n",
195 |     "    else:\n",
196 |     "        numerical_columns.append(col_name)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "<div class=\"alert alert-block alert-info\">\n",
204 |     "<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.\n",
205 |     "</div>"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 4,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "for i in range(len(categorical_columns)):\n",
215 |     "    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])\n",
216 |     "mean_impute_dict    ={}\n",
217 |     "for i in range(len(numerical_columns)):\n",
218 |     "    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))\n",
219 |     "for i in range(len(numerical_columns)):\n",
220 |     "    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "<div class=\"alert alert-block alert-info\">\n",
228 |     "<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations\n",
229 |     "</div>"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 5,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "ohe                  = OneHotEncoder()\n",
239 |     "scalar               = MinMaxScaler()\n",
240 |     "encoded_matrix       = ohe.fit_transform(df[categorical_columns])\n",
241 |     "scaled_matrix        = scalar.fit_transform(df[numerical_columns])\n",
242 |     "X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A\n",
243 |     "Y                    = scalar.fit_transform(df[['SalePrice']])\n"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "<div class=\"alert alert-block alert-info\">\n",
251 |     "<b>Train Test Split :</b> We split the data to train and test set \n",
252 |     "</div>"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 6,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "<div class=\"alert alert-block alert-info\">\n",
269 |     "<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score\n",
270 |     "</div>"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 7,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "GaussianProcessRegressor(alpha=1e-10, copy_X_train=True, kernel=None,\n",
282 |        "                         n_restarts_optimizer=0, normalize_y=False,\n",
283 |        "                         optimizer='fmin_l_bfgs_b', random_state=None)"
284 |       ]
285 |      },
286 |      "execution_count": 7,
287 |      "metadata": {},
288 |      "output_type": "execute_result"
289 |     }
290 |    ],
291 |    "source": [
292 |     "model = GaussianProcessRegressor()\n",
293 |     "model.fit(X_train, y_train)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 8,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "The coefficient of determination is:-  -218.8804257341233\n"
306 |      ]
307 |     }
308 |    ],
309 |    "source": [
310 |     "y_pred = model.predict(X_test)\n",
311 |     "print('The coefficient of determination is:- ',r2_score(y_pred,y_test))"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "<div class=\"alert alert-block alert-info\">\n",
319 |     "<b>Conclusion:</b> Clearly Gaussian Process Regression is not giving a good coefficient of determination at all. We will use Gradient Boosting Regressor in this scenario to see what improvements we can make\n",
320 |     "</div>"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 9,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "gradient_boost_reg_grid  = {'loss'         : hp.choice('loss',['ls','lad','huber','quantile']),\n",
330 |     "                            'learning_rate': hp.uniform('learning_rate',0.0,1.0),\n",
331 |     "                            'n_estimators' : hp.choice('n_estimators',range(50,300)),\n",
332 |     "                            'max_features' : hp.choice('max_features',['auto','sqrt','log2',None]),\n",
333 |     "                            'min_samples_split': hp.uniform('min_samples_split',0.0,1.0),\n",
334 |     "                            'min_samples_leaf' : hp.uniform('min_samples_leaf',0.0,0.5),\n",
335 |     "                         }"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 10,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "name": "stdout",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "100%|██████████████████████████████████████████████████| 20/20 [00:49<00:00,  2.46s/it, best loss: -0.9715407832381585]\n",
348 |       "The best parameter tuned on training set is given by :-  {'learning_rate': 0.627881784944577, 'loss': 'huber', 'max_features': None, 'min_samples_leaf': 0.01756016919912151, 'min_samples_split': 0.4790955420164398, 'n_estimators': 250}\n"
349 |      ]
350 |     }
351 |    ],
352 |    "source": [
353 |     "def hyperopt_train_test(params):\n",
354 |     "    reg = GradientBoostingRegressor(**params,random_state=19)\n",
355 |     "    return cross_val_score(reg, X_train, y_train).mean()\n",
356 |     "\n",
357 |     "def function_to_minimise(params):\n",
358 |     "    accuracy = hyperopt_train_test(params)\n",
359 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
360 |     "\n",
361 |     "\n",
362 |     "trials          = Trials()\n",
363 |     "best            = fmin(function_to_minimise, gradient_boost_reg_grid, algo=tpe.suggest, max_evals=20, trials=trials)\n",
364 |     "best_parameters = space_eval(gradient_boost_reg_grid, best)\n",
365 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 11,
371 |    "metadata": {},
372 |    "outputs": [
373 |     {
374 |      "data": {
375 |       "text/plain": [
376 |        "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
377 |        "                          learning_rate=0.627881784944577, loss='huber',\n",
378 |        "                          max_depth=3, max_features=None, max_leaf_nodes=None,\n",
379 |        "                          min_impurity_decrease=0.0, min_impurity_split=None,\n",
380 |        "                          min_samples_leaf=0.01756016919912151,\n",
381 |        "                          min_samples_split=0.4790955420164398,\n",
382 |        "                          min_weight_fraction_leaf=0.0, n_estimators=250,\n",
383 |        "                          n_iter_no_change=None, presort='auto',\n",
384 |        "                          random_state=None, subsample=1.0, tol=0.0001,\n",
385 |        "                          validation_fraction=0.1, verbose=0, warm_start=False)"
386 |       ]
387 |      },
388 |      "execution_count": 11,
389 |      "metadata": {},
390 |      "output_type": "execute_result"
391 |     }
392 |    ],
393 |    "source": [
394 |     "model = GradientBoostingRegressor(**best_parameters)\n",
395 |     "model.fit(X_train, y_train)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 12,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "The coefficient of determination is:-  0.9195001148340186\n"
408 |      ]
409 |     }
410 |    ],
411 |    "source": [
412 |     "y_pred = model.predict(X_test)\n",
413 |     "print('The coefficient of determination is:- ',r2_score(y_pred,y_test))"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "<div class=\"alert alert-block alert-info\">\n",
421 |     "<b>Conclusion:</b> Gradient Boosting regressor is an improvement over Gaussian Process Regressor in this case.\n",
422 |     "</div>"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": []
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": []
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {},
443 |    "outputs": [],
444 |    "source": []
445 |   }
446 |  ],
447 |  "metadata": {
448 |   "kernelspec": {
449 |    "display_name": "Python 3",
450 |    "language": "python",
451 |    "name": "python3"
452 |   },
453 |   "language_info": {
454 |    "codemirror_mode": {
455 |     "name": "ipython",
456 |     "version": 3
457 |    },
458 |    "file_extension": ".py",
459 |    "mimetype": "text/x-python",
460 |    "name": "python",
461 |    "nbconvert_exporter": "python",
462 |    "pygments_lexer": "ipython3",
463 |    "version": "3.7.4"
464 |   }
465 |  },
466 |  "nbformat": 4,
467 |  "nbformat_minor": 2
468 | }
469 | 


--------------------------------------------------------------------------------
/Project22/House Price Prediction Regression/Linear Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import warnings\n",
 10 |     "warnings.filterwarnings('ignore')\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import scipy\n",
 15 |     "from sklearn.decomposition import PCA\n",
 16 |     "from sklearn.linear_model import LinearRegression\n",
 17 |     "from sklearn.ensemble import AdaBoostRegressor\n",
 18 |     "from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder\n",
 19 |     "from sklearn.model_selection import train_test_split, cross_val_score\n",
 20 |     "from sklearn.metrics import *\n",
 21 |     "import hyperopt\n",
 22 |     "from hyperopt import *\n",
 23 |     "from hyperopt import fmin, tpe, hp, space_eval\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "%matplotlib inline \n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "<div class=\"alert alert-block alert-info\">\n",
 33 |     "<b>Loading the data:</b> We load the data from the mentioned path\n",
 34 |     "</div>"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/html": [
 45 |        "<div>\n",
 46 |        "<style scoped>\n",
 47 |        "    .dataframe tbody tr th:only-of-type {\n",
 48 |        "        vertical-align: middle;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe tbody tr th {\n",
 52 |        "        vertical-align: top;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe thead th {\n",
 56 |        "        text-align: right;\n",
 57 |        "    }\n",
 58 |        "</style>\n",
 59 |        "<table border=\"1\" class=\"dataframe\">\n",
 60 |        "  <thead>\n",
 61 |        "    <tr style=\"text-align: right;\">\n",
 62 |        "      <th></th>\n",
 63 |        "      <th>Id</th>\n",
 64 |        "      <th>MSSubClass</th>\n",
 65 |        "      <th>MSZoning</th>\n",
 66 |        "      <th>LotFrontage</th>\n",
 67 |        "      <th>LotArea</th>\n",
 68 |        "      <th>Street</th>\n",
 69 |        "      <th>Alley</th>\n",
 70 |        "      <th>LotShape</th>\n",
 71 |        "      <th>LandContour</th>\n",
 72 |        "      <th>Utilities</th>\n",
 73 |        "      <th>...</th>\n",
 74 |        "      <th>PoolArea</th>\n",
 75 |        "      <th>PoolQC</th>\n",
 76 |        "      <th>Fence</th>\n",
 77 |        "      <th>MiscFeature</th>\n",
 78 |        "      <th>MiscVal</th>\n",
 79 |        "      <th>MoSold</th>\n",
 80 |        "      <th>YrSold</th>\n",
 81 |        "      <th>SaleType</th>\n",
 82 |        "      <th>SaleCondition</th>\n",
 83 |        "      <th>SalePrice</th>\n",
 84 |        "    </tr>\n",
 85 |        "  </thead>\n",
 86 |        "  <tbody>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>0</th>\n",
 89 |        "      <td>1</td>\n",
 90 |        "      <td>60</td>\n",
 91 |        "      <td>RL</td>\n",
 92 |        "      <td>65.0</td>\n",
 93 |        "      <td>8450</td>\n",
 94 |        "      <td>Pave</td>\n",
 95 |        "      <td>NaN</td>\n",
 96 |        "      <td>Reg</td>\n",
 97 |        "      <td>Lvl</td>\n",
 98 |        "      <td>AllPub</td>\n",
 99 |        "      <td>...</td>\n",
100 |        "      <td>0</td>\n",
101 |        "      <td>NaN</td>\n",
102 |        "      <td>NaN</td>\n",
103 |        "      <td>NaN</td>\n",
104 |        "      <td>0</td>\n",
105 |        "      <td>2</td>\n",
106 |        "      <td>2008</td>\n",
107 |        "      <td>WD</td>\n",
108 |        "      <td>Normal</td>\n",
109 |        "      <td>208500</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>1</th>\n",
113 |        "      <td>2</td>\n",
114 |        "      <td>20</td>\n",
115 |        "      <td>RL</td>\n",
116 |        "      <td>80.0</td>\n",
117 |        "      <td>9600</td>\n",
118 |        "      <td>Pave</td>\n",
119 |        "      <td>NaN</td>\n",
120 |        "      <td>Reg</td>\n",
121 |        "      <td>Lvl</td>\n",
122 |        "      <td>AllPub</td>\n",
123 |        "      <td>...</td>\n",
124 |        "      <td>0</td>\n",
125 |        "      <td>NaN</td>\n",
126 |        "      <td>NaN</td>\n",
127 |        "      <td>NaN</td>\n",
128 |        "      <td>0</td>\n",
129 |        "      <td>5</td>\n",
130 |        "      <td>2007</td>\n",
131 |        "      <td>WD</td>\n",
132 |        "      <td>Normal</td>\n",
133 |        "      <td>181500</td>\n",
134 |        "    </tr>\n",
135 |        "    <tr>\n",
136 |        "      <th>2</th>\n",
137 |        "      <td>3</td>\n",
138 |        "      <td>60</td>\n",
139 |        "      <td>RL</td>\n",
140 |        "      <td>68.0</td>\n",
141 |        "      <td>11250</td>\n",
142 |        "      <td>Pave</td>\n",
143 |        "      <td>NaN</td>\n",
144 |        "      <td>IR1</td>\n",
145 |        "      <td>Lvl</td>\n",
146 |        "      <td>AllPub</td>\n",
147 |        "      <td>...</td>\n",
148 |        "      <td>0</td>\n",
149 |        "      <td>NaN</td>\n",
150 |        "      <td>NaN</td>\n",
151 |        "      <td>NaN</td>\n",
152 |        "      <td>0</td>\n",
153 |        "      <td>9</td>\n",
154 |        "      <td>2008</td>\n",
155 |        "      <td>WD</td>\n",
156 |        "      <td>Normal</td>\n",
157 |        "      <td>223500</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>3</th>\n",
161 |        "      <td>4</td>\n",
162 |        "      <td>70</td>\n",
163 |        "      <td>RL</td>\n",
164 |        "      <td>60.0</td>\n",
165 |        "      <td>9550</td>\n",
166 |        "      <td>Pave</td>\n",
167 |        "      <td>NaN</td>\n",
168 |        "      <td>IR1</td>\n",
169 |        "      <td>Lvl</td>\n",
170 |        "      <td>AllPub</td>\n",
171 |        "      <td>...</td>\n",
172 |        "      <td>0</td>\n",
173 |        "      <td>NaN</td>\n",
174 |        "      <td>NaN</td>\n",
175 |        "      <td>NaN</td>\n",
176 |        "      <td>0</td>\n",
177 |        "      <td>2</td>\n",
178 |        "      <td>2006</td>\n",
179 |        "      <td>WD</td>\n",
180 |        "      <td>Abnorml</td>\n",
181 |        "      <td>140000</td>\n",
182 |        "    </tr>\n",
183 |        "  </tbody>\n",
184 |        "</table>\n",
185 |        "<p>4 rows × 81 columns</p>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
190 |        "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
191 |        "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
192 |        "2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n",
193 |        "3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n",
194 |        "\n",
195 |        "  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n",
196 |        "0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
197 |        "1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   \n",
198 |        "2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   \n",
199 |        "3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
200 |        "\n",
201 |        "  YrSold  SaleType  SaleCondition  SalePrice  \n",
202 |        "0   2008        WD         Normal     208500  \n",
203 |        "1   2007        WD         Normal     181500  \n",
204 |        "2   2008        WD         Normal     223500  \n",
205 |        "3   2006        WD        Abnorml     140000  \n",
206 |        "\n",
207 |        "[4 rows x 81 columns]"
208 |       ]
209 |      },
210 |      "execution_count": 2,
211 |      "metadata": {},
212 |      "output_type": "execute_result"
213 |     }
214 |    ],
215 |    "source": [
216 |     "path_of_input_file = r'D:\\kaggle_trials\\house-prices-advanced-regression-techniques\\train.csv'\n",
217 |     "df                 = pd.read_csv(path_of_input_file)\n",
218 |     "df.head(4)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "<div class=\"alert alert-block alert-info\">\n",
226 |     "<b>Categorical and Numerical Columns Identification:</b> We identify categorical and numerical columns from the data. We do set a threshold that if any categorical value is classified as numerical, then it has to be classified back to categorical if the number of distinct values of that column in the dataframe is less than 10 \n",
227 |     "</div>"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 3,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "cols_needed           = list(df.columns)\n",
237 |     "cols_needed           = cols_needed[:len(cols_needed)-1]\n",
238 |     "\n",
239 |     "possible_numeric_cols = list(df._get_numeric_data().columns)\n",
240 |     "possible_numeric_cols.remove('Id')\n",
241 |     "\n",
242 |     "categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))\n",
243 |     "\n",
244 |     "numerical_columns     = []\n",
245 |     "for i in range(len(possible_numeric_cols)):\n",
246 |     "    col_name  = possible_numeric_cols[i]\n",
247 |     "    if len(df[col_name].unique())<10:\n",
248 |     "        categorical_columns.append(col_name)\n",
249 |     "    else:\n",
250 |     "        numerical_columns.append(col_name)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "<div class=\"alert alert-block alert-info\">\n",
258 |     "<b>Missing Value Treatment:</b> We impute the categorical missing values with their mode and the numerical missing values with their mean\n",
259 |     "</div>"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 4,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "for i in range(len(categorical_columns)):\n",
269 |     "    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])\n",
270 |     "mean_impute_dict    ={}\n",
271 |     "for i in range(len(numerical_columns)):\n",
272 |     "    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))\n",
273 |     "for i in range(len(numerical_columns)):\n",
274 |     "    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "<div class=\"alert alert-block alert-info\">\n",
282 |     "<b>Encoding and Feature Scaling:</b> We do the one hot encoding of categorical values and scale(by using MinMaxScaler) the numerical values to get the final feature matrix X. Subseqently, we consider the SalePrice column to be our target variable\n",
283 |     "</div>"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 5,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "ohe                  = OneHotEncoder()\n",
293 |     "scalar               = MinMaxScaler()\n",
294 |     "encoded_matrix       = ohe.fit_transform(df[categorical_columns])\n",
295 |     "scaled_matrix        = scalar.fit_transform(df[numerical_columns])\n",
296 |     "X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A\n",
297 |     "Y                    = scalar.fit_transform(df[['SalePrice']])\n"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "<div class=\"alert alert-block alert-info\">\n",
305 |     "<b>Train Test split:</b> We perform train test split on the data\n",
306 |     "</div>"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 6,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.33, random_state=42)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "<div class=\"alert alert-block alert-info\">\n",
323 |     "<b>Linear Regression Model Fit:</b> We fit a linear regression model on the data to get the results\n",
324 |     "</div>"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 7,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "reg                = LinearRegression()\n",
334 |     "reg.fit(X_train,y_train)\n",
335 |     "y_pred_linear_reg = reg.predict(X_test)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "<div class=\"alert alert-block alert-info\">\n",
343 |     "<b>Numerical Results:</b> The coefficient of determination is given below\n",
344 |     "</div>"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 8,
350 |    "metadata": {},
351 |    "outputs": [
352 |     {
353 |      "name": "stdout",
354 |      "output_type": "stream",
355 |      "text": [
356 |       "The coefficient of determination is:-  0.9580147190340209\n"
357 |      ]
358 |     }
359 |    ],
360 |    "source": [
361 |     "print('The coefficient of determination is:- ',r2_score(y_pred_linear_reg,y_test))"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "<div class=\"alert alert-block alert-info\">\n",
369 |     "<b>Adaboost Regressor:</b> We will try to fit an Adaboost regressor to the given data\n",
370 |     "    \n",
371 |     "</div>"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 9,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "adaboost_reg_grid  = {'n_estimators' : hp.choice('n_estimators',range(5,50)),\n",
381 |     "                      'learning_rate' : hp.uniform('learning_rate',0.05,1.01),\n",
382 |     "                      'loss'          : hp.choice('loss',['linear','square','exponential'])\n",
383 |     "                     }"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 10,
389 |    "metadata": {},
390 |    "outputs": [
391 |     {
392 |      "name": "stdout",
393 |      "output_type": "stream",
394 |      "text": [
395 |       "100%|██████████████████████████████████████████████████| 30/30 [00:50<00:00,  2.14s/it, best loss: -0.9887685515354506]\n",
396 |       "The best parameter tuned on training set is given by :-  {'learning_rate': 0.7582218518751838, 'loss': 'square', 'n_estimators': 41}\n"
397 |      ]
398 |     }
399 |    ],
400 |    "source": [
401 |     "def hyperopt_train_test(params):\n",
402 |     "    reg = AdaBoostRegressor(**params)\n",
403 |     "    return cross_val_score(reg, X_train, y_train).mean()\n",
404 |     "\n",
405 |     "def function_to_minimise(params):\n",
406 |     "    accuracy = hyperopt_train_test(params)\n",
407 |     "    return {'loss': -1*accuracy, 'status': STATUS_OK}\n",
408 |     "\n",
409 |     "\n",
410 |     "trials          = Trials()\n",
411 |     "best            = fmin(function_to_minimise, adaboost_reg_grid, algo=tpe.suggest, max_evals=30, trials=trials)\n",
412 |     "best_parameters = space_eval(adaboost_reg_grid, best)\n",
413 |     "print('The best parameter tuned on training set is given by :- ',best_parameters)"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 11,
419 |    "metadata": {},
420 |    "outputs": [
421 |     {
422 |      "data": {
423 |       "text/plain": [
424 |        "AdaBoostRegressor(base_estimator=None, learning_rate=0.7582218518751838,\n",
425 |        "                  loss='square', n_estimators=41, random_state=None)"
426 |       ]
427 |      },
428 |      "execution_count": 11,
429 |      "metadata": {},
430 |      "output_type": "execute_result"
431 |     }
432 |    ],
433 |    "source": [
434 |     "model = AdaBoostRegressor(**best_parameters)\n",
435 |     "model.fit(X_train, y_train)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 12,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "y_pred = model.predict(X_test)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 13,
450 |    "metadata": {},
451 |    "outputs": [
452 |     {
453 |      "name": "stdout",
454 |      "output_type": "stream",
455 |      "text": [
456 |       "The coefficient of determination is:-  0.974902747127915\n"
457 |      ]
458 |     }
459 |    ],
460 |    "source": [
461 |     "print('The coefficient of determination is:- ',r2_score(y_pred,y_test))"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {},
467 |    "source": [
468 |     "<div class=\"alert alert-block alert-info\">\n",
469 |     "<b>Conclusion :</b> We can clearly see that Adaboost regressor performed really well as compared to Linear Regression\n",
470 |     "</div>"
471 |    ]
472 |   }
473 |  ],
474 |  "metadata": {
475 |   "kernelspec": {
476 |    "display_name": "Python 3",
477 |    "language": "python",
478 |    "name": "python3"
479 |   },
480 |   "language_info": {
481 |    "codemirror_mode": {
482 |     "name": "ipython",
483 |     "version": 3
484 |    },
485 |    "file_extension": ".py",
486 |    "mimetype": "text/x-python",
487 |    "name": "python",
488 |    "nbconvert_exporter": "python",
489 |    "pygments_lexer": "ipython3",
490 |    "version": "3.7.3"
491 |   }
492 |  },
493 |  "nbformat": 4,
494 |  "nbformat_minor": 2
495 | }
496 | 


--------------------------------------------------------------------------------