├── .github
└── FUNDING.yml
├── .gitignore
├── Activation_Functions
├── README.md
├── README.tex.md
├── code
│ ├── elu.py
│ ├── gelu.py
│ ├── leaky_relu.py
│ ├── mish.py
│ ├── relu.py
│ ├── selu.py
│ ├── sigmoid.py
│ ├── silu.py
│ ├── softmax.py
│ ├── softplus.py
│ └── tanh.py
├── doc
│ ├── Activation_Functions.png
│ ├── Exponential_Linear_Unit.png
│ ├── Gaussian_Error_Linear_Unit.png
│ ├── Leaky_ReLU.png
│ ├── Mish_Function.png
│ ├── Parameteric_ReLU.png
│ ├── Rectified_Linear_Unit.png
│ ├── Scaled_Exponential_Linear_Unit.png
│ ├── Sigmoid_Function.png
│ ├── Sigmoid_Weighted_Linear_Unit_Swish.png
│ ├── SoftPlus.png
│ └── Tanh_Function.png
└── tex
│ ├── 1a3902d66dffcc33134633eb13a56e4a.svg
│ ├── 2172629849e5868eaf600934f256c186.svg
│ ├── 22b919815535e3da79a74831f137d534.svg
│ ├── 290ecca72cd3c083c37a6bdff5f8d689.svg
│ ├── 2d95939262cdc426890def2845d69e00.svg
│ ├── 3e743f8c72715fac3f04a831660936ed.svg
│ ├── 400293c7745c1271a610177098dbe49c.svg
│ ├── 57007cfe55ba83df3eeedbdc9d6485b4.svg
│ ├── 5e156666e8767505b7fdc17f061898f7.svg
│ ├── 61916d70fa806c731f6b8e12a081fdc2.svg
│ ├── 754c5b79c77621fd1c89885a39b8d291.svg
│ ├── 789a1f0365c3e83c7d1dc4a8b10d0acf.svg
│ ├── 7e28db664ad627340f7fda25a290ac36.svg
│ ├── 7eb4be07a0429a57780410969ed58d1a.svg
│ ├── 80c7c3a438606431b27cc86bce2f0135.svg
│ ├── 822646f49afad2437610e66ee730bef7.svg
│ ├── 83803c6cf357e7afb8cdabf1e530ea97.svg
│ ├── 9b171bd87aa286bf84d6621ea1204017.svg
│ ├── a44d2c33ac06d2b68df258ffa4e311c6.svg
│ ├── a4bbb3b4859a057a266b6c31e636abc7.svg
│ ├── a5d8a53e48a44e595830cd70188848a5.svg
│ ├── a7441e8a4f2fdb45cfc82da527cbafed.svg
│ ├── b106834be35dfc293ba97ae8fbe93673.svg
│ ├── b12dfd5e9d8bfe92d02c115de29172d4.svg
│ ├── b5f56261f1d93afbbe17f2cba27d68d9.svg
│ ├── d0f701c20d414f274f5a81ef8eb6be5c.svg
│ └── e859654ddf616a4d426f9a15ef699144.svg
├── Algorithms
├── adaboost
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── adaboost.py
│ ├── doc
│ │ ├── adaboost.png
│ │ ├── adaboost_training.gif
│ │ ├── alpha.png
│ │ └── decision_stump.PNG
│ └── tex
│ │ ├── 00cdc31549c67b60c6dff38106fea53a.svg
│ │ ├── 3826eeb617fdc1a5c8840e859a7dafbb.svg
│ │ ├── 447a46ee6fdce8100ddf3d57c464612b.svg
│ │ ├── 4f4f4e395762a3af4575de74c019ebb5.svg
│ │ ├── b4128148f8163b17d8269f72bf4e6d74.svg
│ │ ├── c2a29561d89e139b3c7bffe51570c3ce.svg
│ │ ├── cbfb1b2a33b28eab8a3e59464768e810.svg
│ │ └── dc56b266dfc19aea6656ef2dde1f1f14.svg
├── dbscan
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── dbscan.py
│ └── doc
│ │ ├── dbscan.gif
│ │ └── results.png
├── decision_tree
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ ├── decision_tree_classification.py
│ │ ├── decision_tree_regression.py
│ │ └── visualize_decision_trees_with_graphviz.py
│ ├── doc
│ │ ├── iris_decision_surface.png
│ │ ├── iris_decision_tree.png
│ │ ├── plot_tree.png
│ │ └── titanic_example.jpg
│ └── tex
│ │ ├── 3952bc7dadde93e3af8e54d66588d8b9.svg
│ │ ├── 55fafb270a7563e9c79658b7e1a606e2.svg
│ │ └── 99b4cda42ce5d6085705dc7458181012.svg
├── gradient_boosting
│ └── code
│ │ ├── gradient_boosting_classifier.py
│ │ └── gradient_boosting_regressor.py
├── k_nearest_neighbors
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ ├── k_nearest_neighbors.py
│ │ └── k_nearest_neighbors_regression.py
│ ├── doc
│ │ ├── effect_of_k.png
│ │ ├── effect_of_k_2.png
│ │ └── euclidean_distance.svg
│ └── tex
│ │ ├── 48e45a94b5215298962054c17e895faf.svg
│ │ └── a44a9e5e7f3ef9019ae9a21dbb98f40f.svg
├── kernel_pca
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── kernel_pca.py
│ ├── doc
│ │ └── kernel_pca.png
│ └── tex
│ │ ├── 12e6d8a64abd9854079af8b0622eb86a.svg
│ │ ├── 1cef6a8d14b34297d97d3e1cf812ff5c.svg
│ │ ├── 28a3c6f9dc75c8bf1b3498bbcea108be.svg
│ │ ├── 3821cc82b4d7dc6624ec03fd5a93dffc.svg
│ │ ├── 65ed4b231dcf18a70bae40e50d48c9c0.svg
│ │ ├── 87524c1390370d418a3be6af1b4136c5.svg
│ │ ├── 88a947ead8011f566945b4f207fde1a8.svg
│ │ ├── a6096ac2cee42d8fa76ec9110eb9c598.svg
│ │ └── d6328eaebbcd5c358f426dbea4bdbf70.svg
├── kmeans
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── kmeans.py
│ ├── doc
│ │ ├── choose_k_value.jpeg
│ │ ├── elbow_method_using_yellowbrick.png
│ │ ├── k_means.gif
│ │ ├── noisy_circles_with_true_output.png
│ │ ├── noisy_moons_with_true_output.png
│ │ ├── silhouette_analysis_3_clusters.jpeg
│ │ ├── silhouette_analysis_4_clusters.jpeg
│ │ ├── silhouette_analysis_5_clusters.jpeg
│ │ └── two_lines.png
│ └── tex
│ │ ├── 065cfac694daeb1fff1264475e035c67.svg
│ │ ├── 16ceb724dafaab6c19cf71bc5c460244.svg
│ │ ├── 1a567506286617473a9c0d9b2172f951.svg
│ │ ├── 43ca5ad9e1f094a31392f860ef481e5c.svg
│ │ ├── 44bc9d542a92714cac84e01cbbb7fd61.svg
│ │ ├── 4bdc8d9bcfb35e1c9bfb51fc69687dfc.svg
│ │ ├── 5c9a23f70c5920444f4613242c1e95fb.svg
│ │ ├── 5d2031093fe35c15cf01b562bab7d54f.svg
│ │ ├── 77a3b857d53fb44e33b53e4c8b68351a.svg
│ │ ├── b3520dc7da5f9731724eb6e1768a45a7.svg
│ │ ├── b776953fbf2b14971aa17331a8640386.svg
│ │ ├── db0e77b2ab4f495dea1f5c5c08588288.svg
│ │ └── f0c3f612efc905c5a416138c62517a36.svg
├── linear_discriminant_analysis
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── linear_discriminant_analysis.py
│ ├── doc
│ │ └── lda_example.png
│ └── tex
│ │ ├── 021a2e6a7f973e9edb8dcb0bf5bda569.svg
│ │ ├── 0aa7f58b7e561001f5301aa03507f552.svg
│ │ ├── 0e51a2dede42189d77627c4d742822c3.svg
│ │ ├── 3bf9c1fe4273ed003fd49e744378a5ac.svg
│ │ ├── 47b592a798cd56ccf668b67abad36a61.svg
│ │ ├── 518542ce2a067b399803d0396d9c5aae.svg
│ │ ├── 5a163b5cb124f209aed344b8f61b493f.svg
│ │ ├── 61daf4e5401f3020b1b0bfefbbd0e59e.svg
│ │ ├── 63bb9849783d01d91403bc9a5fea12a2.svg
│ │ ├── 66a81133e5715952856e2a06741f4676.svg
│ │ ├── 6711c7bae84526c845527391cb33d2e5.svg
│ │ ├── 84c95f91a742c9ceb460a83f9b5090bf.svg
│ │ ├── 874357dd0ff10af024f68c608dfc7a98.svg
│ │ ├── a9ba65368f9892beab04bf21d7e17b4f.svg
│ │ ├── c7eee0782fa9ccb115b1518f68c8908f.svg
│ │ ├── cbfb1b2a33b28eab8a3e59464768e810.svg
│ │ ├── d28140eda2d12e24b434e011b930fa23.svg
│ │ ├── d8cf0d84a4e9973bace4607b359224f4.svg
│ │ ├── deceeaf6940a8c7a5a02373728002b0f.svg
│ │ └── fcda2be66b20dba76606c4f982b63b60.svg
├── linear_regression
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ ├── elastic_net.py
│ │ ├── lasso_regression.py
│ │ ├── linear_regression_explained.ipynb
│ │ ├── multivariate_linear_regression.py
│ │ ├── normal_equation.py
│ │ ├── polynomial_regression.py
│ │ ├── ridge_regression.py
│ │ └── simple_linear_regression.py
│ ├── doc
│ │ ├── linear_regression_example.png
│ │ └── regularization.png
│ └── tex
│ │ ├── 0822727d1cb885ac043eb8c23c6a8c06.svg
│ │ ├── 18813fabfad59d1ba84fc901ede9101f.svg
│ │ ├── 2d3d16f648bb613710e8ed0a19f2fe17.svg
│ │ ├── 4b4518f1b7f0fb1347fa21506ebafb19.svg
│ │ ├── 4bf055a6a961b27706b75bc7e08a0f29.svg
│ │ ├── 660ef60b693132606dcc3aae53b147ca.svg
│ │ ├── 695de53e837a94510d8695f780f764d1.svg
│ │ ├── 87a75da6a417d9d9fd57f0b9b24473d2.svg
│ │ ├── ac342f337b60a671151324a7a222d777.svg
│ │ ├── c116dfb62bb6eadf90bac11393f97a66.svg
│ │ ├── c2a29561d89e139b3c7bffe51570c3ce.svg
│ │ ├── cbfb1b2a33b28eab8a3e59464768e810.svg
│ │ ├── deceeaf6940a8c7a5a02373728002b0f.svg
│ │ ├── e37355cc0b5b07561247c00842519c04.svg
│ │ ├── eedb3ae6d88cd2296e4c9acfe5658b09.svg
│ │ ├── ef27eeeeeadc48f3a48118fbf65ff125.svg
│ │ └── f28aee7ec74570ba081a608f7b5d88bb.svg
├── logistic_regression
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ ├── custom_implementation_vs_sklearn.ipynb
│ │ ├── data
│ │ │ └── heart.csv
│ │ ├── logistic_regression.py
│ │ └── one_vs_all_logistic_regression.py
│ ├── doc
│ │ ├── classification_vs_regression.jpeg
│ │ ├── convex_vs_non_convex.png
│ │ ├── logistic_regression_decision_boundary.png
│ │ ├── loss_functions.png
│ │ ├── one_vs_all.png
│ │ ├── overfitting_vs_underfitting.png
│ │ └── sigmoid.png
│ └── tex
│ │ ├── 068f41ab65ac2dc66989bc4b34ac6269.svg
│ │ ├── 07eebf05477a153a80ab3a1706b61874.svg
│ │ ├── 1426b496f93f9ac9c247d2c6b9feb304.svg
│ │ ├── 4f5fc085fbff8d9f0739164d34742fe9.svg
│ │ ├── 5bf04b3414ef400138e14332d52bd2a2.svg
│ │ ├── 5fb3811ee1edea3bc2a48d40d4db41aa.svg
│ │ ├── 608251b1bb31fecdd0617348db9b9a4c.svg
│ │ ├── 782c35b03d81084d082d0684a07ff03d.svg
│ │ ├── 8892ac1f6b1e6ffec35850296b02ec60.svg
│ │ ├── 949323d1374941432d95b3af55636269.svg
│ │ ├── 95413250c774b015e5c7ae8f011b158c.svg
│ │ ├── b348a8293e0acf9556a5a0a7e5fe9441.svg
│ │ ├── d866685970a1f6603602b10a76c5bf0e.svg
│ │ ├── f65d6c31a76358fd63c7ad13b74c5b2c.svg
│ │ ├── f6e98405de2edfa03d384e436eb4a6e6.svg
│ │ └── fd8be73b54f5436a5cd2e73ba9b6bfa9.svg
├── mean_shift
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── mean_shift.py
│ └── doc
│ │ ├── choose_bandwidth.png
│ │ ├── cluster_comparison.png
│ │ ├── kde_plot.png
│ │ ├── mean_shift.gif
│ │ ├── noisy_circles.png
│ │ ├── noisy_moons.png
│ │ └── two_lines.png
├── principal_component_analysis
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── principal_component_analysis.py
│ ├── doc
│ │ └── pca_example.png
│ └── tex
│ │ ├── 0aa7f58b7e561001f5301aa03507f552.svg
│ │ ├── 63bb9849783d01d91403bc9a5fea12a2.svg
│ │ ├── 84c95f91a742c9ceb460a83f9b5090bf.svg
│ │ └── a9ba65368f9892beab04bf21d7e17b4f.svg
└── random_forest
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ ├── eli5_feature_importance_example.py
│ ├── random_forest_classifier.py
│ ├── random_forest_regressor.py
│ ├── scikit-learn
│ │ ├── feature_importance_example.py
│ │ └── out_of_bag_error_example.py
│ └── shap_feature_importance_example.py
│ ├── doc
│ ├── bootstrapping_vertical.png
│ ├── decision_tree.png
│ ├── feature_importance.png
│ ├── out_of_bag_set.png
│ ├── random_forest_pipeline_horizontal_vertical.png
│ └── selecting_a_random_subset_of_variables_vertical.png
│ └── tex
│ └── 9fc20fb1d3825674c6a279cb0d5ca636.svg
├── CONTRIBUTING.md
├── Ensemble_Methods
└── code
│ ├── averaging.py
│ ├── bagging.py
│ ├── blending.py
│ ├── majority_vote.py
│ ├── stacking.py
│ ├── stacking_retrained.py
│ └── weighted_average.py
├── LICENSE
├── Metrics
├── README.md
├── README.tex.md
├── code
│ ├── accuracy_score.py
│ ├── binary_cross_entropy.py
│ ├── brier_score.py
│ ├── categorical_cross_entropy.py
│ ├── cosine_distance.py
│ ├── d2_score.py
│ ├── f1_score.py
│ ├── fbeta_score.py
│ ├── hinge.py
│ ├── huber.py
│ ├── kl_divergence.py
│ ├── logcosh.py
│ ├── mean_absolute_error.py
│ ├── mean_absolute_percentage_error.py
│ ├── mean_squared_error.py
│ ├── mean_squared_log_error.py
│ ├── median_absolute_error.py
│ ├── poisson.py
│ ├── precision.py
│ ├── r2_score.py
│ ├── recall.py
│ └── tweedie_deviance.py
├── doc
│ ├── binary_cross_entropy.png
│ └── confusion_matrix.png
└── tex
│ ├── 0a5c2da8007e2edc6de9ca962be3f3ed.svg
│ ├── 0df67ef21a0ddee56433ca033cb933c1.svg
│ ├── 15a86bf084c2654dfd8c0ab4ddda5bb3.svg
│ ├── 1ff5c2fb18f358c5a53d9f38bb1538b8.svg
│ ├── 202a192d4715ffd00cf289c10c107b43.svg
│ ├── 282f38ecf82d8d7b9d2813044262d5f3.svg
│ ├── 36b5afebdba34564d884d347484ac0c7.svg
│ ├── 44bc9d542a92714cac84e01cbbb7fd61.svg
│ ├── 4fe48dde86ac2d37419f0b35d57ac460.svg
│ ├── 53446b529aaec55cc9c04abff12141f8.svg
│ ├── 5cd6e6c44dcdc5d9134e7ff6c5b812fc.svg
│ ├── 5ce5d6877b4b1485ff9b0a48a56e5f97.svg
│ ├── 61e1a35fbe056f586e6a9dbc645eabb7.svg
│ ├── 735371fbbd0b21c453edc23b25d47a60.svg
│ ├── 77a3b857d53fb44e33b53e4c8b68351a.svg
│ ├── 793b0453fad52e1901e19f8c4489cace.svg
│ ├── 8217ed3c32a785f0b5aad4055f432ad8.svg
│ ├── 86bbcafb36f7dfddde972e1b47296b4c.svg
│ ├── 894224f3dc1a64562c781eff86cad001.svg
│ ├── 8a1f6bce1cca2d7cb34ee00ca6d18614.svg
│ ├── 8c8cdc49efc1e1ac95c5baf72e69b4e8.svg
│ ├── 8cdee07f9c86dc6c56f28b9f8fb8ae6d.svg
│ ├── 928194bd8bb89cb48374d0ab69a41c69.svg
│ ├── 9883db76caed72638544fbc209d7e157.svg
│ ├── a1b798ffc158c4ee0b440f4114c4f1c0.svg
│ ├── a92f489b7bf58458ad9a831191712560.svg
│ ├── bfcf5229cb3b2eb7b6472152c5538e88.svg
│ ├── c0f72f6ec2f0d5623ef75e15d1a9f197.svg
│ ├── c821543a0ee6e81d1a637188ab98345e.svg
│ ├── cdfab9c39216e8f199b80bc0590823ca.svg
│ ├── ce9e403e07bb796a5a4aea8e9aea8727.svg
│ ├── cf0c74f647a60274739e82cc935d32e4.svg
│ ├── cf644cbd499c18ed6f22cee5950c0d75.svg
│ ├── d5d6a7178f9ca2be9eab3bf855709944.svg
│ ├── d821640e564d2c34dbc9ee887fb60ca1.svg
│ ├── d8bc4fe1fed0596068b06f14dc5b6186.svg
│ ├── db850e0baa86d7832b5c75d7c4488d78.svg
│ ├── deceeaf6940a8c7a5a02373728002b0f.svg
│ ├── e1a2df39f105072461870caf8fa0e344.svg
│ ├── e4f967b6c1927904b60f77385e187da6.svg
│ ├── e9999393d8d1b46365ba09586571c55d.svg
│ ├── ee9dc84d168b211ff9f4b354e295af3c.svg
│ ├── f1128d54a4a5ff0cc3a487dc3f920c62.svg
│ └── ffa6eb731ed4996ab83caa1c630b1b9f.svg
├── Optimizers
├── adadelta
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── adadelta.py
│ ├── doc
│ │ └── adadelta_example.png
│ └── tex
│ │ ├── 11c596de17c342edeed29f489aa4b274.svg
│ │ ├── 15431539b7b73e500cc0fd3d7e0af147.svg
│ │ ├── 16423efbb7c672354f022590a8f79ed2.svg
│ │ ├── 20aafbd370a6b88bfacab3c7c49d8aa8.svg
│ │ ├── 2de22b33302abdf5e16b99d95e6bf125.svg
│ │ ├── 3ca6889677ea09e526a816322160498f.svg
│ │ ├── 407764bb35619057e9230a563546d02a.svg
│ │ ├── 4764741b6721dc727ba86e4c3ea5d106.svg
│ │ ├── 4f4f4e395762a3af4575de74c019ebb5.svg
│ │ ├── 8618e3e0464e1c4ae3ba41984874fa33.svg
│ │ ├── 8fdfd1eb52433d071078828592da25cc.svg
│ │ ├── 9d55fd72b8efdeca23093c2ed0ea5745.svg
│ │ ├── a9c79740e927ca20df42f2ac49811782.svg
│ │ ├── ae4fb5973f393577570881fc24fc2054.svg
│ │ ├── ae81fc362ffffea791fb10239e17378e.svg
│ │ ├── b999b985f5ccf08b3fce39e97a1c63b8.svg
│ │ └── e7d2063bdcfb3dfdb3f44724950543d1.svg
├── adagrad
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── adagrad.py
│ ├── doc
│ │ └── adagrad_example.gif
│ └── tex
│ │ ├── 43af08929a34d369038ea5f29d4b9cad.svg
│ │ ├── 45913b7ee3a34648c53cb1db66c97d75.svg
│ │ ├── 4f4f4e395762a3af4575de74c019ebb5.svg
│ │ ├── 7ccca27b5ccc533a2dd72dc6fa28ed84.svg
│ │ ├── ad3e2cec2e4e99bcb40a19ecda561e56.svg
│ │ ├── b1cc9c4f3f1d62306a8d45977e8f2946.svg
│ │ ├── c745b9b57c145ec5577b82542b2df546.svg
│ │ ├── db14eb9fda4448bde6e9d57897df8aae.svg
│ │ └── f166369f3ef0a7ff052f1e9bbf57d2e2.svg
├── adam
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── adam.py
│ ├── doc
│ │ └── adam_example.PNG
│ └── tex
│ │ ├── 15ef3b23ef739e47090fa0825bf9d390.svg
│ │ ├── 1c22e0ed21fd53f1f1d04d22d5d21677.svg
│ │ ├── 285dbe2a851d6e35501b39511115cd05.svg
│ │ ├── 2cae3bbfffb6ab2858054ba28bfcba80.svg
│ │ ├── 2feec3f6a85bfa367ca19d5e6d7002e6.svg
│ │ ├── 3e3c6ee78813607a4d976d92c19dd36e.svg
│ │ ├── 4ea6f1054f33b2fe4ccc258e940fdce1.svg
│ │ ├── 824123b152beebd863c67856d33ed802.svg
│ │ ├── a53a375441275f24641fc239deb138cb.svg
│ │ ├── b65d13242f56b3410177b1401dd8b7e8.svg
│ │ ├── ddb44cc6d9b5fa907d7e2d60daed1bca.svg
│ │ ├── f24bd5b399fcd2f1620d8978d4c3d069.svg
│ │ └── f4bee786ed43433221a48b27a5ed87ec.svg
├── adamax
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── adamax.py
│ ├── doc
│ │ └── adamax_example.PNG
│ └── tex
│ │ ├── 2ec6e630f199f589a2402fdf3e0289d5.svg
│ │ ├── 336fefe2418749fabf50594e52f7b776.svg
│ │ ├── 34ec2fa234397799e854fa7109da32c2.svg
│ │ ├── 3e3c6ee78813607a4d976d92c19dd36e.svg
│ │ ├── 485b078316d575b8a3edd55921040580.svg
│ │ ├── 5f5bf3f4ba1dd968b4cf5449b4310370.svg
│ │ ├── 6859140733d250349cb7e3623130b8d7.svg
│ │ ├── 839a0dc412c4f8670dd1064e0d6d412f.svg
│ │ ├── c88595da993fcae459ef526daedd66d7.svg
│ │ ├── c8a984d1a187544cc1d3132786b791b3.svg
│ │ ├── ca185a0f63add2baa6fe729fd1cfef60.svg
│ │ └── e6897b8647f3bd38144535d3f40078e2.svg
├── adamw
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── adamw.py
│ └── doc
│ │ └── adamw.png
├── amsgrad
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── amsgrad.py
│ ├── doc
│ │ └── amsgrad_example.png
│ └── tex
│ │ ├── 44e392b0bc182e02eec7fbcb32745a0a.svg
│ │ ├── 824123b152beebd863c67856d33ed802.svg
│ │ └── d3f0f052c885b9de14f9b3438d1ba9f0.svg
├── gradient_descent
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ ├── gradient_descent_with_momentum.py
│ │ └── gradient_descent_with_nesterov_momentum.py
│ ├── doc
│ │ ├── gradient_descent.gif
│ │ ├── momentum.png
│ │ ├── nesterov_accelerated_gradient.png
│ │ ├── pick_learning_rate.png
│ │ └── variations_comparison.png
│ └── tex
│ │ ├── 11c596de17c342edeed29f489aa4b274.svg
│ │ ├── 19f7986adf26d94218ca0cb10277f8e4.svg
│ │ ├── 1c5aa1876430bbdf7dcd8f9e641ac830.svg
│ │ ├── 1f52020ae24caeeaeeb316d2525450a2.svg
│ │ ├── 27e556cf3caa0673ac49a8f0de3c73ca.svg
│ │ ├── 386e10624041d64770c6785c1034b111.svg
│ │ ├── 55a049b8f161ae7cfeb0197d75aff967.svg
│ │ ├── 62b65f92d15f5423073762ffe8477b86.svg
│ │ ├── 666d1825fe38f52f9b0a01c2721dc4c8.svg
│ │ ├── 708d9d53037c10f462707daa2370b7df.svg
│ │ ├── 9691e94c3d0d9932f20e8f32a7908dd0.svg
│ │ ├── ad769e751231d17313953f80471b27a4.svg
│ │ ├── b9a39f2717502925e401654007e07bfd.svg
│ │ ├── bec0f956437138a98cb909f5dae6b77f.svg
│ │ ├── c745b9b57c145ec5577b82542b2df546.svg
│ │ ├── ca79e4e55e2ba419b202c4c9576a0d0e.svg
│ │ ├── d905c0dba806bdd8413af4aefb15d0be.svg
│ │ ├── e1977b3bd8b60ca5e8e3c3b921470696.svg
│ │ ├── e45b3f899e65ddee5e073ecf63f17efb.svg
│ │ └── f6ba11db1e6b10797a9ebcc12aeda2dc.svg
├── nadam
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── nadam.py
│ ├── doc
│ │ └── nadam_example.png
│ └── tex
│ │ ├── b860e63de84df769d7d9d6ce9295ba65.svg
│ │ └── ddb44cc6d9b5fa907d7e2d60daed1bca.svg
├── qhadam
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── qhadam.py
│ ├── doc
│ │ └── qhadam_example.png
│ └── tex
│ │ ├── 41922e474070adc90e7c1379c28d22fe.svg
│ │ ├── 53292819177dbb29ba6d92fe3aa2880c.svg
│ │ └── bcf57c8141818aa66812cefcf9d1a886.svg
├── qhm
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── qhm.py
│ ├── doc
│ │ └── qhm_update_rule.PNG
│ └── tex
│ │ ├── 4d5efe3f0b61407442322e122c778e4b.svg
│ │ ├── 693bbf447e9497167127d798d1d144cc.svg
│ │ ├── 8217ed3c32a785f0b5aad4055f432ad8.svg
│ │ ├── ba749f44b6808f949e9a35e0236f98c8.svg
│ │ ├── e59e9e7497e95821f127a65a4f975e55.svg
│ │ └── f9acdf2e58c905cd2502b16cd0f720c9.svg
├── radam
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ │ └── radam.py
│ └── doc
│ │ ├── radam_adam_comparison.png
│ │ └── radam_update_rule.png
└── rmsprop
│ ├── README.md
│ ├── README.tex.md
│ ├── code
│ └── rmsprop.py
│ ├── doc
│ └── rmsprop_example.PNG
│ └── tex
│ ├── 1d0496971a2775f4887d1df25cea4f7e.svg
│ ├── 4d0c5cb8a4df5487f9457948069c0c86.svg
│ ├── f22bcfcdd9fd04ced0345fc97d620463.svg
│ └── fffcaba90180362da033429f55895e5a.svg
└── README.md
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: TannerGilbert
4 | patreon: gilberttanner
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.ipynb_checkpoints
2 | *__pycache__
3 | *.vscode
4 | *.idea
--------------------------------------------------------------------------------
/Activation_Functions/code/elu.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class ELU:
5 | def __init__(self, alpha: float = 1.0) -> None:
6 | self.alpha = alpha
7 |
8 | def __call__(self, x: np.ndarray) -> np.ndarray:
9 | return np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1.0))
10 |
11 | def gradient(self, x: np.ndarray) -> np.ndarray:
12 | return np.where(x >= 0.0, 1.0, self.alpha * np.exp(x))
13 |
--------------------------------------------------------------------------------
/Activation_Functions/code/gelu.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.special import erf
3 |
4 |
5 | class GELU:
6 | def __call__(self, x: np.ndarray, approximate: bool = True) -> np.ndarray:
7 | if approximate:
8 | return x * 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
9 | return x * 0.5 * (1.0 + erf(x / np.sqrt(2.0)))
10 |
11 | def gradient(self, x: np.ndarray, approximate: bool = True) -> np.ndarray:
12 | if approximate:
13 | return 0.5 * np.tanh(0.0356774 * np.power(x, 3) + 0.797885 * x) + (0.0535161 * np.power(x, 3) + 0.398942 * x) * np.power(1 / np.cosh(x), 2) * (0.0356774 * np.power(x, 3) + 0.797885 * x) + 0.5
14 | return 0.5 * (1.0 + erf(x / np.sqrt(2.0))) + x * 1 / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2))
15 |
--------------------------------------------------------------------------------
/Activation_Functions/code/leaky_relu.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class LeakyReLU:
5 | def __init__(self, alpha: float = 0.3) -> None:
6 | self.alpha = alpha
7 |
8 | def __call__(self, x: np.ndarray) -> np.ndarray:
9 | return np.where(x >= 0.0, x, self.alpha * x)
10 |
11 | def gradient(self, x: np.ndarray) -> np.ndarray:
12 | return np.where(x >= 0.0, 1.0, self.alpha)
13 |
--------------------------------------------------------------------------------
/Activation_Functions/code/mish.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Mish:
5 | def __call__(self, x: np.ndarray) -> np.ndarray:
6 | return x * np.tanh(np.log(1 + np.exp(x)))
7 |
8 | def gradient(self, x: np.ndarray) -> np.ndarray:
9 | return (np.exp(x) * (4*np.exp(2*x) + np.exp(3*x) + 4*(1+x) + np.exp(x)*(6+4*x))) / np.power(2 + 2*np.exp(x) + np.exp(2*x), 2)
10 |
--------------------------------------------------------------------------------
/Activation_Functions/code/relu.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | import numpy as np
3 |
4 |
5 | class ReLU:
6 | def __call__(self, x: Union[list, np.ndarray]) -> np.ndarray:
7 | return np.maximum(x, 0.0)
8 |
9 | def gradient(self, x: np.ndarray) -> np.ndarray:
10 | return np.where(x >= 0.0, 1.0, 0.0)
11 |
--------------------------------------------------------------------------------
/Activation_Functions/code/selu.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class SELU:
5 | def __init__(self):
6 | self.alpha = 1.6732632423543772848170429916717
7 | self.scale = 1.0507009873554804934193349852946
8 |
9 | def __call__(self, x: np.ndarray) -> np.ndarray:
10 | return self.scale * np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1.0))
11 |
12 | def gradient(self, x: np.ndarray) -> np.ndarray:
13 | return self.scale * np.where(x >= 0.0, 1.0, self.alpha * np.exp(x))
14 |
--------------------------------------------------------------------------------
/Activation_Functions/code/sigmoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Sigmoid:
5 | def __call__(self, x: np.ndarray) -> np.ndarray:
6 | return 1 / (1 + np.exp(-x))
7 |
8 | def gradient(self, x: np.ndarray) -> np.ndarray:
9 | return self.__call__(x) * (1 - self.__call__(x))
10 |
--------------------------------------------------------------------------------
/Activation_Functions/code/silu.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class SiLU:
5 | def __call__(self, x: np.ndarray) -> np.ndarray:
6 | return x / (1 + np.exp(-x))
7 |
8 | def gradient(self, x: np.ndarray) -> np.ndarray:
9 | return (1 + np.exp(-x) + x * np.exp(-x)) / np.power(1 + np.exp(-x), 2)
10 |
--------------------------------------------------------------------------------
/Activation_Functions/code/softmax.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | import numpy as np
3 |
4 |
5 | class Softmax:
6 | def __call__(self, x: Union[list, np.ndarray]) -> np.ndarray:
7 | e_x = np.exp(x - np.max(x))
8 | return e_x / e_x.sum(axis=0)
9 |
10 | def gradient(self, x: Union[list, np.ndarray]) -> np.ndarray:
11 | p = self.__call__(x)
12 | return p * (1 - p)
13 |
--------------------------------------------------------------------------------
/Activation_Functions/code/softplus.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class SoftPlus:
5 | def __call__(self, x: np.ndarray) -> np.ndarray:
6 | return np.log(1 + np.exp(x))
7 |
8 | def gradient(self, x: np.ndarray) -> np.ndarray:
9 | return 1 / (1 + np.exp(-x))
10 |
--------------------------------------------------------------------------------
/Activation_Functions/code/tanh.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class TanH:
5 | def __call__(self, x: np.ndarray) -> np.ndarray:
6 | return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
7 |
8 | def gradient(self, x: np.ndarray) -> np.ndarray:
9 | return 1 - np.power(self.__call__(x), 2)
--------------------------------------------------------------------------------
/Activation_Functions/doc/Activation_Functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Activation_Functions.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Exponential_Linear_Unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Exponential_Linear_Unit.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Gaussian_Error_Linear_Unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Gaussian_Error_Linear_Unit.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Leaky_ReLU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Leaky_ReLU.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Mish_Function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Mish_Function.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Parameteric_ReLU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Parameteric_ReLU.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Rectified_Linear_Unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Rectified_Linear_Unit.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Scaled_Exponential_Linear_Unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Scaled_Exponential_Linear_Unit.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Sigmoid_Function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Sigmoid_Function.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Sigmoid_Weighted_Linear_Unit_Swish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Sigmoid_Weighted_Linear_Unit_Swish.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/SoftPlus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/SoftPlus.png
--------------------------------------------------------------------------------
/Activation_Functions/doc/Tanh_Function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Tanh_Function.png
--------------------------------------------------------------------------------
/Algorithms/adaboost/README.tex.md:
--------------------------------------------------------------------------------
1 | # AdaBoost - Adaptive Boosting
2 |
3 | 
4 |
5 | AdaBoost, short for **Ada**ptive [**Boost**ing](https://en.wikipedia.org/wiki/Boosting_(meta-algorithm)), of Freund and Schapire, was the first practical boosting algorithm and remains one of the most widely used and studied ones even today. Boosting is a general strategy for learning "strong models" by combining multiple simpler ones (weak models or weak learners).
6 |
7 | A "weak learner" is a model that will do at least slightly better than chance. AdaBoost can be applied to any classification algorithm, but most often, it's used with **Decision Stumps** - Decision Trees with only one node and two leaves.
8 |
9 | 
10 |
11 | Decision Stumps alone are not an excellent way to make predictions. A full-grown decision tree combines the decisions from all features to predict the target value. A stump, on the other hand, can only use one feature to make predictions.
12 |
13 | ## How does the AdaBoost algorithm work?
14 |
15 | 1. Initialize sample weights uniformly as $w_i^1=\frac{1}{N}$.
16 | 2. For each iteration $t$:
17 |
18 | **Step 1:** A weak learner (e.g. a decision stump) is trained on top of the weighted training data $X$. The weight of each sample $w_i$ indicates how important it is to classify the sample correctly.
19 |
20 | **Step 2:** After training, the weak learner gets a weight based on its accuracy $\alpha_t = \frac{1}{2} \ln \Big( \frac{1-\epsilon_t}{\epsilon_t} \Big)$
21 |
22 | 
23 |
24 | **Step 3:** The weights of misclassified samples are updated $w_i^{(t+1)} = w_i^{(t)} \cdot e^{-\alpha^t y_i h_t(x_i)}$
25 |
26 | **Step 4:** Renormalize weights so they sum up to 1 $\sum_{i=1}^n w_i^{(t+1)}=1$
27 |
28 | 3. Make predicts using a linear combination of the weak learners $H(x) = \text{sign} \Big(\sum_{t=1}^T \alpha_t h_t(x) \Big)$
29 |
30 | 
31 |
32 | ## Code
33 |
34 | - [Adaboost Python](code/adaboost.py)
35 |
36 | ## Resources
37 |
38 | - [https://scikit-learn.org/stable/modules/ensemble.html#adaboost](https://scikit-learn.org/stable/modules/ensemble.html#adaboost)
39 | - [https://www.youtube.com/watch?v=LsK-xG1cLYA](https://www.youtube.com/watch?v=LsK-xG1cLYA)
40 | - [https://blog.paperspace.com/adaboost-optimizer/](https://blog.paperspace.com/adaboost-optimizer/)
41 | - [https://en.wikipedia.org/wiki/AdaBoost](https://en.wikipedia.org/wiki/AdaBoost)
42 | - [https://geoffruddock.com/adaboost-from-scratch-in-python/](https://geoffruddock.com/adaboost-from-scratch-in-python/)
43 | - [https://www.cs.toronto.edu/~mbrubake/teaching/C11/Handouts/AdaBoost.pdf](https://www.cs.toronto.edu/~mbrubake/teaching/C11/Handouts/AdaBoost.pdf)
44 | - [https://jeremykun.com/2015/05/18/boosting-census/](https://jeremykun.com/2015/05/18/boosting-census/)
45 | - [https://ml-explained.com/blog/decision-tree-explained](https://ml-explained.com/blog/decision-tree-explained)
--------------------------------------------------------------------------------
/Algorithms/adaboost/code/adaboost.py:
--------------------------------------------------------------------------------
1 | # based on https://geoffruddock.com/adaboost-from-scratch-in-python/
2 |
3 | from __future__ import annotations
4 | from typing import Union
5 | import numpy as np
6 | from sklearn.tree import DecisionTreeClassifier
7 |
8 |
9 | class AdaBoost:
10 | """AdaBoost
11 | Parameters:
12 | -----------
13 | n_estimators: int
14 | Number of weak learners
15 | """
16 | def __init__(self, n_estimators: int) -> None:
17 | self.n_estimators = n_estimators
18 | self.stumps = np.zeros(shape=n_estimators, dtype=object)
19 | self.stump_weights = np.zeros(shape=n_estimators)
20 | self.sample_weights = None
21 |
22 | def fit(self, X: np.ndarray, y: np.ndarray) -> AdaBoost:
23 | n = X.shape[0]
24 | self.sample_weights = np.zeros(shape=(self.n_estimators, n))
25 |
26 | # Initialize weights
27 | self.sample_weights[0] = np.ones(shape=n) / n
28 |
29 | for i in range(self.n_estimators):
30 | # fit weak learner
31 | curr_sample_weights = self.sample_weights[i]
32 | stump = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
33 | stump.fit(X, y, sample_weight=curr_sample_weights)
34 |
35 | # calculate error and stump weight
36 | pred = stump.predict(X)
37 | err = curr_sample_weights[(pred != y)].sum()
38 | stump_weight = np.log((1 - err) / err) / 2
39 |
40 | # update sample weights
41 | new_sample_weights = (
42 | curr_sample_weights * np.exp(-stump_weight * y * pred)
43 | )
44 |
45 | # normalize sample weights
46 | new_sample_weights /= new_sample_weights.sum()
47 |
48 | if i+1 < self.n_estimators:
49 | self.sample_weights[i+1] = new_sample_weights
50 |
51 | self.stumps[i] = stump
52 | self.stump_weights[i] = stump_weight
53 |
54 | return self
55 |
56 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray:
57 | stump_preds = np.array([stump.predict(X) for stump in self.stumps])
58 | return np.sign(np.dot(self.stump_weights, stump_preds))
59 |
--------------------------------------------------------------------------------
/Algorithms/adaboost/doc/adaboost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/adaboost.png
--------------------------------------------------------------------------------
/Algorithms/adaboost/doc/adaboost_training.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/adaboost_training.gif
--------------------------------------------------------------------------------
/Algorithms/adaboost/doc/alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/alpha.png
--------------------------------------------------------------------------------
/Algorithms/adaboost/doc/decision_stump.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/decision_stump.PNG
--------------------------------------------------------------------------------
/Algorithms/adaboost/tex/4f4f4e395762a3af4575de74c019ebb5.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Algorithms/dbscan/code/dbscan.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | import numpy as np
3 |
4 |
5 | class DBSCAN:
6 | """DBSCAN
7 | Parameters:
8 | -----------
9 | eps: float = 0.3
10 | The maximum distance between two samples for one to be considered as in the neighborhood of the other.
11 | min_points: int = 5
12 | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
13 | """
14 | def __init__(self, eps: float = 0.3, min_points: int = 5) -> None:
15 | self.eps = eps
16 | self.min_points = min_points
17 | self.labels = []
18 | self.c = 1 # number of clusters
19 |
20 | def fit_predict(self, data: Union[list, np.ndarray]) -> list:
21 | self.labels = [0] * len(data)
22 | for i in range(len(data)):
23 | if not (self.labels[i] == 0):
24 | continue
25 |
26 | neighbours = self.find_neighbours(data, i)
27 |
28 | # If the number of points is below min_points the point is a outlier
29 | if len(neighbours) < self.min_points:
30 | self.labels[i] = -1
31 | else:
32 | self.grow_cluster(data, i, neighbours)
33 | self.c += 1
34 | return self.labels
35 |
36 | def find_neighbours(self, data: Union[list, np.ndarray], index: int) -> list:
37 | neighbors = []
38 |
39 | for p in range(len(data)):
40 | if np.linalg.norm(data[index]-data[p]) < self.eps and index != p:
41 | neighbors.append(p)
42 | return neighbors
43 |
44 | def grow_cluster(self, data: Union[list, np.ndarray], index: int, neighbours: list) -> None:
45 | # Assign seed point to cluster
46 | self.labels[index] = self.c
47 |
48 | i = 0
49 | while i < len(neighbours):
50 | p = neighbours[i]
51 | if self.labels[p] == -1:
52 | self.labels[p] = self.c
53 | elif self.labels[p] == 0:
54 | self.labels[p] = self.c
55 | neighbours_new = self.find_neighbours(data, p)
56 | # check neighbours length
57 | if len(neighbours_new) >= self.min_points:
58 | neighbours = neighbours + neighbours_new
59 | i += 1
60 |
61 |
62 | if __name__ == '__main__':
63 | import matplotlib.pyplot as plt
64 | from sklearn.datasets import make_blobs
65 | from sklearn.preprocessing import MinMaxScaler
66 |
67 | X, y = make_blobs(n_samples=30, centers=3, n_features=2)
68 | X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
69 | model = DBSCAN()
70 | predictions = model.fit_predict(X)
71 | colors = ['r', 'g', 'b', 'c', 'k', 'y']
72 | for classification, x in zip(predictions, X):
73 | color = colors[classification]
74 | plt.scatter(x[0], x[1], color=color, s=150, linewidths=5, zorder=10)
75 | plt.show()
76 |
--------------------------------------------------------------------------------
/Algorithms/dbscan/doc/dbscan.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/dbscan/doc/dbscan.gif
--------------------------------------------------------------------------------
/Algorithms/dbscan/doc/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/dbscan/doc/results.png
--------------------------------------------------------------------------------
/Algorithms/decision_tree/code/visualize_decision_trees_with_graphviz.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn import tree
3 | import graphviz
4 |
5 |
6 | iris = load_iris()
7 | X = iris.data
8 | y = iris.target
9 |
10 | clf = tree.DecisionTreeClassifier()
11 | clf = clf.fit(X, y)
12 |
13 | dot_data = tree.export_graphviz(clf, out_file=None,
14 | feature_names=iris.feature_names,
15 | class_names=iris.target_names,
16 | filled=True, rounded=True,
17 | special_characters=True)
18 |
19 | graph = graphviz.Source(dot_data, format="png")
20 | graph.render("decision_tree")
21 |
--------------------------------------------------------------------------------
/Algorithms/decision_tree/doc/iris_decision_surface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/iris_decision_surface.png
--------------------------------------------------------------------------------
/Algorithms/decision_tree/doc/iris_decision_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/iris_decision_tree.png
--------------------------------------------------------------------------------
/Algorithms/decision_tree/doc/plot_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/plot_tree.png
--------------------------------------------------------------------------------
/Algorithms/decision_tree/doc/titanic_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/titanic_example.jpg
--------------------------------------------------------------------------------
/Algorithms/gradient_boosting/code/gradient_boosting_regressor.py:
--------------------------------------------------------------------------------
1 | # based on https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/gradient_boosting.py
2 |
3 | from __future__ import annotations
4 | from typing import Union
5 | import numpy as np
6 | from sklearn.tree import DecisionTreeRegressor
7 |
8 |
9 | def square_error_gradient(y: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
10 | return -(y - y_pred)
11 |
12 |
13 | class GradientBoostingRegressor:
14 | """Gradient Boosting Regressor
15 | Parameters:
16 | -----------
17 | n_estimators: int
18 | The number of classification trees that are used.
19 | learning_rate: float
20 | The step length that will be taken when following the negative gradient.
21 | min_samples_split: int
22 | The minimum number of samples required to split an internal node.
23 | max_depth: int
24 | The maximum depth of the individual regression estimators..
25 | """
26 | def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1, min_samples_split: int = 2,
27 | max_depth: int = 3) -> None:
28 | self.n_estimators = n_estimators
29 | self.learning_rate = learning_rate
30 | self.min_samples_split = min_samples_split
31 | self.max_depth = max_depth
32 |
33 | # Initialize trees
34 | self.initial_prediction = None
35 | self.trees = []
36 | for _ in range(n_estimators):
37 | tree = DecisionTreeRegressor(min_samples_split=self.min_samples_split,
38 | max_depth=self.max_depth)
39 | self.trees.append(tree)
40 |
41 | def fit(self, X: Union[list, np.ndarray], y: np.ndarray) -> GradientBoostingRegressor:
42 | self.initial_prediction = np.mean(y, axis=0)
43 | y_pred = np.full(np.shape(y), np.mean(y, axis=0)) # initial prediction
44 | for i in range(self.n_estimators):
45 | gradient = square_error_gradient(y, y_pred)
46 | self.trees[i].fit(X, gradient)
47 | update = self.trees[i].predict(X)
48 | # Update y predictions
49 | y_pred -= np.multiply(self.learning_rate, update)
50 |
51 | return self
52 |
53 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray:
54 | y_pred = np.array([])
55 | # Make predictions
56 | for tree in self.trees:
57 | update = tree.predict(X)
58 | update = np.multiply(self.learning_rate, update)
59 | y_pred = self.initial_prediction - update if not y_pred.any() else y_pred - update
60 | return y_pred
61 |
62 |
63 | if __name__ == '__main__':
64 | from sklearn import datasets
65 | # Load the diabetes dataset
66 | X, y = datasets.load_diabetes(return_X_y=True)
67 | model = GradientBoostingRegressor(max_depth=8)
68 | model.fit(X, y)
69 | print(model.predict(X[:5]))
70 | print(y[:5])
71 |
--------------------------------------------------------------------------------
/Algorithms/k_nearest_neighbors/code/k_nearest_neighbors.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Union
3 | import numpy as np
4 | from collections import Counter
5 |
6 |
7 | class KNearestNeighbors:
8 | """K Nearest Neighbors classifier.
9 | Parameters:
10 | -----------
11 | k: int
12 | The number of closest neighbors
13 | """
14 | def __init__(self, k: int) -> None:
15 | self.X = None
16 | self.y = None
17 | self.k = k
18 |
19 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> KNearestNeighbors:
20 | self.X = X
21 | self.y = y
22 | return self
23 |
24 | def euclidean_distance(self, X_test: Union[list, np.ndarray]) -> list:
25 | return [np.linalg.norm(X - X_test) for X in self.X]
26 |
27 | def k_nearest(self, X: Union[list, np.ndarray]) -> np.ndarray:
28 | idx = np.argpartition(X, self.k)
29 | return np.take(self.y, idx[:self.k])
30 |
31 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray:
32 | distances_list = [self.euclidean_distance(x) for x in X]
33 | return np.array([Counter(self.k_nearest(distances)).most_common()[0][0] for distances in distances_list])
34 |
35 |
36 | if __name__ == '__main__':
37 | import pandas as pd
38 | from sklearn.model_selection import train_test_split
39 | from sklearn.preprocessing import LabelEncoder
40 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
41 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
42 | X, y = (np.array(df.drop('label', axis=1)),
43 | LabelEncoder().fit_transform(np.array(df['label'])))
44 | X_train, X_test, y_train, y_test = train_test_split(
45 | X, y, test_size=0.2, random_state=42)
46 | model = KNearestNeighbors(4)
47 | model.fit(X_train, y_train)
48 | predictions = model.predict(X_test)
49 | print('Accuracy:', (predictions == y_test).sum()/len(predictions)*100)
50 |
--------------------------------------------------------------------------------
/Algorithms/k_nearest_neighbors/code/k_nearest_neighbors_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Union
3 | import numpy as np
4 |
5 |
6 | class KNearestNeighbors:
7 | """K Nearest Neighbors regressor.
8 | Parameters:
9 | -----------
10 | k: int
11 | The number of closest neighbors
12 | """
13 | def __init__(self, k: int) -> None:
14 | self.X = None
15 | self.y = None
16 | self.k = k
17 |
18 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> KNearestNeighbors:
19 | self.X = X
20 | self.y = y
21 | return self
22 |
23 | def euclidean_distance(self, X_test: Union[list, np.ndarray]) -> list:
24 | return [np.linalg.norm(X - X_test) for X in self.X]
25 |
26 | def k_nearest(self, X: Union[list, np.ndarray]) -> np.ndarray:
27 | idx = np.argpartition(X, self.k)
28 | return np.take(self.y, idx[:self.k])
29 |
30 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray:
31 | distances_list = [self.euclidean_distance(x) for x in X]
32 | return np.array([np.mean(self.k_nearest(distances)) for distances in distances_list])
33 |
34 |
35 | if __name__ == '__main__':
36 | import pandas as pd
37 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
38 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
39 | iris = iris.sample(frac=1).reset_index(drop=True)
40 | X = np.array(iris.drop(['petal_width', 'label'], axis=1))
41 | y = np.array(iris['petal_width'])
42 | model = KNearestNeighbors(3)
43 | model.fit(X, y)
44 | print(model.predict(X[:5]))
45 | print(y[:5])
46 |
--------------------------------------------------------------------------------
/Algorithms/k_nearest_neighbors/doc/effect_of_k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/k_nearest_neighbors/doc/effect_of_k.png
--------------------------------------------------------------------------------
/Algorithms/k_nearest_neighbors/doc/effect_of_k_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/k_nearest_neighbors/doc/effect_of_k_2.png
--------------------------------------------------------------------------------
/Algorithms/kernel_pca/README.md:
--------------------------------------------------------------------------------
1 | # Kernel PCA
2 |
3 | 
4 |
5 | Kernel PCA is an extension of [PCA](https://ml-explained.com/blog/principal-component-analysis-explained) that allows for the separability of nonlinear data by making use of kernels. The basic idea behind it is to project the linearly inseparable data onto a higher dimensional space where it becomes linearly separable.
6 |
7 | Kernel PCA can be summarized as a 4 step process [1]:
8 |
9 | 1. Construct the kernel matrix
from the training dataset
10 |
11 |

12 |
13 | 2. If the projected dataset
doesn’t have zero mean use the Gram matrix
to substitute the kernel matrix
.
14 |
15 | 
16 |
17 | 3. Use
to solve for the vector
.
18 |
19 | 4. Compute the kernel principal components
20 |
21 | 
22 |
23 | [1] Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models
24 |
25 | ## Resources
26 |
27 | - [Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models](https://arxiv.org/pdf/1207.3538.pdf)
28 | - [Kernel tricks and nonlinear dimensionality reduction via RBF kernel PCA](https://sebastianraschka.com/Articles/2014_kernel_pca.html)
29 | - [PCA and kernel PCA explained](https://nirpyresearch.com/pca-kernel-pca-explained/)
30 | - [What are the advantages of kernel PCA over standard PCA?](https://stats.stackexchange.com/questions/94463/what-are-the-advantages-of-kernel-pca-over-standard-pca)
--------------------------------------------------------------------------------
/Algorithms/kernel_pca/README.tex.md:
--------------------------------------------------------------------------------
1 | # Kernel PCA
2 |
3 | 
4 |
5 | Kernel PCA is an extension of [PCA](https://ml-explained.com/blog/principal-component-analysis-explained) that allows for the separability of nonlinear data by making use of kernels. The basic idea behind it is to project the linearly inseparable data onto a higher dimensional space where it becomes linearly separable.
6 |
7 | Kernel PCA can be summarized as a 4 step process [1]:
8 |
9 | 1. Construct the kernel matrix $K$ from the training dataset
10 |
11 | $$K_{i,j} = \kappa(\mathbf{x_i, x_j})$$
12 |
13 | 2. If the projected dataset $\left\{\phi (\mathbf{x}_i) \right\}$ doesn’t have zero mean use the Gram matrix $\stackrel{\sim}{K}$ to substitute the kernel matrix $K$.
14 |
15 | $$\stackrel{\sim}{K} = K - \mathbf{1_N} K - K \mathbf{1_N} + \mathbf{1_N} K \mathbf{1_N}$$
16 |
17 | 3. Use $K_{a_k} = \lambda_k N_{a_{k}}$ to solve for the vector $a_i$.
18 |
19 | 4. Compute the kernel principal components $y_k\left(x\right)$
20 |
21 | $$y_k(\mathbf{x})= \phi \left(\mathbf{x}\right)^T \mathbf{v}_k = \sum_{i=1}^N a_{ki} \kappa(\mathbf{x_i, x_j})$$
22 |
23 | [1] Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models
24 |
25 | ## Resources
26 |
27 | - [Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models](https://arxiv.org/pdf/1207.3538.pdf)
28 | - [Kernel tricks and nonlinear dimensionality reduction via RBF kernel PCA](https://sebastianraschka.com/Articles/2014_kernel_pca.html)
29 | - [PCA and kernel PCA explained](https://nirpyresearch.com/pca-kernel-pca-explained/)
30 | - [What are the advantages of kernel PCA over standard PCA?](https://stats.stackexchange.com/questions/94463/what-are-the-advantages-of-kernel-pca-over-standard-pca)
--------------------------------------------------------------------------------
/Algorithms/kernel_pca/code/kernel_pca.py:
--------------------------------------------------------------------------------
1 | # based on https://sebastianraschka.com/Articles/2014_kernel_pca.html
2 |
3 | from __future__ import annotations
4 | from typing import Union
5 | import numpy as np
6 | from scipy.spatial.distance import pdist, squareform
7 | from scipy.linalg import eigh
8 |
9 |
10 | class KernelPCA:
11 | """KernelPCA
12 | Parameters:
13 | -----------
14 | n_components: int = 2
15 | Number of components to keep.
16 | gamma: float = None
17 | Kernel coefficient
18 | """
19 | def __init__(self, n_components: int = 2, gamma: float = None):
20 | self.n_components = n_components
21 | self.gamma = gamma
22 | self.alphas = None
23 | self.lambdas = None
24 | self.X = None
25 |
26 | def fit(self, X: Union[list, np.ndarray]) -> KernelPCA:
27 | if self.gamma == None:
28 | self.gamma = 1 / X.shape[1]
29 |
30 | sq_dists = pdist(X, 'sqeuclidean')
31 |
32 | mat_sq_dists = squareform(sq_dists)
33 |
34 | K = np.exp(-self.gamma * mat_sq_dists)
35 |
36 | N = K.shape[0]
37 | one_n = np.ones((N,N)) / N
38 | K_norm = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
39 |
40 | eigenvalues, eigenvectors = eigh(K_norm)
41 |
42 | alphas = np.column_stack((eigenvectors[:,-i] for i in range(1, self.n_components+1)))
43 | lambdas = [eigenvalues[-i] for i in range(1, self.n_components+1)]
44 |
45 | self.alphas = alphas
46 | self.lambdas = lambdas
47 | self.X = X
48 |
49 | return self
50 |
51 | def fit_transform(self, X: Union[list, np.ndarray]) -> np.ndarray:
52 | self.fit(X)
53 | return self.alphas * np.sqrt(self.lambdas)
54 |
55 | def transform(self, X: Union[list, np.ndarray]) -> np.ndarray:
56 | # TODO: Rewrite as this is very inefficient
57 | def transform_row(X_r):
58 | pair_dist = np.array([np.sum((X_r-row)**2) for row in self.X])
59 | k = np.exp(-self.gamma * pair_dist)
60 | return k.dot(self.alphas / self.lambdas)
61 |
62 | return np.array(list(map(transform_row, X)))
63 |
64 |
65 |
66 |
67 | if __name__ == '__main__':
68 | from sklearn.datasets import make_circles
69 | import matplotlib.pyplot as plt
70 |
71 | X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2)
72 |
73 | plt.figure(figsize=(8,6))
74 |
75 | pca = KernelPCA(n_components=3)
76 | pca.fit(X)
77 | X = pca.transform(X)
78 |
79 | print(X)
80 | plt.plot(X[0], X[1])
81 | plt.show()
--------------------------------------------------------------------------------
/Algorithms/kernel_pca/doc/kernel_pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kernel_pca/doc/kernel_pca.png
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/choose_k_value.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/choose_k_value.jpeg
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/elbow_method_using_yellowbrick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/elbow_method_using_yellowbrick.png
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/k_means.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/k_means.gif
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/noisy_circles_with_true_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/noisy_circles_with_true_output.png
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/noisy_moons_with_true_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/noisy_moons_with_true_output.png
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/silhouette_analysis_3_clusters.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/silhouette_analysis_3_clusters.jpeg
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/silhouette_analysis_4_clusters.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/silhouette_analysis_4_clusters.jpeg
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/silhouette_analysis_5_clusters.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/silhouette_analysis_5_clusters.jpeg
--------------------------------------------------------------------------------
/Algorithms/kmeans/doc/two_lines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/two_lines.png
--------------------------------------------------------------------------------
/Algorithms/kmeans/tex/44bc9d542a92714cac84e01cbbb7fd61.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Algorithms/kmeans/tex/4bdc8d9bcfb35e1c9bfb51fc69687dfc.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Algorithms/kmeans/tex/77a3b857d53fb44e33b53e4c8b68351a.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Algorithms/linear_discriminant_analysis/doc/lda_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/linear_discriminant_analysis/doc/lda_example.png
--------------------------------------------------------------------------------
/Algorithms/linear_regression/code/elastic_net.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Tuple
3 | import numpy as np
4 |
5 |
6 | class ElasticNet:
7 | """ElasticNet
8 | Parameters:
9 | -----------
10 | learning_rate: float
11 | The step length used when following the negative gradient during training.
12 | C: float, default=1
13 | Regularization strength
14 | """
15 | def __init__(self, learning_rate: float, alpha: float = 1.0, l1_ratio: float = 0.5) -> None:
16 | self.learning_rate = learning_rate
17 | self.alpha = alpha
18 | self.l1_ratio = l1_ratio
19 | self.w = ""
20 |
21 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]:
22 | dif = np.dot(x, self.w) - y
23 | cost = (np.sum(dif**2) + self.alpha * (self.l1_ratio * np.sum(np.absolute(self.w)) +
24 | (1 - self.l1_ratio) * np.sum(np.square(self.w)))) / (2*np.shape(x)[0])
25 |
26 | return dif, cost
27 |
28 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> ElasticNet:
29 | if self.w == "":
30 | _, num_features = np.shape(x)
31 | self.w = np.random.uniform(-1, 1, num_features)
32 | for _ in range(num_iterations):
33 | dif, cost = self.cost_function(x, y)
34 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0]
35 | self.w = self.w - self.learning_rate * gradient
36 | return self
37 |
38 | def predict(self, x: np.ndarray) -> np.ndarray:
39 | return np.dot(x, self.w)
40 |
41 |
42 | # Testing functionality
43 | if __name__ == '__main__':
44 | import pandas as pd
45 | from sklearn.preprocessing import LabelEncoder
46 | from sklearn.model_selection import train_test_split
47 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
48 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
49 | le = LabelEncoder()
50 | iris['label'] = le.fit_transform(iris['label'])
51 | X = np.array(iris.drop(['petal_width'], axis=1))
52 | y = np.array(iris['petal_width'])
53 |
54 | X_train, X_test, y_train, y_test = train_test_split(
55 | X, y, test_size=0.2, random_state=42)
56 |
57 | model = ElasticNet(0.0001)
58 | model.fit(X_train, y_train, 10000)
59 | predictions = model.predict(X_test)
60 | mse = ((y_test - predictions)**2).mean(axis=0)
61 | print('Loss:', mse)
62 |
--------------------------------------------------------------------------------
/Algorithms/linear_regression/code/lasso_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Tuple
3 | import numpy as np
4 |
5 |
6 | class LassoRegression:
7 | """Lasso Regression
8 | Parameters:
9 | -----------
10 | learning_rate: float
11 | The step length used when following the negative gradient during training.
12 | C: float, default=1
13 | Regularization strength
14 | """
15 | def __init__(self, learning_rate: float, C: float = 1) -> None:
16 | self.learning_rate = learning_rate
17 | self.C = C
18 | self.w = ""
19 |
20 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]:
21 | dif = np.dot(x, self.w) - y
22 | cost = (np.sum(dif**2) + self.C * np.sum(np.absolute(self.w))) / (2*np.shape(x)[0])
23 |
24 | return dif, cost
25 |
26 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> LassoRegression:
27 | if self.w == "":
28 | _, num_features = np.shape(x)
29 | self.w = np.random.uniform(-1, 1, num_features)
30 | for _ in range(num_iterations):
31 | dif, cost = self.cost_function(x, y)
32 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0]
33 | self.w = self.w - self.learning_rate * gradient
34 | return self
35 |
36 | def predict(self, x: np.ndarray) -> np.ndarray:
37 | return np.dot(x, self.w)
38 |
39 |
40 | # Testing functionality
41 | if __name__ == '__main__':
42 | import pandas as pd
43 | from sklearn.preprocessing import LabelEncoder
44 | from sklearn.model_selection import train_test_split
45 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
46 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
47 | le = LabelEncoder()
48 | iris['label'] = le.fit_transform(iris['label'])
49 | X = np.array(iris.drop(['petal_width'], axis=1))
50 | y = np.array(iris['petal_width'])
51 |
52 | X_train, X_test, y_train, y_test = train_test_split(
53 | X, y, test_size=0.2, random_state=42)
54 |
55 | model = LassoRegression(0.0001)
56 | model.fit(X_train, y_train, 10000)
57 | predictions = model.predict(X_test)
58 | mse = ((y_test - predictions)**2).mean(axis=0)
59 | print('Loss:', mse)
60 |
--------------------------------------------------------------------------------
/Algorithms/linear_regression/code/multivariate_linear_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Tuple
3 | import numpy as np
4 |
5 |
6 | class MultivariateLinearRegression:
7 | """Multivariate Linear Regression
8 | Parameters:
9 | -----------
10 | learning_rate: float
11 | The step length used when following the negative gradient during training.
12 | """
13 | def __init__(self, learning_rate: float) -> None:
14 | self.learning_rate = learning_rate
15 | self.w = ""
16 |
17 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]:
18 | dif = np.dot(x, self.w) - y
19 | cost = np.sum(dif**2) / (2*np.shape(x)[0])
20 |
21 | return dif, cost
22 |
23 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> MultivariateLinearRegression:
24 | if self.w == "":
25 | _, num_features = np.shape(x)
26 | self.w = np.random.uniform(-1, 1, num_features)
27 | for i in range(num_iterations):
28 | dif, cost = self.cost_function(x, y)
29 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0]
30 | self.w = self.w - self.learning_rate * gradient
31 | return self
32 |
33 | def predict(self, x: np.ndarray) -> np.ndarray:
34 | return np.dot(x, self.w)
35 |
36 |
37 | # Testing functionality
38 | if __name__ == '__main__':
39 | import pandas as pd
40 | from sklearn.preprocessing import LabelEncoder
41 | from sklearn.model_selection import train_test_split
42 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
43 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
44 | le = LabelEncoder()
45 | iris['label'] = le.fit_transform(iris['label'])
46 | X = np.array(iris.drop(['petal_width'], axis=1))
47 | y = np.array(iris['petal_width'])
48 |
49 | X_train, X_test, y_train, y_test = train_test_split(
50 | X, y, test_size=0.2, random_state=42)
51 |
52 | model = MultivariateLinearRegression(0.0001)
53 | model.fit(X_train, y_train, 10000)
54 | predictions = model.predict(X_test)
55 | mse = ((y_test - predictions)**2).mean(axis=0)
56 | print('Loss:', mse)
57 |
--------------------------------------------------------------------------------
/Algorithms/linear_regression/code/normal_equation.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import numpy as np
3 |
4 |
5 | class NormalEquation:
6 |
7 | def __init__(self):
8 | self.w = None
9 |
10 |
11 | def fit(self, x: np.ndarray, y: np.ndarray) -> NormalEquation:
12 | x = np.append(np.ones([len(x), 1]), x, 1)
13 | z = np.linalg.inv(np.dot(x.transpose(), x))
14 | self.w = np.dot(np.dot(z, x.transpose()), y)
15 | return self
16 |
17 | def predict(self, x: np.ndarray):
18 | if self.w == None:
19 | raise Exception('Call .fit before using predict method')
20 |
21 | x = np.append(np.ones([len(x), 1]), x, 1)
22 | return np.dot(x, self.w)
23 |
--------------------------------------------------------------------------------
/Algorithms/linear_regression/code/ridge_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Tuple
3 | import numpy as np
4 |
5 |
6 | class RidgeRegression:
7 | """Ridge Regression
8 | Parameters:
9 | -----------
10 | learning_rate: float
11 | The step length used when following the negative gradient during training.
12 | C: float, default=1
13 | Regularization strength
14 | """
15 | def __init__(self, learning_rate: float, C: float = 1) -> None:
16 | self.learning_rate = learning_rate
17 | self.C = C
18 | self.w = ""
19 |
20 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]:
21 | dif = np.dot(x, self.w) - y
22 | cost = (np.sum(dif**2) + self.C * np.sum(np.square(self.w))) / (2*np.shape(x)[0])
23 |
24 | return dif, cost
25 |
26 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> RidgeRegression:
27 | if self.w == "":
28 | _, num_features = np.shape(x)
29 | self.w = np.random.uniform(-1, 1, num_features)
30 | for _ in range(num_iterations):
31 | dif, cost = self.cost_function(x, y)
32 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0]
33 | self.w = self.w - self.learning_rate * gradient
34 | return self
35 |
36 | def predict(self, x: np.ndarray) -> np.ndarray:
37 | return np.dot(x, self.w)
38 |
39 |
40 | # Testing functionality
41 | if __name__ == '__main__':
42 | import pandas as pd
43 | from sklearn.preprocessing import LabelEncoder
44 | from sklearn.model_selection import train_test_split
45 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
46 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
47 | le = LabelEncoder()
48 | iris['label'] = le.fit_transform(iris['label'])
49 | X = np.array(iris.drop(['petal_width'], axis=1))
50 | y = np.array(iris['petal_width'])
51 |
52 | X_train, X_test, y_train, y_test = train_test_split(
53 | X, y, test_size=0.2, random_state=42)
54 |
55 | model = RidgeRegression(0.0001)
56 | model.fit(X_train, y_train, 10000)
57 | predictions = model.predict(X_test)
58 | mse = ((y_test - predictions)**2).mean(axis=0)
59 | print('Loss:', mse)
60 |
--------------------------------------------------------------------------------
/Algorithms/linear_regression/code/simple_linear_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import numpy as np
3 |
4 |
5 | class SimpleLinearRegression:
6 | """Simple Linear Regression
7 | Parameters:
8 | -----------
9 | learning_rate: float
10 | The step length used when following the negative gradient during training.
11 | """
12 | def __init__(self, learning_rate: float) -> None:
13 | self.m = 0
14 | self.b = 0
15 | self.learning_rate = learning_rate
16 |
17 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> float:
18 | total_error = 0
19 | for i in range(0, len(x)):
20 | total_error += (y[i]-(self.m*x[i]+self.b))**2
21 | return total_error/float(len(x))
22 |
23 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int) -> SimpleLinearRegression:
24 | N = float(len(x))
25 | for j in range(num_iterations):
26 | b_gradient = 0
27 | m_gradient = 0
28 | for i in range(0, len(x)):
29 | b_gradient += -(2/N) * (y[i] - ((self.m * x[i]) + self.b))
30 | m_gradient += -(2/N) * x[i] * \
31 | (y[i] - ((self.m * x[i]) + self.b))
32 | self.b -= (self.learning_rate * b_gradient)
33 | self.m -= (self.learning_rate * m_gradient)
34 | return self
35 |
36 | def predict(self, xs: np.ndarray) -> list:
37 | return [(self.m * x + self.b) for x in xs]
38 |
39 |
40 | # Testing functionality
41 | if __name__ == '__main__':
42 | x = np.linspace(0, 100, 50)
43 | delta = np.random.uniform(-10, 10, x.size)
44 | y = 0.5 * x + 3 + delta
45 |
46 | model = SimpleLinearRegression(0.0001)
47 | model.fit(x, y, 100)
48 | print('Error:', model.cost_function(x, y))
49 |
--------------------------------------------------------------------------------
/Algorithms/linear_regression/doc/linear_regression_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/linear_regression/doc/linear_regression_example.png
--------------------------------------------------------------------------------
/Algorithms/linear_regression/doc/regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/linear_regression/doc/regularization.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/code/logistic_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import numpy as np
3 |
4 |
5 | def sigmoid(x: np.ndarray) -> np.ndarray:
6 | return 1/(1+np.exp(-x))
7 |
8 |
9 | class LogisticRegression:
10 | """Logistic Regression
11 | Parameters:
12 | -----------
13 | learning_rate: float
14 | The step length used when following the negative gradient during training.
15 | num_features: int
16 | The number of feature in the data
17 | penalty: str, default='l2'
18 | The type of penalty used.
19 | C: float, default=1
20 | Regularization strength
21 | """
22 | def __init__(self, learning_rate: float, num_features: int, penalty: str = 'l2', C: float = 0.1) -> None:
23 | self.learning_rate = learning_rate
24 | self.penalty = penalty
25 | self.C = C
26 | self.b = 0
27 | self.w = np.zeros((1, num_features))
28 | assert penalty in ['l2', 'l1', None]
29 |
30 | def cost_function(self, y: np.ndarray, y_pred: np.ndarray) -> float:
31 | y_T = y.T
32 | if self.penalty == 'l1':
33 | return (-1/y.shape[0]) * (np.sum((y_T*np.log(y_pred)) + ((1-y_T) * np.log(1-y_pred))) + self.C * np.sum(np.absolute(self.w)))
34 | elif self.penalty == 'l2':
35 | return (-1/y.shape[0]) * (np.sum((y_T*np.log(y_pred)) + ((1-y_T) * np.log(1-y_pred))) + self.C * np.sum(np.square(self.w)))
36 | else:
37 | return (-1/y.shape[0]) * (np.sum((y_T*np.log(y_pred)) + ((1-y_T) * np.log(1-y_pred))))
38 |
39 | def fit(self, X: np.ndarray, y: np.ndarray, num_iterations) -> LogisticRegression:
40 | for i in range(num_iterations):
41 | pred = sigmoid(np.dot(self.w, X.T) + self.b)
42 | cost = self.cost_function(y, pred)
43 |
44 | # Calculate Gradients/Derivatives
45 | dw = (1 / X.shape[0]) * (np.dot(X.T, (pred - y.T).T))
46 | db = (1 / X.shape[0]) * (np.sum(pred - y.T))
47 |
48 | self.w = self.w - (self.learning_rate * dw.T)
49 | self.b = self.b - (self.learning_rate * db)
50 | return self
51 |
52 | def predict(self, X: np.ndarray) -> list:
53 | predictions = sigmoid(np.dot(self.w, X.T) + self.b)[0]
54 | return [1 if pred >= 0.5 else 0 for pred in predictions]
55 |
56 | def predict_proba(self, X: np.ndarray) -> np.ndarray:
57 | return sigmoid(np.dot(self.w, X.T) + self.b)[0]
58 |
59 |
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/code/one_vs_all_logistic_regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import numpy as np
3 | from logistic_regression import LogisticRegression
4 |
5 |
6 | class LogisticRegressionOneVsAll:
7 | """One vs. All Logistic Regression
8 | Parameters:
9 | -----------
10 | learning_rate: float
11 | The step length used when following the negative gradient during training.
12 | num_features: int
13 | The number of feature in the data
14 | num_classes: int
15 | The number of classes in the data-set
16 | """
17 | def __init__(self, learning_rate: float, num_features: int, num_classes: int) -> None:
18 | self.models = [LogisticRegression(learning_rate, num_features) for _ in range(num_classes)]
19 |
20 | def fit(self, X: np.ndarray, y: np.ndarray, num_iterations: int) -> LogisticRegressionOneVsAll:
21 | for i, model in enumerate(self.models):
22 | y_tmp = (y == i).astype(int)
23 | model.fit(X, y_tmp, num_iterations)
24 | return self
25 |
26 | def predict(self, X: np.ndarray) -> np.ndarray:
27 | predictions = np.array([model.predict_proba(X) for model in self.models])
28 | return np.argmax(predictions, axis=0)
29 |
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/classification_vs_regression.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/classification_vs_regression.jpeg
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/convex_vs_non_convex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/convex_vs_non_convex.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/logistic_regression_decision_boundary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/logistic_regression_decision_boundary.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/loss_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/loss_functions.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/one_vs_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/one_vs_all.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/overfitting_vs_underfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/overfitting_vs_underfitting.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/doc/sigmoid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/sigmoid.png
--------------------------------------------------------------------------------
/Algorithms/logistic_regression/tex/fd8be73b54f5436a5cd2e73ba9b6bfa9.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/choose_bandwidth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/choose_bandwidth.png
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/cluster_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/cluster_comparison.png
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/kde_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/kde_plot.png
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/mean_shift.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/mean_shift.gif
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/noisy_circles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/noisy_circles.png
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/noisy_moons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/noisy_moons.png
--------------------------------------------------------------------------------
/Algorithms/mean_shift/doc/two_lines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/two_lines.png
--------------------------------------------------------------------------------
/Algorithms/principal_component_analysis/doc/pca_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/principal_component_analysis/doc/pca_example.png
--------------------------------------------------------------------------------
/Algorithms/random_forest/code/eli5_feature_importance_example.py:
--------------------------------------------------------------------------------
1 | from sklearn.ensemble import RandomForestClassifier
2 | from sklearn import datasets
3 | from sklearn.model_selection import train_test_split
4 | from IPython.display import display
5 |
6 | import eli5
7 | from eli5.sklearn import PermutationImportance
8 |
9 | RANDOM_STATE = 0
10 |
11 | # Get Iris data
12 | iris = datasets.load_iris()
13 | X = iris.data
14 | y = iris.target
15 |
16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
17 |
18 | # Create and train Random Forest
19 | model = RandomForestClassifier(random_state=RANDOM_STATE)
20 | model.fit(X_train, y_train)
21 |
22 | perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
23 |
24 | display(eli5.show_weights(perm, feature_names=iris.feature_names))
25 |
26 | eli5_weights = eli5.explain_weights(model, feature_names=iris.feature_names)
27 | print(eli5_weights)
--------------------------------------------------------------------------------
/Algorithms/random_forest/code/random_forest_classifier.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Union, Optional
3 | import numpy as np
4 | from sklearn.tree import DecisionTreeClassifier
5 |
6 |
7 | class RandomForest:
8 | """Random Forest Classifier
9 | Parameters:
10 | -----------
11 | n_estimators: int = 10
12 | The number of trees in the forest.
13 | n_features: Optional[Union[str, int]] = 'sqrt'
14 | The number of features to consider when looking for the best split
15 | sample_size: float = 0.8
16 | Amount of data used (0-1)
17 | max_depth: Optional[int] = 10
18 | The maximum depth of the tree.
19 | min_leaf: Union[int, float] = 5
20 | The minimum number of samples required to be at a leaf node.
21 | """
22 | def __init__(self, n_estimators: int = 10, n_features: Optional[Union[str, int]] = 'sqrt', sample_size: float = 0.8,
23 | max_depth: Optional[int] = 10, min_leaf: Union[int, float] = 5) -> None:
24 | self.n_estimators = n_estimators
25 | self.n_features = n_features
26 | self.sample_size = sample_size
27 | self.max_depth = max_depth
28 | self.min_leaf = min_leaf
29 | self.trees = []
30 |
31 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> RandomForest:
32 | for _ in range(self.n_estimators):
33 | idxs = np.random.permutation(len(X))[:int(self.sample_size*len(X))]
34 |
35 | self.trees.append(DecisionTreeClassifier(
36 | max_depth=self.max_depth, min_samples_leaf=self.min_leaf, max_features=self.n_features).fit(X[idxs], y[idxs]))
37 | return self
38 |
39 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray:
40 | predictions_array = np.column_stack([t.predict(X) for t in self.trees])
41 | return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array])
42 |
--------------------------------------------------------------------------------
/Algorithms/random_forest/code/random_forest_regressor.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Union, Optional
3 | import numpy as np
4 | from sklearn.tree import DecisionTreeRegressor
5 |
6 |
7 | class RandomForest:
8 | """Random Forest Regressor
9 | Parameters:
10 | -----------
11 | n_estimators: int = 10
12 | The number of trees in the forest.
13 | n_features: Optional[Union[str, int]] = 'sqrt'
14 | The number of features to consider when looking for the best split
15 | sample_size: float = 0.8
16 | Amount of data used (0-1)
17 | max_depth: Optional[int] = 10
18 | The maximum depth of the tree.
19 | min_leaf: Union[int, float] = 5
20 | The minimum number of samples required to be at a leaf node.
21 | """
22 | def __init__(self, n_estimators: int = 10, n_features: Optional[Union[str, int]] = 'sqrt', sample_size: float = 0.8,
23 | max_depth: Optional[int] = 10, min_leaf: Union[int, float] = 5) -> None:
24 | self.n_estimators = n_estimators
25 | self.n_features = n_features
26 | self.sample_size = sample_size
27 | self.max_depth = max_depth
28 | self.min_leaf = min_leaf
29 | self.trees = []
30 |
31 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> RandomForest:
32 | for _ in range(self.n_estimators):
33 | idxs = np.random.permutation(len(X))[:int(self.sample_size*len(X))]
34 |
35 | self.trees.append(DecisionTreeRegressor(
36 | max_depth=self.max_depth, min_samples_leaf=self.min_leaf, max_features=self.n_features).fit(X[idxs], y[idxs]))
37 | return self
38 |
39 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray:
40 | return np.mean([t.predict(X) for t in self.trees], axis=0)
41 |
--------------------------------------------------------------------------------
/Algorithms/random_forest/code/scikit-learn/feature_importance_example.py:
--------------------------------------------------------------------------------
1 | # based on https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | from sklearn.datasets import make_classification
6 | from sklearn.ensemble import RandomForestClassifier
7 |
8 |
9 | RANDOM_STATE = 0
10 |
11 | # Build a classification task using 3 informative features
12 | X, y = make_classification(n_samples=1000,
13 | n_features=10,
14 | n_informative=3,
15 | n_redundant=0,
16 | n_repeated=0,
17 | n_classes=2,
18 | random_state=RANDOM_STATE,
19 | shuffle=False)
20 |
21 | # Create and train Random Forest
22 | model = RandomForestClassifier(n_estimators=250, random_state=RANDOM_STATE)
23 | model.fit(X, y)
24 |
25 | # Get feature importance
26 | importances = model.feature_importances_
27 | std = np.std([tree.feature_importances_ for tree in model.estimators_],
28 | axis=0)
29 | indices = np.argsort(importances)[::-1]
30 |
31 | # Print the feature ranking
32 | print("Feature ranking:")
33 |
34 | for f in range(X.shape[1]):
35 | print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
36 |
37 | # Plot the impurity-based feature importances of the forest
38 | plt.figure()
39 | plt.title("Feature importances")
40 | plt.bar(range(X.shape[1]), importances[indices],
41 | color="r", yerr=std[indices], align="center")
42 | plt.xticks(range(X.shape[1]), indices)
43 | plt.xlim([-1, X.shape[1]])
44 | plt.show()
45 |
--------------------------------------------------------------------------------
/Algorithms/random_forest/code/scikit-learn/out_of_bag_error_example.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import make_classification
2 | from sklearn.ensemble import RandomForestClassifier
3 |
4 |
5 | RANDOM_STATE = 123
6 |
7 | # Generate a binary classification dataset.
8 | X, y = make_classification(n_samples=500, n_features=25,
9 | n_clusters_per_class=1, n_informative=15,
10 | random_state=RANDOM_STATE)
11 |
12 | model = RandomForestClassifier(oob_score=True, random_state=RANDOM_STATE)
13 |
14 | model.fit(X, y)
15 |
16 | print('Out of bag error:', model.oob_score_)
17 |
--------------------------------------------------------------------------------
/Algorithms/random_forest/code/shap_feature_importance_example.py:
--------------------------------------------------------------------------------
1 | import shap
2 | from sklearn.ensemble import RandomForestClassifier
3 | from sklearn import datasets
4 | from sklearn.model_selection import train_test_split
5 |
6 | RANDOM_STATE = 0
7 |
8 | # Get Iris data
9 | iris = datasets.load_iris()
10 | X = iris.data
11 | y = iris.target
12 |
13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
14 |
15 | # Create and train Random Forest
16 | model = RandomForestClassifier(random_state=RANDOM_STATE)
17 | model.fit(X_train, y_train)
18 |
19 |
20 | explainer = shap.TreeExplainer(model)
21 | shap_values = explainer.shap_values(X_test)
22 |
23 | shap.summary_plot(shap_values, X_test)
--------------------------------------------------------------------------------
/Algorithms/random_forest/doc/bootstrapping_vertical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/bootstrapping_vertical.png
--------------------------------------------------------------------------------
/Algorithms/random_forest/doc/decision_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/decision_tree.png
--------------------------------------------------------------------------------
/Algorithms/random_forest/doc/feature_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/feature_importance.png
--------------------------------------------------------------------------------
/Algorithms/random_forest/doc/out_of_bag_set.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/out_of_bag_set.png
--------------------------------------------------------------------------------
/Algorithms/random_forest/doc/random_forest_pipeline_horizontal_vertical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/random_forest_pipeline_horizontal_vertical.png
--------------------------------------------------------------------------------
/Algorithms/random_forest/doc/selecting_a_random_subset_of_variables_vertical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/selecting_a_random_subset_of_variables_vertical.png
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Contributing
2 |
3 | Thank you for wanting to contribute to Machine-Learning Explained. Machine-Learning Explained is an open-source repository containing explanations and implementations of machine learning algorithms and concepts, and as such any contributions that add to the current explanations or add new ones are more than welcome.
4 |
5 | ## Setup Machine-Learning-Explained and version control
6 |
7 | 1. Make a fork of this repository on Github. You will need an account with Github. This will allow you to make pull requests (PRs) later on.
8 | 2. Clone your fork.
9 | ```bash
10 | git clone
11 | cd Machine-Learning-Explained
12 | ```
13 | 3. Make `git` aware of the Machine-Learning-Explained repo.
14 | ```bash
15 | git remote add upstream https://github.com/TannerGilbert/Machine-Learning-Explained.git
16 | git fetch upstream
17 | ```
18 |
19 | ## Changing/Adding source code
20 |
21 | 1. Choose the branch for your changes.
22 | ```bash
23 | git checkout -b
24 | ```
25 | 2. Write some awesome code! (Make sure only to write code inside the `code` folders)
26 |
27 | ## Changing/Adding documentation
28 |
29 | 1. Choose the branch for your changes.
30 | ```bash
31 | git checkout -b
32 | ```
33 | 2. Make changes / add new documentation.
34 | > Note: Make sure to only work inside the `README.tex.md` files and not inside the `README.md` files.
35 | 3. Generate `README.md` file from `README.tex.md`
36 |
37 | 1. Install [`readme2tex`](https://github.com/leegao/readme2tex)
38 | ```bash
39 | pip install readme2tex
40 | ```
41 | 2. Convert `README.tex.md` to `README.md`
42 | ```bash
43 | python3 -m readme2tex --output README.md README.tex.md --svgdir tex --nocdn
44 | ```
--------------------------------------------------------------------------------
/Ensemble_Methods/code/averaging.py:
--------------------------------------------------------------------------------
1 | # based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2 | # and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4 | import numpy as np
5 |
6 |
7 | class AveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
8 | def __init__(self, models):
9 | self.models = models
10 |
11 | def fit(self, X, y):
12 | self.models_ = [clone(x) for x in self.models]
13 |
14 | # Train cloned base models
15 | for model in self.models_:
16 | model.fit(X, y)
17 |
18 | return self
19 |
20 | def predict(self, X):
21 | predictions = np.column_stack([
22 | model.predict(X) for model in self.models_
23 | ])
24 | return np.mean(predictions, axis=1)
25 |
--------------------------------------------------------------------------------
/Ensemble_Methods/code/bagging.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
2 | import numpy as np
3 |
4 |
5 | class BaggingModels(BaseEstimator, RegressorMixin, TransformerMixin):
6 | def __init__(self, models, task_type='classification'):
7 | self.models = models
8 | self.task_type = task_type
9 |
10 | def fit(self, X, y):
11 | self.models_ = [clone(x) for x in self.models]
12 |
13 | for model in self.models_:
14 | X_tmp, y_tmp = self.subsample(X, y)
15 | model.fit(X_tmp, y_tmp)
16 |
17 | return self
18 |
19 | # Create a random subsample from the dataset with replacement
20 | @staticmethod
21 | def subsample(X, y, ratio=1.0):
22 | X_new, y_new = list(), list()
23 | n_sample = round(len(X) * ratio)
24 | while len(X_new) < n_sample:
25 | index = np.random.randint(len(X))
26 | X_new.append(X[index])
27 | y_new.append(y[index])
28 | return X_new, y_new
29 |
30 | def predict(self, X):
31 | predictions_array = np.column_stack([
32 | model.predict(X) for model in self.models_
33 | ])
34 | if self.task_type == 'classification':
35 | return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array])
36 | else:
37 | return np.mean(predictions_array, axis=1)
38 |
39 | def predict_proba(self, X):
40 | if self.task_type == 'classification':
41 | predictions = []
42 | for x in X:
43 | prediction = np.row_stack([
44 | model.predict_proba([x]) for model in self.models_
45 | ])
46 | predictions.append(np.mean(prediction, axis=0))
47 | return np.array(predictions)
48 | return None
--------------------------------------------------------------------------------
/Ensemble_Methods/code/blending.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
2 | from sklearn.model_selection import train_test_split
3 | import numpy as np
4 |
5 |
6 | class BlendingModels(BaseEstimator, RegressorMixin, TransformerMixin):
7 | def __init__(self, base_models, meta_model, holdout_pct=0.2, use_features_in_secondary=False):
8 | self.base_models = base_models
9 | self.meta_model = meta_model
10 | self.holdout_pct = holdout_pct
11 | self.use_features_in_secondary = use_features_in_secondary
12 |
13 | def fit(self, X, y):
14 | """Fit all the models on the given dataset"""
15 | self.base_models_ = [clone(x) for x in self.base_models]
16 | self.meta_model_ = clone(self.meta_model)
17 |
18 | X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=self.holdout_pct)
19 |
20 | holdout_predictions = np.zeros((X_holdout.shape[0], len(self.base_models)))
21 | for i, model in enumerate(self.base_models_):
22 | model.fit(X_train, y_train)
23 | y_pred = model.predict(X_holdout)
24 | holdout_predictions[:, i] = y_pred
25 | if self.use_features_in_secondary:
26 | self.meta_model_.fit(np.hstack((X_holdout, holdout_predictions)), y_holdout)
27 | else:
28 | self.meta_model_.fit(holdout_predictions, y_holdout)
29 |
30 | return self
31 |
32 | def predict(self, X):
33 | meta_features = np.column_stack([
34 | model.predict(X) for model in self.base_models_
35 | ])
36 | if self.use_features_in_secondary:
37 | return self.meta_model_.predict(np.hstack((X, meta_features)))
38 | else:
39 | return self.meta_model_.predict(meta_features)
40 |
41 | def predict_proba(self, X):
42 | meta_features = np.column_stack([
43 | model.predict(X) for model in self.base_models_
44 | ])
45 | if self.use_features_in_secondary:
46 | return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
47 | else:
48 | return self.meta_model_.predict_proba(meta_features)
--------------------------------------------------------------------------------
/Ensemble_Methods/code/majority_vote.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
2 | import numpy as np
3 |
4 |
5 | class MajorityVote(BaseEstimator, ClassifierMixin, TransformerMixin):
6 | def __init__(self, models):
7 | self.models = models
8 |
9 | def fit(self, X, y):
10 | self.models_ = [clone(x) for x in self.models]
11 |
12 | # Train cloned base models
13 | for model in self.models_:
14 | model.fit(X, y)
15 |
16 | return self
17 |
18 | def predict(self, X):
19 | predictions_array = np.column_stack([
20 | model.predict(X) for model in self.models_
21 | ])
22 | return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array])
--------------------------------------------------------------------------------
/Ensemble_Methods/code/stacking_retrained.py:
--------------------------------------------------------------------------------
1 | # based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2 | # and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4 | from sklearn.model_selection import KFold
5 | import numpy as np
6 |
7 |
8 | class StackingModelsRetrained(BaseEstimator, RegressorMixin, TransformerMixin):
9 | def __init__(self, base_models, meta_model, n_folds=5, use_features_in_secondary=False):
10 | self.base_models = base_models
11 | self.meta_model = meta_model
12 | self.n_folds = n_folds
13 | self.use_features_in_secondary = use_features_in_secondary
14 |
15 | def fit(self, X, y):
16 | """Fit all the models on the given dataset"""
17 | self.base_models_ = [clone(x) for x in self.base_models]
18 | self.meta_model_ = clone(self.meta_model)
19 | kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
20 |
21 | # Train cloned base models and create out-of-fold predictions
22 | out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
23 | for i, model in enumerate(self.base_models):
24 | for train_index, holdout_index in kfold.split(X, y):
25 | instance = clone(model)
26 | instance.fit(X[train_index], y[train_index])
27 | y_pred = instance.predict(X[holdout_index])
28 | out_of_fold_predictions[holdout_index, i] = y_pred
29 |
30 | if self.use_features_in_secondary:
31 | self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y)
32 | else:
33 | self.meta_model_.fit(out_of_fold_predictions, y)
34 |
35 | for model in self.base_models_:
36 | model.fit(X, y)
37 |
38 | return self
39 |
40 | def predict(self, X):
41 | meta_features = np.column_stack([
42 | base_model.predict(X) for base_model in self.base_models_])
43 | if self.use_features_in_secondary:
44 | return self.meta_model_.predict(np.hstack((X, meta_features)))
45 | else:
46 | return self.meta_model_.predict(meta_features)
47 |
48 | def predict_proba(self, X):
49 | meta_features = np.column_stack([
50 | base_model.predict(X) for base_model in self.base_models_])
51 | if self.use_features_in_secondary:
52 | return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
53 | else:
54 | return self.meta_model_.predict_proba(meta_features)
55 |
--------------------------------------------------------------------------------
/Ensemble_Methods/code/weighted_average.py:
--------------------------------------------------------------------------------
1 | # based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2 | # and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4 | import numpy as np
5 |
6 |
7 | class WeightedAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
8 | def __init__(self, models, weights):
9 | self.models = models
10 | self.weights = weights
11 | assert sum(self.weights) == 1
12 |
13 | def fit(self, X, y):
14 | self.models_ = [clone(x) for x in self.models]
15 |
16 | # Train cloned base models
17 | for model in self.models_:
18 | model.fit(X, y)
19 |
20 | return self
21 |
22 | def predict(self, X):
23 | predictions = np.column_stack([
24 | model.predict(X) for model in self.models_
25 | ])
26 | return np.sum(predictions * self.weights, axis=1)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Gilbert Tanner
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/Metrics/code/accuracy_score.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Accuracy:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.sum(y == y_pred) / y.shape[0]
--------------------------------------------------------------------------------
/Metrics/code/binary_cross_entropy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class BinaryCrossentropy:
5 | def __init__(self):
6 | self.epsilon = 1e-15
7 |
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | # Avoid division by zero
13 | y_pred = np.clip(y_pred, self.epsilon, 1 - self.epsilon)
14 | return - y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred)
15 |
16 | def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
17 | # Avoid division by zero
18 | y_pred = np.clip(y_pred, self.epsilon, 1 - self.epsilon)
19 | return - (y / y_pred) + (1 - y) / (1 - y_pred)
20 |
--------------------------------------------------------------------------------
/Metrics/code/brier_score.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class BrierScore:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.sum(np.power(y - y_pred, 2)) / y.shape[0]
10 |
--------------------------------------------------------------------------------
/Metrics/code/categorical_cross_entropy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class CategoricalCrossentropy:
5 | def __init__(self):
6 | self.epsilon = 1e-15
7 |
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | # Avoid division by zero
13 | y_pred = np.clip(y_pred, self.epsilon, 1 - self.epsilon)
14 | return - np.sum(y * np.log(y_pred)) / y.shape[0]
15 |
--------------------------------------------------------------------------------
/Metrics/code/cosine_distance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class CosineDistance:
5 |
6 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
7 | return self.loss(y, y_pred)
8 |
9 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
10 | return np.dot(y, y_pred) / (np.linalg.norm(y) * np.linalg.norm(y_pred))
11 |
--------------------------------------------------------------------------------
/Metrics/code/d2_score.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tweedie_deviance import TweedieDeviance
3 |
4 |
5 | class D2Score:
6 | def __init__(self, power: int) -> None:
7 | self.tweedie = TweedieDeviance(power)
8 |
9 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
10 | return self.loss(y, y_pred)
11 |
12 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
13 | return 1 - self.tweedie(y, y_pred) / self.tweedie(y, np.mean(y))
14 |
--------------------------------------------------------------------------------
/Metrics/code/f1_score.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from recall import Recall
3 | from precision import Precision
4 |
5 |
6 | class F1Score:
7 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
8 | return self.loss(y, y_pred)
9 |
10 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
11 | precision = Precision()
12 | recall = Recall()
13 | return 2 * (precision(y, y_pred) * recall(y, y_pred)) / (precision(y, y_pred) + recall(y, y_pred))
--------------------------------------------------------------------------------
/Metrics/code/fbeta_score.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from recall import Recall
3 | from precision import Precision
4 |
5 |
6 | class FBetaScore:
7 | def __init__(self, beta: float = 1.) -> None:
8 | self.beta = beta
9 |
10 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
11 | return self.loss(y, y_pred)
12 |
13 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
14 | precision = Precision()
15 | recall = Recall()
16 | return (1 + pow(self.beta, 2)) * (precision(y, y_pred) * recall(y, y_pred)) / ((pow(self.beta, 2) * precision(y, y_pred)) + recall(y, y_pred))
--------------------------------------------------------------------------------
/Metrics/code/hinge.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Hinge:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.sum(np.maximum(0, 1 - y * y_pred)) / len(y)
10 |
--------------------------------------------------------------------------------
/Metrics/code/huber.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Huber:
5 | def __init__(self, delta: float = 1.) -> None:
6 | self.delta = delta
7 |
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | return np.where(np.abs(y - y_pred) < self.delta, 0.5 * (y - y_pred)**2, self.delta * (np.abs(y - y_pred)- 0.5 * self.delta))
13 |
--------------------------------------------------------------------------------
/Metrics/code/kl_divergence.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class KLDivergence:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.sum(np.where(y != 0, y * np.log(y / y_pred), 0))
--------------------------------------------------------------------------------
/Metrics/code/logcosh.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class LogCosh:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.sum(np.log(np.cosh(y_pred - y))) / y.shape[0]
10 |
--------------------------------------------------------------------------------
/Metrics/code/mean_absolute_error.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class MeanAbsoluteError:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return 0.5 * np.sum(np.absolute(y - y_pred)) / y.shape[0]
10 |
--------------------------------------------------------------------------------
/Metrics/code/mean_absolute_percentage_error.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class MeanAbsolutePercentageError:
5 | def __init__(self, eps: float = 1e-07):
6 | self.eps = eps
7 |
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | return np.sum(np.absolute((y - y_pred)) / np.maximum(self.eps, np.absolute(y))) / y.shape[0]
13 |
--------------------------------------------------------------------------------
/Metrics/code/mean_squared_error.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class MeanSquaredError:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return 0.5 * np.linalg.norm(y_pred - y) ** 2 / y.shape[0]
10 |
11 | def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | return np.linalg.norm(y_pred - y) / y.shape[0]
13 |
--------------------------------------------------------------------------------
/Metrics/code/mean_squared_log_error.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class MeanSquaredLogarithmicError:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.sum(np.power(np.log(1 + y) - np.log(1 + y_pred), 2)) / y.shape[0]
10 |
--------------------------------------------------------------------------------
/Metrics/code/median_absolute_error.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class MedianAbsoluteError:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return np.median(np.absolute(y - y_pred))
10 |
--------------------------------------------------------------------------------
/Metrics/code/poisson.py:
--------------------------------------------------------------------------------
1 | # based on https://keras.io/api/losses/probabilistic_losses/#poisson-class
2 | import numpy as np
3 |
4 |
5 | class Poisson:
6 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
7 | return self.loss(y, y_pred)
8 |
9 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
10 | return np.sum(y_pred - y * np.log(y_pred)) / y.shape[0]
11 |
--------------------------------------------------------------------------------
/Metrics/code/precision.py:
--------------------------------------------------------------------------------
1 | # based on https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
2 |
3 | from sklearn.metrics import confusion_matrix
4 | import numpy as np
5 |
6 |
7 | class Precision:
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | cm = confusion_matrix(y, y_pred)
13 | return np.mean(np.diag(cm) / np.sum(cm, axis=0))
--------------------------------------------------------------------------------
/Metrics/code/r2_score.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class R2Score:
5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
6 | return self.loss(y, y_pred)
7 |
8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return 1 - (np.sum(np.power(y-y_pred, 2))) / (np.sum(np.power(y-np.mean(y), 2)))
10 |
--------------------------------------------------------------------------------
/Metrics/code/recall.py:
--------------------------------------------------------------------------------
1 | # based on https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
2 |
3 | from sklearn.metrics import confusion_matrix
4 | import numpy as np
5 |
6 |
7 | class Recall:
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | cm = confusion_matrix(y, y_pred)
13 | return np.mean(np.diag(cm) / np.sum(cm, axis=1))
--------------------------------------------------------------------------------
/Metrics/code/tweedie_deviance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class TweedieDeviance:
5 | def __init__(self, power: int) -> None:
6 | self.power = power
7 |
8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
9 | return self.loss(y, y_pred)
10 |
11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64:
12 | if self.power == 0:
13 | return np.sum(np.power(y - y_pred, 2)) / y.shape[0]
14 | elif self.power == 1:
15 | return np.sum(2 * (y * np.log(y / y_pred) + y_pred - y)) / y.shape[0]
16 | elif self.power == 2:
17 | return np.sum(2 * (np.log(y_pred / y) + y / y_pred - 1)) / y.shape[0]
18 | else:
19 | return np.sum(2 * (np.power(np.maximum(y, 0), 2-self.power) / ((1-self.power) * (2-self.power)) - (y * np.power(y_pred, 1 - self.power)) / (1 - self.power) + np.power(y_pred, 2 - self.power) / (2 - self.power))) / y.shape[0]
20 |
--------------------------------------------------------------------------------
/Metrics/doc/binary_cross_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Metrics/doc/binary_cross_entropy.png
--------------------------------------------------------------------------------
/Metrics/doc/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Metrics/doc/confusion_matrix.png
--------------------------------------------------------------------------------
/Metrics/tex/36b5afebdba34564d884d347484ac0c7.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Metrics/tex/44bc9d542a92714cac84e01cbbb7fd61.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Metrics/tex/77a3b857d53fb44e33b53e4c8b68351a.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Metrics/tex/8217ed3c32a785f0b5aad4055f432ad8.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Metrics/tex/cf644cbd499c18ed6f22cee5950c0d75.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Metrics/tex/deceeaf6940a8c7a5a02373728002b0f.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adadelta/README.tex.md:
--------------------------------------------------------------------------------
1 | # ADADELTA: An Adaptive Learning Rate Method
2 |
3 | 
4 |
5 | Adadelta is a stochastic gradient-based optimization algorithm that allows for per-dimension learning rates. Adadelta is an extension of Adagrad that seeks to reduce its aggressive, monotonically decreasing learning rate. Instead of accumulating all past squared gradients, Adadelta restricts the window of accumulated past gradients to a fixed size $\omega$. [1]
6 |
7 | Instead of inefficiently storing $\omega$ previous squared gradients, the sum of gradients is recursively defined as a decaying average of all past squared gradients. The running average $E\left[g^{2}\right]_{t}$ at time step $t$ therefore only depends on the previous average and the current gradient [2]:
8 |
9 | $$E\left[g^{2}\right]_{t} = \gamma{E}\left[g^{2}\right]_{t-1} + \left(1-\gamma\right)g^{2}_{t}$$
10 |
11 | $\gamma$ is usually set to around 0.9. Rewriting SGD updates in terms of the parameter update vector:
12 |
13 | $$ \Delta\theta_{t} = -\eta\cdot{g_{t, i}}$$
14 |
15 | $$\theta_{t+1} = \theta_{t} + \Delta\theta_{t}$$
16 |
17 | AdaDelta takes the form:
18 |
19 | $$RMS[g]_{t}=\sqrt{E\left[g^{2}\right]_{t} + \epsilon}$$
20 |
21 | $$ \Delta\theta_{t} = -\frac{\eta}{RMS[g]_{t}}g_{t} $$
22 |
23 | The authors that the units in the weight update don't match, i.e., the update should have the same hypothetical units as the parameters/weights. To realize this, they use the root mean squared error of parameter updates.
24 |
25 | $$E[\Delta \theta^2]_t = \gamma E[\Delta \theta^2]_{t-1} + (1 - \gamma) \Delta \theta^2_t$$
26 |
27 | $$RMS[\Delta \theta]_{t} = \sqrt{E[\Delta \theta^2]_t + \epsilon}$$
28 |
29 | Since $RMS[\Delta \theta]_{t}$ is unknown, it's approximated with the RMS of the parameter updates until the previous time step $RMS[\Delta \theta]_{t-1}$.
30 |
31 | $$\Delta \theta_t = - \dfrac{RMS[\Delta \theta]_{t-1}}{RMS[g]_{t}} g_{t}$$
32 | $$\theta_{t+1} = \theta_t + \Delta \theta_t$$
33 |
34 | For more information on how to derive this formula, take a look at '[An overview of gradient descent optimization algorithms](https://ruder.io/optimizing-gradient-descent/index.html#adadelta)' by [Sebastian Ruder](https://twitter.com/seb_ruder) and the [original Adadelta paper](https://arxiv.org/abs/1212.5701) by [Matthew D. Zeiler](https://arxiv.org/search/cs?searchtype=author&query=Zeiler%2C+M+D).
35 |
36 | Adadelta's main advantages over Adagrad are that it doesn't need a default learning rate and that it doesn't decrease the learning rate as aggressively and monotonically as Adagrad.
37 |
38 | [1] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.
39 |
40 | [2] https://paperswithcode.com/method/adadelta
41 |
42 | ## Code
43 |
44 | * [Adadelta Numpy Implementation](code/adadelta.py)
--------------------------------------------------------------------------------
/Optimizers/adadelta/code/adadelta.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#adadelta
2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L56
3 |
4 | import numpy as np
5 |
6 |
7 | class Adadelta:
8 | """Adadelta
9 | Parameters:
10 | -----------
11 | rho: float = 0.95
12 | The decay rate.
13 | epsilon: float = 1e-07
14 | A small floating point value to avoid zero denominator.
15 | """
16 | def __init__(self, rho: float = 0.95, epsilon: float = 1e-7) -> None:
17 | self.E_w_update = None # Running average of squared parameter updates
18 | self.E_grad = None # Running average of the squared gradient of w
19 | self.w_update = None # Parameter update
20 | self.epsilon = epsilon
21 | self.rho = rho
22 |
23 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
24 | if self.w_update is None:
25 | self.w_update = np.zeros(np.shape(w))
26 | self.E_w_update = np.zeros(np.shape(w))
27 | self.E_grad = np.zeros(np.shape(grad_wrt_w))
28 |
29 | # Update average of gradients at w
30 | self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2)
31 |
32 | # Calculate root mean squared error of the weight update and gradients
33 | RMS_delta_w = np.sqrt(self.E_w_update + self.epsilon)
34 | RMS_grad = np.sqrt(self.E_grad + self.epsilon)
35 |
36 | # Calculate adaptive learning rate
37 | adaptive_lr = RMS_delta_w / RMS_grad
38 |
39 | # Calculate the update
40 | self.w_update = adaptive_lr * grad_wrt_w
41 |
42 | # Update the running average of w updates
43 | self.E_w_update = self.rho * self.E_w_update + (1 - self.rho) * np.power(self.w_update, 2)
44 |
45 | return w - self.w_update
46 |
--------------------------------------------------------------------------------
/Optimizers/adadelta/doc/adadelta_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adadelta/doc/adadelta_example.png
--------------------------------------------------------------------------------
/Optimizers/adadelta/tex/11c596de17c342edeed29f489aa4b274.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adadelta/tex/4f4f4e395762a3af4575de74c019ebb5.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adadelta/tex/ae4fb5973f393577570881fc24fc2054.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adagrad/README.tex.md:
--------------------------------------------------------------------------------
1 | # Adagrad
2 |
3 | 
4 |
5 | Adagrad [1] is a gradient-based optimization algorithm that adaptively scales the learning rate to the parameters, performing smaller updates for parameters associated with frequently occurring features and larger updates for parameters associated with infrequent features eliminating the need to tune the learning rate manually. The above-mentioned behavior makes Adagrad well-suited for dealing with sparse data, and Dean et al. [2] have found out that Adagrad is much more robust than SGD.
6 |
7 | Reminder: The SDG update for each parameter $\theta_i$ look as follows:
8 |
9 | $$\theta_{t+1, i} = \theta_{t, i} - \alpha \cdot \nabla_\theta J( \theta_{t, i} )$$
10 |
11 | To scale the learning rate to each parameter Adagrad modifies the learning rate $\alpha$ at each time step $t$ for every parameter $\theta_i$ based on the past gradients of $\theta_i$:
12 |
13 | $$\theta_{t+1, i} = \theta_{t, i} - \dfrac{\alpha}{\sqrt{G_{t, ii} + \epsilon}} \cdot \nabla_\theta J( \theta_{t, i} )$$
14 |
15 | Here $G_{t} \in \mathbb{R}^{d \times d}$ is a diagonal matrix where each diagonal element $i, i$ is the sum of the squares of the gradients w.r.t. $\theta_i$ up to time step $t$ and $\epsilon$ is a smoothing term used to avoid division by zero.
16 |
17 | The above can be vectorized as follows:
18 |
19 | $$\theta_{t+1} = \theta_{t} - \dfrac{\alpha}{\sqrt{G_{t} + \epsilon}} \odot \nabla_\theta J( \theta_{t, i} )$$
20 |
21 | Adagrads most significant benefit is that it eliminates the need to tune the learning rate manually, but it still isn't perfect. Its main weakness is that it accumulates the squared gradients in the denominator. Since all the squared terms are positive, the accumulated sum keeps on growing during training. Therefore the learning rate keeps shrinking as the training continues, and it eventually becomes infinitely small. Other algorithms like Adadelta, RMSprop, and Adam try to resolve this flaw. [3]
22 |
23 | [1] Duchi, J., Hazan, E., & Singer, Y. (2011). Adaptive Subgradient Methods for Online Learning and Stochastic Optimization. Journal of Machine Learning Research, 12, 2121–2159. Retrieved from [http://jmlr.org/papers/v12/duchi11a.html](http://jml
24 |
[2] Dean, J., Corrado, G. S., Monga, R., Chen, K., Devin, M., Le, Q. V, … Ng, A. Y. (2012). Large Scale Distributed Deep Networks. NIPS 2012: Neural Information Processing Systems, 1–11. [http://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks.pdf](http://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks.pdf)
25 | [3] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.
26 |
27 | ## Code
28 |
29 | - [Adagrad Numpy Implementation](code/adagrad.py)
--------------------------------------------------------------------------------
/Optimizers/adagrad/code/adagrad.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#adagrad
2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L41
3 |
4 | import numpy as np
5 |
6 |
7 | class Adagrad:
8 | """Adagrad
9 | Parameters:
10 | -----------
11 | learning_rate: float = 0.001
12 | The step length used when following the negative gradient.
13 | initial_accumulator_value: float = 0.1
14 | Starting value for the accumulators, must be non-negative.
15 | epsilon: float = 1e-07
16 | A small floating point value to avoid zero denominator.
17 | """
18 | def __init__(self, learning_rate: float = 0.001, initial_accumulator_value: float = 0.1, epsilon: float = 1e-07) -> None:
19 | self.learning_rate = learning_rate
20 | self.initial_accumulator_value = initial_accumulator_value
21 | self.epsilon = epsilon
22 | self.G = np.array([]) # Sum of squares of the gradients
23 |
24 | assert self.initial_accumulator_value > 0, "initial_accumulator_value must be non-negative"
25 |
26 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
27 | # Initialize w_update if not initialized yet
28 | if not self.G.any():
29 | self.G = np.full(np.shape(w), self.initial_accumulator_value)
30 | # Add the square of the gradient of the loss function at w
31 | self.G += np.power(grad_wrt_w, 2)
32 | return w - self.learning_rate * grad_wrt_w / np.sqrt(self.G + self.epsilon)
33 |
--------------------------------------------------------------------------------
/Optimizers/adagrad/doc/adagrad_example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adagrad/doc/adagrad_example.gif
--------------------------------------------------------------------------------
/Optimizers/adagrad/tex/4f4f4e395762a3af4575de74c019ebb5.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adagrad/tex/7ccca27b5ccc533a2dd72dc6fa28ed84.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adagrad/tex/c745b9b57c145ec5577b82542b2df546.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/adam/README.tex.md:
--------------------------------------------------------------------------------
1 | # Adaptive Moment Estimation (Adam)
2 |
3 | 
4 |
5 | Adaptive Moment Estimation better known as Adam is another adaptive learning rate method first published in 2014 by Kingma et. al. [1] In addition to storing an exponentially decaying average of past squared gradients $v_t$ like Adadelta or RMSprop, Adam also keeps an exponentially decaying average of past gradients $m_t$, similar to SGD with momentum. [2]
6 |
7 | $$m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t$$
8 |
9 | $$v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2$$
10 |
11 | $m_t$ is an estimate of the first [moment](https://en.wikipedia.org/wiki/Moment_(mathematics)) (the mean) and $v_t$ is the estimate of the second moment (the uncentered variance) of the gradients respectively. As $m_t$ and $v_t$ are initialized as vectors of 0's, the authors of Adam observe that they are biased towards zero, especially during the initial time steps, and especially when the decay rates are small (i.e. $\beta_1$ and $\beta_2$ are close to 1). [2]
12 |
13 | To counteract the biases by calculating bias-corrected first and second moment esimates:
14 |
15 | $$\hat{m}_t = \dfrac{m_t}{1 - \beta^t_1}$$
16 |
17 | $$\hat{v}_t = \dfrac{v_t}{1 - \beta^t_2}$$
18 |
19 | $\hat{m}_t$ and $\hat{v}_t$ are then used to update the parameters as follows:
20 |
21 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{\sqrt{\hat{v}_t} + \epsilon} \hat{m}_t$$
22 |
23 | As default values for $\beta_1$ and $\beta_2$ the authors propose $0.9$ for $\beta_1$ and $0.999$ for $\beta_2$.
24 |
25 | [1] Diederik P. Kingma and Jimmy Ba (2014). Adam: A Method for Stochastic Optimization.
26 |
27 | [2] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.
28 |
29 | ## Code
30 |
31 | - [Adam Numpy Implementation](code/adam.py)
32 |
33 | ## Resources
34 |
35 | - [https://arxiv.org/abs/1412.6980](https://arxiv.org/abs/1412.6980)
36 | - [https://ruder.io/optimizing-gradient-descent/index.html#adam](https://ruder.io/optimizing-gradient-descent/index.html#adam)
37 | - [https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c](https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c)
--------------------------------------------------------------------------------
/Optimizers/adam/code/adam.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#adam
2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L106
3 |
4 | import numpy as np
5 |
6 |
7 | class Adam:
8 | """Adam - Adaptive Moment Estimation
9 | Parameters:
10 | -----------
11 | learning_rate: float = 0.001
12 | The step length used when following the negative gradient.
13 | beta_1: float = 0.9
14 | The exponential decay rate for the 1st moment estimates.
15 | beta_2: float = 0.999
16 | The exponential decay rate for the 2nd moment estimates.
17 | epsilon: float = 1e-07
18 | A small floating point value to avoid zero denominator.
19 | """
20 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7) -> None:
21 | self.learning_rate = learning_rate
22 | self.epsilon = epsilon
23 | self.beta_1 = beta_1
24 | self.beta_2 = beta_2
25 |
26 | self.t = 0
27 | self.m = None # Decaying averages of past gradients
28 | self.v = None # Decaying averages of past squared gradients
29 |
30 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
31 | self.t += 1
32 | if self.m is None:
33 | self.m = np.zeros(np.shape(grad_wrt_w))
34 | self.v = np.zeros(np.shape(grad_wrt_w))
35 |
36 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
37 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2)
38 |
39 | m_hat = self.m / (1 - self.beta_1**self.t)
40 | v_hat = self.v / (1 - self.beta_2**self.t)
41 |
42 | w_update = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
43 |
44 | return w - w_update
45 |
--------------------------------------------------------------------------------
/Optimizers/adam/doc/adam_example.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adam/doc/adam_example.PNG
--------------------------------------------------------------------------------
/Optimizers/adamax/README.tex.md:
--------------------------------------------------------------------------------
1 | # AdaMax
2 |
3 | 
4 |
5 | In [Adam](https://ml-explained.com/blog/adam-explained), the update rule for individual weights is scaling their gradients inversely proportional to the $\ell_2$ norm of the past and current gradients.
6 |
7 | $$v_t = \beta_2 v_{t-1} + (1 - \beta_2) |g_t|^2$$
8 |
9 | The L2 norm can be generalized to the $\ell_p$ norm.
10 |
11 | $$v_t = \beta_2^p v_{t-1} + (1 - \beta_2^p) |g_t|^p$$
12 |
13 | Such variants generally become numerically unstable for large $p$, which is why $\ell_1$ and $\ell_2$ norms are most common in practice. However, in the special case where we let $p \rightarrow \infty$, a surprisingly simple and stable algorithm emerges.
14 |
15 | To avoid confusion with Adam, we use $u_t$ to denote the infinity norm-constrained $v_t$:
16 |
17 | $$
18 | u_t = \beta_2^\infty v_{t-1} + (1 - \beta_2^\infty) |g_t|^\infty
19 | = \max(\beta_2 \cdot v_{t-1}, |g_t|)
20 | $$
21 |
22 | We can now plug $u_t$ into the Adam update equation replacing $\sqrt{\hat{v}_t} + \epsilon$ to obtain the AdaMax update rule:
23 |
24 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{u_t} \hat{m}_t$$
25 |
26 | ## Code
27 |
28 | - [AdaMax Numpy Implementation](code/adamax.py)
29 |
30 | ## Resources
31 |
32 | - [https://arxiv.org/abs/1412.6980](https://arxiv.org/abs/1412.6980)
33 | - [https://ruder.io/optimizing-gradient-descent/index.html#adamax](https://ruder.io/optimizing-gradient-descent/index.html#adamax)
34 | - [https://keras.io/api/optimizers/adamax/](https://keras.io/api/optimizers/adamax/)
--------------------------------------------------------------------------------
/Optimizers/adamax/code/adamax.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#adamax
2 |
3 | import numpy as np
4 |
5 |
6 | class AdaMax:
7 | """AdaMax
8 | Parameters:
9 | -----------
10 | learning_rate: float = 0.001
11 | The step length used when following the negative gradient.
12 | beta_1: float = 0.9
13 | The exponential decay rate for the 1st moment estimates.
14 | beta_2: float = 0.999
15 | The exponential decay rate for the 2nd moment estimates.
16 | epsilon: float = 1e-07
17 | A small floating point value to avoid zero denominator.
18 | """
19 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7) -> None:
20 | self.learning_rate = learning_rate
21 | self.epsilon = epsilon
22 | self.beta_1 = beta_1
23 | self.beta_2 = beta_2
24 |
25 | self.t = 0
26 | self.m = None # Decaying averages of past gradients
27 | self.v = None # Decaying averages of past squared gradients
28 |
29 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
30 | self.t += 1
31 | if self.m is None:
32 | self.m = np.zeros(np.shape(grad_wrt_w))
33 | self.v = np.zeros(np.shape(grad_wrt_w))
34 |
35 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
36 | self.v = np.maximum(self.beta_2 * self.v, np.abs(grad_wrt_w))
37 |
38 | m_hat = self.m / (1 - self.beta_1**self.t)
39 |
40 | w_update = self.learning_rate * m_hat / (self.v + self.epsilon)
41 |
42 | return w - w_update
43 |
--------------------------------------------------------------------------------
/Optimizers/adamax/doc/adamax_example.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adamax/doc/adamax_example.PNG
--------------------------------------------------------------------------------
/Optimizers/adamw/README.md:
--------------------------------------------------------------------------------
1 | # AdamW
2 |
3 | AdamW is a stochastic optimization method that modifies the typical implementation of weight decay in Adam to combat Adam's known convergence problems by decoupling the weight decay from the gradient updates.
4 |
5 | 
6 |
7 | ## Code
8 |
9 | - [AdamW Numpy Implementation](code/adamw.py)
10 |
11 | ## Resources
12 |
13 | - [https://arxiv.org/abs/1711.05101](https://arxiv.org/abs/1711.05101)
14 | - [https://paperswithcode.com/method/adamw](https://paperswithcode.com/method/adamw)
15 | - [https://www.fast.ai/2018/07/02/adam-weight-decay/](https://www.fast.ai/2018/07/02/adam-weight-decay/)
16 | - [https://towardsdatascience.com/why-adamw-matters-736223f31b5d](https://towardsdatascience.com/why-adamw-matters-736223f31b5d)
--------------------------------------------------------------------------------
/Optimizers/adamw/README.tex.md:
--------------------------------------------------------------------------------
1 | # AdamW
2 |
3 | AdamW is a stochastic optimization method that modifies the typical implementation of weight decay in Adam to combat Adam's known convergence problems by decoupling the weight decay from the gradient updates.
4 |
5 | 
6 |
7 | ## Code
8 |
9 | - [AdamW Numpy Implementation](code/adamw.py)
10 |
11 | ## Resources
12 |
13 | - [https://arxiv.org/abs/1711.05101](https://arxiv.org/abs/1711.05101)
14 | - [https://paperswithcode.com/method/adamw](https://paperswithcode.com/method/adamw)
15 | - [https://www.fast.ai/2018/07/02/adam-weight-decay/](https://www.fast.ai/2018/07/02/adam-weight-decay/)
16 | - [https://towardsdatascience.com/why-adamw-matters-736223f31b5d](https://towardsdatascience.com/why-adamw-matters-736223f31b5d)
--------------------------------------------------------------------------------
/Optimizers/adamw/code/adamw.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class AdamW:
5 | """AdamW
6 | Parameters:
7 | -----------
8 | learning_rate: float = 0.001
9 | The step length used when following the negative gradient.
10 | beta_1: float = 0.9
11 | The exponential decay rate for the 1st moment estimates.
12 | beta_2: float = 0.999
13 | The exponential decay rate for the 2nd moment estimates.
14 | epsilon: float = 1e-07
15 | A small floating point value to avoid zero denominator.
16 | weight_decay: float = 0.01
17 | Amount of weight decay to be applied.
18 | """
19 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7, weight_decay: float = 0.01) -> None:
20 | self.learning_rate = learning_rate
21 | self.epsilon = epsilon
22 | self.beta_1 = beta_1
23 | self.beta_2 = beta_2
24 | self.weight_decay = weight_decay
25 |
26 | self.t = 0
27 | self.m = None # Decaying averages of past gradients
28 | self.v = None # Decaying averages of past squared gradients
29 |
30 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
31 | self.t += 1
32 | if self.m is None:
33 | self.m = np.zeros(np.shape(grad_wrt_w))
34 | self.v = np.zeros(np.shape(grad_wrt_w))
35 |
36 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
37 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2)
38 |
39 | m_hat = self.m / (1 - self.beta_1**self.t)
40 | v_hat = self.v / (1 - self.beta_2**self.t)
41 |
42 | w_update = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon) + self.weight_decay * grad_wrt_w
43 |
44 | return w - w_update
45 |
--------------------------------------------------------------------------------
/Optimizers/adamw/doc/adamw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adamw/doc/adamw.png
--------------------------------------------------------------------------------
/Optimizers/amsgrad/README.md:
--------------------------------------------------------------------------------
1 | # AMSGrad
2 |
3 | 
4 |
5 | The motivation for AMSGrad lies with the observation that [Adam](https://ml-explained.com/blog/adam-explained) fails to converge to an optimal solution for some data-sets and is outperformed by SDG with momentum.
6 |
7 | Reddi et al. (2018) [1] show that one cause of the issue described above is the use of the exponential moving average of the past squared gradients.
8 |
9 | To fix the above-described behavior, the authors propose a new algorithm called AMSGrad that keeps a running maximum of the squared gradients instead of an exponential moving average.
10 |
11 | 
12 |
13 | 
14 |
15 | For simplicity, the authors also removed the debiasing step, which leads to the following update rule:
16 |
17 | 
18 |
19 | For more information, check out the paper '[On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1)' and the [AMSGrad section](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad) of the '[An overview of gradient descent optimization algorithms](https://ruder.io/optimizing-gradient-descent/index.html)' article.
20 |
21 | [1] Reddi, Sashank J., Kale, Satyen, & Kumar, Sanjiv. [On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1).
22 |
23 | ## Code
24 |
25 | - [AMSGrad Numpy Implementation](code/amsgrad.py)
26 |
27 | ## Resources
28 |
29 | - [https://arxiv.org/abs/1904.09237v1](https://arxiv.org/abs/1904.09237v1)
30 | - [https://paperswithcode.com/method/amsgrad](https://paperswithcode.com/method/amsgrad)
31 | - [https://ruder.io/optimizing-gradient-descent/index.html#amsgrad](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad)
--------------------------------------------------------------------------------
/Optimizers/amsgrad/README.tex.md:
--------------------------------------------------------------------------------
1 | # AMSGrad
2 |
3 | 
4 |
5 | The motivation for AMSGrad lies with the observation that [Adam](https://ml-explained.com/blog/adam-explained) fails to converge to an optimal solution for some data-sets and is outperformed by SDG with momentum.
6 |
7 | Reddi et al. (2018) [1] show that one cause of the issue described above is the use of the exponential moving average of the past squared gradients.
8 |
9 | To fix the above-described behavior, the authors propose a new algorithm called AMSGrad that keeps a running maximum of the squared gradients instead of an exponential moving average.
10 |
11 | $$v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2$$
12 |
13 | $$\hat{v}_t = \text{max}(\hat{v}_{t-1}, v_t)$$
14 |
15 | For simplicity, the authors also removed the debiasing step, which leads to the following update rule:
16 |
17 | $$\begin{align} \begin{split} m_t &= \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ v_t &= \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\ \hat{v}_t &= \text{max}(\hat{v}_{t-1}, v_t) \\ \theta_{t+1} &= \theta_{t} - \dfrac{\eta}{\sqrt{\hat{v}_t} + \epsilon} m_t \end{split} \end{align}$$
18 |
19 | For more information, check out the paper '[On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1)' and the [AMSGrad section](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad) of the '[An overview of gradient descent optimization algorithms](https://ruder.io/optimizing-gradient-descent/index.html)' article.
20 |
21 | [1] Reddi, Sashank J., Kale, Satyen, & Kumar, Sanjiv. [On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1).
22 |
23 | ## Code
24 |
25 | - [AMSGrad Numpy Implementation](code/amsgrad.py)
26 |
27 | ## Resources
28 |
29 | - [https://arxiv.org/abs/1904.09237v1](https://arxiv.org/abs/1904.09237v1)
30 | - [https://paperswithcode.com/method/amsgrad](https://paperswithcode.com/method/amsgrad)
31 | - [https://ruder.io/optimizing-gradient-descent/index.html#amsgrad](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad)
--------------------------------------------------------------------------------
/Optimizers/amsgrad/code/amsgrad.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#amsgrad
2 |
3 | import numpy as np
4 |
5 |
6 | class AMSGrad:
7 | """AMSGrad
8 | Parameters:
9 | -----------
10 | learning_rate: float = 0.001
11 | The step length used when following the negative gradient.
12 | beta_1: float = 0.9
13 | The exponential decay rate for the 1st moment estimates.
14 | beta_2: float = 0.999
15 | The exponential decay rate for the 2nd moment estimates.
16 | epsilon: float = 1e-07
17 | A small floating point value to avoid zero denominator.
18 | """
19 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7) -> None:
20 | self.learning_rate = learning_rate
21 | self.epsilon = epsilon
22 | self.beta_1 = beta_1
23 | self.beta_2 = beta_2
24 |
25 | self.m = None # Decaying averages of past gradients
26 | self.v = None # Decaying averages of past squared gradients
27 |
28 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
29 | if self.m is None:
30 | self.m = np.zeros(np.shape(grad_wrt_w))
31 | self.v = np.zeros(np.shape(grad_wrt_w))
32 |
33 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
34 | v_1 = self.v
35 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2)
36 |
37 | v_hat = np.maximum(v_1, self.v)
38 |
39 | w_update = self.learning_rate * self.m / (np.sqrt(v_hat) + self.epsilon)
40 |
41 | return w - w_update
42 |
--------------------------------------------------------------------------------
/Optimizers/amsgrad/doc/amsgrad_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/amsgrad/doc/amsgrad_example.png
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/code/gradient_descent_with_momentum.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/
2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L9
3 |
4 | import numpy as np
5 |
6 |
7 | class GradientDescent:
8 | """Gradient Descent with Momentum
9 | Parameters:
10 | -----------
11 | learning_rate: float = 0.01
12 | The step length used when following the negative gradient.
13 | momentum: float = 0.0
14 | Amount of momentum to use.
15 | Momentum accelerates gradient descent in the relevant direction and dampens oscillations.
16 | """
17 | def __init__(self, learning_rate: float = 0.01, momentum: float = 0.0) -> None:
18 | self.learning_rate = learning_rate
19 | self.momentum = momentum
20 | self.w_update = np.array([])
21 |
22 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
23 | # Initialize w_update if not initialized yet
24 | if not self.w_update.any():
25 | self.w_update = np.zeros(np.shape(w))
26 | # Use momentum if set
27 | self.w_update = self.momentum * self.w_update + (1 - self.momentum) * grad_wrt_w
28 | # Move against the gradient to minimize loss
29 | return w - self.learning_rate * self.w_update
30 |
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/code/gradient_descent_with_nesterov_momentum.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#nesterovacceleratedgradient
2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L24
3 |
4 | from typing import Callable
5 | import numpy as np
6 |
7 |
8 | class NesterovAcceleratedGradientDescent:
9 | """Gradient Descent with Nesterov Momentum
10 | Parameters:
11 | -----------
12 | learning_rate: float = 0.01
13 | The step length used when following the negative gradient.
14 | momentum: float = 0.0
15 | Amount of momentum to use.
16 | Momentum accelerates gradient descent in the relevant direction and dampens oscillations.
17 | """
18 | def __init__(self, learning_rate: float = 0.01, momentum: float = 0.0) -> None:
19 | self.learning_rate = learning_rate
20 | self.momentum = momentum
21 | self.w_update = np.array([])
22 |
23 | def update(self, w: np.ndarray, grad_func: Callable) -> np.ndarray:
24 | # Calculate the gradient of the loss a bit further down the slope from w
25 | approx_future_grad = np.clip(grad_func(w - self.momentum * self.w_update), -1, 1)
26 | # Initialize w_update if not initialized yet
27 | if not self.w_update.any():
28 | self.w_update = np.zeros(np.shape(w))
29 |
30 | self.w_update = self.momentum * self.w_update + self.learning_rate * approx_future_grad
31 | # Move against the gradient to minimize loss
32 | return w - self.w_update
33 |
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/doc/gradient_descent.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/gradient_descent.gif
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/doc/momentum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/momentum.png
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/doc/nesterov_accelerated_gradient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/nesterov_accelerated_gradient.png
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/doc/pick_learning_rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/pick_learning_rate.png
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/doc/variations_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/variations_comparison.png
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/tex/11c596de17c342edeed29f489aa4b274.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/tex/27e556cf3caa0673ac49a8f0de3c73ca.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/gradient_descent/tex/c745b9b57c145ec5577b82542b2df546.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/nadam/README.md:
--------------------------------------------------------------------------------
1 | # Nadam (Nesterov-accelerated Adaptive Moment Estimation)
2 |
3 | 
4 |
5 | Nadam (Nesterov-accelerated Adaptive Moment Estimation) combines NAG (Nesterov accelerated gradient) and Adam. To do so, the momentum term
needs to be updated. For more information, check out [the paper](http://cs229.stanford.edu/proj2015/054_report.pdf) or the [Nadam section](https://ruder.io/optimizing-gradient-descent/index.html#nadam) of ['An overview of gradient descent optimization algorithms'](https://ruder.io/optimizing-gradient-descent/index.html).
6 |
7 | The final update rule looks as follows:
8 |
9 | 
10 |
11 | ## Code
12 |
13 | - [Nadam Numpy Implementation](code/nadam.py)
14 |
15 | ## Resources
16 |
17 | - [http://cs229.stanford.edu/proj2015/054_report.pdf](http://cs229.stanford.edu/proj2015/054_report.pdf)
18 | - [https://paperswithcode.com/method/nadam](https://paperswithcode.com/method/nadam)
19 | - [https://ruder.io/optimizing-gradient-descent/index.html#nadam](https://ruder.io/optimizing-gradient-descent/index.html#nadam)
--------------------------------------------------------------------------------
/Optimizers/nadam/README.tex.md:
--------------------------------------------------------------------------------
1 | # Nadam (Nesterov-accelerated Adaptive Moment Estimation)
2 |
3 | 
4 |
5 | Nadam (Nesterov-accelerated Adaptive Moment Estimation) combines NAG (Nesterov accelerated gradient) and Adam. To do so, the momentum term $m_t$ needs to be updated. For more information, check out [the paper](http://cs229.stanford.edu/proj2015/054_report.pdf) or the [Nadam section](https://ruder.io/optimizing-gradient-descent/index.html#nadam) of ['An overview of gradient descent optimization algorithms'](https://ruder.io/optimizing-gradient-descent/index.html).
6 |
7 | The final update rule looks as follows:
8 |
9 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{\sqrt{\hat{v}_t} + \epsilon} (\beta_1 \hat{m}_t + \dfrac{(1 - \beta_1) g_t}{1 - \beta^t_1})$$
10 |
11 | ## Code
12 |
13 | - [Nadam Numpy Implementation](code/nadam.py)
14 |
15 | ## Resources
16 |
17 | - [http://cs229.stanford.edu/proj2015/054_report.pdf](http://cs229.stanford.edu/proj2015/054_report.pdf)
18 | - [https://paperswithcode.com/method/nadam](https://paperswithcode.com/method/nadam)
19 | - [https://ruder.io/optimizing-gradient-descent/index.html#nadam](https://ruder.io/optimizing-gradient-descent/index.html#nadam)
--------------------------------------------------------------------------------
/Optimizers/nadam/code/nadam.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#nadam
2 |
3 | import numpy as np
4 |
5 |
6 | class Nadam:
7 | """Nadam
8 | Parameters:
9 | -----------
10 | learning_rate: float = 0.001
11 | The step length used when following the negative gradient.
12 | beta_1: float = 0.9
13 | The exponential decay rate for the 1st moment estimates.
14 | beta_2: float = 0.999
15 | The exponential decay rate for the 2nd moment estimates.
16 | epsilon: float = 1e-07
17 | A small floating point value to avoid zero denominator.
18 | """
19 |
20 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999,
21 | epsilon: float = 1e-7) -> None:
22 | self.learning_rate = learning_rate
23 | self.epsilon = epsilon
24 | self.beta_1 = beta_1
25 | self.beta_2 = beta_2
26 |
27 | self.t = 0
28 | self.m = None # Decaying averages of past gradients
29 | self.v = None # Decaying averages of past squared gradients
30 |
31 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
32 | self.t += 1
33 | if self.m is None:
34 | self.m = np.zeros(np.shape(grad_wrt_w))
35 | self.v = np.zeros(np.shape(grad_wrt_w))
36 |
37 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
38 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2)
39 |
40 | m_hat = self.m / (1 - self.beta_1**self.t)
41 | v_hat = self.v / (1 - self.beta_2**self.t)
42 |
43 | w_update = self.learning_rate / (np.sqrt(v_hat) + self.epsilon) * (self.beta_1 * m_hat + (1 - self.beta_1)
44 | * grad_wrt_w / (1 - self.beta_1**self.t))
45 |
46 | return w - w_update
47 |
--------------------------------------------------------------------------------
/Optimizers/nadam/doc/nadam_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/nadam/doc/nadam_example.png
--------------------------------------------------------------------------------
/Optimizers/qhadam/README.md:
--------------------------------------------------------------------------------
1 | # QHAdam (Quasi-Hyperbolic Adam)
2 |
3 | 
4 |
5 | **Quasi-Hyperbolic Momentum Algorithm (QHM)** is a simple alteration of [SGD with momentum](https://paperswithcode.com/method/sgd-with-momentum), averaging a plain SGD step with a momentum step. **QHAdam (Quasi-Hyperbolic Adam)** is a QH augmented version of [Adam](https://ml-explained.com/blog/adam-explained) that replaces both of Adam's moment estimators with quasi-hyperbolic terms. Namely, QHAdam decouples the momentum term from the current gradient when updating the weights, and decouples the mean squared gradients term from the current squared gradient when updating the weights. [1, 2, 3]
6 |
7 | Essentially, it's a weighted average of the momentum and plain SGD, weighting the current gradient with an immediate discount factor
divided by a weighted average of the mean squared gradients and the current squared gradient, weighting the current squared gradient with an immediate discount factor
. [2]
8 |
9 | 
10 |
11 | [1] Ma, J. and Yarats, D. Quasi-hyperbolic momentum and Adam for deep learning. arXiv preprint arXiv:1810.06801, 2018
12 |
13 | [2] QHAdam Papers With Code
14 |
15 | [3] John Chen. An updated overview of recent gradient descent algorithms
16 |
17 | ## Code
18 |
19 | - [QHAdam Numpy Implementation](code/qhadam.py)
--------------------------------------------------------------------------------
/Optimizers/qhadam/README.tex.md:
--------------------------------------------------------------------------------
1 | # QHAdam (Quasi-Hyperbolic Adam)
2 |
3 | 
4 |
5 | **Quasi-Hyperbolic Momentum Algorithm (QHM)** is a simple alteration of [SGD with momentum](https://paperswithcode.com/method/sgd-with-momentum), averaging a plain SGD step with a momentum step. **QHAdam (Quasi-Hyperbolic Adam)** is a QH augmented version of [Adam](https://ml-explained.com/blog/adam-explained) that replaces both of Adam's moment estimators with quasi-hyperbolic terms. Namely, QHAdam decouples the momentum term from the current gradient when updating the weights, and decouples the mean squared gradients term from the current squared gradient when updating the weights. [1, 2, 3]
6 |
7 | Essentially, it's a weighted average of the momentum and plain SGD, weighting the current gradient with an immediate discount factor $v_1$ divided by a weighted average of the mean squared gradients and the current squared gradient, weighting the current squared gradient with an immediate discount factor $v_2$. [2]
8 |
9 | $$ \theta_{t+1, i} = \theta_{t, i} - \eta\left[\frac{\left(1-v_{1}\right)\cdot{g_{t}} + v_{1}\cdot\hat{m}_{t}}{\sqrt{\left(1-v_{2}\right)g^{2}_{t} + v_{2}\cdot{\hat{v}_{t}}} + \epsilon}\right], \forall{t} $$
10 |
11 | [1] Ma, J. and Yarats, D. Quasi-hyperbolic momentum and Adam for deep learning. arXiv preprint arXiv:1810.06801, 2018
12 |
13 | [2] QHAdam Papers With Code
14 |
15 | [3] John Chen. An updated overview of recent gradient descent algorithms
16 |
17 | ## Code
18 |
19 | - [QHAdam Numpy Implementation](code/qhadam.py)
--------------------------------------------------------------------------------
/Optimizers/qhadam/code/qhadam.py:
--------------------------------------------------------------------------------
1 | # based on https://arxiv.org/pdf/1810.06801.pdf
2 |
3 | import numpy as np
4 |
5 |
6 | class QHAdam:
7 | """QHAdam - Quasi-Hyperbolic Adam
8 | Parameters:
9 | -----------
10 | learning_rate: float = 0.001
11 | The step length used when following the negative gradient.
12 | beta_1: float = 0.9
13 | The exponential decay rate for the 1st moment estimates.
14 | beta_2: float = 0.999
15 | The exponential decay rate for the 2nd moment estimates.
16 | epsilon: float = 1e-07
17 | A small floating point value to avoid zero denominator.
18 | v_1: float = 0.7
19 | Immediate discount factor
20 | v_2: float = 1.0
21 | Immediate discount factor
22 | """
23 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7, v_1: float = 0.7, v_2: float = 1.0) -> None:
24 | self.learning_rate = learning_rate
25 | self.epsilon = epsilon
26 | self.beta_1 = beta_1
27 | self.beta_2 = beta_2
28 | self.v_1 = v_1
29 | self.v_2 = v_2
30 |
31 | self.t = 0
32 | self.m = None # Decaying averages of past gradients
33 | self.v = None # Decaying averages of past squared gradients
34 |
35 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
36 | self.t += 1
37 | if self.m is None:
38 | self.m = np.zeros(np.shape(grad_wrt_w))
39 | self.v = np.zeros(np.shape(grad_wrt_w))
40 |
41 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
42 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2)
43 |
44 | m_hat = self.m / (1 - self.beta_1**self.t)
45 | v_hat = self.v / (1 - self.beta_2**self.t)
46 |
47 | w_update = self.learning_rate * ((1 - self.v_1) * grad_wrt_w + self.v_1 * m_hat) / (np.sqrt((1 - self.v_2) * np.power(grad_wrt_w, 2) + self.v_2 * v_hat) + self.epsilon)
48 |
49 | return w - w_update
50 |
--------------------------------------------------------------------------------
/Optimizers/qhadam/doc/qhadam_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/qhadam/doc/qhadam_example.png
--------------------------------------------------------------------------------
/Optimizers/qhadam/tex/41922e474070adc90e7c1379c28d22fe.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/Optimizers/qhm/README.md:
--------------------------------------------------------------------------------
1 | # QHM (Quasi-Hyperbolic Momentum)
2 |
3 | 
4 |
5 | Quasi-Hyperbolic Momentum Algorithm (QHM) is a simple alteration of SGD with momentum, averaging a plain SGD step with a momentum step, thereby decoupling the momentum term
from the current gradient
when updating the weights.
6 |
7 | 
8 |
9 | 
10 |
11 | The authors recommend
and
as a good starting point. For more information about QHM, check out the resources below.
12 |
13 | ## Code
14 |
15 | - [QHM Numpy Implementation](code/qhm.py)
16 |
17 | ## Resources
18 |
19 | - [https://arxiv.org/pdf/1810.06801.pdf](https://arxiv.org/pdf/1810.06801.pdf)
20 | - [https://paperswithcode.com/method/qhadam](https://paperswithcode.com/method/qhadam)
21 | - [https://johnchenresearch.github.io/demon/](https://johnchenresearch.github.io/demon/)
22 | - [https://facebookresearch.github.io/qhoptim/](https://facebookresearch.github.io/qhoptim/)
--------------------------------------------------------------------------------
/Optimizers/qhm/README.tex.md:
--------------------------------------------------------------------------------
1 | # QHM (Quasi-Hyperbolic Momentum)
2 |
3 | 
4 |
5 | Quasi-Hyperbolic Momentum Algorithm (QHM) is a simple alteration of SGD with momentum, averaging a plain SGD step with a momentum step, thereby decoupling the momentum term $\beta$ from the current gradient $\nabla_t$ when updating the weights.
6 |
7 | $$g_{t + 1} \leftarrow \beta \cdot g_t + (1 - \beta) \cdot \nabla_t$$
8 |
9 | $$\theta_{t + 1} \leftarrow \theta_t + \alpha \left[ (1 - \nu) \cdot \nabla_t + \nu \cdot g_{t + 1} \right]$$
10 |
11 | The authors recommend $\nu=0.7$ and $\beta=0.999$ as a good starting point. For more information about QHM, check out the resources below.
12 |
13 | ## Code
14 |
15 | - [QHM Numpy Implementation](code/qhm.py)
16 |
17 | ## Resources
18 |
19 | - [https://arxiv.org/pdf/1810.06801.pdf](https://arxiv.org/pdf/1810.06801.pdf)
20 | - [https://paperswithcode.com/method/qhadam](https://paperswithcode.com/method/qhadam)
21 | - [https://johnchenresearch.github.io/demon/](https://johnchenresearch.github.io/demon/)
22 | - [https://facebookresearch.github.io/qhoptim/](https://facebookresearch.github.io/qhoptim/)
--------------------------------------------------------------------------------
/Optimizers/qhm/code/qhm.py:
--------------------------------------------------------------------------------
1 | # based on https://arxiv.org/pdf/1810.06801.pdf
2 |
3 | import numpy as np
4 |
5 |
6 | class QHM:
7 | """QHM -Quasi-Hyperbolic Momentum
8 | Parameters:
9 | -----------
10 | learning_rate: float = 0.001
11 | The step length used when following the negative gradient.
12 | beta: float = 0.999
13 | Momentum factor.
14 | v: float = 0.7
15 | Immediate discount factor.
16 | """
17 | def __init__(self, learning_rate: float = 0.001, beta: float = 0.999, v: float = 0.7) -> None:
18 | self.learning_rate = learning_rate
19 | self.beta = beta
20 | self.v = v
21 |
22 | self.g_t = np.array([])
23 |
24 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
25 | if not self.g_t.any():
26 | self.g_t = np.zeros(np.shape(w))
27 |
28 | self.g_t = self.beta * self.g_t + (1 - self.beta) * grad_wrt_w
29 |
30 | return w - self.learning_rate * ((1 - self.v) * grad_wrt_w + self.v * self.g_t)
31 |
--------------------------------------------------------------------------------
/Optimizers/qhm/doc/qhm_update_rule.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/qhm/doc/qhm_update_rule.PNG
--------------------------------------------------------------------------------
/Optimizers/qhm/tex/8217ed3c32a785f0b5aad4055f432ad8.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Optimizers/qhm/tex/f9acdf2e58c905cd2502b16cd0f720c9.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/Optimizers/radam/README.md:
--------------------------------------------------------------------------------
1 | # RAdam - Rectified Adam
2 |
3 | 
4 |
5 | RAdam or "Rectified Adam" is a variant of the Adam optimizer that seeks to tackle Adam's bad convergence problem by introducing a term to rectify the variance of the adaptive learning rate.
6 |
7 | The authors argue that the root cause of Adam's bad convergence is that the adaptive learning rate has an undesirable large variance in the early stage of model training due to the limited amount of training samples being used.
8 |
9 | RAdam deals with the large variance of the adaptive learning rate by adding a rectifier term:
10 |
11 | 
12 |
13 | ## Code
14 |
15 | - [RAdam Numpy Implementation](code/radam.py)
16 |
17 | ## Resources
18 |
19 | - [https://arxiv.org/abs/1908.03265](https://arxiv.org/abs/1908.03265)
20 | - [https://paperswithcode.com/method/radam](https://paperswithcode.com/method/radam)
21 | - [https://github.com/LiyuanLucasLiu/RAdam](https://github.com/LiyuanLucasLiu/RAdam)
22 | - [https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b](https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b)
--------------------------------------------------------------------------------
/Optimizers/radam/README.tex.md:
--------------------------------------------------------------------------------
1 | # RAdam - Rectified Adam
2 |
3 | 
4 |
5 | RAdam or "Rectified Adam" is a variant of the Adam optimizer that seeks to tackle Adam's bad convergence problem by introducing a term to rectify the variance of the adaptive learning rate.
6 |
7 | The authors argue that the root cause of Adam's bad convergence is that the adaptive learning rate has an undesirable large variance in the early stage of model training due to the limited amount of training samples being used.
8 |
9 | RAdam deals with the large variance of the adaptive learning rate by adding a rectifier term:
10 |
11 | 
12 |
13 | ## Code
14 |
15 | - [RAdam Numpy Implementation](code/radam.py)
16 |
17 | ## Resources
18 |
19 | - [https://arxiv.org/abs/1908.03265](https://arxiv.org/abs/1908.03265)
20 | - [https://paperswithcode.com/method/radam](https://paperswithcode.com/method/radam)
21 | - [https://github.com/LiyuanLucasLiu/RAdam](https://github.com/LiyuanLucasLiu/RAdam)
22 | - [https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b](https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b)
--------------------------------------------------------------------------------
/Optimizers/radam/code/radam.py:
--------------------------------------------------------------------------------
1 | # based on https://arxiv.org/pdf/1908.03265.pdf
2 |
3 | import numpy as np
4 |
5 |
6 | class RAdam:
7 | """RAdam
8 | Parameters:
9 | -----------
10 | learning_rate: float = 0.001
11 | The step length used when following the negative gradient.
12 | beta_1: float = 0.9
13 | The exponential decay rate for the 1st moment estimates.
14 | beta_2: float = 0.999
15 | The exponential decay rate for the 2nd moment estimates.
16 | """
17 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999) -> None:
18 | self.learning_rate = learning_rate
19 | self.beta_1 = beta_1
20 | self.beta_2 = beta_2
21 |
22 | self.p_max = 2 / (1 - self.beta_2) - 1
23 |
24 | self.t = 0
25 | self.m = None # Decaying averages of past gradients
26 | self.v = None # Decaying averages of past squared gradients
27 |
28 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
29 | self.t += 1
30 | if self.m is None:
31 | self.m = np.zeros(np.shape(grad_wrt_w))
32 | self.v = np.zeros(np.shape(grad_wrt_w))
33 |
34 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w
35 | self.v = 1 / self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2)
36 |
37 | m_hat = self.m / (1 - self.beta_1**self.t)
38 | p_t = self.p_max - 2 * self.t * self.beta_2**self.t / (1 - self.beta_2**self.t)
39 |
40 | if p_t > 4:
41 | l_t = np.sqrt((1 - self.beta_2**self.t) / self.v)
42 | r_t = np.sqrt(((p_t - 4) * (p_t - 2) * self.p_max) / ((self.p_max - 4) * (self.p_max - 2) * p_t))
43 | w_update = self.learning_rate * r_t * m_hat * l_t
44 | else:
45 | w_update = self.learning_rate * m_hat
46 |
47 | return w - w_update
48 |
--------------------------------------------------------------------------------
/Optimizers/radam/doc/radam_adam_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/radam/doc/radam_adam_comparison.png
--------------------------------------------------------------------------------
/Optimizers/radam/doc/radam_update_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/radam/doc/radam_update_rule.png
--------------------------------------------------------------------------------
/Optimizers/rmsprop/README.md:
--------------------------------------------------------------------------------
1 | # RMSprop
2 |
3 | 
4 |
5 | RMSprop is an unpublished, adaptive learning rate optimization algorithm first proposed by [Geoff Hinton](https://en.wikipedia.org/wiki/Geoffrey_Hinton) in lecture 6 of his online class "[Neural Networks for Machine Learning](http://www.cs.toronto.edu/~hinton/coursera/lecture6/lec6.pdf)". RMSprop and Adadelta have been developed independently around the same time, and both try to resolve Adagrad's diminishing learning rate problem. [1]
6 |
7 | 
8 | 
9 |
10 | The difference between Adadelta and RMSprop is that Adadelta removes the learning rate
entirely and replaces it by the root mean squared error of parameter updates.
11 |
12 | [1] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.
13 |
14 | ## Code
15 |
16 | * [RMSprop Numpy Implementation](code/rmsprop.py)
--------------------------------------------------------------------------------
/Optimizers/rmsprop/README.tex.md:
--------------------------------------------------------------------------------
1 | # RMSprop
2 |
3 | 
4 |
5 | RMSprop is an unpublished, adaptive learning rate optimization algorithm first proposed by [Geoff Hinton](https://en.wikipedia.org/wiki/Geoffrey_Hinton) in lecture 6 of his online class "[Neural Networks for Machine Learning](http://www.cs.toronto.edu/~hinton/coursera/lecture6/lec6.pdf)". RMSprop and Adadelta have been developed independently around the same time, and both try to resolve Adagrad's diminishing learning rate problem. [1]
6 |
7 | $$E[g^2]_t = 0.9 E[g^2]_{t-1} + 0.1 g^2_t$$
8 |
9 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{\sqrt{E[g^2]_t + \epsilon}} g_{t}$$
10 |
11 | The difference between Adadelta and RMSprop is that Adadelta removes the learning rate $\eta$ entirely and replaces it by the root mean squared error of parameter updates.
12 |
13 | [1] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.
14 |
15 | ## Code
16 |
17 | * [RMSprop Numpy Implementation](code/rmsprop.py)
--------------------------------------------------------------------------------
/Optimizers/rmsprop/code/rmsprop.py:
--------------------------------------------------------------------------------
1 | # based on https://ruder.io/optimizing-gradient-descent/#rmsprop
2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L88
3 |
4 | import numpy as np
5 |
6 |
7 | class RMSprop:
8 | """RMSprop
9 | Parameters:
10 | -----------
11 | learning_rate: float = 0.001
12 | The step length used when following the negative gradient.
13 | rho: float = 0.9
14 | Discounting factor for the history/coming gradient.
15 | epsilon: float = 1e-07
16 | A small floating point value to avoid zero denominator.
17 | """
18 | def __init__(self, learning_rate: float = 0.001, rho: float = 0.9, epsilon: float = 1e-7) -> None:
19 | self.learning_rate = learning_rate
20 | self.rho = rho
21 | self.epsilon = epsilon
22 | self.E_grad = None # Running average of the square gradients at w
23 |
24 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray:
25 | if self.E_grad is None:
26 | self.E_grad = np.zeros(np.shape(grad_wrt_w))
27 |
28 | # Update average of gradients at w
29 | self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2)
30 |
31 | return w - self.learning_rate * grad_wrt_w / np.sqrt(self.E_grad + self.epsilon)
32 |
--------------------------------------------------------------------------------
/Optimizers/rmsprop/doc/rmsprop_example.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/rmsprop/doc/rmsprop_example.PNG
--------------------------------------------------------------------------------
/Optimizers/rmsprop/tex/1d0496971a2775f4887d1df25cea4f7e.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------