├── .github └── FUNDING.yml ├── .gitignore ├── Activation_Functions ├── README.md ├── README.tex.md ├── code │ ├── elu.py │ ├── gelu.py │ ├── leaky_relu.py │ ├── mish.py │ ├── relu.py │ ├── selu.py │ ├── sigmoid.py │ ├── silu.py │ ├── softmax.py │ ├── softplus.py │ └── tanh.py ├── doc │ ├── Activation_Functions.png │ ├── Exponential_Linear_Unit.png │ ├── Gaussian_Error_Linear_Unit.png │ ├── Leaky_ReLU.png │ ├── Mish_Function.png │ ├── Parameteric_ReLU.png │ ├── Rectified_Linear_Unit.png │ ├── Scaled_Exponential_Linear_Unit.png │ ├── Sigmoid_Function.png │ ├── Sigmoid_Weighted_Linear_Unit_Swish.png │ ├── SoftPlus.png │ └── Tanh_Function.png └── tex │ ├── 1a3902d66dffcc33134633eb13a56e4a.svg │ ├── 2172629849e5868eaf600934f256c186.svg │ ├── 22b919815535e3da79a74831f137d534.svg │ ├── 290ecca72cd3c083c37a6bdff5f8d689.svg │ ├── 2d95939262cdc426890def2845d69e00.svg │ ├── 3e743f8c72715fac3f04a831660936ed.svg │ ├── 400293c7745c1271a610177098dbe49c.svg │ ├── 57007cfe55ba83df3eeedbdc9d6485b4.svg │ ├── 5e156666e8767505b7fdc17f061898f7.svg │ ├── 61916d70fa806c731f6b8e12a081fdc2.svg │ ├── 754c5b79c77621fd1c89885a39b8d291.svg │ ├── 789a1f0365c3e83c7d1dc4a8b10d0acf.svg │ ├── 7e28db664ad627340f7fda25a290ac36.svg │ ├── 7eb4be07a0429a57780410969ed58d1a.svg │ ├── 80c7c3a438606431b27cc86bce2f0135.svg │ ├── 822646f49afad2437610e66ee730bef7.svg │ ├── 83803c6cf357e7afb8cdabf1e530ea97.svg │ ├── 9b171bd87aa286bf84d6621ea1204017.svg │ ├── a44d2c33ac06d2b68df258ffa4e311c6.svg │ ├── a4bbb3b4859a057a266b6c31e636abc7.svg │ ├── a5d8a53e48a44e595830cd70188848a5.svg │ ├── a7441e8a4f2fdb45cfc82da527cbafed.svg │ ├── b106834be35dfc293ba97ae8fbe93673.svg │ ├── b12dfd5e9d8bfe92d02c115de29172d4.svg │ ├── b5f56261f1d93afbbe17f2cba27d68d9.svg │ ├── d0f701c20d414f274f5a81ef8eb6be5c.svg │ └── e859654ddf616a4d426f9a15ef699144.svg ├── Algorithms ├── adaboost │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── adaboost.py │ ├── doc │ │ ├── adaboost.png │ │ ├── adaboost_training.gif │ │ ├── alpha.png │ │ └── decision_stump.PNG │ └── tex │ │ ├── 00cdc31549c67b60c6dff38106fea53a.svg │ │ ├── 3826eeb617fdc1a5c8840e859a7dafbb.svg │ │ ├── 447a46ee6fdce8100ddf3d57c464612b.svg │ │ ├── 4f4f4e395762a3af4575de74c019ebb5.svg │ │ ├── b4128148f8163b17d8269f72bf4e6d74.svg │ │ ├── c2a29561d89e139b3c7bffe51570c3ce.svg │ │ ├── cbfb1b2a33b28eab8a3e59464768e810.svg │ │ └── dc56b266dfc19aea6656ef2dde1f1f14.svg ├── dbscan │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── dbscan.py │ └── doc │ │ ├── dbscan.gif │ │ └── results.png ├── decision_tree │ ├── README.md │ ├── README.tex.md │ ├── code │ │ ├── decision_tree_classification.py │ │ ├── decision_tree_regression.py │ │ └── visualize_decision_trees_with_graphviz.py │ ├── doc │ │ ├── iris_decision_surface.png │ │ ├── iris_decision_tree.png │ │ ├── plot_tree.png │ │ └── titanic_example.jpg │ └── tex │ │ ├── 3952bc7dadde93e3af8e54d66588d8b9.svg │ │ ├── 55fafb270a7563e9c79658b7e1a606e2.svg │ │ └── 99b4cda42ce5d6085705dc7458181012.svg ├── gradient_boosting │ └── code │ │ ├── gradient_boosting_classifier.py │ │ └── gradient_boosting_regressor.py ├── k_nearest_neighbors │ ├── README.md │ ├── README.tex.md │ ├── code │ │ ├── k_nearest_neighbors.py │ │ └── k_nearest_neighbors_regression.py │ ├── doc │ │ ├── effect_of_k.png │ │ ├── effect_of_k_2.png │ │ └── euclidean_distance.svg │ └── tex │ │ ├── 48e45a94b5215298962054c17e895faf.svg │ │ └── a44a9e5e7f3ef9019ae9a21dbb98f40f.svg ├── kernel_pca │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── kernel_pca.py │ ├── doc │ │ └── kernel_pca.png │ └── tex │ │ ├── 12e6d8a64abd9854079af8b0622eb86a.svg │ │ ├── 1cef6a8d14b34297d97d3e1cf812ff5c.svg │ │ ├── 28a3c6f9dc75c8bf1b3498bbcea108be.svg │ │ ├── 3821cc82b4d7dc6624ec03fd5a93dffc.svg │ │ ├── 65ed4b231dcf18a70bae40e50d48c9c0.svg │ │ ├── 87524c1390370d418a3be6af1b4136c5.svg │ │ ├── 88a947ead8011f566945b4f207fde1a8.svg │ │ ├── a6096ac2cee42d8fa76ec9110eb9c598.svg │ │ └── d6328eaebbcd5c358f426dbea4bdbf70.svg ├── kmeans │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── kmeans.py │ ├── doc │ │ ├── choose_k_value.jpeg │ │ ├── elbow_method_using_yellowbrick.png │ │ ├── k_means.gif │ │ ├── noisy_circles_with_true_output.png │ │ ├── noisy_moons_with_true_output.png │ │ ├── silhouette_analysis_3_clusters.jpeg │ │ ├── silhouette_analysis_4_clusters.jpeg │ │ ├── silhouette_analysis_5_clusters.jpeg │ │ └── two_lines.png │ └── tex │ │ ├── 065cfac694daeb1fff1264475e035c67.svg │ │ ├── 16ceb724dafaab6c19cf71bc5c460244.svg │ │ ├── 1a567506286617473a9c0d9b2172f951.svg │ │ ├── 43ca5ad9e1f094a31392f860ef481e5c.svg │ │ ├── 44bc9d542a92714cac84e01cbbb7fd61.svg │ │ ├── 4bdc8d9bcfb35e1c9bfb51fc69687dfc.svg │ │ ├── 5c9a23f70c5920444f4613242c1e95fb.svg │ │ ├── 5d2031093fe35c15cf01b562bab7d54f.svg │ │ ├── 77a3b857d53fb44e33b53e4c8b68351a.svg │ │ ├── b3520dc7da5f9731724eb6e1768a45a7.svg │ │ ├── b776953fbf2b14971aa17331a8640386.svg │ │ ├── db0e77b2ab4f495dea1f5c5c08588288.svg │ │ └── f0c3f612efc905c5a416138c62517a36.svg ├── linear_discriminant_analysis │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── linear_discriminant_analysis.py │ ├── doc │ │ └── lda_example.png │ └── tex │ │ ├── 021a2e6a7f973e9edb8dcb0bf5bda569.svg │ │ ├── 0aa7f58b7e561001f5301aa03507f552.svg │ │ ├── 0e51a2dede42189d77627c4d742822c3.svg │ │ ├── 3bf9c1fe4273ed003fd49e744378a5ac.svg │ │ ├── 47b592a798cd56ccf668b67abad36a61.svg │ │ ├── 518542ce2a067b399803d0396d9c5aae.svg │ │ ├── 5a163b5cb124f209aed344b8f61b493f.svg │ │ ├── 61daf4e5401f3020b1b0bfefbbd0e59e.svg │ │ ├── 63bb9849783d01d91403bc9a5fea12a2.svg │ │ ├── 66a81133e5715952856e2a06741f4676.svg │ │ ├── 6711c7bae84526c845527391cb33d2e5.svg │ │ ├── 84c95f91a742c9ceb460a83f9b5090bf.svg │ │ ├── 874357dd0ff10af024f68c608dfc7a98.svg │ │ ├── a9ba65368f9892beab04bf21d7e17b4f.svg │ │ ├── c7eee0782fa9ccb115b1518f68c8908f.svg │ │ ├── cbfb1b2a33b28eab8a3e59464768e810.svg │ │ ├── d28140eda2d12e24b434e011b930fa23.svg │ │ ├── d8cf0d84a4e9973bace4607b359224f4.svg │ │ ├── deceeaf6940a8c7a5a02373728002b0f.svg │ │ └── fcda2be66b20dba76606c4f982b63b60.svg ├── linear_regression │ ├── README.md │ ├── README.tex.md │ ├── code │ │ ├── elastic_net.py │ │ ├── lasso_regression.py │ │ ├── linear_regression_explained.ipynb │ │ ├── multivariate_linear_regression.py │ │ ├── normal_equation.py │ │ ├── polynomial_regression.py │ │ ├── ridge_regression.py │ │ └── simple_linear_regression.py │ ├── doc │ │ ├── linear_regression_example.png │ │ └── regularization.png │ └── tex │ │ ├── 0822727d1cb885ac043eb8c23c6a8c06.svg │ │ ├── 18813fabfad59d1ba84fc901ede9101f.svg │ │ ├── 2d3d16f648bb613710e8ed0a19f2fe17.svg │ │ ├── 4b4518f1b7f0fb1347fa21506ebafb19.svg │ │ ├── 4bf055a6a961b27706b75bc7e08a0f29.svg │ │ ├── 660ef60b693132606dcc3aae53b147ca.svg │ │ ├── 695de53e837a94510d8695f780f764d1.svg │ │ ├── 87a75da6a417d9d9fd57f0b9b24473d2.svg │ │ ├── ac342f337b60a671151324a7a222d777.svg │ │ ├── c116dfb62bb6eadf90bac11393f97a66.svg │ │ ├── c2a29561d89e139b3c7bffe51570c3ce.svg │ │ ├── cbfb1b2a33b28eab8a3e59464768e810.svg │ │ ├── deceeaf6940a8c7a5a02373728002b0f.svg │ │ ├── e37355cc0b5b07561247c00842519c04.svg │ │ ├── eedb3ae6d88cd2296e4c9acfe5658b09.svg │ │ ├── ef27eeeeeadc48f3a48118fbf65ff125.svg │ │ └── f28aee7ec74570ba081a608f7b5d88bb.svg ├── logistic_regression │ ├── README.md │ ├── README.tex.md │ ├── code │ │ ├── custom_implementation_vs_sklearn.ipynb │ │ ├── data │ │ │ └── heart.csv │ │ ├── logistic_regression.py │ │ └── one_vs_all_logistic_regression.py │ ├── doc │ │ ├── classification_vs_regression.jpeg │ │ ├── convex_vs_non_convex.png │ │ ├── logistic_regression_decision_boundary.png │ │ ├── loss_functions.png │ │ ├── one_vs_all.png │ │ ├── overfitting_vs_underfitting.png │ │ └── sigmoid.png │ └── tex │ │ ├── 068f41ab65ac2dc66989bc4b34ac6269.svg │ │ ├── 07eebf05477a153a80ab3a1706b61874.svg │ │ ├── 1426b496f93f9ac9c247d2c6b9feb304.svg │ │ ├── 4f5fc085fbff8d9f0739164d34742fe9.svg │ │ ├── 5bf04b3414ef400138e14332d52bd2a2.svg │ │ ├── 5fb3811ee1edea3bc2a48d40d4db41aa.svg │ │ ├── 608251b1bb31fecdd0617348db9b9a4c.svg │ │ ├── 782c35b03d81084d082d0684a07ff03d.svg │ │ ├── 8892ac1f6b1e6ffec35850296b02ec60.svg │ │ ├── 949323d1374941432d95b3af55636269.svg │ │ ├── 95413250c774b015e5c7ae8f011b158c.svg │ │ ├── b348a8293e0acf9556a5a0a7e5fe9441.svg │ │ ├── d866685970a1f6603602b10a76c5bf0e.svg │ │ ├── f65d6c31a76358fd63c7ad13b74c5b2c.svg │ │ ├── f6e98405de2edfa03d384e436eb4a6e6.svg │ │ └── fd8be73b54f5436a5cd2e73ba9b6bfa9.svg ├── mean_shift │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── mean_shift.py │ └── doc │ │ ├── choose_bandwidth.png │ │ ├── cluster_comparison.png │ │ ├── kde_plot.png │ │ ├── mean_shift.gif │ │ ├── noisy_circles.png │ │ ├── noisy_moons.png │ │ └── two_lines.png ├── principal_component_analysis │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── principal_component_analysis.py │ ├── doc │ │ └── pca_example.png │ └── tex │ │ ├── 0aa7f58b7e561001f5301aa03507f552.svg │ │ ├── 63bb9849783d01d91403bc9a5fea12a2.svg │ │ ├── 84c95f91a742c9ceb460a83f9b5090bf.svg │ │ └── a9ba65368f9892beab04bf21d7e17b4f.svg └── random_forest │ ├── README.md │ ├── README.tex.md │ ├── code │ ├── eli5_feature_importance_example.py │ ├── random_forest_classifier.py │ ├── random_forest_regressor.py │ ├── scikit-learn │ │ ├── feature_importance_example.py │ │ └── out_of_bag_error_example.py │ └── shap_feature_importance_example.py │ ├── doc │ ├── bootstrapping_vertical.png │ ├── decision_tree.png │ ├── feature_importance.png │ ├── out_of_bag_set.png │ ├── random_forest_pipeline_horizontal_vertical.png │ └── selecting_a_random_subset_of_variables_vertical.png │ └── tex │ └── 9fc20fb1d3825674c6a279cb0d5ca636.svg ├── CONTRIBUTING.md ├── Ensemble_Methods └── code │ ├── averaging.py │ ├── bagging.py │ ├── blending.py │ ├── majority_vote.py │ ├── stacking.py │ ├── stacking_retrained.py │ └── weighted_average.py ├── LICENSE ├── Metrics ├── README.md ├── README.tex.md ├── code │ ├── accuracy_score.py │ ├── binary_cross_entropy.py │ ├── brier_score.py │ ├── categorical_cross_entropy.py │ ├── cosine_distance.py │ ├── d2_score.py │ ├── f1_score.py │ ├── fbeta_score.py │ ├── hinge.py │ ├── huber.py │ ├── kl_divergence.py │ ├── logcosh.py │ ├── mean_absolute_error.py │ ├── mean_absolute_percentage_error.py │ ├── mean_squared_error.py │ ├── mean_squared_log_error.py │ ├── median_absolute_error.py │ ├── poisson.py │ ├── precision.py │ ├── r2_score.py │ ├── recall.py │ └── tweedie_deviance.py ├── doc │ ├── binary_cross_entropy.png │ └── confusion_matrix.png └── tex │ ├── 0a5c2da8007e2edc6de9ca962be3f3ed.svg │ ├── 0df67ef21a0ddee56433ca033cb933c1.svg │ ├── 15a86bf084c2654dfd8c0ab4ddda5bb3.svg │ ├── 1ff5c2fb18f358c5a53d9f38bb1538b8.svg │ ├── 202a192d4715ffd00cf289c10c107b43.svg │ ├── 282f38ecf82d8d7b9d2813044262d5f3.svg │ ├── 36b5afebdba34564d884d347484ac0c7.svg │ ├── 44bc9d542a92714cac84e01cbbb7fd61.svg │ ├── 4fe48dde86ac2d37419f0b35d57ac460.svg │ ├── 53446b529aaec55cc9c04abff12141f8.svg │ ├── 5cd6e6c44dcdc5d9134e7ff6c5b812fc.svg │ ├── 5ce5d6877b4b1485ff9b0a48a56e5f97.svg │ ├── 61e1a35fbe056f586e6a9dbc645eabb7.svg │ ├── 735371fbbd0b21c453edc23b25d47a60.svg │ ├── 77a3b857d53fb44e33b53e4c8b68351a.svg │ ├── 793b0453fad52e1901e19f8c4489cace.svg │ ├── 8217ed3c32a785f0b5aad4055f432ad8.svg │ ├── 86bbcafb36f7dfddde972e1b47296b4c.svg │ ├── 894224f3dc1a64562c781eff86cad001.svg │ ├── 8a1f6bce1cca2d7cb34ee00ca6d18614.svg │ ├── 8c8cdc49efc1e1ac95c5baf72e69b4e8.svg │ ├── 8cdee07f9c86dc6c56f28b9f8fb8ae6d.svg │ ├── 928194bd8bb89cb48374d0ab69a41c69.svg │ ├── 9883db76caed72638544fbc209d7e157.svg │ ├── a1b798ffc158c4ee0b440f4114c4f1c0.svg │ ├── a92f489b7bf58458ad9a831191712560.svg │ ├── bfcf5229cb3b2eb7b6472152c5538e88.svg │ ├── c0f72f6ec2f0d5623ef75e15d1a9f197.svg │ ├── c821543a0ee6e81d1a637188ab98345e.svg │ ├── cdfab9c39216e8f199b80bc0590823ca.svg │ ├── ce9e403e07bb796a5a4aea8e9aea8727.svg │ ├── cf0c74f647a60274739e82cc935d32e4.svg │ ├── cf644cbd499c18ed6f22cee5950c0d75.svg │ ├── d5d6a7178f9ca2be9eab3bf855709944.svg │ ├── d821640e564d2c34dbc9ee887fb60ca1.svg │ ├── d8bc4fe1fed0596068b06f14dc5b6186.svg │ ├── db850e0baa86d7832b5c75d7c4488d78.svg │ ├── deceeaf6940a8c7a5a02373728002b0f.svg │ ├── e1a2df39f105072461870caf8fa0e344.svg │ ├── e4f967b6c1927904b60f77385e187da6.svg │ ├── e9999393d8d1b46365ba09586571c55d.svg │ ├── ee9dc84d168b211ff9f4b354e295af3c.svg │ ├── f1128d54a4a5ff0cc3a487dc3f920c62.svg │ └── ffa6eb731ed4996ab83caa1c630b1b9f.svg ├── Optimizers ├── adadelta │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── adadelta.py │ ├── doc │ │ └── adadelta_example.png │ └── tex │ │ ├── 11c596de17c342edeed29f489aa4b274.svg │ │ ├── 15431539b7b73e500cc0fd3d7e0af147.svg │ │ ├── 16423efbb7c672354f022590a8f79ed2.svg │ │ ├── 20aafbd370a6b88bfacab3c7c49d8aa8.svg │ │ ├── 2de22b33302abdf5e16b99d95e6bf125.svg │ │ ├── 3ca6889677ea09e526a816322160498f.svg │ │ ├── 407764bb35619057e9230a563546d02a.svg │ │ ├── 4764741b6721dc727ba86e4c3ea5d106.svg │ │ ├── 4f4f4e395762a3af4575de74c019ebb5.svg │ │ ├── 8618e3e0464e1c4ae3ba41984874fa33.svg │ │ ├── 8fdfd1eb52433d071078828592da25cc.svg │ │ ├── 9d55fd72b8efdeca23093c2ed0ea5745.svg │ │ ├── a9c79740e927ca20df42f2ac49811782.svg │ │ ├── ae4fb5973f393577570881fc24fc2054.svg │ │ ├── ae81fc362ffffea791fb10239e17378e.svg │ │ ├── b999b985f5ccf08b3fce39e97a1c63b8.svg │ │ └── e7d2063bdcfb3dfdb3f44724950543d1.svg ├── adagrad │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── adagrad.py │ ├── doc │ │ └── adagrad_example.gif │ └── tex │ │ ├── 43af08929a34d369038ea5f29d4b9cad.svg │ │ ├── 45913b7ee3a34648c53cb1db66c97d75.svg │ │ ├── 4f4f4e395762a3af4575de74c019ebb5.svg │ │ ├── 7ccca27b5ccc533a2dd72dc6fa28ed84.svg │ │ ├── ad3e2cec2e4e99bcb40a19ecda561e56.svg │ │ ├── b1cc9c4f3f1d62306a8d45977e8f2946.svg │ │ ├── c745b9b57c145ec5577b82542b2df546.svg │ │ ├── db14eb9fda4448bde6e9d57897df8aae.svg │ │ └── f166369f3ef0a7ff052f1e9bbf57d2e2.svg ├── adam │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── adam.py │ ├── doc │ │ └── adam_example.PNG │ └── tex │ │ ├── 15ef3b23ef739e47090fa0825bf9d390.svg │ │ ├── 1c22e0ed21fd53f1f1d04d22d5d21677.svg │ │ ├── 285dbe2a851d6e35501b39511115cd05.svg │ │ ├── 2cae3bbfffb6ab2858054ba28bfcba80.svg │ │ ├── 2feec3f6a85bfa367ca19d5e6d7002e6.svg │ │ ├── 3e3c6ee78813607a4d976d92c19dd36e.svg │ │ ├── 4ea6f1054f33b2fe4ccc258e940fdce1.svg │ │ ├── 824123b152beebd863c67856d33ed802.svg │ │ ├── a53a375441275f24641fc239deb138cb.svg │ │ ├── b65d13242f56b3410177b1401dd8b7e8.svg │ │ ├── ddb44cc6d9b5fa907d7e2d60daed1bca.svg │ │ ├── f24bd5b399fcd2f1620d8978d4c3d069.svg │ │ └── f4bee786ed43433221a48b27a5ed87ec.svg ├── adamax │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── adamax.py │ ├── doc │ │ └── adamax_example.PNG │ └── tex │ │ ├── 2ec6e630f199f589a2402fdf3e0289d5.svg │ │ ├── 336fefe2418749fabf50594e52f7b776.svg │ │ ├── 34ec2fa234397799e854fa7109da32c2.svg │ │ ├── 3e3c6ee78813607a4d976d92c19dd36e.svg │ │ ├── 485b078316d575b8a3edd55921040580.svg │ │ ├── 5f5bf3f4ba1dd968b4cf5449b4310370.svg │ │ ├── 6859140733d250349cb7e3623130b8d7.svg │ │ ├── 839a0dc412c4f8670dd1064e0d6d412f.svg │ │ ├── c88595da993fcae459ef526daedd66d7.svg │ │ ├── c8a984d1a187544cc1d3132786b791b3.svg │ │ ├── ca185a0f63add2baa6fe729fd1cfef60.svg │ │ └── e6897b8647f3bd38144535d3f40078e2.svg ├── adamw │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── adamw.py │ └── doc │ │ └── adamw.png ├── amsgrad │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── amsgrad.py │ ├── doc │ │ └── amsgrad_example.png │ └── tex │ │ ├── 44e392b0bc182e02eec7fbcb32745a0a.svg │ │ ├── 824123b152beebd863c67856d33ed802.svg │ │ └── d3f0f052c885b9de14f9b3438d1ba9f0.svg ├── gradient_descent │ ├── README.md │ ├── README.tex.md │ ├── code │ │ ├── gradient_descent_with_momentum.py │ │ └── gradient_descent_with_nesterov_momentum.py │ ├── doc │ │ ├── gradient_descent.gif │ │ ├── momentum.png │ │ ├── nesterov_accelerated_gradient.png │ │ ├── pick_learning_rate.png │ │ └── variations_comparison.png │ └── tex │ │ ├── 11c596de17c342edeed29f489aa4b274.svg │ │ ├── 19f7986adf26d94218ca0cb10277f8e4.svg │ │ ├── 1c5aa1876430bbdf7dcd8f9e641ac830.svg │ │ ├── 1f52020ae24caeeaeeb316d2525450a2.svg │ │ ├── 27e556cf3caa0673ac49a8f0de3c73ca.svg │ │ ├── 386e10624041d64770c6785c1034b111.svg │ │ ├── 55a049b8f161ae7cfeb0197d75aff967.svg │ │ ├── 62b65f92d15f5423073762ffe8477b86.svg │ │ ├── 666d1825fe38f52f9b0a01c2721dc4c8.svg │ │ ├── 708d9d53037c10f462707daa2370b7df.svg │ │ ├── 9691e94c3d0d9932f20e8f32a7908dd0.svg │ │ ├── ad769e751231d17313953f80471b27a4.svg │ │ ├── b9a39f2717502925e401654007e07bfd.svg │ │ ├── bec0f956437138a98cb909f5dae6b77f.svg │ │ ├── c745b9b57c145ec5577b82542b2df546.svg │ │ ├── ca79e4e55e2ba419b202c4c9576a0d0e.svg │ │ ├── d905c0dba806bdd8413af4aefb15d0be.svg │ │ ├── e1977b3bd8b60ca5e8e3c3b921470696.svg │ │ ├── e45b3f899e65ddee5e073ecf63f17efb.svg │ │ └── f6ba11db1e6b10797a9ebcc12aeda2dc.svg ├── nadam │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── nadam.py │ ├── doc │ │ └── nadam_example.png │ └── tex │ │ ├── b860e63de84df769d7d9d6ce9295ba65.svg │ │ └── ddb44cc6d9b5fa907d7e2d60daed1bca.svg ├── qhadam │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── qhadam.py │ ├── doc │ │ └── qhadam_example.png │ └── tex │ │ ├── 41922e474070adc90e7c1379c28d22fe.svg │ │ ├── 53292819177dbb29ba6d92fe3aa2880c.svg │ │ └── bcf57c8141818aa66812cefcf9d1a886.svg ├── qhm │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── qhm.py │ ├── doc │ │ └── qhm_update_rule.PNG │ └── tex │ │ ├── 4d5efe3f0b61407442322e122c778e4b.svg │ │ ├── 693bbf447e9497167127d798d1d144cc.svg │ │ ├── 8217ed3c32a785f0b5aad4055f432ad8.svg │ │ ├── ba749f44b6808f949e9a35e0236f98c8.svg │ │ ├── e59e9e7497e95821f127a65a4f975e55.svg │ │ └── f9acdf2e58c905cd2502b16cd0f720c9.svg ├── radam │ ├── README.md │ ├── README.tex.md │ ├── code │ │ └── radam.py │ └── doc │ │ ├── radam_adam_comparison.png │ │ └── radam_update_rule.png └── rmsprop │ ├── README.md │ ├── README.tex.md │ ├── code │ └── rmsprop.py │ ├── doc │ └── rmsprop_example.PNG │ └── tex │ ├── 1d0496971a2775f4887d1df25cea4f7e.svg │ ├── 4d0c5cb8a4df5487f9457948069c0c86.svg │ ├── f22bcfcdd9fd04ced0345fc97d620463.svg │ └── fffcaba90180362da033429f55895e5a.svg └── README.md /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: TannerGilbert 4 | patreon: gilberttanner 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints 2 | *__pycache__ 3 | *.vscode 4 | *.idea -------------------------------------------------------------------------------- /Activation_Functions/code/elu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ELU: 5 | def __init__(self, alpha: float = 1.0) -> None: 6 | self.alpha = alpha 7 | 8 | def __call__(self, x: np.ndarray) -> np.ndarray: 9 | return np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1.0)) 10 | 11 | def gradient(self, x: np.ndarray) -> np.ndarray: 12 | return np.where(x >= 0.0, 1.0, self.alpha * np.exp(x)) 13 | -------------------------------------------------------------------------------- /Activation_Functions/code/gelu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.special import erf 3 | 4 | 5 | class GELU: 6 | def __call__(self, x: np.ndarray, approximate: bool = True) -> np.ndarray: 7 | if approximate: 8 | return x * 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) 9 | return x * 0.5 * (1.0 + erf(x / np.sqrt(2.0))) 10 | 11 | def gradient(self, x: np.ndarray, approximate: bool = True) -> np.ndarray: 12 | if approximate: 13 | return 0.5 * np.tanh(0.0356774 * np.power(x, 3) + 0.797885 * x) + (0.0535161 * np.power(x, 3) + 0.398942 * x) * np.power(1 / np.cosh(x), 2) * (0.0356774 * np.power(x, 3) + 0.797885 * x) + 0.5 14 | return 0.5 * (1.0 + erf(x / np.sqrt(2.0))) + x * 1 / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2)) 15 | -------------------------------------------------------------------------------- /Activation_Functions/code/leaky_relu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LeakyReLU: 5 | def __init__(self, alpha: float = 0.3) -> None: 6 | self.alpha = alpha 7 | 8 | def __call__(self, x: np.ndarray) -> np.ndarray: 9 | return np.where(x >= 0.0, x, self.alpha * x) 10 | 11 | def gradient(self, x: np.ndarray) -> np.ndarray: 12 | return np.where(x >= 0.0, 1.0, self.alpha) 13 | -------------------------------------------------------------------------------- /Activation_Functions/code/mish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Mish: 5 | def __call__(self, x: np.ndarray) -> np.ndarray: 6 | return x * np.tanh(np.log(1 + np.exp(x))) 7 | 8 | def gradient(self, x: np.ndarray) -> np.ndarray: 9 | return (np.exp(x) * (4*np.exp(2*x) + np.exp(3*x) + 4*(1+x) + np.exp(x)*(6+4*x))) / np.power(2 + 2*np.exp(x) + np.exp(2*x), 2) 10 | -------------------------------------------------------------------------------- /Activation_Functions/code/relu.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | 4 | 5 | class ReLU: 6 | def __call__(self, x: Union[list, np.ndarray]) -> np.ndarray: 7 | return np.maximum(x, 0.0) 8 | 9 | def gradient(self, x: np.ndarray) -> np.ndarray: 10 | return np.where(x >= 0.0, 1.0, 0.0) 11 | -------------------------------------------------------------------------------- /Activation_Functions/code/selu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class SELU: 5 | def __init__(self): 6 | self.alpha = 1.6732632423543772848170429916717 7 | self.scale = 1.0507009873554804934193349852946 8 | 9 | def __call__(self, x: np.ndarray) -> np.ndarray: 10 | return self.scale * np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1.0)) 11 | 12 | def gradient(self, x: np.ndarray) -> np.ndarray: 13 | return self.scale * np.where(x >= 0.0, 1.0, self.alpha * np.exp(x)) 14 | -------------------------------------------------------------------------------- /Activation_Functions/code/sigmoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Sigmoid: 5 | def __call__(self, x: np.ndarray) -> np.ndarray: 6 | return 1 / (1 + np.exp(-x)) 7 | 8 | def gradient(self, x: np.ndarray) -> np.ndarray: 9 | return self.__call__(x) * (1 - self.__call__(x)) 10 | -------------------------------------------------------------------------------- /Activation_Functions/code/silu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class SiLU: 5 | def __call__(self, x: np.ndarray) -> np.ndarray: 6 | return x / (1 + np.exp(-x)) 7 | 8 | def gradient(self, x: np.ndarray) -> np.ndarray: 9 | return (1 + np.exp(-x) + x * np.exp(-x)) / np.power(1 + np.exp(-x), 2) 10 | -------------------------------------------------------------------------------- /Activation_Functions/code/softmax.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | 4 | 5 | class Softmax: 6 | def __call__(self, x: Union[list, np.ndarray]) -> np.ndarray: 7 | e_x = np.exp(x - np.max(x)) 8 | return e_x / e_x.sum(axis=0) 9 | 10 | def gradient(self, x: Union[list, np.ndarray]) -> np.ndarray: 11 | p = self.__call__(x) 12 | return p * (1 - p) 13 | -------------------------------------------------------------------------------- /Activation_Functions/code/softplus.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class SoftPlus: 5 | def __call__(self, x: np.ndarray) -> np.ndarray: 6 | return np.log(1 + np.exp(x)) 7 | 8 | def gradient(self, x: np.ndarray) -> np.ndarray: 9 | return 1 / (1 + np.exp(-x)) 10 | -------------------------------------------------------------------------------- /Activation_Functions/code/tanh.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class TanH: 5 | def __call__(self, x: np.ndarray) -> np.ndarray: 6 | return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) 7 | 8 | def gradient(self, x: np.ndarray) -> np.ndarray: 9 | return 1 - np.power(self.__call__(x), 2) -------------------------------------------------------------------------------- /Activation_Functions/doc/Activation_Functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Activation_Functions.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Exponential_Linear_Unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Exponential_Linear_Unit.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Gaussian_Error_Linear_Unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Gaussian_Error_Linear_Unit.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Leaky_ReLU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Leaky_ReLU.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Mish_Function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Mish_Function.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Parameteric_ReLU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Parameteric_ReLU.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Rectified_Linear_Unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Rectified_Linear_Unit.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Scaled_Exponential_Linear_Unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Scaled_Exponential_Linear_Unit.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Sigmoid_Function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Sigmoid_Function.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Sigmoid_Weighted_Linear_Unit_Swish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Sigmoid_Weighted_Linear_Unit_Swish.png -------------------------------------------------------------------------------- /Activation_Functions/doc/SoftPlus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/SoftPlus.png -------------------------------------------------------------------------------- /Activation_Functions/doc/Tanh_Function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Activation_Functions/doc/Tanh_Function.png -------------------------------------------------------------------------------- /Algorithms/adaboost/README.tex.md: -------------------------------------------------------------------------------- 1 | # AdaBoost - Adaptive Boosting 2 | 3 | ![Adaboost Decision Boundary](doc/adaboost.png) 4 | 5 | AdaBoost, short for **Ada**ptive [**Boost**ing](https://en.wikipedia.org/wiki/Boosting_(meta-algorithm)), of Freund and Schapire, was the first practical boosting algorithm and remains one of the most widely used and studied ones even today. Boosting is a general strategy for learning "strong models" by combining multiple simpler ones (weak models or weak learners). 6 | 7 | A "weak learner" is a model that will do at least slightly better than chance. AdaBoost can be applied to any classification algorithm, but most often, it's used with **Decision Stumps** - Decision Trees with only one node and two leaves. 8 | 9 | ![Decision Stump](doc/decision_stump.PNG) 10 | 11 | Decision Stumps alone are not an excellent way to make predictions. A full-grown decision tree combines the decisions from all features to predict the target value. A stump, on the other hand, can only use one feature to make predictions. 12 | 13 | ## How does the AdaBoost algorithm work? 14 | 15 | 1. Initialize sample weights uniformly as $w_i^1=\frac{1}{N}$. 16 | 2. For each iteration $t$: 17 | 18 | **Step 1:** A weak learner (e.g. a decision stump) is trained on top of the weighted training data $X$. The weight of each sample $w_i$ indicates how important it is to classify the sample correctly. 19 | 20 | **Step 2:** After training, the weak learner gets a weight based on its accuracy $\alpha_t = \frac{1}{2} \ln \Big( \frac{1-\epsilon_t}{\epsilon_t} \Big)$ 21 | 22 | ![Alpha](doc/alpha.png) 23 | 24 | **Step 3:** The weights of misclassified samples are updated $w_i^{(t+1)} = w_i^{(t)} \cdot e^{-\alpha^t y_i h_t(x_i)}$ 25 | 26 | **Step 4:** Renormalize weights so they sum up to 1 $\sum_{i=1}^n w_i^{(t+1)}=1$ 27 | 28 | 3. Make predicts using a linear combination of the weak learners $H(x) = \text{sign} \Big(\sum_{t=1}^T \alpha_t h_t(x) \Big)$ 29 | 30 | ![Adaboost Training](doc/adaboost_training.gif) 31 | 32 | ## Code 33 | 34 | - [Adaboost Python](code/adaboost.py) 35 | 36 | ## Resources 37 | 38 | - [https://scikit-learn.org/stable/modules/ensemble.html#adaboost](https://scikit-learn.org/stable/modules/ensemble.html#adaboost) 39 | - [https://www.youtube.com/watch?v=LsK-xG1cLYA](https://www.youtube.com/watch?v=LsK-xG1cLYA) 40 | - [https://blog.paperspace.com/adaboost-optimizer/](https://blog.paperspace.com/adaboost-optimizer/) 41 | - [https://en.wikipedia.org/wiki/AdaBoost](https://en.wikipedia.org/wiki/AdaBoost) 42 | - [https://geoffruddock.com/adaboost-from-scratch-in-python/](https://geoffruddock.com/adaboost-from-scratch-in-python/) 43 | - [https://www.cs.toronto.edu/~mbrubake/teaching/C11/Handouts/AdaBoost.pdf](https://www.cs.toronto.edu/~mbrubake/teaching/C11/Handouts/AdaBoost.pdf) 44 | - [https://jeremykun.com/2015/05/18/boosting-census/](https://jeremykun.com/2015/05/18/boosting-census/) 45 | - [https://ml-explained.com/blog/decision-tree-explained](https://ml-explained.com/blog/decision-tree-explained) -------------------------------------------------------------------------------- /Algorithms/adaboost/code/adaboost.py: -------------------------------------------------------------------------------- 1 | # based on https://geoffruddock.com/adaboost-from-scratch-in-python/ 2 | 3 | from __future__ import annotations 4 | from typing import Union 5 | import numpy as np 6 | from sklearn.tree import DecisionTreeClassifier 7 | 8 | 9 | class AdaBoost: 10 | """AdaBoost 11 | Parameters: 12 | ----------- 13 | n_estimators: int 14 | Number of weak learners 15 | """ 16 | def __init__(self, n_estimators: int) -> None: 17 | self.n_estimators = n_estimators 18 | self.stumps = np.zeros(shape=n_estimators, dtype=object) 19 | self.stump_weights = np.zeros(shape=n_estimators) 20 | self.sample_weights = None 21 | 22 | def fit(self, X: np.ndarray, y: np.ndarray) -> AdaBoost: 23 | n = X.shape[0] 24 | self.sample_weights = np.zeros(shape=(self.n_estimators, n)) 25 | 26 | # Initialize weights 27 | self.sample_weights[0] = np.ones(shape=n) / n 28 | 29 | for i in range(self.n_estimators): 30 | # fit weak learner 31 | curr_sample_weights = self.sample_weights[i] 32 | stump = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2) 33 | stump.fit(X, y, sample_weight=curr_sample_weights) 34 | 35 | # calculate error and stump weight 36 | pred = stump.predict(X) 37 | err = curr_sample_weights[(pred != y)].sum() 38 | stump_weight = np.log((1 - err) / err) / 2 39 | 40 | # update sample weights 41 | new_sample_weights = ( 42 | curr_sample_weights * np.exp(-stump_weight * y * pred) 43 | ) 44 | 45 | # normalize sample weights 46 | new_sample_weights /= new_sample_weights.sum() 47 | 48 | if i+1 < self.n_estimators: 49 | self.sample_weights[i+1] = new_sample_weights 50 | 51 | self.stumps[i] = stump 52 | self.stump_weights[i] = stump_weight 53 | 54 | return self 55 | 56 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray: 57 | stump_preds = np.array([stump.predict(X) for stump in self.stumps]) 58 | return np.sign(np.dot(self.stump_weights, stump_preds)) 59 | -------------------------------------------------------------------------------- /Algorithms/adaboost/doc/adaboost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/adaboost.png -------------------------------------------------------------------------------- /Algorithms/adaboost/doc/adaboost_training.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/adaboost_training.gif -------------------------------------------------------------------------------- /Algorithms/adaboost/doc/alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/alpha.png -------------------------------------------------------------------------------- /Algorithms/adaboost/doc/decision_stump.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/adaboost/doc/decision_stump.PNG -------------------------------------------------------------------------------- /Algorithms/adaboost/tex/4f4f4e395762a3af4575de74c019ebb5.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Algorithms/dbscan/code/dbscan.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | 4 | 5 | class DBSCAN: 6 | """DBSCAN 7 | Parameters: 8 | ----------- 9 | eps: float = 0.3 10 | The maximum distance between two samples for one to be considered as in the neighborhood of the other. 11 | min_points: int = 5 12 | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 13 | """ 14 | def __init__(self, eps: float = 0.3, min_points: int = 5) -> None: 15 | self.eps = eps 16 | self.min_points = min_points 17 | self.labels = [] 18 | self.c = 1 # number of clusters 19 | 20 | def fit_predict(self, data: Union[list, np.ndarray]) -> list: 21 | self.labels = [0] * len(data) 22 | for i in range(len(data)): 23 | if not (self.labels[i] == 0): 24 | continue 25 | 26 | neighbours = self.find_neighbours(data, i) 27 | 28 | # If the number of points is below min_points the point is a outlier 29 | if len(neighbours) < self.min_points: 30 | self.labels[i] = -1 31 | else: 32 | self.grow_cluster(data, i, neighbours) 33 | self.c += 1 34 | return self.labels 35 | 36 | def find_neighbours(self, data: Union[list, np.ndarray], index: int) -> list: 37 | neighbors = [] 38 | 39 | for p in range(len(data)): 40 | if np.linalg.norm(data[index]-data[p]) < self.eps and index != p: 41 | neighbors.append(p) 42 | return neighbors 43 | 44 | def grow_cluster(self, data: Union[list, np.ndarray], index: int, neighbours: list) -> None: 45 | # Assign seed point to cluster 46 | self.labels[index] = self.c 47 | 48 | i = 0 49 | while i < len(neighbours): 50 | p = neighbours[i] 51 | if self.labels[p] == -1: 52 | self.labels[p] = self.c 53 | elif self.labels[p] == 0: 54 | self.labels[p] = self.c 55 | neighbours_new = self.find_neighbours(data, p) 56 | # check neighbours length 57 | if len(neighbours_new) >= self.min_points: 58 | neighbours = neighbours + neighbours_new 59 | i += 1 60 | 61 | 62 | if __name__ == '__main__': 63 | import matplotlib.pyplot as plt 64 | from sklearn.datasets import make_blobs 65 | from sklearn.preprocessing import MinMaxScaler 66 | 67 | X, y = make_blobs(n_samples=30, centers=3, n_features=2) 68 | X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X) 69 | model = DBSCAN() 70 | predictions = model.fit_predict(X) 71 | colors = ['r', 'g', 'b', 'c', 'k', 'y'] 72 | for classification, x in zip(predictions, X): 73 | color = colors[classification] 74 | plt.scatter(x[0], x[1], color=color, s=150, linewidths=5, zorder=10) 75 | plt.show() 76 | -------------------------------------------------------------------------------- /Algorithms/dbscan/doc/dbscan.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/dbscan/doc/dbscan.gif -------------------------------------------------------------------------------- /Algorithms/dbscan/doc/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/dbscan/doc/results.png -------------------------------------------------------------------------------- /Algorithms/decision_tree/code/visualize_decision_trees_with_graphviz.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn import tree 3 | import graphviz 4 | 5 | 6 | iris = load_iris() 7 | X = iris.data 8 | y = iris.target 9 | 10 | clf = tree.DecisionTreeClassifier() 11 | clf = clf.fit(X, y) 12 | 13 | dot_data = tree.export_graphviz(clf, out_file=None, 14 | feature_names=iris.feature_names, 15 | class_names=iris.target_names, 16 | filled=True, rounded=True, 17 | special_characters=True) 18 | 19 | graph = graphviz.Source(dot_data, format="png") 20 | graph.render("decision_tree") 21 | -------------------------------------------------------------------------------- /Algorithms/decision_tree/doc/iris_decision_surface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/iris_decision_surface.png -------------------------------------------------------------------------------- /Algorithms/decision_tree/doc/iris_decision_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/iris_decision_tree.png -------------------------------------------------------------------------------- /Algorithms/decision_tree/doc/plot_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/plot_tree.png -------------------------------------------------------------------------------- /Algorithms/decision_tree/doc/titanic_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/decision_tree/doc/titanic_example.jpg -------------------------------------------------------------------------------- /Algorithms/gradient_boosting/code/gradient_boosting_regressor.py: -------------------------------------------------------------------------------- 1 | # based on https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/gradient_boosting.py 2 | 3 | from __future__ import annotations 4 | from typing import Union 5 | import numpy as np 6 | from sklearn.tree import DecisionTreeRegressor 7 | 8 | 9 | def square_error_gradient(y: np.ndarray, y_pred: np.ndarray) -> np.ndarray: 10 | return -(y - y_pred) 11 | 12 | 13 | class GradientBoostingRegressor: 14 | """Gradient Boosting Regressor 15 | Parameters: 16 | ----------- 17 | n_estimators: int 18 | The number of classification trees that are used. 19 | learning_rate: float 20 | The step length that will be taken when following the negative gradient. 21 | min_samples_split: int 22 | The minimum number of samples required to split an internal node. 23 | max_depth: int 24 | The maximum depth of the individual regression estimators.. 25 | """ 26 | def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1, min_samples_split: int = 2, 27 | max_depth: int = 3) -> None: 28 | self.n_estimators = n_estimators 29 | self.learning_rate = learning_rate 30 | self.min_samples_split = min_samples_split 31 | self.max_depth = max_depth 32 | 33 | # Initialize trees 34 | self.initial_prediction = None 35 | self.trees = [] 36 | for _ in range(n_estimators): 37 | tree = DecisionTreeRegressor(min_samples_split=self.min_samples_split, 38 | max_depth=self.max_depth) 39 | self.trees.append(tree) 40 | 41 | def fit(self, X: Union[list, np.ndarray], y: np.ndarray) -> GradientBoostingRegressor: 42 | self.initial_prediction = np.mean(y, axis=0) 43 | y_pred = np.full(np.shape(y), np.mean(y, axis=0)) # initial prediction 44 | for i in range(self.n_estimators): 45 | gradient = square_error_gradient(y, y_pred) 46 | self.trees[i].fit(X, gradient) 47 | update = self.trees[i].predict(X) 48 | # Update y predictions 49 | y_pred -= np.multiply(self.learning_rate, update) 50 | 51 | return self 52 | 53 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray: 54 | y_pred = np.array([]) 55 | # Make predictions 56 | for tree in self.trees: 57 | update = tree.predict(X) 58 | update = np.multiply(self.learning_rate, update) 59 | y_pred = self.initial_prediction - update if not y_pred.any() else y_pred - update 60 | return y_pred 61 | 62 | 63 | if __name__ == '__main__': 64 | from sklearn import datasets 65 | # Load the diabetes dataset 66 | X, y = datasets.load_diabetes(return_X_y=True) 67 | model = GradientBoostingRegressor(max_depth=8) 68 | model.fit(X, y) 69 | print(model.predict(X[:5])) 70 | print(y[:5]) 71 | -------------------------------------------------------------------------------- /Algorithms/k_nearest_neighbors/code/k_nearest_neighbors.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Union 3 | import numpy as np 4 | from collections import Counter 5 | 6 | 7 | class KNearestNeighbors: 8 | """K Nearest Neighbors classifier. 9 | Parameters: 10 | ----------- 11 | k: int 12 | The number of closest neighbors 13 | """ 14 | def __init__(self, k: int) -> None: 15 | self.X = None 16 | self.y = None 17 | self.k = k 18 | 19 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> KNearestNeighbors: 20 | self.X = X 21 | self.y = y 22 | return self 23 | 24 | def euclidean_distance(self, X_test: Union[list, np.ndarray]) -> list: 25 | return [np.linalg.norm(X - X_test) for X in self.X] 26 | 27 | def k_nearest(self, X: Union[list, np.ndarray]) -> np.ndarray: 28 | idx = np.argpartition(X, self.k) 29 | return np.take(self.y, idx[:self.k]) 30 | 31 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray: 32 | distances_list = [self.euclidean_distance(x) for x in X] 33 | return np.array([Counter(self.k_nearest(distances)).most_common()[0][0] for distances in distances_list]) 34 | 35 | 36 | if __name__ == '__main__': 37 | import pandas as pd 38 | from sklearn.model_selection import train_test_split 39 | from sklearn.preprocessing import LabelEncoder 40 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 41 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) 42 | X, y = (np.array(df.drop('label', axis=1)), 43 | LabelEncoder().fit_transform(np.array(df['label']))) 44 | X_train, X_test, y_train, y_test = train_test_split( 45 | X, y, test_size=0.2, random_state=42) 46 | model = KNearestNeighbors(4) 47 | model.fit(X_train, y_train) 48 | predictions = model.predict(X_test) 49 | print('Accuracy:', (predictions == y_test).sum()/len(predictions)*100) 50 | -------------------------------------------------------------------------------- /Algorithms/k_nearest_neighbors/code/k_nearest_neighbors_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Union 3 | import numpy as np 4 | 5 | 6 | class KNearestNeighbors: 7 | """K Nearest Neighbors regressor. 8 | Parameters: 9 | ----------- 10 | k: int 11 | The number of closest neighbors 12 | """ 13 | def __init__(self, k: int) -> None: 14 | self.X = None 15 | self.y = None 16 | self.k = k 17 | 18 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> KNearestNeighbors: 19 | self.X = X 20 | self.y = y 21 | return self 22 | 23 | def euclidean_distance(self, X_test: Union[list, np.ndarray]) -> list: 24 | return [np.linalg.norm(X - X_test) for X in self.X] 25 | 26 | def k_nearest(self, X: Union[list, np.ndarray]) -> np.ndarray: 27 | idx = np.argpartition(X, self.k) 28 | return np.take(self.y, idx[:self.k]) 29 | 30 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray: 31 | distances_list = [self.euclidean_distance(x) for x in X] 32 | return np.array([np.mean(self.k_nearest(distances)) for distances in distances_list]) 33 | 34 | 35 | if __name__ == '__main__': 36 | import pandas as pd 37 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 38 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) 39 | iris = iris.sample(frac=1).reset_index(drop=True) 40 | X = np.array(iris.drop(['petal_width', 'label'], axis=1)) 41 | y = np.array(iris['petal_width']) 42 | model = KNearestNeighbors(3) 43 | model.fit(X, y) 44 | print(model.predict(X[:5])) 45 | print(y[:5]) 46 | -------------------------------------------------------------------------------- /Algorithms/k_nearest_neighbors/doc/effect_of_k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/k_nearest_neighbors/doc/effect_of_k.png -------------------------------------------------------------------------------- /Algorithms/k_nearest_neighbors/doc/effect_of_k_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/k_nearest_neighbors/doc/effect_of_k_2.png -------------------------------------------------------------------------------- /Algorithms/kernel_pca/README.md: -------------------------------------------------------------------------------- 1 | # Kernel PCA 2 | 3 | ![Kernel PCA Example](doc/kernel_pca.png) 4 | 5 | Kernel PCA is an extension of [PCA](https://ml-explained.com/blog/principal-component-analysis-explained) that allows for the separability of nonlinear data by making use of kernels. The basic idea behind it is to project the linearly inseparable data onto a higher dimensional space where it becomes linearly separable. 6 | 7 | Kernel PCA can be summarized as a 4 step process [1]: 8 | 9 | 1. Construct the kernel matrix from the training dataset 10 | 11 |

12 | 13 | 2. If the projected dataset doesn’t have zero mean use the Gram matrix to substitute the kernel matrix . 14 | 15 |

16 | 17 | 3. Use to solve for the vector . 18 | 19 | 4. Compute the kernel principal components 20 | 21 |

22 | 23 |

[1] Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models

24 | 25 | ## Resources 26 | 27 | - [Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models](https://arxiv.org/pdf/1207.3538.pdf) 28 | - [Kernel tricks and nonlinear dimensionality reduction via RBF kernel PCA](https://sebastianraschka.com/Articles/2014_kernel_pca.html) 29 | - [PCA and kernel PCA explained](https://nirpyresearch.com/pca-kernel-pca-explained/) 30 | - [What are the advantages of kernel PCA over standard PCA?](https://stats.stackexchange.com/questions/94463/what-are-the-advantages-of-kernel-pca-over-standard-pca) -------------------------------------------------------------------------------- /Algorithms/kernel_pca/README.tex.md: -------------------------------------------------------------------------------- 1 | # Kernel PCA 2 | 3 | ![Kernel PCA Example](doc/kernel_pca.png) 4 | 5 | Kernel PCA is an extension of [PCA](https://ml-explained.com/blog/principal-component-analysis-explained) that allows for the separability of nonlinear data by making use of kernels. The basic idea behind it is to project the linearly inseparable data onto a higher dimensional space where it becomes linearly separable. 6 | 7 | Kernel PCA can be summarized as a 4 step process [1]: 8 | 9 | 1. Construct the kernel matrix $K$ from the training dataset 10 | 11 | $$K_{i,j} = \kappa(\mathbf{x_i, x_j})$$ 12 | 13 | 2. If the projected dataset $\left\{\phi (\mathbf{x}_i) \right\}$ doesn’t have zero mean use the Gram matrix $\stackrel{\sim}{K}$ to substitute the kernel matrix $K$. 14 | 15 | $$\stackrel{\sim}{K} = K - \mathbf{1_N} K - K \mathbf{1_N} + \mathbf{1_N} K \mathbf{1_N}$$ 16 | 17 | 3. Use $K_{a_k} = \lambda_k N_{a_{k}}$ to solve for the vector $a_i$. 18 | 19 | 4. Compute the kernel principal components $y_k\left(x\right)$ 20 | 21 | $$y_k(\mathbf{x})= \phi \left(\mathbf{x}\right)^T \mathbf{v}_k = \sum_{i=1}^N a_{ki} \kappa(\mathbf{x_i, x_j})$$ 22 | 23 |

[1] Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models

24 | 25 | ## Resources 26 | 27 | - [Kernel Principal Component Analysis and its Applications in Face Recognition and Active Shape Models](https://arxiv.org/pdf/1207.3538.pdf) 28 | - [Kernel tricks and nonlinear dimensionality reduction via RBF kernel PCA](https://sebastianraschka.com/Articles/2014_kernel_pca.html) 29 | - [PCA and kernel PCA explained](https://nirpyresearch.com/pca-kernel-pca-explained/) 30 | - [What are the advantages of kernel PCA over standard PCA?](https://stats.stackexchange.com/questions/94463/what-are-the-advantages-of-kernel-pca-over-standard-pca) -------------------------------------------------------------------------------- /Algorithms/kernel_pca/code/kernel_pca.py: -------------------------------------------------------------------------------- 1 | # based on https://sebastianraschka.com/Articles/2014_kernel_pca.html 2 | 3 | from __future__ import annotations 4 | from typing import Union 5 | import numpy as np 6 | from scipy.spatial.distance import pdist, squareform 7 | from scipy.linalg import eigh 8 | 9 | 10 | class KernelPCA: 11 | """KernelPCA 12 | Parameters: 13 | ----------- 14 | n_components: int = 2 15 | Number of components to keep. 16 | gamma: float = None 17 | Kernel coefficient 18 | """ 19 | def __init__(self, n_components: int = 2, gamma: float = None): 20 | self.n_components = n_components 21 | self.gamma = gamma 22 | self.alphas = None 23 | self.lambdas = None 24 | self.X = None 25 | 26 | def fit(self, X: Union[list, np.ndarray]) -> KernelPCA: 27 | if self.gamma == None: 28 | self.gamma = 1 / X.shape[1] 29 | 30 | sq_dists = pdist(X, 'sqeuclidean') 31 | 32 | mat_sq_dists = squareform(sq_dists) 33 | 34 | K = np.exp(-self.gamma * mat_sq_dists) 35 | 36 | N = K.shape[0] 37 | one_n = np.ones((N,N)) / N 38 | K_norm = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n) 39 | 40 | eigenvalues, eigenvectors = eigh(K_norm) 41 | 42 | alphas = np.column_stack((eigenvectors[:,-i] for i in range(1, self.n_components+1))) 43 | lambdas = [eigenvalues[-i] for i in range(1, self.n_components+1)] 44 | 45 | self.alphas = alphas 46 | self.lambdas = lambdas 47 | self.X = X 48 | 49 | return self 50 | 51 | def fit_transform(self, X: Union[list, np.ndarray]) -> np.ndarray: 52 | self.fit(X) 53 | return self.alphas * np.sqrt(self.lambdas) 54 | 55 | def transform(self, X: Union[list, np.ndarray]) -> np.ndarray: 56 | # TODO: Rewrite as this is very inefficient 57 | def transform_row(X_r): 58 | pair_dist = np.array([np.sum((X_r-row)**2) for row in self.X]) 59 | k = np.exp(-self.gamma * pair_dist) 60 | return k.dot(self.alphas / self.lambdas) 61 | 62 | return np.array(list(map(transform_row, X))) 63 | 64 | 65 | 66 | 67 | if __name__ == '__main__': 68 | from sklearn.datasets import make_circles 69 | import matplotlib.pyplot as plt 70 | 71 | X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2) 72 | 73 | plt.figure(figsize=(8,6)) 74 | 75 | pca = KernelPCA(n_components=3) 76 | pca.fit(X) 77 | X = pca.transform(X) 78 | 79 | print(X) 80 | plt.plot(X[0], X[1]) 81 | plt.show() -------------------------------------------------------------------------------- /Algorithms/kernel_pca/doc/kernel_pca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kernel_pca/doc/kernel_pca.png -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/choose_k_value.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/choose_k_value.jpeg -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/elbow_method_using_yellowbrick.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/elbow_method_using_yellowbrick.png -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/k_means.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/k_means.gif -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/noisy_circles_with_true_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/noisy_circles_with_true_output.png -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/noisy_moons_with_true_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/noisy_moons_with_true_output.png -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/silhouette_analysis_3_clusters.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/silhouette_analysis_3_clusters.jpeg -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/silhouette_analysis_4_clusters.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/silhouette_analysis_4_clusters.jpeg -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/silhouette_analysis_5_clusters.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/silhouette_analysis_5_clusters.jpeg -------------------------------------------------------------------------------- /Algorithms/kmeans/doc/two_lines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/kmeans/doc/two_lines.png -------------------------------------------------------------------------------- /Algorithms/kmeans/tex/44bc9d542a92714cac84e01cbbb7fd61.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Algorithms/kmeans/tex/4bdc8d9bcfb35e1c9bfb51fc69687dfc.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Algorithms/kmeans/tex/77a3b857d53fb44e33b53e4c8b68351a.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Algorithms/linear_discriminant_analysis/doc/lda_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/linear_discriminant_analysis/doc/lda_example.png -------------------------------------------------------------------------------- /Algorithms/linear_regression/code/elastic_net.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Tuple 3 | import numpy as np 4 | 5 | 6 | class ElasticNet: 7 | """ElasticNet 8 | Parameters: 9 | ----------- 10 | learning_rate: float 11 | The step length used when following the negative gradient during training. 12 | C: float, default=1 13 | Regularization strength 14 | """ 15 | def __init__(self, learning_rate: float, alpha: float = 1.0, l1_ratio: float = 0.5) -> None: 16 | self.learning_rate = learning_rate 17 | self.alpha = alpha 18 | self.l1_ratio = l1_ratio 19 | self.w = "" 20 | 21 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]: 22 | dif = np.dot(x, self.w) - y 23 | cost = (np.sum(dif**2) + self.alpha * (self.l1_ratio * np.sum(np.absolute(self.w)) + 24 | (1 - self.l1_ratio) * np.sum(np.square(self.w)))) / (2*np.shape(x)[0]) 25 | 26 | return dif, cost 27 | 28 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> ElasticNet: 29 | if self.w == "": 30 | _, num_features = np.shape(x) 31 | self.w = np.random.uniform(-1, 1, num_features) 32 | for _ in range(num_iterations): 33 | dif, cost = self.cost_function(x, y) 34 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0] 35 | self.w = self.w - self.learning_rate * gradient 36 | return self 37 | 38 | def predict(self, x: np.ndarray) -> np.ndarray: 39 | return np.dot(x, self.w) 40 | 41 | 42 | # Testing functionality 43 | if __name__ == '__main__': 44 | import pandas as pd 45 | from sklearn.preprocessing import LabelEncoder 46 | from sklearn.model_selection import train_test_split 47 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 48 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) 49 | le = LabelEncoder() 50 | iris['label'] = le.fit_transform(iris['label']) 51 | X = np.array(iris.drop(['petal_width'], axis=1)) 52 | y = np.array(iris['petal_width']) 53 | 54 | X_train, X_test, y_train, y_test = train_test_split( 55 | X, y, test_size=0.2, random_state=42) 56 | 57 | model = ElasticNet(0.0001) 58 | model.fit(X_train, y_train, 10000) 59 | predictions = model.predict(X_test) 60 | mse = ((y_test - predictions)**2).mean(axis=0) 61 | print('Loss:', mse) 62 | -------------------------------------------------------------------------------- /Algorithms/linear_regression/code/lasso_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Tuple 3 | import numpy as np 4 | 5 | 6 | class LassoRegression: 7 | """Lasso Regression 8 | Parameters: 9 | ----------- 10 | learning_rate: float 11 | The step length used when following the negative gradient during training. 12 | C: float, default=1 13 | Regularization strength 14 | """ 15 | def __init__(self, learning_rate: float, C: float = 1) -> None: 16 | self.learning_rate = learning_rate 17 | self.C = C 18 | self.w = "" 19 | 20 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]: 21 | dif = np.dot(x, self.w) - y 22 | cost = (np.sum(dif**2) + self.C * np.sum(np.absolute(self.w))) / (2*np.shape(x)[0]) 23 | 24 | return dif, cost 25 | 26 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> LassoRegression: 27 | if self.w == "": 28 | _, num_features = np.shape(x) 29 | self.w = np.random.uniform(-1, 1, num_features) 30 | for _ in range(num_iterations): 31 | dif, cost = self.cost_function(x, y) 32 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0] 33 | self.w = self.w - self.learning_rate * gradient 34 | return self 35 | 36 | def predict(self, x: np.ndarray) -> np.ndarray: 37 | return np.dot(x, self.w) 38 | 39 | 40 | # Testing functionality 41 | if __name__ == '__main__': 42 | import pandas as pd 43 | from sklearn.preprocessing import LabelEncoder 44 | from sklearn.model_selection import train_test_split 45 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 46 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) 47 | le = LabelEncoder() 48 | iris['label'] = le.fit_transform(iris['label']) 49 | X = np.array(iris.drop(['petal_width'], axis=1)) 50 | y = np.array(iris['petal_width']) 51 | 52 | X_train, X_test, y_train, y_test = train_test_split( 53 | X, y, test_size=0.2, random_state=42) 54 | 55 | model = LassoRegression(0.0001) 56 | model.fit(X_train, y_train, 10000) 57 | predictions = model.predict(X_test) 58 | mse = ((y_test - predictions)**2).mean(axis=0) 59 | print('Loss:', mse) 60 | -------------------------------------------------------------------------------- /Algorithms/linear_regression/code/multivariate_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Tuple 3 | import numpy as np 4 | 5 | 6 | class MultivariateLinearRegression: 7 | """Multivariate Linear Regression 8 | Parameters: 9 | ----------- 10 | learning_rate: float 11 | The step length used when following the negative gradient during training. 12 | """ 13 | def __init__(self, learning_rate: float) -> None: 14 | self.learning_rate = learning_rate 15 | self.w = "" 16 | 17 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]: 18 | dif = np.dot(x, self.w) - y 19 | cost = np.sum(dif**2) / (2*np.shape(x)[0]) 20 | 21 | return dif, cost 22 | 23 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> MultivariateLinearRegression: 24 | if self.w == "": 25 | _, num_features = np.shape(x) 26 | self.w = np.random.uniform(-1, 1, num_features) 27 | for i in range(num_iterations): 28 | dif, cost = self.cost_function(x, y) 29 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0] 30 | self.w = self.w - self.learning_rate * gradient 31 | return self 32 | 33 | def predict(self, x: np.ndarray) -> np.ndarray: 34 | return np.dot(x, self.w) 35 | 36 | 37 | # Testing functionality 38 | if __name__ == '__main__': 39 | import pandas as pd 40 | from sklearn.preprocessing import LabelEncoder 41 | from sklearn.model_selection import train_test_split 42 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 43 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) 44 | le = LabelEncoder() 45 | iris['label'] = le.fit_transform(iris['label']) 46 | X = np.array(iris.drop(['petal_width'], axis=1)) 47 | y = np.array(iris['petal_width']) 48 | 49 | X_train, X_test, y_train, y_test = train_test_split( 50 | X, y, test_size=0.2, random_state=42) 51 | 52 | model = MultivariateLinearRegression(0.0001) 53 | model.fit(X_train, y_train, 10000) 54 | predictions = model.predict(X_test) 55 | mse = ((y_test - predictions)**2).mean(axis=0) 56 | print('Loss:', mse) 57 | -------------------------------------------------------------------------------- /Algorithms/linear_regression/code/normal_equation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import numpy as np 3 | 4 | 5 | class NormalEquation: 6 | 7 | def __init__(self): 8 | self.w = None 9 | 10 | 11 | def fit(self, x: np.ndarray, y: np.ndarray) -> NormalEquation: 12 | x = np.append(np.ones([len(x), 1]), x, 1) 13 | z = np.linalg.inv(np.dot(x.transpose(), x)) 14 | self.w = np.dot(np.dot(z, x.transpose()), y) 15 | return self 16 | 17 | def predict(self, x: np.ndarray): 18 | if self.w == None: 19 | raise Exception('Call .fit before using predict method') 20 | 21 | x = np.append(np.ones([len(x), 1]), x, 1) 22 | return np.dot(x, self.w) 23 | -------------------------------------------------------------------------------- /Algorithms/linear_regression/code/ridge_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Tuple 3 | import numpy as np 4 | 5 | 6 | class RidgeRegression: 7 | """Ridge Regression 8 | Parameters: 9 | ----------- 10 | learning_rate: float 11 | The step length used when following the negative gradient during training. 12 | C: float, default=1 13 | Regularization strength 14 | """ 15 | def __init__(self, learning_rate: float, C: float = 1) -> None: 16 | self.learning_rate = learning_rate 17 | self.C = C 18 | self.w = "" 19 | 20 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]: 21 | dif = np.dot(x, self.w) - y 22 | cost = (np.sum(dif**2) + self.C * np.sum(np.square(self.w))) / (2*np.shape(x)[0]) 23 | 24 | return dif, cost 25 | 26 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int = 10000) -> RidgeRegression: 27 | if self.w == "": 28 | _, num_features = np.shape(x) 29 | self.w = np.random.uniform(-1, 1, num_features) 30 | for _ in range(num_iterations): 31 | dif, cost = self.cost_function(x, y) 32 | gradient = np.dot(x.transpose(), dif) / np.shape(x)[0] 33 | self.w = self.w - self.learning_rate * gradient 34 | return self 35 | 36 | def predict(self, x: np.ndarray) -> np.ndarray: 37 | return np.dot(x, self.w) 38 | 39 | 40 | # Testing functionality 41 | if __name__ == '__main__': 42 | import pandas as pd 43 | from sklearn.preprocessing import LabelEncoder 44 | from sklearn.model_selection import train_test_split 45 | iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 46 | names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) 47 | le = LabelEncoder() 48 | iris['label'] = le.fit_transform(iris['label']) 49 | X = np.array(iris.drop(['petal_width'], axis=1)) 50 | y = np.array(iris['petal_width']) 51 | 52 | X_train, X_test, y_train, y_test = train_test_split( 53 | X, y, test_size=0.2, random_state=42) 54 | 55 | model = RidgeRegression(0.0001) 56 | model.fit(X_train, y_train, 10000) 57 | predictions = model.predict(X_test) 58 | mse = ((y_test - predictions)**2).mean(axis=0) 59 | print('Loss:', mse) 60 | -------------------------------------------------------------------------------- /Algorithms/linear_regression/code/simple_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import numpy as np 3 | 4 | 5 | class SimpleLinearRegression: 6 | """Simple Linear Regression 7 | Parameters: 8 | ----------- 9 | learning_rate: float 10 | The step length used when following the negative gradient during training. 11 | """ 12 | def __init__(self, learning_rate: float) -> None: 13 | self.m = 0 14 | self.b = 0 15 | self.learning_rate = learning_rate 16 | 17 | def cost_function(self, x: np.ndarray, y: np.ndarray) -> float: 18 | total_error = 0 19 | for i in range(0, len(x)): 20 | total_error += (y[i]-(self.m*x[i]+self.b))**2 21 | return total_error/float(len(x)) 22 | 23 | def fit(self, x: np.ndarray, y: np.ndarray, num_iterations: int) -> SimpleLinearRegression: 24 | N = float(len(x)) 25 | for j in range(num_iterations): 26 | b_gradient = 0 27 | m_gradient = 0 28 | for i in range(0, len(x)): 29 | b_gradient += -(2/N) * (y[i] - ((self.m * x[i]) + self.b)) 30 | m_gradient += -(2/N) * x[i] * \ 31 | (y[i] - ((self.m * x[i]) + self.b)) 32 | self.b -= (self.learning_rate * b_gradient) 33 | self.m -= (self.learning_rate * m_gradient) 34 | return self 35 | 36 | def predict(self, xs: np.ndarray) -> list: 37 | return [(self.m * x + self.b) for x in xs] 38 | 39 | 40 | # Testing functionality 41 | if __name__ == '__main__': 42 | x = np.linspace(0, 100, 50) 43 | delta = np.random.uniform(-10, 10, x.size) 44 | y = 0.5 * x + 3 + delta 45 | 46 | model = SimpleLinearRegression(0.0001) 47 | model.fit(x, y, 100) 48 | print('Error:', model.cost_function(x, y)) 49 | -------------------------------------------------------------------------------- /Algorithms/linear_regression/doc/linear_regression_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/linear_regression/doc/linear_regression_example.png -------------------------------------------------------------------------------- /Algorithms/linear_regression/doc/regularization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/linear_regression/doc/regularization.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/code/logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import numpy as np 3 | 4 | 5 | def sigmoid(x: np.ndarray) -> np.ndarray: 6 | return 1/(1+np.exp(-x)) 7 | 8 | 9 | class LogisticRegression: 10 | """Logistic Regression 11 | Parameters: 12 | ----------- 13 | learning_rate: float 14 | The step length used when following the negative gradient during training. 15 | num_features: int 16 | The number of feature in the data 17 | penalty: str, default='l2' 18 | The type of penalty used. 19 | C: float, default=1 20 | Regularization strength 21 | """ 22 | def __init__(self, learning_rate: float, num_features: int, penalty: str = 'l2', C: float = 0.1) -> None: 23 | self.learning_rate = learning_rate 24 | self.penalty = penalty 25 | self.C = C 26 | self.b = 0 27 | self.w = np.zeros((1, num_features)) 28 | assert penalty in ['l2', 'l1', None] 29 | 30 | def cost_function(self, y: np.ndarray, y_pred: np.ndarray) -> float: 31 | y_T = y.T 32 | if self.penalty == 'l1': 33 | return (-1/y.shape[0]) * (np.sum((y_T*np.log(y_pred)) + ((1-y_T) * np.log(1-y_pred))) + self.C * np.sum(np.absolute(self.w))) 34 | elif self.penalty == 'l2': 35 | return (-1/y.shape[0]) * (np.sum((y_T*np.log(y_pred)) + ((1-y_T) * np.log(1-y_pred))) + self.C * np.sum(np.square(self.w))) 36 | else: 37 | return (-1/y.shape[0]) * (np.sum((y_T*np.log(y_pred)) + ((1-y_T) * np.log(1-y_pred)))) 38 | 39 | def fit(self, X: np.ndarray, y: np.ndarray, num_iterations) -> LogisticRegression: 40 | for i in range(num_iterations): 41 | pred = sigmoid(np.dot(self.w, X.T) + self.b) 42 | cost = self.cost_function(y, pred) 43 | 44 | # Calculate Gradients/Derivatives 45 | dw = (1 / X.shape[0]) * (np.dot(X.T, (pred - y.T).T)) 46 | db = (1 / X.shape[0]) * (np.sum(pred - y.T)) 47 | 48 | self.w = self.w - (self.learning_rate * dw.T) 49 | self.b = self.b - (self.learning_rate * db) 50 | return self 51 | 52 | def predict(self, X: np.ndarray) -> list: 53 | predictions = sigmoid(np.dot(self.w, X.T) + self.b)[0] 54 | return [1 if pred >= 0.5 else 0 for pred in predictions] 55 | 56 | def predict_proba(self, X: np.ndarray) -> np.ndarray: 57 | return sigmoid(np.dot(self.w, X.T) + self.b)[0] 58 | 59 | -------------------------------------------------------------------------------- /Algorithms/logistic_regression/code/one_vs_all_logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import numpy as np 3 | from logistic_regression import LogisticRegression 4 | 5 | 6 | class LogisticRegressionOneVsAll: 7 | """One vs. All Logistic Regression 8 | Parameters: 9 | ----------- 10 | learning_rate: float 11 | The step length used when following the negative gradient during training. 12 | num_features: int 13 | The number of feature in the data 14 | num_classes: int 15 | The number of classes in the data-set 16 | """ 17 | def __init__(self, learning_rate: float, num_features: int, num_classes: int) -> None: 18 | self.models = [LogisticRegression(learning_rate, num_features) for _ in range(num_classes)] 19 | 20 | def fit(self, X: np.ndarray, y: np.ndarray, num_iterations: int) -> LogisticRegressionOneVsAll: 21 | for i, model in enumerate(self.models): 22 | y_tmp = (y == i).astype(int) 23 | model.fit(X, y_tmp, num_iterations) 24 | return self 25 | 26 | def predict(self, X: np.ndarray) -> np.ndarray: 27 | predictions = np.array([model.predict_proba(X) for model in self.models]) 28 | return np.argmax(predictions, axis=0) 29 | -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/classification_vs_regression.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/classification_vs_regression.jpeg -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/convex_vs_non_convex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/convex_vs_non_convex.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/logistic_regression_decision_boundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/logistic_regression_decision_boundary.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/loss_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/loss_functions.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/one_vs_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/one_vs_all.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/overfitting_vs_underfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/overfitting_vs_underfitting.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/doc/sigmoid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/logistic_regression/doc/sigmoid.png -------------------------------------------------------------------------------- /Algorithms/logistic_regression/tex/fd8be73b54f5436a5cd2e73ba9b6bfa9.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/choose_bandwidth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/choose_bandwidth.png -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/cluster_comparison.png -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/kde_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/kde_plot.png -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/mean_shift.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/mean_shift.gif -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/noisy_circles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/noisy_circles.png -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/noisy_moons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/noisy_moons.png -------------------------------------------------------------------------------- /Algorithms/mean_shift/doc/two_lines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/mean_shift/doc/two_lines.png -------------------------------------------------------------------------------- /Algorithms/principal_component_analysis/doc/pca_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/principal_component_analysis/doc/pca_example.png -------------------------------------------------------------------------------- /Algorithms/random_forest/code/eli5_feature_importance_example.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | from sklearn import datasets 3 | from sklearn.model_selection import train_test_split 4 | from IPython.display import display 5 | 6 | import eli5 7 | from eli5.sklearn import PermutationImportance 8 | 9 | RANDOM_STATE = 0 10 | 11 | # Get Iris data 12 | iris = datasets.load_iris() 13 | X = iris.data 14 | y = iris.target 15 | 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE) 17 | 18 | # Create and train Random Forest 19 | model = RandomForestClassifier(random_state=RANDOM_STATE) 20 | model.fit(X_train, y_train) 21 | 22 | perm = PermutationImportance(model, random_state=1).fit(X_test, y_test) 23 | 24 | display(eli5.show_weights(perm, feature_names=iris.feature_names)) 25 | 26 | eli5_weights = eli5.explain_weights(model, feature_names=iris.feature_names) 27 | print(eli5_weights) -------------------------------------------------------------------------------- /Algorithms/random_forest/code/random_forest_classifier.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Union, Optional 3 | import numpy as np 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | 7 | class RandomForest: 8 | """Random Forest Classifier 9 | Parameters: 10 | ----------- 11 | n_estimators: int = 10 12 | The number of trees in the forest. 13 | n_features: Optional[Union[str, int]] = 'sqrt' 14 | The number of features to consider when looking for the best split 15 | sample_size: float = 0.8 16 | Amount of data used (0-1) 17 | max_depth: Optional[int] = 10 18 | The maximum depth of the tree. 19 | min_leaf: Union[int, float] = 5 20 | The minimum number of samples required to be at a leaf node. 21 | """ 22 | def __init__(self, n_estimators: int = 10, n_features: Optional[Union[str, int]] = 'sqrt', sample_size: float = 0.8, 23 | max_depth: Optional[int] = 10, min_leaf: Union[int, float] = 5) -> None: 24 | self.n_estimators = n_estimators 25 | self.n_features = n_features 26 | self.sample_size = sample_size 27 | self.max_depth = max_depth 28 | self.min_leaf = min_leaf 29 | self.trees = [] 30 | 31 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> RandomForest: 32 | for _ in range(self.n_estimators): 33 | idxs = np.random.permutation(len(X))[:int(self.sample_size*len(X))] 34 | 35 | self.trees.append(DecisionTreeClassifier( 36 | max_depth=self.max_depth, min_samples_leaf=self.min_leaf, max_features=self.n_features).fit(X[idxs], y[idxs])) 37 | return self 38 | 39 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray: 40 | predictions_array = np.column_stack([t.predict(X) for t in self.trees]) 41 | return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array]) 42 | -------------------------------------------------------------------------------- /Algorithms/random_forest/code/random_forest_regressor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Union, Optional 3 | import numpy as np 4 | from sklearn.tree import DecisionTreeRegressor 5 | 6 | 7 | class RandomForest: 8 | """Random Forest Regressor 9 | Parameters: 10 | ----------- 11 | n_estimators: int = 10 12 | The number of trees in the forest. 13 | n_features: Optional[Union[str, int]] = 'sqrt' 14 | The number of features to consider when looking for the best split 15 | sample_size: float = 0.8 16 | Amount of data used (0-1) 17 | max_depth: Optional[int] = 10 18 | The maximum depth of the tree. 19 | min_leaf: Union[int, float] = 5 20 | The minimum number of samples required to be at a leaf node. 21 | """ 22 | def __init__(self, n_estimators: int = 10, n_features: Optional[Union[str, int]] = 'sqrt', sample_size: float = 0.8, 23 | max_depth: Optional[int] = 10, min_leaf: Union[int, float] = 5) -> None: 24 | self.n_estimators = n_estimators 25 | self.n_features = n_features 26 | self.sample_size = sample_size 27 | self.max_depth = max_depth 28 | self.min_leaf = min_leaf 29 | self.trees = [] 30 | 31 | def fit(self, X: Union[list, np.ndarray], y: Union[list, np.ndarray]) -> RandomForest: 32 | for _ in range(self.n_estimators): 33 | idxs = np.random.permutation(len(X))[:int(self.sample_size*len(X))] 34 | 35 | self.trees.append(DecisionTreeRegressor( 36 | max_depth=self.max_depth, min_samples_leaf=self.min_leaf, max_features=self.n_features).fit(X[idxs], y[idxs])) 37 | return self 38 | 39 | def predict(self, X: Union[list, np.ndarray]) -> np.ndarray: 40 | return np.mean([t.predict(X) for t in self.trees], axis=0) 41 | -------------------------------------------------------------------------------- /Algorithms/random_forest/code/scikit-learn/feature_importance_example.py: -------------------------------------------------------------------------------- 1 | # based on https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.datasets import make_classification 6 | from sklearn.ensemble import RandomForestClassifier 7 | 8 | 9 | RANDOM_STATE = 0 10 | 11 | # Build a classification task using 3 informative features 12 | X, y = make_classification(n_samples=1000, 13 | n_features=10, 14 | n_informative=3, 15 | n_redundant=0, 16 | n_repeated=0, 17 | n_classes=2, 18 | random_state=RANDOM_STATE, 19 | shuffle=False) 20 | 21 | # Create and train Random Forest 22 | model = RandomForestClassifier(n_estimators=250, random_state=RANDOM_STATE) 23 | model.fit(X, y) 24 | 25 | # Get feature importance 26 | importances = model.feature_importances_ 27 | std = np.std([tree.feature_importances_ for tree in model.estimators_], 28 | axis=0) 29 | indices = np.argsort(importances)[::-1] 30 | 31 | # Print the feature ranking 32 | print("Feature ranking:") 33 | 34 | for f in range(X.shape[1]): 35 | print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) 36 | 37 | # Plot the impurity-based feature importances of the forest 38 | plt.figure() 39 | plt.title("Feature importances") 40 | plt.bar(range(X.shape[1]), importances[indices], 41 | color="r", yerr=std[indices], align="center") 42 | plt.xticks(range(X.shape[1]), indices) 43 | plt.xlim([-1, X.shape[1]]) 44 | plt.show() 45 | -------------------------------------------------------------------------------- /Algorithms/random_forest/code/scikit-learn/out_of_bag_error_example.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import make_classification 2 | from sklearn.ensemble import RandomForestClassifier 3 | 4 | 5 | RANDOM_STATE = 123 6 | 7 | # Generate a binary classification dataset. 8 | X, y = make_classification(n_samples=500, n_features=25, 9 | n_clusters_per_class=1, n_informative=15, 10 | random_state=RANDOM_STATE) 11 | 12 | model = RandomForestClassifier(oob_score=True, random_state=RANDOM_STATE) 13 | 14 | model.fit(X, y) 15 | 16 | print('Out of bag error:', model.oob_score_) 17 | -------------------------------------------------------------------------------- /Algorithms/random_forest/code/shap_feature_importance_example.py: -------------------------------------------------------------------------------- 1 | import shap 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn import datasets 4 | from sklearn.model_selection import train_test_split 5 | 6 | RANDOM_STATE = 0 7 | 8 | # Get Iris data 9 | iris = datasets.load_iris() 10 | X = iris.data 11 | y = iris.target 12 | 13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE) 14 | 15 | # Create and train Random Forest 16 | model = RandomForestClassifier(random_state=RANDOM_STATE) 17 | model.fit(X_train, y_train) 18 | 19 | 20 | explainer = shap.TreeExplainer(model) 21 | shap_values = explainer.shap_values(X_test) 22 | 23 | shap.summary_plot(shap_values, X_test) -------------------------------------------------------------------------------- /Algorithms/random_forest/doc/bootstrapping_vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/bootstrapping_vertical.png -------------------------------------------------------------------------------- /Algorithms/random_forest/doc/decision_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/decision_tree.png -------------------------------------------------------------------------------- /Algorithms/random_forest/doc/feature_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/feature_importance.png -------------------------------------------------------------------------------- /Algorithms/random_forest/doc/out_of_bag_set.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/out_of_bag_set.png -------------------------------------------------------------------------------- /Algorithms/random_forest/doc/random_forest_pipeline_horizontal_vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/random_forest_pipeline_horizontal_vertical.png -------------------------------------------------------------------------------- /Algorithms/random_forest/doc/selecting_a_random_subset_of_variables_vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Algorithms/random_forest/doc/selecting_a_random_subset_of_variables_vertical.png -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | Thank you for wanting to contribute to Machine-Learning Explained. Machine-Learning Explained is an open-source repository containing explanations and implementations of machine learning algorithms and concepts, and as such any contributions that add to the current explanations or add new ones are more than welcome. 4 | 5 | ## Setup Machine-Learning-Explained and version control 6 | 7 | 1. Make a fork of this repository on Github. You will need an account with Github. This will allow you to make pull requests (PRs) later on. 8 | 2. Clone your fork. 9 | ```bash 10 | git clone 11 | cd Machine-Learning-Explained 12 | ``` 13 | 3. Make `git` aware of the Machine-Learning-Explained repo. 14 | ```bash 15 | git remote add upstream https://github.com/TannerGilbert/Machine-Learning-Explained.git 16 | git fetch upstream 17 | ``` 18 | 19 | ## Changing/Adding source code 20 | 21 | 1. Choose the branch for your changes. 22 | ```bash 23 | git checkout -b 24 | ``` 25 | 2. Write some awesome code! (Make sure only to write code inside the `code` folders) 26 | 27 | ## Changing/Adding documentation 28 | 29 | 1. Choose the branch for your changes. 30 | ```bash 31 | git checkout -b 32 | ``` 33 | 2. Make changes / add new documentation. 34 | > Note: Make sure to only work inside the `README.tex.md` files and not inside the `README.md` files. 35 | 3. Generate `README.md` file from `README.tex.md` 36 | 37 | 1. Install [`readme2tex`](https://github.com/leegao/readme2tex) 38 | ```bash 39 | pip install readme2tex 40 | ``` 41 | 2. Convert `README.tex.md` to `README.md` 42 | ```bash 43 | python3 -m readme2tex --output README.md README.tex.md --svgdir tex --nocdn 44 | ``` -------------------------------------------------------------------------------- /Ensemble_Methods/code/averaging.py: -------------------------------------------------------------------------------- 1 | # based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard 2 | # and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches 3 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin 4 | import numpy as np 5 | 6 | 7 | class AveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): 8 | def __init__(self, models): 9 | self.models = models 10 | 11 | def fit(self, X, y): 12 | self.models_ = [clone(x) for x in self.models] 13 | 14 | # Train cloned base models 15 | for model in self.models_: 16 | model.fit(X, y) 17 | 18 | return self 19 | 20 | def predict(self, X): 21 | predictions = np.column_stack([ 22 | model.predict(X) for model in self.models_ 23 | ]) 24 | return np.mean(predictions, axis=1) 25 | -------------------------------------------------------------------------------- /Ensemble_Methods/code/bagging.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin 2 | import numpy as np 3 | 4 | 5 | class BaggingModels(BaseEstimator, RegressorMixin, TransformerMixin): 6 | def __init__(self, models, task_type='classification'): 7 | self.models = models 8 | self.task_type = task_type 9 | 10 | def fit(self, X, y): 11 | self.models_ = [clone(x) for x in self.models] 12 | 13 | for model in self.models_: 14 | X_tmp, y_tmp = self.subsample(X, y) 15 | model.fit(X_tmp, y_tmp) 16 | 17 | return self 18 | 19 | # Create a random subsample from the dataset with replacement 20 | @staticmethod 21 | def subsample(X, y, ratio=1.0): 22 | X_new, y_new = list(), list() 23 | n_sample = round(len(X) * ratio) 24 | while len(X_new) < n_sample: 25 | index = np.random.randint(len(X)) 26 | X_new.append(X[index]) 27 | y_new.append(y[index]) 28 | return X_new, y_new 29 | 30 | def predict(self, X): 31 | predictions_array = np.column_stack([ 32 | model.predict(X) for model in self.models_ 33 | ]) 34 | if self.task_type == 'classification': 35 | return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array]) 36 | else: 37 | return np.mean(predictions_array, axis=1) 38 | 39 | def predict_proba(self, X): 40 | if self.task_type == 'classification': 41 | predictions = [] 42 | for x in X: 43 | prediction = np.row_stack([ 44 | model.predict_proba([x]) for model in self.models_ 45 | ]) 46 | predictions.append(np.mean(prediction, axis=0)) 47 | return np.array(predictions) 48 | return None -------------------------------------------------------------------------------- /Ensemble_Methods/code/blending.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin 2 | from sklearn.model_selection import train_test_split 3 | import numpy as np 4 | 5 | 6 | class BlendingModels(BaseEstimator, RegressorMixin, TransformerMixin): 7 | def __init__(self, base_models, meta_model, holdout_pct=0.2, use_features_in_secondary=False): 8 | self.base_models = base_models 9 | self.meta_model = meta_model 10 | self.holdout_pct = holdout_pct 11 | self.use_features_in_secondary = use_features_in_secondary 12 | 13 | def fit(self, X, y): 14 | """Fit all the models on the given dataset""" 15 | self.base_models_ = [clone(x) for x in self.base_models] 16 | self.meta_model_ = clone(self.meta_model) 17 | 18 | X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=self.holdout_pct) 19 | 20 | holdout_predictions = np.zeros((X_holdout.shape[0], len(self.base_models))) 21 | for i, model in enumerate(self.base_models_): 22 | model.fit(X_train, y_train) 23 | y_pred = model.predict(X_holdout) 24 | holdout_predictions[:, i] = y_pred 25 | if self.use_features_in_secondary: 26 | self.meta_model_.fit(np.hstack((X_holdout, holdout_predictions)), y_holdout) 27 | else: 28 | self.meta_model_.fit(holdout_predictions, y_holdout) 29 | 30 | return self 31 | 32 | def predict(self, X): 33 | meta_features = np.column_stack([ 34 | model.predict(X) for model in self.base_models_ 35 | ]) 36 | if self.use_features_in_secondary: 37 | return self.meta_model_.predict(np.hstack((X, meta_features))) 38 | else: 39 | return self.meta_model_.predict(meta_features) 40 | 41 | def predict_proba(self, X): 42 | meta_features = np.column_stack([ 43 | model.predict(X) for model in self.base_models_ 44 | ]) 45 | if self.use_features_in_secondary: 46 | return self.meta_model_.predict_proba(np.hstack((X, meta_features))) 47 | else: 48 | return self.meta_model_.predict_proba(meta_features) -------------------------------------------------------------------------------- /Ensemble_Methods/code/majority_vote.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin 2 | import numpy as np 3 | 4 | 5 | class MajorityVote(BaseEstimator, ClassifierMixin, TransformerMixin): 6 | def __init__(self, models): 7 | self.models = models 8 | 9 | def fit(self, X, y): 10 | self.models_ = [clone(x) for x in self.models] 11 | 12 | # Train cloned base models 13 | for model in self.models_: 14 | model.fit(X, y) 15 | 16 | return self 17 | 18 | def predict(self, X): 19 | predictions_array = np.column_stack([ 20 | model.predict(X) for model in self.models_ 21 | ]) 22 | return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array]) -------------------------------------------------------------------------------- /Ensemble_Methods/code/stacking_retrained.py: -------------------------------------------------------------------------------- 1 | # based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard 2 | # and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches 3 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin 4 | from sklearn.model_selection import KFold 5 | import numpy as np 6 | 7 | 8 | class StackingModelsRetrained(BaseEstimator, RegressorMixin, TransformerMixin): 9 | def __init__(self, base_models, meta_model, n_folds=5, use_features_in_secondary=False): 10 | self.base_models = base_models 11 | self.meta_model = meta_model 12 | self.n_folds = n_folds 13 | self.use_features_in_secondary = use_features_in_secondary 14 | 15 | def fit(self, X, y): 16 | """Fit all the models on the given dataset""" 17 | self.base_models_ = [clone(x) for x in self.base_models] 18 | self.meta_model_ = clone(self.meta_model) 19 | kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42) 20 | 21 | # Train cloned base models and create out-of-fold predictions 22 | out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) 23 | for i, model in enumerate(self.base_models): 24 | for train_index, holdout_index in kfold.split(X, y): 25 | instance = clone(model) 26 | instance.fit(X[train_index], y[train_index]) 27 | y_pred = instance.predict(X[holdout_index]) 28 | out_of_fold_predictions[holdout_index, i] = y_pred 29 | 30 | if self.use_features_in_secondary: 31 | self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y) 32 | else: 33 | self.meta_model_.fit(out_of_fold_predictions, y) 34 | 35 | for model in self.base_models_: 36 | model.fit(X, y) 37 | 38 | return self 39 | 40 | def predict(self, X): 41 | meta_features = np.column_stack([ 42 | base_model.predict(X) for base_model in self.base_models_]) 43 | if self.use_features_in_secondary: 44 | return self.meta_model_.predict(np.hstack((X, meta_features))) 45 | else: 46 | return self.meta_model_.predict(meta_features) 47 | 48 | def predict_proba(self, X): 49 | meta_features = np.column_stack([ 50 | base_model.predict(X) for base_model in self.base_models_]) 51 | if self.use_features_in_secondary: 52 | return self.meta_model_.predict_proba(np.hstack((X, meta_features))) 53 | else: 54 | return self.meta_model_.predict_proba(meta_features) 55 | -------------------------------------------------------------------------------- /Ensemble_Methods/code/weighted_average.py: -------------------------------------------------------------------------------- 1 | # based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard 2 | # and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches 3 | from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin 4 | import numpy as np 5 | 6 | 7 | class WeightedAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): 8 | def __init__(self, models, weights): 9 | self.models = models 10 | self.weights = weights 11 | assert sum(self.weights) == 1 12 | 13 | def fit(self, X, y): 14 | self.models_ = [clone(x) for x in self.models] 15 | 16 | # Train cloned base models 17 | for model in self.models_: 18 | model.fit(X, y) 19 | 20 | return self 21 | 22 | def predict(self, X): 23 | predictions = np.column_stack([ 24 | model.predict(X) for model in self.models_ 25 | ]) 26 | return np.sum(predictions * self.weights, axis=1) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Gilbert Tanner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Metrics/code/accuracy_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Accuracy: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.sum(y == y_pred) / y.shape[0] -------------------------------------------------------------------------------- /Metrics/code/binary_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BinaryCrossentropy: 5 | def __init__(self): 6 | self.epsilon = 1e-15 7 | 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | # Avoid division by zero 13 | y_pred = np.clip(y_pred, self.epsilon, 1 - self.epsilon) 14 | return - y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred) 15 | 16 | def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 17 | # Avoid division by zero 18 | y_pred = np.clip(y_pred, self.epsilon, 1 - self.epsilon) 19 | return - (y / y_pred) + (1 - y) / (1 - y_pred) 20 | -------------------------------------------------------------------------------- /Metrics/code/brier_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BrierScore: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.sum(np.power(y - y_pred, 2)) / y.shape[0] 10 | -------------------------------------------------------------------------------- /Metrics/code/categorical_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class CategoricalCrossentropy: 5 | def __init__(self): 6 | self.epsilon = 1e-15 7 | 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | # Avoid division by zero 13 | y_pred = np.clip(y_pred, self.epsilon, 1 - self.epsilon) 14 | return - np.sum(y * np.log(y_pred)) / y.shape[0] 15 | -------------------------------------------------------------------------------- /Metrics/code/cosine_distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class CosineDistance: 5 | 6 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 7 | return self.loss(y, y_pred) 8 | 9 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 10 | return np.dot(y, y_pred) / (np.linalg.norm(y) * np.linalg.norm(y_pred)) 11 | -------------------------------------------------------------------------------- /Metrics/code/d2_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tweedie_deviance import TweedieDeviance 3 | 4 | 5 | class D2Score: 6 | def __init__(self, power: int) -> None: 7 | self.tweedie = TweedieDeviance(power) 8 | 9 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 10 | return self.loss(y, y_pred) 11 | 12 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 13 | return 1 - self.tweedie(y, y_pred) / self.tweedie(y, np.mean(y)) 14 | -------------------------------------------------------------------------------- /Metrics/code/f1_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from recall import Recall 3 | from precision import Precision 4 | 5 | 6 | class F1Score: 7 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 8 | return self.loss(y, y_pred) 9 | 10 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 11 | precision = Precision() 12 | recall = Recall() 13 | return 2 * (precision(y, y_pred) * recall(y, y_pred)) / (precision(y, y_pred) + recall(y, y_pred)) -------------------------------------------------------------------------------- /Metrics/code/fbeta_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from recall import Recall 3 | from precision import Precision 4 | 5 | 6 | class FBetaScore: 7 | def __init__(self, beta: float = 1.) -> None: 8 | self.beta = beta 9 | 10 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 11 | return self.loss(y, y_pred) 12 | 13 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 14 | precision = Precision() 15 | recall = Recall() 16 | return (1 + pow(self.beta, 2)) * (precision(y, y_pred) * recall(y, y_pred)) / ((pow(self.beta, 2) * precision(y, y_pred)) + recall(y, y_pred)) -------------------------------------------------------------------------------- /Metrics/code/hinge.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Hinge: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.sum(np.maximum(0, 1 - y * y_pred)) / len(y) 10 | -------------------------------------------------------------------------------- /Metrics/code/huber.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Huber: 5 | def __init__(self, delta: float = 1.) -> None: 6 | self.delta = delta 7 | 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | return np.where(np.abs(y - y_pred) < self.delta, 0.5 * (y - y_pred)**2, self.delta * (np.abs(y - y_pred)- 0.5 * self.delta)) 13 | -------------------------------------------------------------------------------- /Metrics/code/kl_divergence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class KLDivergence: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.sum(np.where(y != 0, y * np.log(y / y_pred), 0)) -------------------------------------------------------------------------------- /Metrics/code/logcosh.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LogCosh: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.sum(np.log(np.cosh(y_pred - y))) / y.shape[0] 10 | -------------------------------------------------------------------------------- /Metrics/code/mean_absolute_error.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MeanAbsoluteError: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return 0.5 * np.sum(np.absolute(y - y_pred)) / y.shape[0] 10 | -------------------------------------------------------------------------------- /Metrics/code/mean_absolute_percentage_error.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MeanAbsolutePercentageError: 5 | def __init__(self, eps: float = 1e-07): 6 | self.eps = eps 7 | 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | return np.sum(np.absolute((y - y_pred)) / np.maximum(self.eps, np.absolute(y))) / y.shape[0] 13 | -------------------------------------------------------------------------------- /Metrics/code/mean_squared_error.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MeanSquaredError: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return 0.5 * np.linalg.norm(y_pred - y) ** 2 / y.shape[0] 10 | 11 | def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | return np.linalg.norm(y_pred - y) / y.shape[0] 13 | -------------------------------------------------------------------------------- /Metrics/code/mean_squared_log_error.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MeanSquaredLogarithmicError: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.sum(np.power(np.log(1 + y) - np.log(1 + y_pred), 2)) / y.shape[0] 10 | -------------------------------------------------------------------------------- /Metrics/code/median_absolute_error.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MedianAbsoluteError: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return np.median(np.absolute(y - y_pred)) 10 | -------------------------------------------------------------------------------- /Metrics/code/poisson.py: -------------------------------------------------------------------------------- 1 | # based on https://keras.io/api/losses/probabilistic_losses/#poisson-class 2 | import numpy as np 3 | 4 | 5 | class Poisson: 6 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 7 | return self.loss(y, y_pred) 8 | 9 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 10 | return np.sum(y_pred - y * np.log(y_pred)) / y.shape[0] 11 | -------------------------------------------------------------------------------- /Metrics/code/precision.py: -------------------------------------------------------------------------------- 1 | # based on https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co 2 | 3 | from sklearn.metrics import confusion_matrix 4 | import numpy as np 5 | 6 | 7 | class Precision: 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | cm = confusion_matrix(y, y_pred) 13 | return np.mean(np.diag(cm) / np.sum(cm, axis=0)) -------------------------------------------------------------------------------- /Metrics/code/r2_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class R2Score: 5 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 6 | return self.loss(y, y_pred) 7 | 8 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return 1 - (np.sum(np.power(y-y_pred, 2))) / (np.sum(np.power(y-np.mean(y), 2))) 10 | -------------------------------------------------------------------------------- /Metrics/code/recall.py: -------------------------------------------------------------------------------- 1 | # based on https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co 2 | 3 | from sklearn.metrics import confusion_matrix 4 | import numpy as np 5 | 6 | 7 | class Recall: 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | cm = confusion_matrix(y, y_pred) 13 | return np.mean(np.diag(cm) / np.sum(cm, axis=1)) -------------------------------------------------------------------------------- /Metrics/code/tweedie_deviance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class TweedieDeviance: 5 | def __init__(self, power: int) -> None: 6 | self.power = power 7 | 8 | def __call__(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 9 | return self.loss(y, y_pred) 10 | 11 | def loss(self, y: np.ndarray, y_pred: np.ndarray) -> np.float64: 12 | if self.power == 0: 13 | return np.sum(np.power(y - y_pred, 2)) / y.shape[0] 14 | elif self.power == 1: 15 | return np.sum(2 * (y * np.log(y / y_pred) + y_pred - y)) / y.shape[0] 16 | elif self.power == 2: 17 | return np.sum(2 * (np.log(y_pred / y) + y / y_pred - 1)) / y.shape[0] 18 | else: 19 | return np.sum(2 * (np.power(np.maximum(y, 0), 2-self.power) / ((1-self.power) * (2-self.power)) - (y * np.power(y_pred, 1 - self.power)) / (1 - self.power) + np.power(y_pred, 2 - self.power) / (2 - self.power))) / y.shape[0] 20 | -------------------------------------------------------------------------------- /Metrics/doc/binary_cross_entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Metrics/doc/binary_cross_entropy.png -------------------------------------------------------------------------------- /Metrics/doc/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Metrics/doc/confusion_matrix.png -------------------------------------------------------------------------------- /Metrics/tex/36b5afebdba34564d884d347484ac0c7.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Metrics/tex/44bc9d542a92714cac84e01cbbb7fd61.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Metrics/tex/77a3b857d53fb44e33b53e4c8b68351a.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Metrics/tex/8217ed3c32a785f0b5aad4055f432ad8.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Metrics/tex/cf644cbd499c18ed6f22cee5950c0d75.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Metrics/tex/deceeaf6940a8c7a5a02373728002b0f.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adadelta/README.tex.md: -------------------------------------------------------------------------------- 1 | # ADADELTA: An Adaptive Learning Rate Method 2 | 3 | ![Adadelta Example](doc/adadelta_example.png) 4 | 5 | Adadelta is a stochastic gradient-based optimization algorithm that allows for per-dimension learning rates. Adadelta is an extension of Adagrad that seeks to reduce its aggressive, monotonically decreasing learning rate. Instead of accumulating all past squared gradients, Adadelta restricts the window of accumulated past gradients to a fixed size $\omega$. [1] 6 | 7 | Instead of inefficiently storing $\omega$ previous squared gradients, the sum of gradients is recursively defined as a decaying average of all past squared gradients. The running average $E\left[g^{2}\right]_{t}$ at time step $t$ therefore only depends on the previous average and the current gradient [2]: 8 | 9 | $$E\left[g^{2}\right]_{t} = \gamma{E}\left[g^{2}\right]_{t-1} + \left(1-\gamma\right)g^{2}_{t}$$ 10 | 11 | $\gamma$ is usually set to around 0.9. Rewriting SGD updates in terms of the parameter update vector: 12 | 13 | $$ \Delta\theta_{t} = -\eta\cdot{g_{t, i}}$$ 14 | 15 | $$\theta_{t+1}  = \theta_{t} + \Delta\theta_{t}$$ 16 | 17 | AdaDelta takes the form: 18 | 19 | $$RMS[g]_{t}=\sqrt{E\left[g^{2}\right]_{t} + \epsilon}$$ 20 | 21 | $$ \Delta\theta_{t} = -\frac{\eta}{RMS[g]_{t}}g_{t} $$ 22 | 23 | The authors that the units in the weight update don't match, i.e., the update should have the same hypothetical units as the parameters/weights. To realize this, they use the root mean squared error of parameter updates. 24 | 25 | $$E[\Delta \theta^2]_t = \gamma E[\Delta \theta^2]_{t-1} + (1 - \gamma) \Delta \theta^2_t$$ 26 | 27 | $$RMS[\Delta \theta]_{t} = \sqrt{E[\Delta \theta^2]_t + \epsilon}$$ 28 | 29 | Since $RMS[\Delta \theta]_{t}$ is unknown, it's approximated with the RMS of the parameter updates until the previous time step $RMS[\Delta \theta]_{t-1}$. 30 | 31 | $$\Delta \theta_t = - \dfrac{RMS[\Delta \theta]_{t-1}}{RMS[g]_{t}} g_{t}$$ 32 | $$\theta_{t+1} = \theta_t + \Delta \theta_t$$ 33 | 34 | For more information on how to derive this formula, take a look at '[An overview of gradient descent optimization algorithms](https://ruder.io/optimizing-gradient-descent/index.html#adadelta)' by [Sebastian Ruder](https://twitter.com/seb_ruder) and the [original Adadelta paper](https://arxiv.org/abs/1212.5701) by [Matthew D. Zeiler](https://arxiv.org/search/cs?searchtype=author&query=Zeiler%2C+M+D). 35 | 36 | Adadelta's main advantages over Adagrad are that it doesn't need a default learning rate and that it doesn't decrease the learning rate as aggressively and monotonically as Adagrad. 37 | 38 |

[1] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.

39 | 40 |

[2] https://paperswithcode.com/method/adadelta

41 | 42 | ## Code 43 | 44 | * [Adadelta Numpy Implementation](code/adadelta.py) -------------------------------------------------------------------------------- /Optimizers/adadelta/code/adadelta.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#adadelta 2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L56 3 | 4 | import numpy as np 5 | 6 | 7 | class Adadelta: 8 | """Adadelta 9 | Parameters: 10 | ----------- 11 | rho: float = 0.95 12 | The decay rate. 13 | epsilon: float = 1e-07 14 | A small floating point value to avoid zero denominator. 15 | """ 16 | def __init__(self, rho: float = 0.95, epsilon: float = 1e-7) -> None: 17 | self.E_w_update = None # Running average of squared parameter updates 18 | self.E_grad = None # Running average of the squared gradient of w 19 | self.w_update = None # Parameter update 20 | self.epsilon = epsilon 21 | self.rho = rho 22 | 23 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 24 | if self.w_update is None: 25 | self.w_update = np.zeros(np.shape(w)) 26 | self.E_w_update = np.zeros(np.shape(w)) 27 | self.E_grad = np.zeros(np.shape(grad_wrt_w)) 28 | 29 | # Update average of gradients at w 30 | self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2) 31 | 32 | # Calculate root mean squared error of the weight update and gradients 33 | RMS_delta_w = np.sqrt(self.E_w_update + self.epsilon) 34 | RMS_grad = np.sqrt(self.E_grad + self.epsilon) 35 | 36 | # Calculate adaptive learning rate 37 | adaptive_lr = RMS_delta_w / RMS_grad 38 | 39 | # Calculate the update 40 | self.w_update = adaptive_lr * grad_wrt_w 41 | 42 | # Update the running average of w updates 43 | self.E_w_update = self.rho * self.E_w_update + (1 - self.rho) * np.power(self.w_update, 2) 44 | 45 | return w - self.w_update 46 | -------------------------------------------------------------------------------- /Optimizers/adadelta/doc/adadelta_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adadelta/doc/adadelta_example.png -------------------------------------------------------------------------------- /Optimizers/adadelta/tex/11c596de17c342edeed29f489aa4b274.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adadelta/tex/4f4f4e395762a3af4575de74c019ebb5.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adadelta/tex/ae4fb5973f393577570881fc24fc2054.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adagrad/README.tex.md: -------------------------------------------------------------------------------- 1 | # Adagrad 2 | 3 | ![Adagrad example gif](doc/adagrad_example.gif) 4 | 5 | Adagrad [1] is a gradient-based optimization algorithm that adaptively scales the learning rate to the parameters, performing smaller updates for parameters associated with frequently occurring features and larger updates for parameters associated with infrequent features eliminating the need to tune the learning rate manually. The above-mentioned behavior makes Adagrad well-suited for dealing with sparse data, and Dean et al. [2] have found out that Adagrad is much more robust than SGD. 6 | 7 | Reminder: The SDG update for each parameter $\theta_i$ look as follows: 8 | 9 | $$\theta_{t+1, i} = \theta_{t, i} - \alpha \cdot \nabla_\theta J( \theta_{t, i} )$$ 10 | 11 | To scale the learning rate to each parameter Adagrad modifies the learning rate $\alpha$ at each time step $t$ for every parameter $\theta_i$ based on the past gradients of $\theta_i$: 12 | 13 | $$\theta_{t+1, i} = \theta_{t, i} - \dfrac{\alpha}{\sqrt{G_{t, ii} + \epsilon}} \cdot \nabla_\theta J( \theta_{t, i} )$$ 14 | 15 | Here $G_{t} \in \mathbb{R}^{d \times d}$ is a diagonal matrix where each diagonal element $i, i$ is the sum of the squares of the gradients w.r.t. $\theta_i$ up to time step $t$ and $\epsilon$ is a smoothing term used to avoid division by zero. 16 | 17 | The above can be vectorized as follows: 18 | 19 | $$\theta_{t+1} = \theta_{t} - \dfrac{\alpha}{\sqrt{G_{t} + \epsilon}} \odot \nabla_\theta J( \theta_{t, i} )$$ 20 | 21 | Adagrads most significant benefit is that it eliminates the need to tune the learning rate manually, but it still isn't perfect. Its main weakness is that it accumulates the squared gradients in the denominator. Since all the squared terms are positive, the accumulated sum keeps on growing during training. Therefore the learning rate keeps shrinking as the training continues, and it eventually becomes infinitely small. Other algorithms like Adadelta, RMSprop, and Adam try to resolve this flaw. [3] 22 | 23 |

[1] Duchi, J., Hazan, E., & Singer, Y. (2011). Adaptive Subgradient Methods for Online Learning and Stochastic Optimization. Journal of Machine Learning Research, 12, 2121–2159. Retrieved from [http://jmlr.org/papers/v12/duchi11a.html](http://jml 24 |

[2] Dean, J., Corrado, G. S., Monga, R., Chen, K., Devin, M., Le, Q. V, … Ng, A. Y. (2012). Large Scale Distributed Deep Networks. NIPS 2012: Neural Information Processing Systems, 1–11. [http://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks.pdf](http://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks.pdf)

25 |

[3] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.

26 | 27 | ## Code 28 | 29 | - [Adagrad Numpy Implementation](code/adagrad.py) -------------------------------------------------------------------------------- /Optimizers/adagrad/code/adagrad.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#adagrad 2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L41 3 | 4 | import numpy as np 5 | 6 | 7 | class Adagrad: 8 | """Adagrad 9 | Parameters: 10 | ----------- 11 | learning_rate: float = 0.001 12 | The step length used when following the negative gradient. 13 | initial_accumulator_value: float = 0.1 14 | Starting value for the accumulators, must be non-negative. 15 | epsilon: float = 1e-07 16 | A small floating point value to avoid zero denominator. 17 | """ 18 | def __init__(self, learning_rate: float = 0.001, initial_accumulator_value: float = 0.1, epsilon: float = 1e-07) -> None: 19 | self.learning_rate = learning_rate 20 | self.initial_accumulator_value = initial_accumulator_value 21 | self.epsilon = epsilon 22 | self.G = np.array([]) # Sum of squares of the gradients 23 | 24 | assert self.initial_accumulator_value > 0, "initial_accumulator_value must be non-negative" 25 | 26 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 27 | # Initialize w_update if not initialized yet 28 | if not self.G.any(): 29 | self.G = np.full(np.shape(w), self.initial_accumulator_value) 30 | # Add the square of the gradient of the loss function at w 31 | self.G += np.power(grad_wrt_w, 2) 32 | return w - self.learning_rate * grad_wrt_w / np.sqrt(self.G + self.epsilon) 33 | -------------------------------------------------------------------------------- /Optimizers/adagrad/doc/adagrad_example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adagrad/doc/adagrad_example.gif -------------------------------------------------------------------------------- /Optimizers/adagrad/tex/4f4f4e395762a3af4575de74c019ebb5.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adagrad/tex/7ccca27b5ccc533a2dd72dc6fa28ed84.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adagrad/tex/c745b9b57c145ec5577b82542b2df546.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/adam/README.tex.md: -------------------------------------------------------------------------------- 1 | # Adaptive Moment Estimation (Adam) 2 | 3 | ![Adam Example](doc/adam_example.PNG) 4 | 5 | Adaptive Moment Estimation better known as Adam is another adaptive learning rate method first published in 2014 by Kingma et. al. [1] In addition to storing an exponentially decaying average of past squared gradients $v_t$ like Adadelta or RMSprop, Adam also keeps an exponentially decaying average of past gradients $m_t$, similar to SGD with momentum. [2] 6 | 7 | $$m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t$$ 8 | 9 | $$v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2$$ 10 | 11 | $m_t$ is an estimate of the first [moment](https://en.wikipedia.org/wiki/Moment_(mathematics)) (the mean) and $v_t$ is the estimate of the second moment (the uncentered variance) of the gradients respectively. As $m_t$ and $v_t$ are initialized as vectors of 0's, the authors of Adam observe that they are biased towards zero, especially during the initial time steps, and especially when the decay rates are small (i.e. $\beta_1$ and $\beta_2$ are close to 1). [2] 12 | 13 | To counteract the biases by calculating bias-corrected first and second moment esimates: 14 | 15 | $$\hat{m}_t = \dfrac{m_t}{1 - \beta^t_1}$$ 16 | 17 | $$\hat{v}_t = \dfrac{v_t}{1 - \beta^t_2}$$ 18 | 19 | $\hat{m}_t$ and $\hat{v}_t$ are then used to update the parameters as follows: 20 | 21 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{\sqrt{\hat{v}_t} + \epsilon} \hat{m}_t$$ 22 | 23 | As default values for $\beta_1$ and $\beta_2$ the authors propose $0.9$ for $\beta_1$ and $0.999$ for $\beta_2$. 24 | 25 |

[1] Diederik P. Kingma and Jimmy Ba (2014). Adam: A Method for Stochastic Optimization.

26 | 27 |

[2] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.

28 | 29 | ## Code 30 | 31 | - [Adam Numpy Implementation](code/adam.py) 32 | 33 | ## Resources 34 | 35 | - [https://arxiv.org/abs/1412.6980](https://arxiv.org/abs/1412.6980) 36 | - [https://ruder.io/optimizing-gradient-descent/index.html#adam](https://ruder.io/optimizing-gradient-descent/index.html#adam) 37 | - [https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c](https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c) -------------------------------------------------------------------------------- /Optimizers/adam/code/adam.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#adam 2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L106 3 | 4 | import numpy as np 5 | 6 | 7 | class Adam: 8 | """Adam - Adaptive Moment Estimation 9 | Parameters: 10 | ----------- 11 | learning_rate: float = 0.001 12 | The step length used when following the negative gradient. 13 | beta_1: float = 0.9 14 | The exponential decay rate for the 1st moment estimates. 15 | beta_2: float = 0.999 16 | The exponential decay rate for the 2nd moment estimates. 17 | epsilon: float = 1e-07 18 | A small floating point value to avoid zero denominator. 19 | """ 20 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7) -> None: 21 | self.learning_rate = learning_rate 22 | self.epsilon = epsilon 23 | self.beta_1 = beta_1 24 | self.beta_2 = beta_2 25 | 26 | self.t = 0 27 | self.m = None # Decaying averages of past gradients 28 | self.v = None # Decaying averages of past squared gradients 29 | 30 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 31 | self.t += 1 32 | if self.m is None: 33 | self.m = np.zeros(np.shape(grad_wrt_w)) 34 | self.v = np.zeros(np.shape(grad_wrt_w)) 35 | 36 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 37 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2) 38 | 39 | m_hat = self.m / (1 - self.beta_1**self.t) 40 | v_hat = self.v / (1 - self.beta_2**self.t) 41 | 42 | w_update = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon) 43 | 44 | return w - w_update 45 | -------------------------------------------------------------------------------- /Optimizers/adam/doc/adam_example.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adam/doc/adam_example.PNG -------------------------------------------------------------------------------- /Optimizers/adamax/README.tex.md: -------------------------------------------------------------------------------- 1 | # AdaMax 2 | 3 | ![AdaMax Example](doc/adamax_example.PNG) 4 | 5 | In [Adam](https://ml-explained.com/blog/adam-explained), the update rule for individual weights is scaling their gradients inversely proportional to the $\ell_2$ norm of the past and current gradients. 6 | 7 | $$v_t = \beta_2 v_{t-1} + (1 - \beta_2) |g_t|^2$$ 8 | 9 | The L2 norm can be generalized to the $\ell_p$ norm. 10 | 11 | $$v_t = \beta_2^p v_{t-1} + (1 - \beta_2^p) |g_t|^p$$ 12 | 13 | Such variants generally become numerically unstable for large $p$, which is why $\ell_1$ and $\ell_2$ norms are most common in practice. However, in the special case where we let $p \rightarrow \infty$, a surprisingly simple and stable algorithm emerges. 14 | 15 | To avoid confusion with Adam, we use $u_t$ to denote the infinity norm-constrained $v_t$: 16 | 17 | $$ 18 | u_t = \beta_2^\infty v_{t-1} + (1 - \beta_2^\infty) |g_t|^\infty 19 | = \max(\beta_2 \cdot v_{t-1}, |g_t|) 20 | $$ 21 | 22 | We can now plug $u_t$ into the Adam update equation replacing $\sqrt{\hat{v}_t} + \epsilon$ to obtain the AdaMax update rule: 23 | 24 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{u_t} \hat{m}_t$$ 25 | 26 | ## Code 27 | 28 | - [AdaMax Numpy Implementation](code/adamax.py) 29 | 30 | ## Resources 31 | 32 | - [https://arxiv.org/abs/1412.6980](https://arxiv.org/abs/1412.6980) 33 | - [https://ruder.io/optimizing-gradient-descent/index.html#adamax](https://ruder.io/optimizing-gradient-descent/index.html#adamax) 34 | - [https://keras.io/api/optimizers/adamax/](https://keras.io/api/optimizers/adamax/) -------------------------------------------------------------------------------- /Optimizers/adamax/code/adamax.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#adamax 2 | 3 | import numpy as np 4 | 5 | 6 | class AdaMax: 7 | """AdaMax 8 | Parameters: 9 | ----------- 10 | learning_rate: float = 0.001 11 | The step length used when following the negative gradient. 12 | beta_1: float = 0.9 13 | The exponential decay rate for the 1st moment estimates. 14 | beta_2: float = 0.999 15 | The exponential decay rate for the 2nd moment estimates. 16 | epsilon: float = 1e-07 17 | A small floating point value to avoid zero denominator. 18 | """ 19 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7) -> None: 20 | self.learning_rate = learning_rate 21 | self.epsilon = epsilon 22 | self.beta_1 = beta_1 23 | self.beta_2 = beta_2 24 | 25 | self.t = 0 26 | self.m = None # Decaying averages of past gradients 27 | self.v = None # Decaying averages of past squared gradients 28 | 29 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 30 | self.t += 1 31 | if self.m is None: 32 | self.m = np.zeros(np.shape(grad_wrt_w)) 33 | self.v = np.zeros(np.shape(grad_wrt_w)) 34 | 35 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 36 | self.v = np.maximum(self.beta_2 * self.v, np.abs(grad_wrt_w)) 37 | 38 | m_hat = self.m / (1 - self.beta_1**self.t) 39 | 40 | w_update = self.learning_rate * m_hat / (self.v + self.epsilon) 41 | 42 | return w - w_update 43 | -------------------------------------------------------------------------------- /Optimizers/adamax/doc/adamax_example.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adamax/doc/adamax_example.PNG -------------------------------------------------------------------------------- /Optimizers/adamw/README.md: -------------------------------------------------------------------------------- 1 | # AdamW 2 | 3 | AdamW is a stochastic optimization method that modifies the typical implementation of weight decay in Adam to combat Adam's known convergence problems by decoupling the weight decay from the gradient updates. 4 | 5 | ![AdamW](doc/adamw.png) 6 | 7 | ## Code 8 | 9 | - [AdamW Numpy Implementation](code/adamw.py) 10 | 11 | ## Resources 12 | 13 | - [https://arxiv.org/abs/1711.05101](https://arxiv.org/abs/1711.05101) 14 | - [https://paperswithcode.com/method/adamw](https://paperswithcode.com/method/adamw) 15 | - [https://www.fast.ai/2018/07/02/adam-weight-decay/](https://www.fast.ai/2018/07/02/adam-weight-decay/) 16 | - [https://towardsdatascience.com/why-adamw-matters-736223f31b5d](https://towardsdatascience.com/why-adamw-matters-736223f31b5d) -------------------------------------------------------------------------------- /Optimizers/adamw/README.tex.md: -------------------------------------------------------------------------------- 1 | # AdamW 2 | 3 | AdamW is a stochastic optimization method that modifies the typical implementation of weight decay in Adam to combat Adam's known convergence problems by decoupling the weight decay from the gradient updates. 4 | 5 | ![AdamW](doc/adamw.png) 6 | 7 | ## Code 8 | 9 | - [AdamW Numpy Implementation](code/adamw.py) 10 | 11 | ## Resources 12 | 13 | - [https://arxiv.org/abs/1711.05101](https://arxiv.org/abs/1711.05101) 14 | - [https://paperswithcode.com/method/adamw](https://paperswithcode.com/method/adamw) 15 | - [https://www.fast.ai/2018/07/02/adam-weight-decay/](https://www.fast.ai/2018/07/02/adam-weight-decay/) 16 | - [https://towardsdatascience.com/why-adamw-matters-736223f31b5d](https://towardsdatascience.com/why-adamw-matters-736223f31b5d) -------------------------------------------------------------------------------- /Optimizers/adamw/code/adamw.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class AdamW: 5 | """AdamW 6 | Parameters: 7 | ----------- 8 | learning_rate: float = 0.001 9 | The step length used when following the negative gradient. 10 | beta_1: float = 0.9 11 | The exponential decay rate for the 1st moment estimates. 12 | beta_2: float = 0.999 13 | The exponential decay rate for the 2nd moment estimates. 14 | epsilon: float = 1e-07 15 | A small floating point value to avoid zero denominator. 16 | weight_decay: float = 0.01 17 | Amount of weight decay to be applied. 18 | """ 19 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7, weight_decay: float = 0.01) -> None: 20 | self.learning_rate = learning_rate 21 | self.epsilon = epsilon 22 | self.beta_1 = beta_1 23 | self.beta_2 = beta_2 24 | self.weight_decay = weight_decay 25 | 26 | self.t = 0 27 | self.m = None # Decaying averages of past gradients 28 | self.v = None # Decaying averages of past squared gradients 29 | 30 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 31 | self.t += 1 32 | if self.m is None: 33 | self.m = np.zeros(np.shape(grad_wrt_w)) 34 | self.v = np.zeros(np.shape(grad_wrt_w)) 35 | 36 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 37 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2) 38 | 39 | m_hat = self.m / (1 - self.beta_1**self.t) 40 | v_hat = self.v / (1 - self.beta_2**self.t) 41 | 42 | w_update = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon) + self.weight_decay * grad_wrt_w 43 | 44 | return w - w_update 45 | -------------------------------------------------------------------------------- /Optimizers/adamw/doc/adamw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/adamw/doc/adamw.png -------------------------------------------------------------------------------- /Optimizers/amsgrad/README.md: -------------------------------------------------------------------------------- 1 | # AMSGrad 2 | 3 | ![AMSGrad Example](doc/amsgrad_example.png) 4 | 5 | The motivation for AMSGrad lies with the observation that [Adam](https://ml-explained.com/blog/adam-explained) fails to converge to an optimal solution for some data-sets and is outperformed by SDG with momentum. 6 | 7 | Reddi et al. (2018) [1] show that one cause of the issue described above is the use of the exponential moving average of the past squared gradients. 8 | 9 | To fix the above-described behavior, the authors propose a new algorithm called AMSGrad that keeps a running maximum of the squared gradients instead of an exponential moving average. 10 | 11 |

12 | 13 |

14 | 15 | For simplicity, the authors also removed the debiasing step, which leads to the following update rule: 16 | 17 |

18 | 19 | For more information, check out the paper '[On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1)' and the [AMSGrad section](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad) of the '[An overview of gradient descent optimization algorithms](https://ruder.io/optimizing-gradient-descent/index.html)' article. 20 | 21 |

[1] Reddi, Sashank J., Kale, Satyen, & Kumar, Sanjiv. [On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1).

22 | 23 | ## Code 24 | 25 | - [AMSGrad Numpy Implementation](code/amsgrad.py) 26 | 27 | ## Resources 28 | 29 | - [https://arxiv.org/abs/1904.09237v1](https://arxiv.org/abs/1904.09237v1) 30 | - [https://paperswithcode.com/method/amsgrad](https://paperswithcode.com/method/amsgrad) 31 | - [https://ruder.io/optimizing-gradient-descent/index.html#amsgrad](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad) -------------------------------------------------------------------------------- /Optimizers/amsgrad/README.tex.md: -------------------------------------------------------------------------------- 1 | # AMSGrad 2 | 3 | ![AMSGrad Example](doc/amsgrad_example.png) 4 | 5 | The motivation for AMSGrad lies with the observation that [Adam](https://ml-explained.com/blog/adam-explained) fails to converge to an optimal solution for some data-sets and is outperformed by SDG with momentum. 6 | 7 | Reddi et al. (2018) [1] show that one cause of the issue described above is the use of the exponential moving average of the past squared gradients. 8 | 9 | To fix the above-described behavior, the authors propose a new algorithm called AMSGrad that keeps a running maximum of the squared gradients instead of an exponential moving average. 10 | 11 | $$v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2$$ 12 | 13 | $$\hat{v}_t = \text{max}(\hat{v}_{t-1}, v_t)$$ 14 | 15 | For simplicity, the authors also removed the debiasing step, which leads to the following update rule: 16 | 17 | $$\begin{align} \begin{split} m_t &= \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ v_t &= \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\ \hat{v}_t &= \text{max}(\hat{v}_{t-1}, v_t) \\ \theta_{t+1} &= \theta_{t} - \dfrac{\eta}{\sqrt{\hat{v}_t} + \epsilon} m_t \end{split} \end{align}$$ 18 | 19 | For more information, check out the paper '[On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1)' and the [AMSGrad section](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad) of the '[An overview of gradient descent optimization algorithms](https://ruder.io/optimizing-gradient-descent/index.html)' article. 20 | 21 |

[1] Reddi, Sashank J., Kale, Satyen, & Kumar, Sanjiv. [On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237v1).

22 | 23 | ## Code 24 | 25 | - [AMSGrad Numpy Implementation](code/amsgrad.py) 26 | 27 | ## Resources 28 | 29 | - [https://arxiv.org/abs/1904.09237v1](https://arxiv.org/abs/1904.09237v1) 30 | - [https://paperswithcode.com/method/amsgrad](https://paperswithcode.com/method/amsgrad) 31 | - [https://ruder.io/optimizing-gradient-descent/index.html#amsgrad](https://ruder.io/optimizing-gradient-descent/index.html#amsgrad) -------------------------------------------------------------------------------- /Optimizers/amsgrad/code/amsgrad.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#amsgrad 2 | 3 | import numpy as np 4 | 5 | 6 | class AMSGrad: 7 | """AMSGrad 8 | Parameters: 9 | ----------- 10 | learning_rate: float = 0.001 11 | The step length used when following the negative gradient. 12 | beta_1: float = 0.9 13 | The exponential decay rate for the 1st moment estimates. 14 | beta_2: float = 0.999 15 | The exponential decay rate for the 2nd moment estimates. 16 | epsilon: float = 1e-07 17 | A small floating point value to avoid zero denominator. 18 | """ 19 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7) -> None: 20 | self.learning_rate = learning_rate 21 | self.epsilon = epsilon 22 | self.beta_1 = beta_1 23 | self.beta_2 = beta_2 24 | 25 | self.m = None # Decaying averages of past gradients 26 | self.v = None # Decaying averages of past squared gradients 27 | 28 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 29 | if self.m is None: 30 | self.m = np.zeros(np.shape(grad_wrt_w)) 31 | self.v = np.zeros(np.shape(grad_wrt_w)) 32 | 33 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 34 | v_1 = self.v 35 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2) 36 | 37 | v_hat = np.maximum(v_1, self.v) 38 | 39 | w_update = self.learning_rate * self.m / (np.sqrt(v_hat) + self.epsilon) 40 | 41 | return w - w_update 42 | -------------------------------------------------------------------------------- /Optimizers/amsgrad/doc/amsgrad_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/amsgrad/doc/amsgrad_example.png -------------------------------------------------------------------------------- /Optimizers/gradient_descent/code/gradient_descent_with_momentum.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/ 2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L9 3 | 4 | import numpy as np 5 | 6 | 7 | class GradientDescent: 8 | """Gradient Descent with Momentum 9 | Parameters: 10 | ----------- 11 | learning_rate: float = 0.01 12 | The step length used when following the negative gradient. 13 | momentum: float = 0.0 14 | Amount of momentum to use. 15 | Momentum accelerates gradient descent in the relevant direction and dampens oscillations. 16 | """ 17 | def __init__(self, learning_rate: float = 0.01, momentum: float = 0.0) -> None: 18 | self.learning_rate = learning_rate 19 | self.momentum = momentum 20 | self.w_update = np.array([]) 21 | 22 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 23 | # Initialize w_update if not initialized yet 24 | if not self.w_update.any(): 25 | self.w_update = np.zeros(np.shape(w)) 26 | # Use momentum if set 27 | self.w_update = self.momentum * self.w_update + (1 - self.momentum) * grad_wrt_w 28 | # Move against the gradient to minimize loss 29 | return w - self.learning_rate * self.w_update 30 | -------------------------------------------------------------------------------- /Optimizers/gradient_descent/code/gradient_descent_with_nesterov_momentum.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#nesterovacceleratedgradient 2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L24 3 | 4 | from typing import Callable 5 | import numpy as np 6 | 7 | 8 | class NesterovAcceleratedGradientDescent: 9 | """Gradient Descent with Nesterov Momentum 10 | Parameters: 11 | ----------- 12 | learning_rate: float = 0.01 13 | The step length used when following the negative gradient. 14 | momentum: float = 0.0 15 | Amount of momentum to use. 16 | Momentum accelerates gradient descent in the relevant direction and dampens oscillations. 17 | """ 18 | def __init__(self, learning_rate: float = 0.01, momentum: float = 0.0) -> None: 19 | self.learning_rate = learning_rate 20 | self.momentum = momentum 21 | self.w_update = np.array([]) 22 | 23 | def update(self, w: np.ndarray, grad_func: Callable) -> np.ndarray: 24 | # Calculate the gradient of the loss a bit further down the slope from w 25 | approx_future_grad = np.clip(grad_func(w - self.momentum * self.w_update), -1, 1) 26 | # Initialize w_update if not initialized yet 27 | if not self.w_update.any(): 28 | self.w_update = np.zeros(np.shape(w)) 29 | 30 | self.w_update = self.momentum * self.w_update + self.learning_rate * approx_future_grad 31 | # Move against the gradient to minimize loss 32 | return w - self.w_update 33 | -------------------------------------------------------------------------------- /Optimizers/gradient_descent/doc/gradient_descent.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/gradient_descent.gif -------------------------------------------------------------------------------- /Optimizers/gradient_descent/doc/momentum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/momentum.png -------------------------------------------------------------------------------- /Optimizers/gradient_descent/doc/nesterov_accelerated_gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/nesterov_accelerated_gradient.png -------------------------------------------------------------------------------- /Optimizers/gradient_descent/doc/pick_learning_rate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/pick_learning_rate.png -------------------------------------------------------------------------------- /Optimizers/gradient_descent/doc/variations_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/gradient_descent/doc/variations_comparison.png -------------------------------------------------------------------------------- /Optimizers/gradient_descent/tex/11c596de17c342edeed29f489aa4b274.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/gradient_descent/tex/27e556cf3caa0673ac49a8f0de3c73ca.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/gradient_descent/tex/c745b9b57c145ec5577b82542b2df546.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/nadam/README.md: -------------------------------------------------------------------------------- 1 | # Nadam (Nesterov-accelerated Adaptive Moment Estimation) 2 | 3 | ![Nadam Training Example](doc/nadam_example.png) 4 | 5 | Nadam (Nesterov-accelerated Adaptive Moment Estimation) combines NAG (Nesterov accelerated gradient) and Adam. To do so, the momentum term needs to be updated. For more information, check out [the paper](http://cs229.stanford.edu/proj2015/054_report.pdf) or the [Nadam section](https://ruder.io/optimizing-gradient-descent/index.html#nadam) of ['An overview of gradient descent optimization algorithms'](https://ruder.io/optimizing-gradient-descent/index.html). 6 | 7 | The final update rule looks as follows: 8 | 9 |

10 | 11 | ## Code 12 | 13 | - [Nadam Numpy Implementation](code/nadam.py) 14 | 15 | ## Resources 16 | 17 | - [http://cs229.stanford.edu/proj2015/054_report.pdf](http://cs229.stanford.edu/proj2015/054_report.pdf) 18 | - [https://paperswithcode.com/method/nadam](https://paperswithcode.com/method/nadam) 19 | - [https://ruder.io/optimizing-gradient-descent/index.html#nadam](https://ruder.io/optimizing-gradient-descent/index.html#nadam) -------------------------------------------------------------------------------- /Optimizers/nadam/README.tex.md: -------------------------------------------------------------------------------- 1 | # Nadam (Nesterov-accelerated Adaptive Moment Estimation) 2 | 3 | ![Nadam Training Example](doc/nadam_example.png) 4 | 5 | Nadam (Nesterov-accelerated Adaptive Moment Estimation) combines NAG (Nesterov accelerated gradient) and Adam. To do so, the momentum term $m_t$ needs to be updated. For more information, check out [the paper](http://cs229.stanford.edu/proj2015/054_report.pdf) or the [Nadam section](https://ruder.io/optimizing-gradient-descent/index.html#nadam) of ['An overview of gradient descent optimization algorithms'](https://ruder.io/optimizing-gradient-descent/index.html). 6 | 7 | The final update rule looks as follows: 8 | 9 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{\sqrt{\hat{v}_t} + \epsilon} (\beta_1 \hat{m}_t + \dfrac{(1 - \beta_1) g_t}{1 - \beta^t_1})$$ 10 | 11 | ## Code 12 | 13 | - [Nadam Numpy Implementation](code/nadam.py) 14 | 15 | ## Resources 16 | 17 | - [http://cs229.stanford.edu/proj2015/054_report.pdf](http://cs229.stanford.edu/proj2015/054_report.pdf) 18 | - [https://paperswithcode.com/method/nadam](https://paperswithcode.com/method/nadam) 19 | - [https://ruder.io/optimizing-gradient-descent/index.html#nadam](https://ruder.io/optimizing-gradient-descent/index.html#nadam) -------------------------------------------------------------------------------- /Optimizers/nadam/code/nadam.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#nadam 2 | 3 | import numpy as np 4 | 5 | 6 | class Nadam: 7 | """Nadam 8 | Parameters: 9 | ----------- 10 | learning_rate: float = 0.001 11 | The step length used when following the negative gradient. 12 | beta_1: float = 0.9 13 | The exponential decay rate for the 1st moment estimates. 14 | beta_2: float = 0.999 15 | The exponential decay rate for the 2nd moment estimates. 16 | epsilon: float = 1e-07 17 | A small floating point value to avoid zero denominator. 18 | """ 19 | 20 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, 21 | epsilon: float = 1e-7) -> None: 22 | self.learning_rate = learning_rate 23 | self.epsilon = epsilon 24 | self.beta_1 = beta_1 25 | self.beta_2 = beta_2 26 | 27 | self.t = 0 28 | self.m = None # Decaying averages of past gradients 29 | self.v = None # Decaying averages of past squared gradients 30 | 31 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 32 | self.t += 1 33 | if self.m is None: 34 | self.m = np.zeros(np.shape(grad_wrt_w)) 35 | self.v = np.zeros(np.shape(grad_wrt_w)) 36 | 37 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 38 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2) 39 | 40 | m_hat = self.m / (1 - self.beta_1**self.t) 41 | v_hat = self.v / (1 - self.beta_2**self.t) 42 | 43 | w_update = self.learning_rate / (np.sqrt(v_hat) + self.epsilon) * (self.beta_1 * m_hat + (1 - self.beta_1) 44 | * grad_wrt_w / (1 - self.beta_1**self.t)) 45 | 46 | return w - w_update 47 | -------------------------------------------------------------------------------- /Optimizers/nadam/doc/nadam_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/nadam/doc/nadam_example.png -------------------------------------------------------------------------------- /Optimizers/qhadam/README.md: -------------------------------------------------------------------------------- 1 | # QHAdam (Quasi-Hyperbolic Adam) 2 | 3 | ![QHAdam Example](doc/qhadam_example.png) 4 | 5 | **Quasi-Hyperbolic Momentum Algorithm (QHM)** is a simple alteration of [SGD with momentum](https://paperswithcode.com/method/sgd-with-momentum), averaging a plain SGD step with a momentum step. **QHAdam (Quasi-Hyperbolic Adam)** is a QH augmented version of [Adam](https://ml-explained.com/blog/adam-explained) that replaces both of Adam's moment estimators with quasi-hyperbolic terms. Namely, QHAdam decouples the momentum term from the current gradient when updating the weights, and decouples the mean squared gradients term from the current squared gradient when updating the weights. [1, 2, 3] 6 | 7 | Essentially, it's a weighted average of the momentum and plain SGD, weighting the current gradient with an immediate discount factor divided by a weighted average of the mean squared gradients and the current squared gradient, weighting the current squared gradient with an immediate discount factor . [2] 8 | 9 |

10 | 11 |

[1] Ma, J. and Yarats, D. Quasi-hyperbolic momentum and Adam for deep learning. arXiv preprint arXiv:1810.06801, 2018

12 | 13 |

[2] QHAdam Papers With Code

14 | 15 |

[3] John Chen. An updated overview of recent gradient descent algorithms

16 | 17 | ## Code 18 | 19 | - [QHAdam Numpy Implementation](code/qhadam.py) -------------------------------------------------------------------------------- /Optimizers/qhadam/README.tex.md: -------------------------------------------------------------------------------- 1 | # QHAdam (Quasi-Hyperbolic Adam) 2 | 3 | ![QHAdam Example](doc/qhadam_example.png) 4 | 5 | **Quasi-Hyperbolic Momentum Algorithm (QHM)** is a simple alteration of [SGD with momentum](https://paperswithcode.com/method/sgd-with-momentum), averaging a plain SGD step with a momentum step. **QHAdam (Quasi-Hyperbolic Adam)** is a QH augmented version of [Adam](https://ml-explained.com/blog/adam-explained) that replaces both of Adam's moment estimators with quasi-hyperbolic terms. Namely, QHAdam decouples the momentum term from the current gradient when updating the weights, and decouples the mean squared gradients term from the current squared gradient when updating the weights. [1, 2, 3] 6 | 7 | Essentially, it's a weighted average of the momentum and plain SGD, weighting the current gradient with an immediate discount factor $v_1$ divided by a weighted average of the mean squared gradients and the current squared gradient, weighting the current squared gradient with an immediate discount factor $v_2$. [2] 8 | 9 | $$ \theta_{t+1, i} = \theta_{t, i} - \eta\left[\frac{\left(1-v_{1}\right)\cdot{g_{t}} + v_{1}\cdot\hat{m}_{t}}{\sqrt{\left(1-v_{2}\right)g^{2}_{t} + v_{2}\cdot{\hat{v}_{t}}} + \epsilon}\right], \forall{t} $$ 10 | 11 |

[1] Ma, J. and Yarats, D. Quasi-hyperbolic momentum and Adam for deep learning. arXiv preprint arXiv:1810.06801, 2018

12 | 13 |

[2] QHAdam Papers With Code

14 | 15 |

[3] John Chen. An updated overview of recent gradient descent algorithms

16 | 17 | ## Code 18 | 19 | - [QHAdam Numpy Implementation](code/qhadam.py) -------------------------------------------------------------------------------- /Optimizers/qhadam/code/qhadam.py: -------------------------------------------------------------------------------- 1 | # based on https://arxiv.org/pdf/1810.06801.pdf 2 | 3 | import numpy as np 4 | 5 | 6 | class QHAdam: 7 | """QHAdam - Quasi-Hyperbolic Adam 8 | Parameters: 9 | ----------- 10 | learning_rate: float = 0.001 11 | The step length used when following the negative gradient. 12 | beta_1: float = 0.9 13 | The exponential decay rate for the 1st moment estimates. 14 | beta_2: float = 0.999 15 | The exponential decay rate for the 2nd moment estimates. 16 | epsilon: float = 1e-07 17 | A small floating point value to avoid zero denominator. 18 | v_1: float = 0.7 19 | Immediate discount factor 20 | v_2: float = 1.0 21 | Immediate discount factor 22 | """ 23 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7, v_1: float = 0.7, v_2: float = 1.0) -> None: 24 | self.learning_rate = learning_rate 25 | self.epsilon = epsilon 26 | self.beta_1 = beta_1 27 | self.beta_2 = beta_2 28 | self.v_1 = v_1 29 | self.v_2 = v_2 30 | 31 | self.t = 0 32 | self.m = None # Decaying averages of past gradients 33 | self.v = None # Decaying averages of past squared gradients 34 | 35 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 36 | self.t += 1 37 | if self.m is None: 38 | self.m = np.zeros(np.shape(grad_wrt_w)) 39 | self.v = np.zeros(np.shape(grad_wrt_w)) 40 | 41 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 42 | self.v = self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2) 43 | 44 | m_hat = self.m / (1 - self.beta_1**self.t) 45 | v_hat = self.v / (1 - self.beta_2**self.t) 46 | 47 | w_update = self.learning_rate * ((1 - self.v_1) * grad_wrt_w + self.v_1 * m_hat) / (np.sqrt((1 - self.v_2) * np.power(grad_wrt_w, 2) + self.v_2 * v_hat) + self.epsilon) 48 | 49 | return w - w_update 50 | -------------------------------------------------------------------------------- /Optimizers/qhadam/doc/qhadam_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/qhadam/doc/qhadam_example.png -------------------------------------------------------------------------------- /Optimizers/qhadam/tex/41922e474070adc90e7c1379c28d22fe.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /Optimizers/qhm/README.md: -------------------------------------------------------------------------------- 1 | # QHM (Quasi-Hyperbolic Momentum) 2 | 3 | ![QHM Update rule](doc/qhm_update_rule.PNG) 4 | 5 | Quasi-Hyperbolic Momentum Algorithm (QHM) is a simple alteration of SGD with momentum, averaging a plain SGD step with a momentum step, thereby decoupling the momentum term from the current gradient when updating the weights. 6 | 7 |

8 | 9 |

10 | 11 | The authors recommend  and  as a good starting point. For more information about QHM, check out the resources below. 12 | 13 | ## Code 14 | 15 | - [QHM Numpy Implementation](code/qhm.py) 16 | 17 | ## Resources 18 | 19 | - [https://arxiv.org/pdf/1810.06801.pdf](https://arxiv.org/pdf/1810.06801.pdf) 20 | - [https://paperswithcode.com/method/qhadam](https://paperswithcode.com/method/qhadam) 21 | - [https://johnchenresearch.github.io/demon/](https://johnchenresearch.github.io/demon/) 22 | - [https://facebookresearch.github.io/qhoptim/](https://facebookresearch.github.io/qhoptim/) -------------------------------------------------------------------------------- /Optimizers/qhm/README.tex.md: -------------------------------------------------------------------------------- 1 | # QHM (Quasi-Hyperbolic Momentum) 2 | 3 | ![QHM Update rule](doc/qhm_update_rule.PNG) 4 | 5 | Quasi-Hyperbolic Momentum Algorithm (QHM) is a simple alteration of SGD with momentum, averaging a plain SGD step with a momentum step, thereby decoupling the momentum term $\beta$ from the current gradient $\nabla_t$ when updating the weights. 6 | 7 | $$g_{t + 1} \leftarrow \beta \cdot g_t + (1 - \beta) \cdot \nabla_t$$ 8 | 9 | $$\theta_{t + 1} \leftarrow \theta_t + \alpha \left[ (1 - \nu) \cdot \nabla_t + \nu \cdot g_{t + 1} \right]$$ 10 | 11 | The authors recommend $\nu=0.7$ and $\beta=0.999$ as a good starting point. For more information about QHM, check out the resources below. 12 | 13 | ## Code 14 | 15 | - [QHM Numpy Implementation](code/qhm.py) 16 | 17 | ## Resources 18 | 19 | - [https://arxiv.org/pdf/1810.06801.pdf](https://arxiv.org/pdf/1810.06801.pdf) 20 | - [https://paperswithcode.com/method/qhadam](https://paperswithcode.com/method/qhadam) 21 | - [https://johnchenresearch.github.io/demon/](https://johnchenresearch.github.io/demon/) 22 | - [https://facebookresearch.github.io/qhoptim/](https://facebookresearch.github.io/qhoptim/) -------------------------------------------------------------------------------- /Optimizers/qhm/code/qhm.py: -------------------------------------------------------------------------------- 1 | # based on https://arxiv.org/pdf/1810.06801.pdf 2 | 3 | import numpy as np 4 | 5 | 6 | class QHM: 7 | """QHM -Quasi-Hyperbolic Momentum 8 | Parameters: 9 | ----------- 10 | learning_rate: float = 0.001 11 | The step length used when following the negative gradient. 12 | beta: float = 0.999 13 | Momentum factor. 14 | v: float = 0.7 15 | Immediate discount factor. 16 | """ 17 | def __init__(self, learning_rate: float = 0.001, beta: float = 0.999, v: float = 0.7) -> None: 18 | self.learning_rate = learning_rate 19 | self.beta = beta 20 | self.v = v 21 | 22 | self.g_t = np.array([]) 23 | 24 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 25 | if not self.g_t.any(): 26 | self.g_t = np.zeros(np.shape(w)) 27 | 28 | self.g_t = self.beta * self.g_t + (1 - self.beta) * grad_wrt_w 29 | 30 | return w - self.learning_rate * ((1 - self.v) * grad_wrt_w + self.v * self.g_t) 31 | -------------------------------------------------------------------------------- /Optimizers/qhm/doc/qhm_update_rule.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/qhm/doc/qhm_update_rule.PNG -------------------------------------------------------------------------------- /Optimizers/qhm/tex/8217ed3c32a785f0b5aad4055f432ad8.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Optimizers/qhm/tex/f9acdf2e58c905cd2502b16cd0f720c9.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /Optimizers/radam/README.md: -------------------------------------------------------------------------------- 1 | # RAdam - Rectified Adam 2 | 3 | ![RAdam Adam Comparison](doc/radam_adam_comparison.png) 4 | 5 | RAdam or "Rectified Adam" is a variant of the Adam optimizer that seeks to tackle Adam's bad convergence problem by introducing a term to rectify the variance of the adaptive learning rate. 6 | 7 | The authors argue that the root cause of Adam's bad convergence is that the adaptive learning rate has an undesirable large variance in the early stage of model training due to the limited amount of training samples being used. 8 | 9 | RAdam deals with the large variance of the adaptive learning rate by adding a rectifier term: 10 | 11 | ![RAdam Update Rule](doc/radam_update_rule.png) 12 | 13 | ## Code 14 | 15 | - [RAdam Numpy Implementation](code/radam.py) 16 | 17 | ## Resources 18 | 19 | - [https://arxiv.org/abs/1908.03265](https://arxiv.org/abs/1908.03265) 20 | - [https://paperswithcode.com/method/radam](https://paperswithcode.com/method/radam) 21 | - [https://github.com/LiyuanLucasLiu/RAdam](https://github.com/LiyuanLucasLiu/RAdam) 22 | - [https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b](https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b) -------------------------------------------------------------------------------- /Optimizers/radam/README.tex.md: -------------------------------------------------------------------------------- 1 | # RAdam - Rectified Adam 2 | 3 | ![RAdam Adam Comparison](doc/radam_adam_comparison.png) 4 | 5 | RAdam or "Rectified Adam" is a variant of the Adam optimizer that seeks to tackle Adam's bad convergence problem by introducing a term to rectify the variance of the adaptive learning rate. 6 | 7 | The authors argue that the root cause of Adam's bad convergence is that the adaptive learning rate has an undesirable large variance in the early stage of model training due to the limited amount of training samples being used. 8 | 9 | RAdam deals with the large variance of the adaptive learning rate by adding a rectifier term: 10 | 11 | ![RAdam Update Rule](doc/radam_update_rule.png) 12 | 13 | ## Code 14 | 15 | - [RAdam Numpy Implementation](code/radam.py) 16 | 17 | ## Resources 18 | 19 | - [https://arxiv.org/abs/1908.03265](https://arxiv.org/abs/1908.03265) 20 | - [https://paperswithcode.com/method/radam](https://paperswithcode.com/method/radam) 21 | - [https://github.com/LiyuanLucasLiu/RAdam](https://github.com/LiyuanLucasLiu/RAdam) 22 | - [https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b](https://lessw.medium.com/new-state-of-the-art-ai-optimizer-rectified-adam-radam-5d854730807b) -------------------------------------------------------------------------------- /Optimizers/radam/code/radam.py: -------------------------------------------------------------------------------- 1 | # based on https://arxiv.org/pdf/1908.03265.pdf 2 | 3 | import numpy as np 4 | 5 | 6 | class RAdam: 7 | """RAdam 8 | Parameters: 9 | ----------- 10 | learning_rate: float = 0.001 11 | The step length used when following the negative gradient. 12 | beta_1: float = 0.9 13 | The exponential decay rate for the 1st moment estimates. 14 | beta_2: float = 0.999 15 | The exponential decay rate for the 2nd moment estimates. 16 | """ 17 | def __init__(self, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999) -> None: 18 | self.learning_rate = learning_rate 19 | self.beta_1 = beta_1 20 | self.beta_2 = beta_2 21 | 22 | self.p_max = 2 / (1 - self.beta_2) - 1 23 | 24 | self.t = 0 25 | self.m = None # Decaying averages of past gradients 26 | self.v = None # Decaying averages of past squared gradients 27 | 28 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 29 | self.t += 1 30 | if self.m is None: 31 | self.m = np.zeros(np.shape(grad_wrt_w)) 32 | self.v = np.zeros(np.shape(grad_wrt_w)) 33 | 34 | self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad_wrt_w 35 | self.v = 1 / self.beta_2 * self.v + (1 - self.beta_2) * np.power(grad_wrt_w, 2) 36 | 37 | m_hat = self.m / (1 - self.beta_1**self.t) 38 | p_t = self.p_max - 2 * self.t * self.beta_2**self.t / (1 - self.beta_2**self.t) 39 | 40 | if p_t > 4: 41 | l_t = np.sqrt((1 - self.beta_2**self.t) / self.v) 42 | r_t = np.sqrt(((p_t - 4) * (p_t - 2) * self.p_max) / ((self.p_max - 4) * (self.p_max - 2) * p_t)) 43 | w_update = self.learning_rate * r_t * m_hat * l_t 44 | else: 45 | w_update = self.learning_rate * m_hat 46 | 47 | return w - w_update 48 | -------------------------------------------------------------------------------- /Optimizers/radam/doc/radam_adam_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/radam/doc/radam_adam_comparison.png -------------------------------------------------------------------------------- /Optimizers/radam/doc/radam_update_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/radam/doc/radam_update_rule.png -------------------------------------------------------------------------------- /Optimizers/rmsprop/README.md: -------------------------------------------------------------------------------- 1 | # RMSprop 2 | 3 | ![RMSprop Example](doc/rmsprop_example.PNG) 4 | 5 | RMSprop is an unpublished, adaptive learning rate optimization algorithm first proposed by [Geoff Hinton](https://en.wikipedia.org/wiki/Geoffrey_Hinton) in lecture 6 of his online class "[Neural Networks for Machine Learning](http://www.cs.toronto.edu/~hinton/coursera/lecture6/lec6.pdf)". RMSprop and Adadelta have been developed independently around the same time, and both try to resolve Adagrad's diminishing learning rate problem. [1] 6 | 7 |

8 |

9 | 10 | The difference between Adadelta and RMSprop is that Adadelta removes the learning rate entirely and replaces it by the root mean squared error of parameter updates. 11 | 12 |

[1] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.

13 | 14 | ## Code 15 | 16 | * [RMSprop Numpy Implementation](code/rmsprop.py) -------------------------------------------------------------------------------- /Optimizers/rmsprop/README.tex.md: -------------------------------------------------------------------------------- 1 | # RMSprop 2 | 3 | ![RMSprop Example](doc/rmsprop_example.PNG) 4 | 5 | RMSprop is an unpublished, adaptive learning rate optimization algorithm first proposed by [Geoff Hinton](https://en.wikipedia.org/wiki/Geoffrey_Hinton) in lecture 6 of his online class "[Neural Networks for Machine Learning](http://www.cs.toronto.edu/~hinton/coursera/lecture6/lec6.pdf)". RMSprop and Adadelta have been developed independently around the same time, and both try to resolve Adagrad's diminishing learning rate problem. [1] 6 | 7 | $$E[g^2]_t = 0.9 E[g^2]_{t-1} + 0.1 g^2_t$$ 8 | 9 | $$\theta_{t+1} = \theta_{t} - \dfrac{\eta}{\sqrt{E[g^2]_t + \epsilon}} g_{t}$$ 10 | 11 | The difference between Adadelta and RMSprop is that Adadelta removes the learning rate $\eta$ entirely and replaces it by the root mean squared error of parameter updates. 12 | 13 |

[1] Sebastian Ruder (2016). An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747.

14 | 15 | ## Code 16 | 17 | * [RMSprop Numpy Implementation](code/rmsprop.py) -------------------------------------------------------------------------------- /Optimizers/rmsprop/code/rmsprop.py: -------------------------------------------------------------------------------- 1 | # based on https://ruder.io/optimizing-gradient-descent/#rmsprop 2 | # and https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/deep_learning/optimizers.py#L88 3 | 4 | import numpy as np 5 | 6 | 7 | class RMSprop: 8 | """RMSprop 9 | Parameters: 10 | ----------- 11 | learning_rate: float = 0.001 12 | The step length used when following the negative gradient. 13 | rho: float = 0.9 14 | Discounting factor for the history/coming gradient. 15 | epsilon: float = 1e-07 16 | A small floating point value to avoid zero denominator. 17 | """ 18 | def __init__(self, learning_rate: float = 0.001, rho: float = 0.9, epsilon: float = 1e-7) -> None: 19 | self.learning_rate = learning_rate 20 | self.rho = rho 21 | self.epsilon = epsilon 22 | self.E_grad = None # Running average of the square gradients at w 23 | 24 | def update(self, w: np.ndarray, grad_wrt_w: np.ndarray) -> np.ndarray: 25 | if self.E_grad is None: 26 | self.E_grad = np.zeros(np.shape(grad_wrt_w)) 27 | 28 | # Update average of gradients at w 29 | self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2) 30 | 31 | return w - self.learning_rate * grad_wrt_w / np.sqrt(self.E_grad + self.epsilon) 32 | -------------------------------------------------------------------------------- /Optimizers/rmsprop/doc/rmsprop_example.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TannerGilbert/Machine-Learning-Explained/c671f4cdb68eb316f39a8f6142c8d3cffb838c11/Optimizers/rmsprop/doc/rmsprop_example.PNG -------------------------------------------------------------------------------- /Optimizers/rmsprop/tex/1d0496971a2775f4887d1df25cea4f7e.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | --------------------------------------------------------------------------------