├── .gitignore
├── README.md
├── cs224n
└── README.md
├── fastai
├── README.md
└── bird_or_plane.ipynb
├── kaggle
├── README.md
├── ensemble
│ ├── README.md
│ └── utils.py
├── metrics
│ ├── README.md
│ └── metrics.py
├── tuning
│ └── optuna_classification.py
├── validation
│ └── README.md
└── viz
│ └── eda.py
├── llms
├── README.md
├── fine_tuning
│ └── REAMDE.md
└── gpt_numpy
│ └── gpt.py
├── math_for_ml
├── README.md
└── linalg.ipynb
├── minitorch
└── README.md
├── ml_from_scratch
├── README.md
└── supervised
│ ├── lineargression.py
│ └── xgboost.py
├── nlp
├── BERT
│ └── README.md
├── classic
│ ├── LDA.py
│ ├── LSA.py
│ ├── PCA.py
│ ├── SVD.py
│ ├── TFIDF.py
│ ├── TSNE.py
│ └── UMAP.py
└── embeddings
│ ├── README.md
│ ├── berttokenize.ipynb
│ └── tfidf.ipynb
├── nn_zero_to_hero
├── README.md
└── micrograd
│ ├── derivatives.ipynb
│ ├── exercises.ipynb
│ ├── micrograd.ipynb
│ ├── micrograd.py
│ └── viz.py
├── roadmap.png
└── tensor_puzzles
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AI from scratch
2 |
3 | documenting what I learn
4 |
5 | from this [roadmap](https://medium.com/bitgrit-data-science-publication/a-roadmap-to-learn-ai-in-2024-cc30c6aa6e16) I made
6 |
7 | 
8 |
--------------------------------------------------------------------------------
/cs224n/README.md:
--------------------------------------------------------------------------------
1 | # CS224N: Natural Language Processing with Deep Learning
2 |
3 | [Course page](https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1234/)
4 |
5 | Papers
6 |
7 | - [Efficient Estimation of Word Representations in Vector Space (original word2vec paper)](https://arxiv.org/pdf/1301.3781.pdf)
8 | - [Distributed Representations of Words and Phrases and their Compositionality (negative sampling paper)](https://proceedings.neurips.cc/paper_files/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf)
9 |
--------------------------------------------------------------------------------
/fastai/README.md:
--------------------------------------------------------------------------------
1 | # Fast AI
2 |
3 | - [Practical Deep Learning for Coders - Practical Deep Learning](https://course.fast.ai/)
4 | - [Lectures](https://www.youtube.com/playlist?list=PLfYUBJiXbdtSvpQjSnJJ_PmDQB_VyT5iU)
5 | - [Practical Deep Learning for Coders - Part 2 overview](https://course.fast.ai/Lessons/part2.html)
6 | - [Walk with fastai - Introduction](https://walkwithfastai.com/revisited/)
7 |
--------------------------------------------------------------------------------
/kaggle/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle Notes
2 |
3 | Sources
4 |
5 | - [Kaggle Solutions](https://farid.one/kaggle-solutions/)
6 | - [The Kaggle Book](https://learning.oreilly.com/library/view/the-kaggle-book/9781801817479/)
7 | - [Winning Toolkit for Competitive ML](https://mlcontests.com/winning-toolkit/)
8 | - [[1811.12808] Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning](https://arxiv.org/abs/1811.12808)
9 |
10 | ## Metrics
11 |
12 | - in real world, your model will be evaluated against multiple metrics, and some of the metrics won't even be related to how your predictions perform against the ground truth you are using for testing
13 | - ex: domain of knowledge you're working in, scope of project, number of features considered for model, overall memory usage, requirements for special hardware, latency of prediction process, complexity of prediction model, and many other aspects may count more that just predictive performance.
14 | - it is dominated by business and tech infrastructure concerns
15 |
16 | ### evaluation metrics and objective functions
17 |
18 | - objective functions: serves model during training, involved in process of error minimization (or score maximization)
19 | - evaluation metric: serves your model after it has been trained by providing a score
20 | - it cannot influence how model fits data, but it helps you select the best configurations within a model, and the best model among competing ones
21 | - analysis of evaluation metric should be your first act in competitions
22 |
23 | ### objective function, cost function and loss function
24 |
25 | - loss function: single data point (penalty = |pred - ground truth|)
26 | - cost function: on whole dataset (or a batch), sum or average over loss penalties. Can comprise further constraints, i.e. L1 or L2 penalties
27 | - objective function: related to scope of optimization during ML training, comprise cost function, but not limited to them. Can also take into account goals not related to target, ex: requiring sparse coefficients of estimated model or minimization of coefficients' values, i.e. L1 and L2 regularization.
28 |
29 | Loss & cost imply optimization based on minimization, objective function is neutral, it can be a maximization or minimization activity.
30 |
31 | Scoring function (higher score = better prediction, maximization process), error functions (smaller error = better prediction, minimization process)
32 |
33 | ### basic tasks
34 |
35 | - regression
36 | - a model that can predict a real number (often positive, sometimes negative)
37 | - evaluations: diff = dist(pred, true), square(diff) it to punish large errors / log(diff) to penalize predictions of the wrong scale
38 | - classification
39 | - binary: 0 or 1 / probabilities of class (in medical fields)
40 | - churn/not churn, cancer/not cancer (probability is important here)
41 | - !watch out for imbalance!, use eval metrics that take imbalance into account
42 | - multi-class: >2 classes
43 | - ex: leaf classification
44 | - ensure performance across class is comparable (model can underperform with respect to certain classes)
45 | - multi-label: predictions are not exclusive and you can predict multiple class ownership
46 | - ex: classify news articles with relevant topics
47 | - require further evaluations to control whether model is predicting correct classes, as well as the correct number and mix of classes
48 | - Ordinal
49 | - halfway between regression and classification
50 | - ex: magnitude of earthquake, customer preferences
51 | - as multiclass
52 | - get prediction as integer value, but not take into account the order of class
53 | - problem: probabilities distributed across entire range of possible values, depicting multi-model and often asymmetric distribution (you should expect Gaussian around max probability class)
54 | - as regression
55 | - output as a float number, and results include full range of values between integers of ordinal distribution, and possible outside of it
56 | - one solution is to crop the output values, cast into int by unit rounding, but may lead to inaccuracies, requiring more sophisticated post-processing
57 |
58 | ### common metrics
59 |
60 | Top Kaggle metrics
61 |
62 | - **AUC**: measures if your model's predicted probabilities tend to predict positive cases with high probabilities
63 | - **log loss**: how far your predicted probabilities are from the ground truth (as you optimize for log loss, you optimize for AUC metric)
64 | - **MAC@{k}**: common in recsys and search engines, used for information retrieval evaluations
65 | - ex: whale identification and having 5 possible guesses
66 | - ex2: quickdraw doodle recognition (guess the content of a drawn sketch in 3 attempts, score not just if you can guess correctly, but if your correct guess is among a certain number, the "K" in the name of the function, of other incorrect predictions)
67 | - RMSLE (root mean squared logarithmic error):
68 | - quadratic weigthed kappa: for ordinal scale problems (problems that involve guessing a progressive integer number)
69 |
70 | Metrics for regression
71 |
72 | - MSE
73 | - mean of sum of squared errors (SSE)
74 | - cautions
75 | - sensitive to outliers
76 | - imbalanced errors
77 | - not robust to non-gaussian errors
78 | - lack of sensitivity to small errors
79 | - R squared (coefficient of determination)
80 |
--------------------------------------------------------------------------------
/kaggle/ensemble/README.md:
--------------------------------------------------------------------------------
1 | # Ensemble Learning
2 |
3 | ## What
4 |
5 | A technique that blends predictions from a diverse set of models.
6 |
7 | See [Wisdom of Crowds](https://arxiv.org/abs/1605.04074)
8 |
9 | ## Why they work
10 |
11 | - performance: ensemble reduce variance component of prediction error by adding bias
12 | - robustness: ensemble reduces reliance on any single model's prediction, making it better at handling noisy data.
13 |
14 | ## Diversity
15 |
16 | ensemble learning is based on concept of combining multiple weak learners, weak because individual models don't need to be very accurate, as long as they're better than a random model, combining them is beneficial.
17 |
18 | Diversity is a concept referring to the idea that individual models have to be **as different from each other as possible**. This is because different models are likely to make different types of errors. By combining predictions of a diverse set, we can reduce overall error of the ensemble.
19 |
20 | In order for accuracy of ensemble to be better than individual models, there needs to be diversity.
21 |
22 | ### how
23 |
24 | - train each model on different subset of data
25 | - bagging (w replacement)
26 | - pasting (w/out replacement)
27 | - ex:
28 | - random forest: achieves diversity using random number of features at each split
29 | - extremely randomized trees: a random split to lower correlation between trees
30 | - train each model with a different set of features
31 | - train each model using a different type of algorithm
32 | - voting and stacking meta-models
33 |
34 | ### good and bad
35 |
36 | good diversity: ensemble is already correct, low disagreement between classifier, several votes wasted
37 |
38 | bad diversity: ensemble is incorrect, any disagreement represent a wasted vote, as individual classifier did not contribute to correct decision.
39 |
40 | increase good diversity (where disagreements among classifiers contribute to correct decisions) and reduce bad diversity (where disagreements does not contribute to correct decisions).
41 |
42 | ### metrics
43 |
44 | - let f_1, f_2, f_3 be predictions of diff models in ensemble
45 |
46 | two types of measures
47 |
48 | - pairwise: computed for every f_i, f_j pair, represented by NxN matrix
49 | - global: computed on whole matrix of predictions, represented by a single value
50 |
51 | Measures
52 |
53 | - pearson correlation coefficient
54 | - disagreement
55 | - Yule's Q
56 | - entropy
57 |
58 | References
59 |
60 | - [Measures of Diversity in Classifier Ensembles and Their Relationship with the Ensemble Accuracy | Machine Learning](https://link.springer.com/article/10.1023/A:1022859003006)
61 | - [Understanding the Importance of Diversity in Ensemble Learning](https://towardsdatascience.com/understanding-the-importance-of-diversity-in-ensemble-learning-34fb58fd2ed0#:~:text=Ensemble%20learning%20is%20a%20powerful,of%20the%20ensemble%20also%20increased.)
62 |
63 | ## Methods
64 |
65 | 1. blending : averaging, weighted averaging, and rank averaging
66 | - average the outputs
67 | - weights given to model can be assigned explicitly or implicitly by [rank averaging](https://towardsdatascience.com/ensemble-averaging-improve-machine-learning-performance-by-voting-246106c753ee), which ranks models by performance and gives more accurate models greater weights
68 | - involves using optuna or hyperopt to find optimal blend by taking into account cross validation metrics
69 | 2. Voting : for classification
70 | - ex: majority voting: class that most models predict is chosen
71 | 3. classical trio: bagging, boosting and stacking
72 | - bagging: train multiple models on different subsets of training data and average prediction
73 | - boosting: sequentially training models, each new model focuses on errors made by predecessors.
74 | - Stacking: feed predictions of various models as input to higher-level model.
75 |
76 | ## Reality
77 |
78 | Building robust and highly accurate models are only half the solution. an equally challenging part is explainability and fairness.
79 |
80 | see: [On Transparency of Machine Learning Models: A Position Paper](https://crcs.seas.harvard.edu/sites/projects.iq.harvard.edu/files/crcs/files/ai4sg_2020_paper_62.pdf) and [[2105.06791] Agree to Disagree: When Deep Learning Models With Identical Architectures Produce Distinct Explanations](https://arxiv.org/abs/2105.06791)
81 |
82 | ## References
83 |
84 | - [Unreasonably Effective Ensemble Learning](https://www.kaggle.com/code/yeemeitsang/unreasonably-effective-ensemble-learning/notebook#Conclusion)
85 |
86 | ## implementations
87 |
88 | - [EnsemblesTutorial/ensemble_functions.py at main · PadraigC/EnsemblesTutorial](https://github.com/PadraigC/EnsemblesTutorial/blob/main/ensemble_functions.py)
89 |
--------------------------------------------------------------------------------
/kaggle/ensemble/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def coefficients(preds):
5 | A = np.asarray(preds[:, 0], dtype=bool)
6 | B = np.asarray(preds[:, 1], dtype=bool)
7 |
8 | a = np.sum(A * B) # A right, B right
9 | b = np.sum(~A * B) # A wrong, B right
10 | c = np.sum(A * ~B) # A right, B wrong
11 | d = np.sum(~A * ~B) # A wrong, B wrong
12 |
13 | return a, b, c, d
14 |
15 |
16 | def disagreement(preds, i, j):
17 | L = preds.shape[1]
18 | a, b, c, d = coefficients(preds[:, [i, j]])
19 | return float(b + c) / (a + b + c + d)
20 |
21 |
22 | def paired_q(preds, i, j):
23 | L = preds.shape[1]
24 | # div = np.zeros((L * (L - 1)) // 2)
25 | a, b, c, d = coefficients(preds[:, [i, j]])
26 | return float(a * d - b * c) / ((a * d + b * c) + 10e-24)
27 |
28 |
29 | def entropy(preds):
30 | L = preds.shape[1]
31 | tmp = np.sum(preds, axis=1)
32 | tmp = np.minimum(tmp, L - tmp)
33 | ent = np.mean((1.0 / (L - np.ceil(0.5 * L))) * tmp)
34 | return ent
35 |
--------------------------------------------------------------------------------
/kaggle/metrics/README.md:
--------------------------------------------------------------------------------
1 | # metrics
2 |
3 | [ajitsingh98/Evaluation-Metrics-In-Machine-Learning-Problems-Python: evaluation metrics implementation in Python from scratch](https://github.com/ajitsingh98/Evaluation-Metrics-In-Machine-Learning-Problems-Python)
4 |
--------------------------------------------------------------------------------
/kaggle/metrics/metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numba import jit
3 |
4 |
5 | def mse(y_pred: np.array, y_true: np.array) -> float:
6 | squared_diff = (y_pred - y_true) ** 2
7 | mse = np.mean(squared_diff)
8 | return mse
9 |
10 |
11 | def rmse(y_true, y_pred):
12 | return np.sqrt(mse)
13 |
14 |
15 | def r2(y_true, y_pred):
16 | pass
17 |
18 |
19 | @jit
20 | def fast_auc(y_true, y_prob):
21 | """
22 | fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
23 | """
24 | y_true = np.asarray(y_true)
25 | y_true = y_true[np.argsort(y_prob)]
26 | nfalse = 0
27 | auc = 0
28 | n = len(y_true)
29 | for i in range(n):
30 | y_i = y_true[i]
31 | nfalse += 1 - y_i
32 | auc += y_i * nfalse
33 | auc /= nfalse * (n - nfalse)
34 | return auc
35 |
--------------------------------------------------------------------------------
/kaggle/tuning/optuna_classification.py:
--------------------------------------------------------------------------------
1 | # paper: https://arxiv.org/pdf/1907.10902.pdf
2 | # examples: https://github.com/optuna/optuna-examples
3 |
4 | """
5 | Optuna example that optimizes a classifier configuration for cancer dataset
6 | using XGBoost.
7 |
8 | In this example, we optimize the validation accuracy of cancer detection
9 | using XGBoost. We optimize both the choice of booster model and its
10 | hyperparameters.
11 |
12 | """
13 |
14 | import numpy as np
15 | import optuna
16 | import sklearn.datasets
17 | import sklearn.metrics
18 | import xgboost as xgb
19 | from sklearn.model_selection import train_test_split
20 |
21 |
22 | def objective(trial):
23 | (data, target) = sklearn.datasets.load_breast_cancer(return_X_y=True)
24 | train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
25 | dtrain = xgb.DMatrix(train_x, label=train_y)
26 | dvalid = xgb.DMatrix(valid_x, label=valid_y)
27 |
28 | param = {
29 | "verbosity": 0,
30 | "objective": "binary:logistic",
31 | # use exact for small dataset.
32 | "tree_method": "exact",
33 | # defines booster, gblinear for linear functions.
34 | "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
35 | # L2 regularization weight.
36 | "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
37 | # L1 regularization weight.
38 | "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
39 | # sampling ratio for training data.
40 | "subsample": trial.suggest_float("subsample", 0.2, 1.0),
41 | # sampling according to each tree.
42 | "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
43 | }
44 |
45 | if param["booster"] in ["gbtree", "dart"]:
46 | # maximum depth of the tree, signifies complexity of the tree.
47 | param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
48 | # minimum child weight, larger the term more conservative the tree.
49 | param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
50 | param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
51 | # defines how selective algorithm is.
52 | param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
53 | param["grow_policy"] = trial.suggest_categorical(
54 | "grow_policy", ["depthwise", "lossguide"]
55 | )
56 |
57 | if param["booster"] == "dart":
58 | param["sample_type"] = trial.suggest_categorical(
59 | "sample_type", ["uniform", "weighted"]
60 | )
61 | param["normalize_type"] = trial.suggest_categorical(
62 | "normalize_type", ["tree", "forest"]
63 | )
64 | param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
65 | param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
66 |
67 | bst = xgb.train(param, dtrain)
68 | preds = bst.predict(dvalid)
69 | pred_labels = np.rint(preds)
70 | accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
71 | return accuracy
72 |
73 |
74 | if __name__ == "__main__":
75 | study = optuna.create_study(direction="maximize")
76 | study.optimize(objective, n_trials=100, timeout=600)
77 |
78 | print("Number of finished trials: ", len(study.trials))
79 | print("Best trial:")
80 | trial = study.best_trial
81 |
82 | print(" Value: {}".format(trial.value))
83 | print(" Params: ")
84 | for key, value in trial.params.items():
85 | print(" {}: {}".format(key, value))
86 |
87 | """
88 | Optuna example that optimizes a classifier configuration for cancer dataset using LightGBM.
89 |
90 | In this example, we optimize the validation accuracy of cancer detection using LightGBM.
91 | We optimize both the choice of booster model and their hyperparameters.
92 |
93 | """
94 |
95 | import lightgbm as lgb
96 | import numpy as np
97 | import optuna
98 | import sklearn.datasets
99 | import sklearn.metrics
100 | from sklearn.model_selection import train_test_split
101 |
102 |
103 | # FYI: Objective functions can take additional arguments
104 | # (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
105 | def objective(trial):
106 | data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
107 | train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
108 | dtrain = lgb.Dataset(train_x, label=train_y)
109 |
110 | param = {
111 | "objective": "binary",
112 | "metric": "binary_logloss",
113 | "verbosity": -1,
114 | "boosting_type": "gbdt",
115 | "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
116 | "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
117 | "num_leaves": trial.suggest_int("num_leaves", 2, 256),
118 | "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
119 | "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
120 | "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
121 | "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
122 | }
123 |
124 | gbm = lgb.train(param, dtrain)
125 | preds = gbm.predict(valid_x)
126 | pred_labels = np.rint(preds)
127 | accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
128 | return accuracy
129 |
130 |
131 | if __name__ == "__main__":
132 | study = optuna.create_study(direction="maximize")
133 | study.optimize(objective, n_trials=100)
134 |
135 | print("Number of finished trials: {}".format(len(study.trials)))
136 |
137 | print("Best trial:")
138 | trial = study.best_trial
139 |
140 | print(" Value: {}".format(trial.value))
141 |
142 | print(" Params: ")
143 | for key, value in trial.params.items():
144 | print(" {}: {}".format(key, value))
145 |
146 | """
147 | Optuna example that optimizes a classifier configuration for cancer dataset using
148 | Catboost.
149 |
150 | In this example, we optimize the validation accuracy of cancer detection using
151 | Catboost. We optimize both the choice of booster model and their hyperparameters.
152 |
153 | """
154 |
155 | import catboost as cb
156 | import numpy as np
157 | import optuna
158 | from sklearn.datasets import load_breast_cancer
159 | from sklearn.metrics import accuracy_score
160 | from sklearn.model_selection import train_test_split
161 |
162 |
163 | def objective(trial):
164 | data, target = load_breast_cancer(return_X_y=True)
165 | train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.3)
166 |
167 | param = {
168 | "objective": trial.suggest_categorical(
169 | "objective", ["Logloss", "CrossEntropy"]
170 | ),
171 | "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
172 | "depth": trial.suggest_int("depth", 1, 12),
173 | "boosting_type": trial.suggest_categorical(
174 | "boosting_type", ["Ordered", "Plain"]
175 | ),
176 | "bootstrap_type": trial.suggest_categorical(
177 | "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
178 | ),
179 | "used_ram_limit": "3gb",
180 | }
181 |
182 | if param["bootstrap_type"] == "Bayesian":
183 | param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
184 | elif param["bootstrap_type"] == "Bernoulli":
185 | param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
186 |
187 | gbm = cb.CatBoostClassifier(**param)
188 |
189 | gbm.fit(
190 | train_x,
191 | train_y,
192 | eval_set=[(valid_x, valid_y)],
193 | verbose=0,
194 | early_stopping_rounds=100,
195 | )
196 |
197 | preds = gbm.predict(valid_x)
198 | pred_labels = np.rint(preds)
199 | accuracy = accuracy_score(valid_y, pred_labels)
200 | return accuracy
201 |
202 |
203 | if __name__ == "__main__":
204 | study = optuna.create_study(direction="maximize")
205 | study.optimize(objective, n_trials=100, timeout=600)
206 |
207 | print("Number of finished trials: {}".format(len(study.trials)))
208 |
209 | print("Best trial:")
210 | trial = study.best_trial
211 |
212 | print(" Value: {}".format(trial.value))
213 |
214 | print(" Params: ")
215 | for key, value in trial.params.items():
216 | print(" {}: {}".format(key, value))
217 |
--------------------------------------------------------------------------------
/kaggle/validation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/kaggle/validation/README.md
--------------------------------------------------------------------------------
/kaggle/viz/eda.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import seaborn as sns
4 | import matplotlib.pyplot as plt
5 | %matplotlib inline
6 |
7 | sns.set_theme(style="whitegrid", palette="muted", context="talk", font_scale=1.2)
8 |
9 | plt.rcParams.update({
10 | 'figure.figsize': (10, 6),
11 | 'axes.titlesize': 18,
12 | 'axes.labelsize': 16,
13 | 'xtick.labelsize': 14,
14 | 'ytick.labelsize': 14,
15 | 'legend.fontsize': 12
16 | })
17 |
18 | import warnings
19 | warnings.filterwarnings("ignore")
20 |
21 | def reduce_mem_usage(df, verbose=True):
22 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
23 | start_mem = df.memory_usage().sum() / 1024**2
24 | for col in df.columns:
25 | col_type = df[col].dtypes
26 | if col_type in numerics:
27 | c_min = df[col].min()
28 | c_max = df[col].max()
29 | if str(col_type)[:3] == 'int':
30 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
31 | df[col] = df[col].astype(np.int8)
32 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
33 | df[col] = df[col].astype(np.int16)
34 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
35 | df[col] = df[col].astype(np.int32)
36 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
37 | df[col] = df[col].astype(np.int64)
38 | else:
39 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
40 | df[col] = df[col].astype(np.float16)
41 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
42 | df[col] = df[col].astype(np.float32)
43 | else:
44 | df[col] = df[col].astype(np.float64)
45 | end_mem = df.memory_usage().sum() / 1024**2
46 | if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
47 | return df
48 |
49 | ## EDA
50 |
51 | def plot_categorical(data, column_name):
52 | f, ax = plt.subplots(1, 2, figsize=(18, 8))
53 | data[column_name].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
54 | ax[0].set_title(column_name)
55 | ax[0].set_ylabel('')
56 | sns.countplot(x=column_name, data=data, ax=ax[1])
57 | ax[1].set_title(column_name)
58 | plt.show()
59 |
60 | def plot_correlation_heatmap(df):
61 | corr = df.corr()
62 | mask = np.triu(corr)
63 | plt.figure(figsize=(15, 11))
64 | sns.heatmap(corr, mask=mask, annot=True, fmt=".3f")
65 | plt.show()
66 |
67 |
68 | def plot_pairplot(data, numerical_cols, target_col):
69 | pairplot = sns.pairplot(data=data[numerical_cols + [target_col]],
70 | hue=target_col,
71 | corner=True,
72 | plot_kws={'alpha': 0.7, 's': 50, 'edgecolor': 'k'},
73 | palette='Set1',
74 | diag_kws={'edgecolor':'k'})
75 | pairplot.fig.suptitle("Pairplot of Numerical Variables", y=1.02)
76 | plt.show()
77 |
78 | def plot_boxplot(data, numerical_col, target_col):
79 | sns.boxplot(x=target_col, y=numerical_col, data=data)
80 | plt.title(f'Box Plot of {numerical_col} by {target_col}')
81 | plt.show()
82 |
83 |
84 | def plot_violinplot(data, numerical_col, target_col):
85 | sns.violinplot(x=target_col, y=numerical_col, data=data)
86 | plt.title(f'Violin Plot of {numerical_col} by {target_col}')
87 | plt.show()
88 |
89 | def plot_histograms(data, continuous_vars, target_col):
90 | for column in continuous_vars:
91 | if data[column].dtype == 'float16':
92 | data[column] = data[column].astype('float32')
93 |
94 | fig, ax = plt.subplots(figsize=(18, 4))
95 | sns.histplot(data=data, x=column, hue=target_col, bins=50, kde=True)
96 | plt.show()
97 |
98 | def plot_countplot(data, column_name):
99 | sns.countplot(x=column_name, data=data)
100 | plt.title(f'Count Plot of {column_name}')
101 | plt.show()
--------------------------------------------------------------------------------
/llms/README.md:
--------------------------------------------------------------------------------
1 | # LLMs
2 |
3 | Landscape
4 |
5 | - [Language Models Formulas](https://www.youtube.com/watch?v=KCXDr-UOb9A)
6 |
7 | From Scratch
8 |
9 | - [GPT Speed Optimization](https://www.dipkumar.dev/becoming-the-unbeatable/posts/gpt-kvcache/)
10 | - [GPT in 60 Lines of NumPy | Jay Mody](https://jaykmody.com/blog/gpt-from-scratch/)
11 |
12 | Mistral
13 |
14 | - [makeMoE: Implement a Sparse Mixture of Experts Language Model from Scratch](https://huggingface.co/blog/AviSoori1x/makemoe-from-scratch)
15 |
--------------------------------------------------------------------------------
/llms/fine_tuning/REAMDE.md:
--------------------------------------------------------------------------------
1 | # Fine Tuning
2 |
3 | [Fine-Tune LLMs](https://www.philschmid.de/fine-tune-llms-in-2024-with-trl)
4 | [Fine-Tuning — The GenAI Guidebook](https://ravinkumar.com/GenAiGuidebook/language_models/finetuning.html)
5 |
--------------------------------------------------------------------------------
/llms/gpt_numpy/gpt.py:
--------------------------------------------------------------------------------
1 | # https://jaykmody.com/blog/gpt-from-scratch/
2 |
--------------------------------------------------------------------------------
/math_for_ml/README.md:
--------------------------------------------------------------------------------
1 | # Math for ML
2 |
3 | - [Math for Machine Learning](https://www.youtube.com/playlist?list=PLD80i8An1OEGZ2tYimemzwC3xqkU0jKUg)
4 | - [edu/math-for-ml at main · wandb/edu](https://github.com/wandb/edu/tree/main/math-for-ml)
5 |
6 | - [The Matrix Calculus You Need For Deep Learning](https://arxiv.org/pdf/1802.01528.pdf)
7 | - [Essence of calculus](https://www.youtube.com/playlist?list=PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr)
8 |
9 | - [Computational Linear Algebra](https://www.fast.ai/posts/2017-07-17-num-lin-alg.html) ([video](https://www.youtube.com/playlist?list=PLtmWHNX-gukIc92m1K0P6bIOnZb-mg0hY), [code](https://github.com/fastai/numerical-linear-algebra))
10 | - [Introduction to Linear Algebra for Applied Machine Learning with Python](https://pabloinsente.github.io/intro-linear-algebra)
11 |
12 | Videos
13 |
14 | - [Essence of linear algebra](https://www.youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab)
15 |
--------------------------------------------------------------------------------
/math_for_ml/linalg.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | }
10 | ],
11 | "metadata": {
12 | "kernelspec": {
13 | "display_name": "base",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "name": "python",
19 | "version": "3.9.10"
20 | }
21 | },
22 | "nbformat": 4,
23 | "nbformat_minor": 2
24 | }
25 |
--------------------------------------------------------------------------------
/minitorch/README.md:
--------------------------------------------------------------------------------
1 | # MiniTorch
2 |
3 | Resources
4 |
5 | - [MiniTorch](https://minitorch.github.io/)
6 | - [minitorch](https://github.com/minitorch/)
7 | - [MiniTorch: A DIY Course on Machine Learning Engineering](https://www.youtube.com/playlist?list=PLO45-80-XKkQyROXXpn4PfjF1J2tH46w8)
8 |
--------------------------------------------------------------------------------
/ml_from_scratch/README.md:
--------------------------------------------------------------------------------
1 | # ML and DL from scratch
2 |
3 | - [Implement - YouTube](https://www.youtube.com/playlist?list=PLG8XxYPkVOUvVzz1ZKcGAJpIBK7GRrFYR)
4 | - [eriklindernoren/ML-From-Scratch](https://github.com/eriklindernoren/ML-From-Scratch)
5 | - [JeremyNixon/oracle](https://github.com/JeremyNixon/oracle)
6 | - [trekhleb/homemade-machine-learning](https://github.com/trekhleb/homemade-machine-learning)
7 | - [ethen8181/machine-learning](https://github.com/ethen8181/machine-learning)
8 |
9 | Specific ones
10 |
11 | - [Ekeany/XGBoost-From-Scratch](https://github.com/Ekeany/XGBoost-From-Scratch)
12 | - [HowUMAPWorks/HowUMAPWorks.ipynb](https://github.com/NikolayOskolkov/HowUMAPWorks/blob/c872b2feb1426992c7ef4528994aba7ad6fcc0d6/HowUMAPWorks.ipynb)
13 |
14 | Papers
15 |
16 | - [[2402.01502] Why do Random Forests Work? Understanding Tree Ensembles as Self-Regularizing Adaptive Smoothers](https://arxiv.org/abs/2402.01502)
17 |
--------------------------------------------------------------------------------
/ml_from_scratch/supervised/lineargression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from dataclasses import dataclass
3 | import matplotlib.pyplot as plt
4 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
5 |
6 | plt.style.use("bmh")
7 |
8 |
9 | """
10 | f(x) = xW + b
11 |
12 | MSELoss = (actual - predicted)^2 / n_samples
13 |
14 | wrt weights
15 | ((y - f(x))^2)' = 2(y - f(x))( y - f(x))'
16 | = 2(y - f(x))(y - xW - b)
17 | = -2x(y - f(x))
18 |
19 | wrt bias
20 | ((y - f(x))^2)' = 2(y - f(x))( y - f(x))'
21 | = 2(y - f(x))(y - xW - b)
22 | = -2(y - f(x))
23 | """
24 |
25 |
26 | @dataclass
27 | class LinearRegression:
28 | features: np.ndarray
29 | labels: np.ndarray
30 | learning_rate: float
31 | epochs: int
32 | logging: bool
33 |
34 | def fit(self, features: np.ndarray, labels: np.ndarray) -> None:
35 | """Fits LR model"""
36 |
37 | n_samples, n_features = features.shape
38 | self.weights, self.bias = np.zeros(n_features), 0
39 |
40 | for epoch in range(self.epochs):
41 | residuals = labels - self.predict(features)
42 |
43 | d_weights = -2 / n_samples * features.T.dot(residuals)
44 |
45 | d_bias = -2 / n_samples * residuals.sum()
46 |
47 | self.weights -= self.learning_rate * d_weights
48 | self.bias -= self.learning_rate * d_bias
49 |
50 | mse_loss = np.mean(np.square(residuals))
51 | if self.logging:
52 | print(f"MSE loss [{epoch}] : {mse_loss:.15f}")
53 |
54 | def predict(self, features: np.ndarray) -> np.ndarray:
55 | """Perform inference using given features"""
56 |
57 | return features.dot(self.weights) + self.bias
58 |
59 |
60 | if __name__ == "__main__":
61 | # training data
62 | X_train = np.arange(0, 250).reshape(-1, 1)
63 | y_train = np.arange(0, 500, 2)
64 |
65 | # testing data
66 | X_test = np.arange(300, 400, 8).reshape(-1, 1)
67 | y_test = np.arange(600, 800, 16)
68 |
69 | # Train model
70 | LR = LinearRegression(X_train, y_train, learning_rate=1e-5, epochs=75, logging=True)
71 |
72 | LR.fit(X_train, y_train)
73 |
74 | preds = LR.predict(X_test).round()
75 |
76 | # Plot the data
77 | fig, axs = plt.subplots(nrows=1, ncols=3)
78 | fig.suptitle("f(x) = 2x")
79 | fig.tight_layout()
80 | fig.set_size_inches(18, 8)
81 |
82 | axs[0].set_title("Visualization for f(x) = 2x")
83 | axs[0].set_xlabel("x")
84 | axs[0].set_ylabel("y")
85 | axs[0].plot(X_train, y_train)
86 |
87 | axs[1].set_title("Scatterplot for f(x) = 2x Data")
88 | axs[1].set_xlabel("x")
89 | axs[1].set_ylabel("y")
90 | axs[1].scatter(X_test, y_test, color="blue")
91 |
92 | axs[2].set_title("Visualization for Approximated f(x) = 2x")
93 | axs[2].set_xlabel("x")
94 | axs[2].set_ylabel("y")
95 | axs[2].scatter(X_test, y_test, color="blue")
96 | axs[2].plot(X_test, preds)
97 |
98 | plt.show()
99 |
100 | accuracy = accuracy_score(preds, y_test)
101 | precision, recall, fscore, _ = precision_recall_fscore_support(
102 | y_test, preds, average="macro"
103 | )
104 |
105 | print(f"Accuracy: {accuracy:.3f}")
106 | print(f"Precision: {recall:.3f}")
107 | print(f"Recall: {precision:.3f}")
108 | print(f"F-score: {fscore:.3f}")
109 |
--------------------------------------------------------------------------------
/ml_from_scratch/supervised/xgboost.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/ml_from_scratch/supervised/xgboost.py
--------------------------------------------------------------------------------
/nlp/BERT/README.md:
--------------------------------------------------------------------------------
1 | # BERT
2 |
3 | - [BERT](https://huggingface.co/docs/transformers/model_doc/bert)
4 | - [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)
5 | - [DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)
6 | - [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)
7 |
8 | Applications
9 |
10 | [BERTopic for Topic Modeling - Maarten Grootendorst - Talking Language AI Ep#1](https://www.youtube.com/watch?v=uZxQz87lb84&list=PLLalUvky4CLJ9ZgtZguDJ7dAYuI1bfaYW&index=7&t=840s)
11 |
--------------------------------------------------------------------------------
/nlp/classic/LDA.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nlp/classic/LDA.py
--------------------------------------------------------------------------------
/nlp/classic/LSA.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nlp/classic/LSA.py
--------------------------------------------------------------------------------
/nlp/classic/PCA.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nlp/classic/PCA.py
--------------------------------------------------------------------------------
/nlp/classic/SVD.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nlp/classic/SVD.py
--------------------------------------------------------------------------------
/nlp/classic/TFIDF.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Dict, List
3 |
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import pandas as pd
7 | import seaborn as sns
8 | from sklearn.feature_extraction.text import TfidfVectorizer
9 |
10 | pd.set_option("display.max_rows", None)
11 |
12 |
13 | def tokenize(text: str) -> List[str]:
14 | """Tokenize the input text by removing punctuation and splitting into words."""
15 | cleaned_text = re.sub(r"[^\w\s]", "", text)
16 | tokens = cleaned_text.lower().split()
17 | return tokens
18 |
19 |
20 | def calculate_word_frequencies(document: List[str]) -> Dict[str, int]:
21 | """Calculate the frequency of each word in a document."""
22 | frequencies = {}
23 | for word in document:
24 | frequencies[word] = frequencies.get(word, 0) + 1
25 | return frequencies
26 |
27 |
28 | def calculate_tf(word_counts: Dict[str, int], document_length: int) -> Dict[str, float]:
29 | """Calculate term frequency for each word in a document."""
30 |
31 | tf_dict = {
32 | word: count / float(document_length) for word, count in word_counts.items()
33 | }
34 |
35 | return tf_dict
36 |
37 |
38 | def calculate_idf(documents_word_counts: List[Dict[str, int]]) -> Dict[str, float]:
39 | """Calculate inverse document frequency for each word across all documents."""
40 | N = len(documents_word_counts)
41 | idf_dict = {}
42 | unique_words = set(word for doc in documents_word_counts for word in doc)
43 |
44 | for word in unique_words:
45 | # count number of docs containing the word
46 | doc_containing_word = sum(
47 | word in document for document in documents_word_counts
48 | )
49 |
50 | idf_dict[word] = np.log10((N + 1) / (doc_containing_word + 1))
51 |
52 | return idf_dict
53 |
54 |
55 | def calculate_tfidf(
56 | tf_dict: Dict[str, float], idf_dict: Dict[str, float]
57 | ) -> Dict[str, float]:
58 | """Calculate TF-IDF for each word in a document."""
59 |
60 | tfidf_dict = {word: tf_val * idf_dict[word] for word, tf_val in tf_dict.items()}
61 |
62 | return tfidf_dict
63 |
64 |
65 | def visualize_tfidf(tfidf_matrix: pd.DataFrame):
66 | """Visualize the TF-IDF matrix using a heatmap."""
67 | plt.figure(figsize=(10, 10))
68 | sns.heatmap(tfidf_matrix, annot=True, cmap="YlGnBu")
69 | plt.xticks(rotation=45, ha="right")
70 | plt.tight_layout()
71 | plt.show()
72 |
73 |
74 | def main():
75 | # seneca
76 | sentences = [
77 | "Life, if well lived, is long enough.",
78 | "Your time is limited, so don't waste it living someone else's life.",
79 | ]
80 |
81 | documents = [tokenize(sentence) for sentence in sentences]
82 |
83 | documents_word_counts = [calculate_word_frequencies(doc) for doc in documents]
84 |
85 | idf_dict = calculate_idf(documents_word_counts)
86 |
87 | tfidfs = []
88 | for doc, doc_word_counts in zip(documents, documents_word_counts):
89 | tf_dict = calculate_tf(doc_word_counts, len(doc))
90 | tfidf_dict = calculate_tfidf(tf_dict, idf_dict)
91 | tfidfs.append(tfidf_dict)
92 |
93 | tfidf_matrix = pd.DataFrame(tfidfs, index=["Document A", "Document B"]).T
94 | visualize_tfidf(tfidf_matrix)
95 |
96 | # scikit-learn
97 | titles = ["seneca", "steve_jobs"]
98 |
99 | vectorizer = TfidfVectorizer()
100 | vector = vectorizer.fit_transform(sentences)
101 | dict(zip(vectorizer.get_feature_names_out(), vector.toarray()[0]))
102 |
103 | tfidf_df = pd.DataFrame(
104 | vector.toarray(), index=titles, columns=vectorizer.get_feature_names_out()
105 | )
106 |
107 | visualize_tfidf(tfidf_df.T)
108 |
109 |
110 | if __name__ == "__main__":
111 | main()
112 |
--------------------------------------------------------------------------------
/nlp/classic/TSNE.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nlp/classic/TSNE.py
--------------------------------------------------------------------------------
/nlp/classic/UMAP.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nlp/classic/UMAP.py
--------------------------------------------------------------------------------
/nlp/embeddings/README.md:
--------------------------------------------------------------------------------
1 | # Embeddings
2 |
3 | - [What Are Embeddings - Vicki Boykis](https://github.com/veekaybee/what_are_embeddings/blob/main/embeddings.pdf)
4 | - [hackerllama - Sentence Embeddings. Introduction to Sentence Embeddings](https://osanseviero.github.io/hackerllama/blog/posts/sentence_embeddings/)
5 | - [Embeddings: What they are and why they matter](https://simonwillison.net/2023/Oct/23/embeddings/)
6 |
--------------------------------------------------------------------------------
/nlp/embeddings/tfidf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 12,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# seneca\n",
10 | "sentence_a = \"\"\"Life, if well lived, is long enough.\"\"\"\n",
11 | "# steve jobs\n",
12 | "sentence_b = \"\"\"Your time is limited, so don't waste it living someone else's life.\"\"\""
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 26,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "from typing import List\n",
22 | "import re\n",
23 | "\n",
24 | "\n",
25 | "def tokenize(text: str) -> List[str]:\n",
26 | " # Remove punctuation using regex, keeping words and numbers\n",
27 | " cleaned_text = re.sub(r\"[^\\w\\s]\", \"\", text)\n",
28 | " # Split the cleaned text into words\n",
29 | " tokens = cleaned_text.lower().split()\n",
30 | " return tokens"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 27,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "7\n",
43 | "12\n"
44 | ]
45 | },
46 | {
47 | "data": {
48 | "text/plain": [
49 | "17"
50 | ]
51 | },
52 | "execution_count": 27,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "doc_a = tokenize(sentence_a)\n",
59 | "doc_b = tokenize(sentence_b)\n",
60 | "\n",
61 | "print(len(doc_a))\n",
62 | "print(len(doc_b))\n",
63 | "\n",
64 | "total_corpus = set(doc_a).union(set(doc_b))\n",
65 | "\n",
66 | "len(total_corpus)"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "### bag of words\n"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 28,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/html": [
84 | "
\n",
85 | "\n",
98 | "
\n",
99 | " \n",
100 | " \n",
101 | " | \n",
102 | " 0 | \n",
103 | " 1 | \n",
104 | "
\n",
105 | " \n",
106 | " \n",
107 | " \n",
108 | " so | \n",
109 | " 0 | \n",
110 | " 1 | \n",
111 | "
\n",
112 | " \n",
113 | " lived | \n",
114 | " 1 | \n",
115 | " 0 | \n",
116 | "
\n",
117 | " \n",
118 | " it | \n",
119 | " 0 | \n",
120 | " 1 | \n",
121 | "
\n",
122 | " \n",
123 | " life | \n",
124 | " 1 | \n",
125 | " 1 | \n",
126 | "
\n",
127 | " \n",
128 | " elses | \n",
129 | " 0 | \n",
130 | " 1 | \n",
131 | "
\n",
132 | " \n",
133 | " time | \n",
134 | " 0 | \n",
135 | " 1 | \n",
136 | "
\n",
137 | " \n",
138 | " well | \n",
139 | " 1 | \n",
140 | " 0 | \n",
141 | "
\n",
142 | " \n",
143 | " long | \n",
144 | " 1 | \n",
145 | " 0 | \n",
146 | "
\n",
147 | " \n",
148 | " if | \n",
149 | " 1 | \n",
150 | " 0 | \n",
151 | "
\n",
152 | " \n",
153 | " living | \n",
154 | " 0 | \n",
155 | " 1 | \n",
156 | "
\n",
157 | " \n",
158 | " your | \n",
159 | " 0 | \n",
160 | " 1 | \n",
161 | "
\n",
162 | " \n",
163 | " is | \n",
164 | " 1 | \n",
165 | " 1 | \n",
166 | "
\n",
167 | " \n",
168 | " limited | \n",
169 | " 0 | \n",
170 | " 1 | \n",
171 | "
\n",
172 | " \n",
173 | " dont | \n",
174 | " 0 | \n",
175 | " 1 | \n",
176 | "
\n",
177 | " \n",
178 | " waste | \n",
179 | " 0 | \n",
180 | " 1 | \n",
181 | "
\n",
182 | " \n",
183 | " someone | \n",
184 | " 0 | \n",
185 | " 1 | \n",
186 | "
\n",
187 | " \n",
188 | " enough | \n",
189 | " 1 | \n",
190 | " 0 | \n",
191 | "
\n",
192 | " \n",
193 | "
\n",
194 | "
"
195 | ],
196 | "text/plain": [
197 | " 0 1\n",
198 | "so 0 1\n",
199 | "lived 1 0\n",
200 | "it 0 1\n",
201 | "life 1 1\n",
202 | "elses 0 1\n",
203 | "time 0 1\n",
204 | "well 1 0\n",
205 | "long 1 0\n",
206 | "if 1 0\n",
207 | "living 0 1\n",
208 | "your 0 1\n",
209 | "is 1 1\n",
210 | "limited 0 1\n",
211 | "dont 0 1\n",
212 | "waste 0 1\n",
213 | "someone 0 1\n",
214 | "enough 1 0"
215 | ]
216 | },
217 | "execution_count": 28,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "import pandas as pd\n",
224 | "\n",
225 | "\n",
226 | "word_count_a = dict.fromkeys(total_corpus, 0)\n",
227 | "word_count_b = dict.fromkeys(total_corpus, 0)\n",
228 | "\n",
229 | "for word in doc_a:\n",
230 | " word_count_a[word] += 1\n",
231 | "\n",
232 | "for word in doc_b:\n",
233 | " word_count_b[word] += 1\n",
234 | "\n",
235 | "pd.set_option(\"display.max_rows\", None)\n",
236 | "\n",
237 | "freq = pd.DataFrame([word_count_a, word_count_b])\n",
238 | "freq.T"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "### TF\n"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 29,
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "data": {
255 | "text/plain": [
256 | "{'so': 0.0,\n",
257 | " 'lived': 0.14285714285714285,\n",
258 | " 'it': 0.0,\n",
259 | " 'life': 0.14285714285714285,\n",
260 | " 'elses': 0.0,\n",
261 | " 'time': 0.0,\n",
262 | " 'well': 0.14285714285714285,\n",
263 | " 'long': 0.14285714285714285,\n",
264 | " 'if': 0.14285714285714285,\n",
265 | " 'living': 0.0,\n",
266 | " 'your': 0.0,\n",
267 | " 'is': 0.14285714285714285,\n",
268 | " 'limited': 0.0,\n",
269 | " 'dont': 0.0,\n",
270 | " 'waste': 0.0,\n",
271 | " 'someone': 0.0,\n",
272 | " 'enough': 0.14285714285714285}"
273 | ]
274 | },
275 | "execution_count": 29,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "def tf(word_counts: dict, document: list[str]) -> dict:\n",
282 | " \"\"\"Calculate term frequency of each word in a document.\"\"\"\n",
283 | "\n",
284 | " tf_dict = {}\n",
285 | " corpus_count = len(document)\n",
286 | "\n",
287 | " for word, count in word_counts.items():\n",
288 | " tf_dict[word] = count / float(corpus_count)\n",
289 | "\n",
290 | " return tf_dict\n",
291 | "\n",
292 | "\n",
293 | "tf(word_count_a, doc_a)"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "### IDF\n"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 30,
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/plain": [
311 | "{'so': 0.17609125905568124,\n",
312 | " 'lived': 0.17609125905568124,\n",
313 | " 'it': 0.17609125905568124,\n",
314 | " 'life': 0.0,\n",
315 | " 'elses': 0.17609125905568124,\n",
316 | " 'time': 0.17609125905568124,\n",
317 | " 'well': 0.17609125905568124,\n",
318 | " 'long': 0.17609125905568124,\n",
319 | " 'if': 0.17609125905568124,\n",
320 | " 'living': 0.17609125905568124,\n",
321 | " 'your': 0.17609125905568124,\n",
322 | " 'is': 0.0,\n",
323 | " 'limited': 0.17609125905568124,\n",
324 | " 'dont': 0.17609125905568124,\n",
325 | " 'waste': 0.17609125905568124,\n",
326 | " 'someone': 0.17609125905568124,\n",
327 | " 'enough': 0.17609125905568124}"
328 | ]
329 | },
330 | "execution_count": 30,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "import numpy as np\n",
337 | "\n",
338 | "\n",
339 | "def idf(word_counts: list[dict[str, int]]) -> dict:\n",
340 | " \"\"\"Given N documents, no. of documents in which the the term appears for each term\"\"\"\n",
341 | " idf_dict = {}\n",
342 | " N = len(word_counts)\n",
343 | "\n",
344 | " idf_dict = dict.fromkeys(word_counts[0].keys(), 0)\n",
345 | "\n",
346 | " for word in idf_dict.keys():\n",
347 | " idf_dict[word] = sum(doc[word] > 0 for doc in word_counts)\n",
348 | "\n",
349 | " for word, df in idf_dict.items():\n",
350 | " idf_dict[word] = np.log10((N + 1.0) / (df + 1.0))\n",
351 | "\n",
352 | " return idf_dict\n",
353 | "\n",
354 | "\n",
355 | "idfs = idf([word_count_a, word_count_b])\n",
356 | "idfs"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "### TF-IDF\n"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 31,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "data": {
373 | "text/html": [
374 | "\n",
375 | "\n",
388 | "
\n",
389 | " \n",
390 | " \n",
391 | " | \n",
392 | " 0 | \n",
393 | " 1 | \n",
394 | "
\n",
395 | " \n",
396 | " \n",
397 | " \n",
398 | " so | \n",
399 | " 0.000000 | \n",
400 | " 0.014674 | \n",
401 | "
\n",
402 | " \n",
403 | " lived | \n",
404 | " 0.025156 | \n",
405 | " 0.000000 | \n",
406 | "
\n",
407 | " \n",
408 | " it | \n",
409 | " 0.000000 | \n",
410 | " 0.014674 | \n",
411 | "
\n",
412 | " \n",
413 | " life | \n",
414 | " 0.000000 | \n",
415 | " 0.000000 | \n",
416 | "
\n",
417 | " \n",
418 | " elses | \n",
419 | " 0.000000 | \n",
420 | " 0.014674 | \n",
421 | "
\n",
422 | " \n",
423 | " time | \n",
424 | " 0.000000 | \n",
425 | " 0.014674 | \n",
426 | "
\n",
427 | " \n",
428 | " well | \n",
429 | " 0.025156 | \n",
430 | " 0.000000 | \n",
431 | "
\n",
432 | " \n",
433 | " long | \n",
434 | " 0.025156 | \n",
435 | " 0.000000 | \n",
436 | "
\n",
437 | " \n",
438 | " if | \n",
439 | " 0.025156 | \n",
440 | " 0.000000 | \n",
441 | "
\n",
442 | " \n",
443 | " living | \n",
444 | " 0.000000 | \n",
445 | " 0.014674 | \n",
446 | "
\n",
447 | " \n",
448 | " your | \n",
449 | " 0.000000 | \n",
450 | " 0.014674 | \n",
451 | "
\n",
452 | " \n",
453 | " is | \n",
454 | " 0.000000 | \n",
455 | " 0.000000 | \n",
456 | "
\n",
457 | " \n",
458 | " limited | \n",
459 | " 0.000000 | \n",
460 | " 0.014674 | \n",
461 | "
\n",
462 | " \n",
463 | " dont | \n",
464 | " 0.000000 | \n",
465 | " 0.014674 | \n",
466 | "
\n",
467 | " \n",
468 | " waste | \n",
469 | " 0.000000 | \n",
470 | " 0.014674 | \n",
471 | "
\n",
472 | " \n",
473 | " someone | \n",
474 | " 0.000000 | \n",
475 | " 0.014674 | \n",
476 | "
\n",
477 | " \n",
478 | " enough | \n",
479 | " 0.025156 | \n",
480 | " 0.000000 | \n",
481 | "
\n",
482 | " \n",
483 | "
\n",
484 | "
"
485 | ],
486 | "text/plain": [
487 | " 0 1\n",
488 | "so 0.000000 0.014674\n",
489 | "lived 0.025156 0.000000\n",
490 | "it 0.000000 0.014674\n",
491 | "life 0.000000 0.000000\n",
492 | "elses 0.000000 0.014674\n",
493 | "time 0.000000 0.014674\n",
494 | "well 0.025156 0.000000\n",
495 | "long 0.025156 0.000000\n",
496 | "if 0.025156 0.000000\n",
497 | "living 0.000000 0.014674\n",
498 | "your 0.000000 0.014674\n",
499 | "is 0.000000 0.000000\n",
500 | "limited 0.000000 0.014674\n",
501 | "dont 0.000000 0.014674\n",
502 | "waste 0.000000 0.014674\n",
503 | "someone 0.000000 0.014674\n",
504 | "enough 0.025156 0.000000"
505 | ]
506 | },
507 | "execution_count": 31,
508 | "metadata": {},
509 | "output_type": "execute_result"
510 | }
511 | ],
512 | "source": [
513 | "def tfidf(doc_elements: dict[str, int], idfs: dict[str, int]) -> dict:\n",
514 | " \"\"\"TF * IDF per word given a single word in a single document\"\"\"\n",
515 | "\n",
516 | " tfidf_dict = {}\n",
517 | "\n",
518 | " for word, val in doc_elements.items():\n",
519 | " tfidf_dict[word] = val * idfs[word]\n",
520 | "\n",
521 | " return tfidf_dict\n",
522 | "\n",
523 | "\n",
524 | "# calculate term frequency for each document\n",
525 | "tf_a = tf(word_count_a, doc_a)\n",
526 | "tf_b = tf(word_count_b, doc_b)\n",
527 | "\n",
528 | "# calculate inverse document frequency for each document\n",
529 | "tfidf_a = tfidf(tf_a, idfs)\n",
530 | "tfidf_b = tfidf(tf_b, idfs)\n",
531 | "\n",
532 | "# return score\n",
533 | "document_tfidf = pd.DataFrame([tfidf_a, tfidf_b])\n",
534 | "document_tfidf.T"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 32,
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "data": {
544 | "image/png": "",
545 | "text/plain": [
546 | ""
547 | ]
548 | },
549 | "metadata": {},
550 | "output_type": "display_data"
551 | }
552 | ],
553 | "source": [
554 | "import matplotlib.pyplot as plt\n",
555 | "import seaborn as sns\n",
556 | "\n",
557 | "\n",
558 | "def visualize_tfidf(tfidf_matrix: pd.DataFrame):\n",
559 | " plt.figure(figsize=(10, 10))\n",
560 | " sns.heatmap(tfidf_matrix, annot=True, cmap=\"YlGnBu\")\n",
561 | " plt.xticks(rotation=45, ha=\"right\")\n",
562 | " plt.tight_layout()\n",
563 | " plt.show()\n",
564 | "\n",
565 | "\n",
566 | "# Prepare the TF-IDF matrix for visualization and EDA\n",
567 | "tfidf_matrix = pd.DataFrame([tfidf_a, tfidf_b], index=[\"Document A\", \"Document B\"]).T\n",
568 | "\n",
569 | "# Visualize the TF-IDF matrix\n",
570 | "visualize_tfidf(tfidf_matrix)"
571 | ]
572 | },
573 | {
574 | "cell_type": "markdown",
575 | "metadata": {},
576 | "source": [
577 | "The heatmap displays the TF-IDF scores of different words across two documents (A and B). The colors indicate the magnitude of the TF-IDF scores, with darker colors representing higher scores. Words that have non-zero scores in both documents are those that are shared between the documents. Words that have a high score in one document and a low or zero score in the other suggest uniqueness. For example, \"ignorance\" has a higher score in Document B, indicating it's more important or unique to that document within the context of these two documents.\n"
578 | ]
579 | },
580 | {
581 | "cell_type": "markdown",
582 | "metadata": {},
583 | "source": [
584 | "# scikit-learn\n"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 38,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
594 | "\n",
595 | "corpus = [sentence_a, sentence_b]\n",
596 | "titles = [\"seneca\", \"steve_jobs\"]\n",
597 | "\n",
598 | "vectorizer = TfidfVectorizer()\n",
599 | "vector = vectorizer.fit_transform(corpus)\n",
600 | "dict(zip(vectorizer.get_feature_names_out(), vector.toarray()[0]))\n",
601 | "\n",
602 | "tfidf_df = pd.DataFrame(\n",
603 | " vector.toarray(), index=titles, columns=vectorizer.get_feature_names_out()\n",
604 | ")"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 39,
610 | "metadata": {},
611 | "outputs": [
612 | {
613 | "data": {
614 | "image/png": "",
615 | "text/plain": [
616 | ""
617 | ]
618 | },
619 | "metadata": {},
620 | "output_type": "display_data"
621 | }
622 | ],
623 | "source": [
624 | "visualize_tfidf(tfidf_df.T)"
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": null,
630 | "metadata": {},
631 | "outputs": [],
632 | "source": []
633 | }
634 | ],
635 | "metadata": {
636 | "kernelspec": {
637 | "display_name": "ai",
638 | "language": "python",
639 | "name": "python3"
640 | },
641 | "language_info": {
642 | "codemirror_mode": {
643 | "name": "ipython",
644 | "version": 3
645 | },
646 | "file_extension": ".py",
647 | "mimetype": "text/x-python",
648 | "name": "python",
649 | "nbconvert_exporter": "python",
650 | "pygments_lexer": "ipython3",
651 | "version": "3.12.1"
652 | }
653 | },
654 | "nbformat": 4,
655 | "nbformat_minor": 2
656 | }
657 |
--------------------------------------------------------------------------------
/nn_zero_to_hero/README.md:
--------------------------------------------------------------------------------
1 | # Neural Networks from scratch
2 |
3 | My code for Andrej Karpathy's NN from scratch series (~13 hours)
4 |
5 | ## installations
6 |
7 | ```bash
8 | brew install graphviz
9 | ```
10 |
11 | ## Building micrograd (currently doing)
12 |
13 | What? A tiny scalar-valued autograd engine and a neural net library on top of it with PyTorch-like API
14 |
15 | ## Source
16 |
17 | - [Neural Networks: Zero to Hero](https://www.youtube.com/playlist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ)
18 | - [karpathy/nn-zero-to-hero: Neural Networks: Zero to Hero](https://github.com/karpathy/nn-zero-to-hero)
19 |
--------------------------------------------------------------------------------
/nn_zero_to_hero/micrograd/derivatives.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import math\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "\n",
13 | "%matplotlib inline"
14 | ]
15 | },
16 | {
17 | "attachments": {},
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Derivative of a function with single input\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "data": {
31 | "text/plain": [
32 | "20.0"
33 | ]
34 | },
35 | "execution_count": 2,
36 | "metadata": {},
37 | "output_type": "execute_result"
38 | }
39 | ],
40 | "source": [
41 | "def f(x):\n",
42 | " return 3 * x**2 - 4 * x + 5\n",
43 | "\n",
44 | "\n",
45 | "f(3.0)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "image/png": "",
56 | "text/plain": [
57 | ""
58 | ]
59 | },
60 | "metadata": {
61 | "needs_background": "light"
62 | },
63 | "output_type": "display_data"
64 | }
65 | ],
66 | "source": [
67 | "xs = np.arange(-5, 5, 0.25)\n",
68 | "ys = f(xs) # apply f to each element of xs\n",
69 | "plt.plot(xs, ys);"
70 | ]
71 | },
72 | {
73 | "attachments": {},
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "what is derivative at every point of this function? in class you would do it by hand\n"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | "20.014003000000002"
89 | ]
90 | },
91 | "execution_count": 5,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "h = 0.001\n",
98 | "x = 3.0\n",
99 | "f(x + h) # do you expect function to be greater or less after bumping h\n",
100 | "(f(x + h) - f(x)) / h # function responded in positive direction normalized by run"
101 | ]
102 | },
103 | {
104 | "attachments": {},
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "make h very small to converge to right amount\n"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 9,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "14.000001158365194"
120 | ]
121 | },
122 | "execution_count": 9,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "h = 0.0000000001\n",
129 | "x = 3.0\n",
130 | "(f(x + h) - f(x)) / h"
131 | ]
132 | },
133 | {
134 | "attachments": {},
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "doing this from function\n",
139 | "\n",
140 | "derivative of $f(x) = 3x^2 - 4x + 5 = 6x - 4$\n",
141 | "\n",
142 | "when $x = 3$, $6 \\cdot 3 - 4 = 14$\n"
143 | ]
144 | },
145 | {
146 | "attachments": {},
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "What if it's a negative number?\n",
151 | "\n",
152 | "looking at function, if we bump it, it'll go down, so it's going to be negative\n"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 14,
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/plain": [
163 | "-21.999966293151374"
164 | ]
165 | },
166 | "execution_count": 14,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "h = 0.0000000001\n",
173 | "x = -3.0\n",
174 | "(f(x + h) - f(x)) / h"
175 | ]
176 | },
177 | {
178 | "attachments": {},
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "What if it's a number where slope is zero?\n",
183 | "\n",
184 | "nudging it doesn't change the value of the function\n"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 11,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "0.0"
196 | ]
197 | },
198 | "execution_count": 11,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "h = 0.0000000001\n",
205 | "x = 2 / 3 # slope is zero at this point\n",
206 | "(f(x + h) - f(x)) / h"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 8,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "data": {
216 | "text/plain": [
217 | "0.0"
218 | ]
219 | },
220 | "execution_count": 8,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "h = 0.0000000000000001 # floating point arithmetic, represetnation is finite\n",
227 | "x = 3.0\n",
228 | "(f(x + h) - f(x)) / h"
229 | ]
230 | },
231 | {
232 | "attachments": {},
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "## Derivative of a function with multiple input\n"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 12,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "name": "stdout",
246 | "output_type": "stream",
247 | "text": [
248 | "4.0\n"
249 | ]
250 | }
251 | ],
252 | "source": [
253 | "a = 2.0\n",
254 | "b = -3.0\n",
255 | "c = 10.0\n",
256 | "d = a * b + c\n",
257 | "\n",
258 | "print(d)"
259 | ]
260 | },
261 | {
262 | "attachments": {},
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "what happens if we bump b?\n",
267 | "\n",
268 | "because b is negative, it's going to go down\n"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 15,
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "name": "stdout",
278 | "output_type": "stream",
279 | "text": [
280 | "d1 4.0\n",
281 | "d2 3.99999997\n",
282 | "slope -2.999999981767587\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "h = 0.00000001\n",
288 | "\n",
289 | "# fix inputs at values of interest\n",
290 | "a = 2.0\n",
291 | "b = -3.0\n",
292 | "c = 10.0\n",
293 | "\n",
294 | "d1 = a * b + c\n",
295 | "a += h\n",
296 | "d2 = a * b + c\n",
297 | "\n",
298 | "print(\"d1\", d1)\n",
299 | "print(\"d2\", d2)\n",
300 | "print(\"slope\", (d2 - d1) / h)"
301 | ]
302 | },
303 | {
304 | "attachments": {},
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "mathmatically:\n",
309 | "\n",
310 | "derivative of d with respect to a gives you b, and b is -3\n"
311 | ]
312 | },
313 | {
314 | "attachments": {},
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "what happens if we bump b?\n",
319 | "\n",
320 | "because a is positive, we'll be adding more to d.\n",
321 | "\n",
322 | "What is the sensitivity? the slope of the function? it's 2\n"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 16,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "d1 4.0\n",
335 | "d2 4.00000002\n",
336 | "slope 1.999999987845058\n"
337 | ]
338 | }
339 | ],
340 | "source": [
341 | "h = 0.00000001\n",
342 | "\n",
343 | "# fix inputs at values of interest\n",
344 | "a = 2.0\n",
345 | "b = -3.0\n",
346 | "c = 10.0\n",
347 | "\n",
348 | "d1 = a * b + c\n",
349 | "b += h\n",
350 | "d2 = a * b + c\n",
351 | "\n",
352 | "print(\"d1\", d1)\n",
353 | "print(\"d2\", d2)\n",
354 | "print(\"slope\", (d2 - d1) / h)"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 17,
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "name": "stdout",
364 | "output_type": "stream",
365 | "text": [
366 | "d1 4.0\n",
367 | "d2 4.000000010000001\n",
368 | "slope 1.000000082740371\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "h = 0.00000001\n",
374 | "\n",
375 | "# fix inputs at values of interest\n",
376 | "a = 2.0\n",
377 | "b = -3.0\n",
378 | "c = 10.0\n",
379 | "\n",
380 | "d1 = a * b + c\n",
381 | "c += h\n",
382 | "d2 = a * b + c\n",
383 | "\n",
384 | "print(\"d1\", d1)\n",
385 | "print(\"d2\", d2)\n",
386 | "print(\"slope\", (d2 - d1) / h)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": []
395 | }
396 | ],
397 | "metadata": {
398 | "kernelspec": {
399 | "display_name": "base",
400 | "language": "python",
401 | "name": "python3"
402 | },
403 | "language_info": {
404 | "codemirror_mode": {
405 | "name": "ipython",
406 | "version": 3
407 | },
408 | "file_extension": ".py",
409 | "mimetype": "text/x-python",
410 | "name": "python",
411 | "nbconvert_exporter": "python",
412 | "pygments_lexer": "ipython3",
413 | "version": "3.9.10"
414 | },
415 | "orig_nbformat": 4,
416 | "vscode": {
417 | "interpreter": {
418 | "hash": "0f1e841692445df6c0f476977380d4c26cc40d52508098a18c340919add514d9"
419 | }
420 | }
421 | },
422 | "nbformat": 4,
423 | "nbformat_minor": 2
424 | }
425 |
--------------------------------------------------------------------------------
/nn_zero_to_hero/micrograd/exercises.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "JnGHatCI51JP"
7 | },
8 | "source": [
9 | "# micrograd exercises\n",
10 | "\n",
11 | "1. watch the [micrograd video](https://www.youtube.com/watch?v=VMj-3S1tku0) on YouTube\n",
12 | "2. come back and complete these exercises to level up :)\n"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {
18 | "id": "OFt6NKOz6iBZ"
19 | },
20 | "source": [
21 | "## section 1: derivatives\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "id": "3Jx9fCXl5xHd"
29 | },
30 | "outputs": [],
31 | "source": [
32 | "# here is a mathematical expression that takes 3 inputs and produces one output\n",
33 | "from math import sin, cos\n",
34 | "\n",
35 | "\n",
36 | "def f(a, b, c):\n",
37 | " return -(a**3) + sin(3 * b) - 1.0 / c + b**2.5 - a**0.5\n",
38 | "\n",
39 | "\n",
40 | "print(f(2, 3, 4))"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "id": "qXaH59eL9zxf"
48 | },
49 | "outputs": [],
50 | "source": [
51 | "# write the function df that returns the analytical gradient of f\n",
52 | "# i.e. use your skills from calculus to take the derivative, then implement the formula\n",
53 | "# if you do not calculus then feel free to ask wolframalpha, e.g.:\n",
54 | "# https://www.wolframalpha.com/input?i=d%2Fda%28sin%283*a%29%29%29\n",
55 | "\n",
56 | "\n",
57 | "def gradf(a, b, c):\n",
58 | " return [0, 0, 0] # todo, return [df/da, df/db, df/dc]\n",
59 | "\n",
60 | "\n",
61 | "# expected answer is the list of\n",
62 | "ans = [-12.353553390593273, 10.25699027111255, 0.0625]\n",
63 | "yours = gradf(2, 3, 4)\n",
64 | "for dim in range(3):\n",
65 | " ok = \"OK\" if abs(yours[dim] - ans[dim]) < 1e-5 else \"WRONG!\"\n",
66 | " print(f\"{ok} for dim {dim}: expected {ans[dim]}, yours returns {yours[dim]}\")"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "id": "_27n-KTA9Qla"
74 | },
75 | "outputs": [],
76 | "source": [
77 | "# now estimate the gradient numerically without any calculus, using\n",
78 | "# the approximation we used in the video.\n",
79 | "# you should not call the function df from the last cell\n",
80 | "\n",
81 | "# -----------\n",
82 | "numerical_grad = [0, 0, 0] # TODO\n",
83 | "# -----------\n",
84 | "\n",
85 | "for dim in range(3):\n",
86 | " ok = \"OK\" if abs(numerical_grad[dim] - ans[dim]) < 1e-5 else \"WRONG!\"\n",
87 | " print(\n",
88 | " f\"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad[dim]}\"\n",
89 | " )"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "id": "BUqsGb5o_h2P"
97 | },
98 | "outputs": [],
99 | "source": [
100 | "# there is an alternative formula that provides a much better numerical\n",
101 | "# approximation to the derivative of a function.\n",
102 | "# learn about it here: https://en.wikipedia.org/wiki/Symmetric_derivative\n",
103 | "# implement it. confirm that for the same step size h this version gives a\n",
104 | "# better approximation.\n",
105 | "\n",
106 | "# -----------\n",
107 | "numerical_grad2 = [0, 0, 0] # TODO\n",
108 | "# -----------\n",
109 | "\n",
110 | "for dim in range(3):\n",
111 | " ok = \"OK\" if abs(numerical_grad2[dim] - ans[dim]) < 1e-5 else \"WRONG!\"\n",
112 | " print(\n",
113 | " f\"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad2[dim]}\"\n",
114 | " )"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "id": "tklF9s_4AtlI"
121 | },
122 | "source": [
123 | "## section 2: support for softmax\n"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {
130 | "id": "nAPe_RVrCTeO"
131 | },
132 | "outputs": [],
133 | "source": [
134 | "# Value class starter code, with many functions taken out\n",
135 | "from math import exp, log\n",
136 | "\n",
137 | "\n",
138 | "class Value:\n",
139 | " def __init__(self, data, _children=(), _op=\"\", label=\"\"):\n",
140 | " self.data = data\n",
141 | " self.grad = 0.0\n",
142 | " self._backward = lambda: None\n",
143 | " self._prev = set(_children)\n",
144 | " self._op = _op\n",
145 | " self.label = label\n",
146 | "\n",
147 | " def __repr__(self):\n",
148 | " return f\"Value(data={self.data})\"\n",
149 | "\n",
150 | " def __add__(self, other): # exactly as in the video\n",
151 | " other = other if isinstance(other, Value) else Value(other)\n",
152 | " out = Value(self.data + other.data, (self, other), \"+\")\n",
153 | "\n",
154 | " def _backward():\n",
155 | " self.grad += 1.0 * out.grad\n",
156 | " other.grad += 1.0 * out.grad\n",
157 | "\n",
158 | " out._backward = _backward\n",
159 | "\n",
160 | " return out\n",
161 | "\n",
162 | " # ------\n",
163 | " # re-implement all the other functions needed for the exercises below\n",
164 | " # your code here\n",
165 | " # TODO\n",
166 | " # ------\n",
167 | "\n",
168 | " def backward(self): # exactly as in video\n",
169 | " topo = []\n",
170 | " visited = set()\n",
171 | "\n",
172 | " def build_topo(v):\n",
173 | " if v not in visited:\n",
174 | " visited.add(v)\n",
175 | " for child in v._prev:\n",
176 | " build_topo(child)\n",
177 | " topo.append(v)\n",
178 | "\n",
179 | " build_topo(self)\n",
180 | "\n",
181 | " self.grad = 1.0\n",
182 | " for node in reversed(topo):\n",
183 | " node._backward()"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "id": "VgWvwVQNAvnI"
191 | },
192 | "outputs": [],
193 | "source": [
194 | "# without referencing our code/video __too__ much, make this cell work\n",
195 | "# you'll have to implement (in some cases re-implemented) a number of functions\n",
196 | "# of the Value object, similar to what we've seen in the video.\n",
197 | "# instead of the squared error loss this implements the negative log likelihood\n",
198 | "# loss, which is very often used in classification.\n",
199 | "\n",
200 | "# this is the softmax function\n",
201 | "# https://en.wikipedia.org/wiki/Softmax_function\n",
202 | "def softmax(logits):\n",
203 | " counts = [logit.exp() for logit in logits]\n",
204 | " denominator = sum(counts)\n",
205 | " out = [c / denominator for c in counts]\n",
206 | " return out\n",
207 | "\n",
208 | "\n",
209 | "# this is the negative log likelihood loss function, pervasive in classification\n",
210 | "logits = [Value(0.0), Value(3.0), Value(-2.0), Value(1.0)]\n",
211 | "probs = softmax(logits)\n",
212 | "loss = -probs[3].log() # dim 3 acts as the label for this input example\n",
213 | "loss.backward()\n",
214 | "print(loss.data)\n",
215 | "\n",
216 | "ans = [\n",
217 | " 0.041772570515350445,\n",
218 | " 0.8390245074625319,\n",
219 | " 0.005653302662216329,\n",
220 | " -0.8864503806400986,\n",
221 | "]\n",
222 | "for dim in range(4):\n",
223 | " ok = \"OK\" if abs(logits[dim].grad - ans[dim]) < 1e-5 else \"WRONG!\"\n",
224 | " print(f\"{ok} for dim {dim}: expected {ans[dim]}, yours returns {logits[dim].grad}\")"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {
231 | "id": "q7ca1SVAGG1S"
232 | },
233 | "outputs": [],
234 | "source": [
235 | "# verify the gradient using the torch library\n",
236 | "# torch should give you the exact same gradient\n",
237 | "import torch"
238 | ]
239 | }
240 | ],
241 | "metadata": {
242 | "colab": {
243 | "provenance": []
244 | },
245 | "kernelspec": {
246 | "display_name": "Python 3",
247 | "language": "python",
248 | "name": "python3"
249 | },
250 | "language_info": {
251 | "name": "python",
252 | "version": "3.10.0"
253 | },
254 | "vscode": {
255 | "interpreter": {
256 | "hash": "50587d438b9934cf2712ee500622f7def3550698a6c70c07f7d3c00dd27cb653"
257 | }
258 | }
259 | },
260 | "nbformat": 4,
261 | "nbformat_minor": 0
262 | }
263 |
--------------------------------------------------------------------------------
/nn_zero_to_hero/micrograd/micrograd.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from viz import draw_dot"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 13,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Value(data=-8.0)\n",
22 | "{Value(data=-2.0), Value(data=4.0)}\n",
23 | "*\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "class Value:\n",
29 | " def __init__(self, data, _children=(), _op=\"\", label=\"\"):\n",
30 | " self.data = data\n",
31 | " self.grad = 0.0\n",
32 | " self._prev = set(_children)\n",
33 | " self._op = _op\n",
34 | " self.label = label\n",
35 | "\n",
36 | " def __repr__(self):\n",
37 | " return f\"Value(data={self.data})\"\n",
38 | "\n",
39 | " def __add__(self, other):\n",
40 | " return Value(self.data + other.data, (self, other), \"+\")\n",
41 | "\n",
42 | " def __mul__(self, other):\n",
43 | " return Value(self.data * other.data, (self, other), \"*\")\n",
44 | "\n",
45 | "\n",
46 | "a = Value(2.0, label=\"a\")\n",
47 | "b = Value(-3.0, label=\"b\")\n",
48 | "c = Value(10.0, label=\"c\")\n",
49 | "e = a * b\n",
50 | "e.label = \"e\"\n",
51 | "d = e + c\n",
52 | "d.label = \"d\"\n",
53 | "f = Value(-2.0, label=\"f\")\n",
54 | "L = d * f\n",
55 | "L.label = \"L\"\n",
56 | "\n",
57 | "print(L)\n",
58 | "print(L._prev) # the children of the value\n",
59 | "print(L._op) # the operation that created the value"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 15,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "data": {
69 | "image/svg+xml": [
70 | "\n",
71 | "\n",
73 | "\n",
75 | "\n",
76 | "\n"
224 | ],
225 | "text/plain": [
226 | ""
227 | ]
228 | },
229 | "execution_count": 15,
230 | "metadata": {},
231 | "output_type": "execute_result"
232 | }
233 | ],
234 | "source": [
235 | "draw_dot(L)"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": []
244 | }
245 | ],
246 | "metadata": {
247 | "kernelspec": {
248 | "display_name": "base",
249 | "language": "python",
250 | "name": "python3"
251 | },
252 | "language_info": {
253 | "codemirror_mode": {
254 | "name": "ipython",
255 | "version": 3
256 | },
257 | "file_extension": ".py",
258 | "mimetype": "text/x-python",
259 | "name": "python",
260 | "nbconvert_exporter": "python",
261 | "pygments_lexer": "ipython3",
262 | "version": "3.9.10"
263 | },
264 | "orig_nbformat": 4,
265 | "vscode": {
266 | "interpreter": {
267 | "hash": "0f1e841692445df6c0f476977380d4c26cc40d52508098a18c340919add514d9"
268 | }
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 2
273 | }
274 |
--------------------------------------------------------------------------------
/nn_zero_to_hero/micrograd/micrograd.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/nn_zero_to_hero/micrograd/micrograd.py
--------------------------------------------------------------------------------
/nn_zero_to_hero/micrograd/viz.py:
--------------------------------------------------------------------------------
1 | from graphviz import Digraph
2 |
3 |
4 | def trace(root):
5 | # builds a set of all nodes and edges in a graph
6 | nodes, edges = set(), set()
7 |
8 | def build(v):
9 | if v not in nodes:
10 | nodes.add(v)
11 | for child in v._prev:
12 | edges.add((child, v))
13 | build(child)
14 |
15 | build(root)
16 | return nodes, edges
17 |
18 |
19 | def draw_dot(root):
20 | dot = Digraph(format="svg", graph_attr={"rankdir": "LR"}) # LR = left to right
21 |
22 | nodes, edges = trace(root)
23 | for n in nodes:
24 | uid = str(id(n))
25 | # for any value in the graph, create a rectangular ('record') node for it
26 | dot.node(
27 | name=uid,
28 | label="{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad),
29 | shape="record",
30 | )
31 | if n._op:
32 | # if this value is a result of some operation, create an op node for it
33 | dot.node(name=uid + n._op, label=n._op)
34 | # and connect this node to it
35 | dot.edge(uid + n._op, uid)
36 |
37 | for n1, n2 in edges:
38 | # connect n1 to the op node of n2
39 | dot.edge(str(id(n1)), str(id(n2)) + n2._op)
40 |
41 | return dot
42 |
--------------------------------------------------------------------------------
/roadmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benthecoder/AI/b77a9c354386bdb681a80f5dd913b4b7ee4f640c/roadmap.png
--------------------------------------------------------------------------------
/tensor_puzzles/README.md:
--------------------------------------------------------------------------------
1 | # Tensor Puzzles
2 |
3 | [srush/Tensor-Puzzles: Solve puzzles. Improve your pytorch.](https://github.com/srush/Tensor-Puzzles)
4 |
--------------------------------------------------------------------------------