├── .github └── workflows │ └── manual.yml ├── CODEOWNERS ├── README.md ├── lessons ├── .DS_Store ├── DeepLearning │ ├── 1_IntroNNs │ │ ├── GradientDescent.ipynb │ │ ├── GradientDescentSolutions.ipynb │ │ ├── StudentAdmissions.ipynb │ │ ├── StudentAdmissionsSolutions.ipynb │ │ ├── data.csv │ │ └── student_data.csv │ ├── 2_Keras │ │ ├── IMDB_In_Keras.ipynb │ │ ├── IMDB_In_Keras_Solutions.ipynb │ │ ├── StudentAdmissionsKeras.ipynb │ │ └── student_data.csv │ ├── 3_PyTorch │ │ ├── .DS_Store │ │ ├── LICENSE │ │ ├── Part 1 - Tensors in PyTorch.ipynb │ │ ├── Part 2 - Neural Networks in PyTorch.ipynb │ │ ├── Part 3 - Training Neural Networks.ipynb │ │ ├── Part 4 - Fashion-MNIST Exercise.ipynb │ │ ├── Part 5 - Inference and Validation.ipynb │ │ ├── Part 6 - Saving and Loading Models.ipynb │ │ ├── Part 7 - Loading Image Data.ipynb │ │ ├── Part 8 - Transfer Learning.ipynb │ │ ├── README.md │ │ ├── assets │ │ │ ├── ImageNet_example.png │ │ │ ├── Pooling_Simple_max.png │ │ │ ├── activation.png │ │ │ ├── autoencoder_1.png │ │ │ ├── cat.70.jpg │ │ │ ├── cat_cropped.png │ │ │ ├── conv_net.jpg │ │ │ ├── dog.128.jpg │ │ │ ├── dog_cat.png │ │ │ ├── examples_new.png │ │ │ ├── fashion-mnist-sprite.png │ │ │ ├── full_padding_no_strides_transposed.gif │ │ │ ├── function_approx.png │ │ │ ├── gradient_descent.png │ │ │ ├── lenet.png │ │ │ ├── mlp_mnist.png │ │ │ ├── multilayer_diagram_weights.png │ │ │ ├── network_diagram.png │ │ │ ├── padding_strides.gif │ │ │ ├── simple_neuron.png │ │ │ ├── tensor_examples.svg │ │ │ ├── test_examples.png │ │ │ ├── train_examples.png │ │ │ └── w1_backprop_graph.png │ │ ├── fc_model.py │ │ └── helper.py │ └── new-intro-to-pytorch │ │ ├── Part 1 - Tensors in PyTorch (Exercises).ipynb │ │ ├── Part 1 - Tensors in PyTorch (Solution).ipynb │ │ ├── Part 2 - Neural Networks in PyTorch (Exercises).ipynb │ │ ├── Part 2 - Neural Networks in PyTorch (Solution).ipynb │ │ ├── Part 3 - Training Neural Networks (Exercises).ipynb │ │ ├── Part 3 - Training Neural Networks (Solution).ipynb │ │ ├── Part 4 - Fashion-MNIST (Exercises).ipynb │ │ ├── Part 4 - Fashion-MNIST (Solution).ipynb │ │ ├── Part 5 - Inference and Validation (Exercises).ipynb │ │ ├── Part 5 - Inference and Validation (Solution).ipynb │ │ ├── Part 6 - Saving and Loading Models.ipynb │ │ ├── Part 7 - Loading Image Data (Exercises).ipynb │ │ ├── Part 7 - Loading Image Data (Solution).ipynb │ │ ├── Part 8 - Transfer Learning (Exercises).ipynb │ │ ├── Part 8 - Transfer Learning (Solution).ipynb │ │ ├── README.md │ │ ├── assets │ │ ├── ImageNet_example.png │ │ ├── Pooling_Simple_max.png │ │ ├── activation.png │ │ ├── autoencoder_1.png │ │ ├── backprop_diagram.png │ │ ├── cat.70.jpg │ │ ├── cat_cropped.png │ │ ├── conv_net.jpg │ │ ├── dog.128.jpg │ │ ├── dog_cat.png │ │ ├── examples_new.png │ │ ├── fashion-mnist-sprite.png │ │ ├── full_padding_no_strides_transposed.gif │ │ ├── function_approx.png │ │ ├── gradient_descent.png │ │ ├── image_distribution.png │ │ ├── infographic.pdf │ │ ├── lenet.png │ │ ├── mlp_mnist.png │ │ ├── mnist.png │ │ ├── multilayer_diagram_weights.png │ │ ├── network_diagram.png │ │ ├── overfitting.png │ │ ├── padding_strides.gif │ │ ├── simple_neuron.png │ │ ├── tensor_examples.svg │ │ ├── test_examples.png │ │ ├── train_examples.png │ │ └── w1_backprop_graph.png │ │ ├── fc_model.py │ │ └── helper.py ├── Supervised │ ├── .DS_Store │ ├── 1_DecisionTrees │ │ ├── Solutions.ipynb │ │ ├── titanic_data.csv │ │ └── titanic_survival_exploration.ipynb │ ├── 2_NaiveBayes │ │ ├── .DS_Store │ │ ├── Bayesian_Inference.ipynb │ │ ├── Bayesian_Inference_solution.ipynb │ │ ├── images │ │ │ ├── bayes_formula.png │ │ │ ├── countvectorizer.png │ │ │ ├── dqnb.png │ │ │ ├── naivebayes.png │ │ │ └── tfidf.png │ │ └── smsspamcollection │ │ │ ├── SMSSpamCollection │ │ │ └── readme │ ├── 3_EnsembleMethods │ │ ├── .DS_Store │ │ ├── Spam_&_Ensembles.ipynb │ │ ├── Spam_&_Ensembles_Solution.ipynb │ │ ├── images │ │ │ ├── bayes_formula.png │ │ │ ├── countvectorizer.png │ │ │ ├── dqnb.png │ │ │ ├── naivebayes.png │ │ │ └── tfidf.png │ │ └── smsspamcollection │ │ │ ├── SMSSpamCollection │ │ │ └── readme │ ├── 4_ModelEvaluationMetrics │ │ ├── .DS_Store │ │ ├── Classification_Metrics.ipynb │ │ ├── Classification_Metrics_Solution.ipynb │ │ ├── Regression Metrics Solution.ipynb │ │ ├── Regression Metrics.ipynb │ │ ├── images │ │ │ ├── bayes_formula.png │ │ │ ├── countvectorizer.png │ │ │ ├── dqnb.png │ │ │ ├── naivebayes.png │ │ │ └── tfidf.png │ │ ├── smsspamcollection │ │ │ ├── SMSSpamCollection │ │ │ └── readme │ │ ├── tests.py │ │ └── tests2.py │ └── 5_TrainingTuning │ │ ├── Diabetes Case Study - Solution.ipynb │ │ ├── Diabetes Case Study.ipynb │ │ ├── Grid_Search_Lab.ipynb │ │ ├── Solution.ipynb │ │ ├── check_file.py │ │ ├── data.csv │ │ └── diabetes.csv └── Unsupervised │ ├── 1_Clustering │ ├── C18_FeatScalingEx_01.png │ ├── Changing K - Solution.ipynb │ ├── Changing K.ipynb │ ├── Feature Scaling - Solution.ipynb │ ├── Feature Scaling Example - Solution.ipynb │ ├── Feature Scaling Example.ipynb │ ├── Feature Scaling.ipynb │ ├── Identifying_Clusters.ipynb │ ├── Identifying_Clusters_Solution.ipynb │ ├── UL1_Clustering_Storyboard_Assets (1).ipynb │ ├── UL6_PCA_Storyboard_Assets (1).ipynb │ ├── giphy (1).gif │ ├── giphy.gif │ ├── helper.py │ ├── helper_functions.py │ ├── helpers2.py │ ├── k-means Clustering of Movie Ratings [SOLUTION].ipynb │ ├── k-means Clustering of Movie Ratings.ipynb │ ├── ml-latest-small │ │ ├── README.txt │ │ ├── links.csv │ │ ├── movies.csv │ │ ├── ratings.csv │ │ └── tags.csv │ ├── test_file.py │ ├── tests.py │ └── tests2.py │ ├── 2_HierarchcalDensityClustering │ ├── DBSCAN Notebook [SOLUTION].ipynb │ ├── DBSCAN Notebook.ipynb │ ├── blobs.csv │ ├── dbscan_lab_helper.py │ ├── images │ │ ├── high_epsilon_and_high_min_sample.png │ │ ├── high_epsilon_and_low_min_sample.png │ │ ├── low_epsilon_and_high_min_sample.png │ │ └── low_epsilon_and_low_min_sample.png │ └── varied.csv │ ├── 3_MixtureModels │ ├── GMM Clustering and Cluster Validation Lab [SOLUTION].ipynb │ └── GMM Clustering and Cluster Validation Lab.ipynb │ ├── 4_PCA │ ├── Interpret_PCA_Results.ipynb │ ├── Interpret_PCA_Results_SC.ipynb │ ├── Interpret_PCA_Results_Solution.ipynb │ ├── PCA_1.ipynb │ ├── PCA_1_Solution.ipynb │ ├── PCA_Mini_Project.ipynb │ ├── PCA_Mini_Project_Solution.ipynb │ ├── PCA_SC.ipynb │ ├── Screencast_Example_Code.ipynb │ ├── data │ │ ├── cars.csv │ │ └── train.csv │ ├── helper_functions.py │ ├── test_code.py │ └── test_code2.py │ └── 5_ICA │ ├── ICA mix 1.wav │ ├── ICA mix 2.wav │ ├── ICA mix 3.wav │ ├── Independent Component Analysis Lab [SOLUTION].ipynb │ └── Independent Component Analysis Lab.ipynb └── projects ├── .DS_Store ├── p1_charityml ├── .ipynb_checkpoints │ └── finding_donors-checkpoint.ipynb ├── README.md ├── census.csv ├── example_submission.csv ├── finding_donors.ipynb ├── test_census.csv └── visuals.py └── p2_image_classifier ├── Image Classifier Project-zh.ipynb ├── Image Classifier Project.ipynb ├── LICENSE ├── README.md ├── assets ├── Flowers.png └── inference_example.png ├── cat_to_name.json ├── predict.py └── train.py /.github/workflows/manual.yml: -------------------------------------------------------------------------------- 1 | # Workflow to ensure whenever a Github PR is submitted, 2 | # a JIRA ticket gets created automatically. 3 | name: Manual Workflow 4 | 5 | # Controls when the action will run. 6 | on: 7 | # Triggers the workflow on pull request events but only for the master branch 8 | pull_request_target: 9 | types: [opened, reopened] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | jobs: 15 | test-transition-issue: 16 | name: Convert Github Issue to Jira Issue 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@master 21 | 22 | - name: Login 23 | uses: atlassian/gajira-login@master 24 | env: 25 | JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} 26 | JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} 27 | JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} 28 | 29 | - name: Create NEW JIRA ticket 30 | id: create 31 | uses: atlassian/gajira-create@master 32 | with: 33 | project: CONUPDATE 34 | issuetype: Task 35 | summary: | 36 | Github PR ND229 C1 | Repo: ${{ github.repository }} | PR# ${{github.event.number}} 37 | description: | 38 | Repo link: https://github.com/${{ github.repository }} 39 | PR no. ${{ github.event.pull_request.number }} 40 | PR title: ${{ github.event.pull_request.title }} 41 | PR description: ${{ github.event.pull_request.description }} 42 | In addition, please resolve other issues, if any. 43 | fields: '{"components": [{"name":"nd229 - Intro to Machine Learning with PyTorch"}], "customfield_16449":"https://classroom.udacity.com/", "customfield_16450":"Resolve the PR", "labels": ["github"], "priority":{"id": "4"}}' 44 | 45 | - name: Log created issue 46 | run: echo "Issue ${{ steps.create.outputs.issue }} was created" 47 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @udacity/active-public-content -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intro to Machine Learning with Pytorch Nanodegree (nd229) 2 | 3 | Content for Udacity's Intro to Machine Learning with Pytorch Nanodegree curriculum, which includes project and lesson content. 4 | 5 | Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License. Please refer to [Udacity Terms of Service](https://www.udacity.com/legal) for further information. 6 | -------------------------------------------------------------------------------- /lessons/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/.DS_Store -------------------------------------------------------------------------------- /lessons/DeepLearning/1_IntroNNs/GradientDescent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Implementing the Gradient Descent Algorithm\n", 8 | "\n", 9 | "In this lab, we'll implement the basic functions of the Gradient Descent algorithm to find the boundary in a small dataset. First, we'll start with some functions that will help us plot and visualize the data." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import matplotlib.pyplot as plt\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "\n", 24 | "#Some helper functions for plotting and drawing lines\n", 25 | "\n", 26 | "def plot_points(X, y):\n", 27 | " admitted = X[np.argwhere(y==1)]\n", 28 | " rejected = X[np.argwhere(y==0)]\n", 29 | " plt.scatter([s[0][0] for s in rejected], [s[0][1] for s in rejected], s = 25, color = 'blue', edgecolor = 'k')\n", 30 | " plt.scatter([s[0][0] for s in admitted], [s[0][1] for s in admitted], s = 25, color = 'red', edgecolor = 'k')\n", 31 | "\n", 32 | "def display(m, b, color='g--'):\n", 33 | " plt.xlim(-0.05,1.05)\n", 34 | " plt.ylim(-0.05,1.05)\n", 35 | " x = np.arange(-10, 10, 0.1)\n", 36 | " plt.plot(x, m*x+b, color)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Reading and plotting the data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "data = pd.read_csv('data.csv', header=None)\n", 55 | "X = np.array(data[[0,1]])\n", 56 | "y = np.array(data[2])\n", 57 | "plot_points(X,y)\n", 58 | "plt.show()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## TODO: Implementing the basic functions\n", 66 | "Here is your turn to shine. Implement the following formulas, as explained in the text.\n", 67 | "- Sigmoid activation function\n", 68 | "\n", 69 | "$$\\sigma(x) = \\frac{1}{1+e^{-x}}$$\n", 70 | "\n", 71 | "- Output (prediction) formula\n", 72 | "\n", 73 | "$$\\hat{y} = \\sigma(w_1 x_1 + w_2 x_2 + b)$$\n", 74 | "\n", 75 | "- Error function\n", 76 | "\n", 77 | "$$Error(y, \\hat{y}) = - y \\log(\\hat{y}) - (1-y) \\log(1-\\hat{y})$$\n", 78 | "\n", 79 | "- The function that updates the weights\n", 80 | "\n", 81 | "$$ w_i \\longrightarrow w_i + \\alpha (y - \\hat{y}) x_i$$\n", 82 | "\n", 83 | "$$ b \\longrightarrow b + \\alpha (y - \\hat{y})$$" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "# Implement the following functions\n", 95 | "\n", 96 | "# Activation (sigmoid) function\n", 97 | "def sigmoid(x):\n", 98 | " pass\n", 99 | "\n", 100 | "# Output (prediction) formula\n", 101 | "def output_formula(features, weights, bias):\n", 102 | " pass\n", 103 | "\n", 104 | "# Error (log-loss) formula\n", 105 | "def error_formula(y, output):\n", 106 | " pass\n", 107 | "\n", 108 | "# Gradient descent step\n", 109 | "def update_weights(x, y, weights, bias, learnrate):\n", 110 | " pass" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Training function\n", 118 | "This function will help us iterate the gradient descent algorithm through all the data, for a number of epochs. It will also plot the data, and some of the boundary lines obtained as we run the algorithm." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "np.random.seed(44)\n", 130 | "\n", 131 | "epochs = 100\n", 132 | "learnrate = 0.01\n", 133 | "\n", 134 | "def train(features, targets, epochs, learnrate, graph_lines=False):\n", 135 | " \n", 136 | " errors = []\n", 137 | " n_records, n_features = features.shape\n", 138 | " last_loss = None\n", 139 | " weights = np.random.normal(scale=1 / n_features**.5, size=n_features)\n", 140 | " bias = 0\n", 141 | " for e in range(epochs):\n", 142 | " del_w = np.zeros(weights.shape)\n", 143 | " for x, y in zip(features, targets):\n", 144 | " output = output_formula(x, weights, bias)\n", 145 | " error = error_formula(y, output)\n", 146 | " weights, bias = update_weights(x, y, weights, bias, learnrate)\n", 147 | " \n", 148 | " # Printing out the log-loss error on the training set\n", 149 | " out = output_formula(features, weights, bias)\n", 150 | " loss = np.mean(error_formula(targets, out))\n", 151 | " errors.append(loss)\n", 152 | " if e % (epochs / 10) == 0:\n", 153 | " print(\"\\n========== Epoch\", e,\"==========\")\n", 154 | " if last_loss and last_loss < loss:\n", 155 | " print(\"Train loss: \", loss, \" WARNING - Loss Increasing\")\n", 156 | " else:\n", 157 | " print(\"Train loss: \", loss)\n", 158 | " last_loss = loss\n", 159 | " predictions = out > 0.5\n", 160 | " accuracy = np.mean(predictions == targets)\n", 161 | " print(\"Accuracy: \", accuracy)\n", 162 | " if graph_lines and e % (epochs / 100) == 0:\n", 163 | " display(-weights[0]/weights[1], -bias/weights[1])\n", 164 | " \n", 165 | "\n", 166 | " # Plotting the solution boundary\n", 167 | " plt.title(\"Solution boundary\")\n", 168 | " display(-weights[0]/weights[1], -bias/weights[1], 'black')\n", 169 | "\n", 170 | " # Plotting the data\n", 171 | " plot_points(features, targets)\n", 172 | " plt.show()\n", 173 | "\n", 174 | " # Plotting the error\n", 175 | " plt.title(\"Error Plot\")\n", 176 | " plt.xlabel('Number of epochs')\n", 177 | " plt.ylabel('Error')\n", 178 | " plt.plot(errors)\n", 179 | " plt.show()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Time to train the algorithm!\n", 187 | "When we run the function, we'll obtain the following:\n", 188 | "- 10 updates with the current training loss and accuracy\n", 189 | "- A plot of the data and some of the boundary lines obtained. The final one is in black. Notice how the lines get closer and closer to the best fit, as we go through more epochs.\n", 190 | "- A plot of the error function. Notice how it decreases as we go through more epochs." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "train(X, y, epochs, learnrate, True)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.6.1" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } 245 | -------------------------------------------------------------------------------- /lessons/DeepLearning/1_IntroNNs/GradientDescentSolutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Solutions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# Activation (sigmoid) function\n", 19 | "def sigmoid(x):\n", 20 | " return 1 / (1 + np.exp(-x))\n", 21 | "\n", 22 | "def output_formula(features, weights, bias):\n", 23 | " return sigmoid(np.dot(features, weights) + bias)\n", 24 | "\n", 25 | "def error_formula(y, output):\n", 26 | " return - y*np.log(output) - (1 - y) * np.log(1-output)\n", 27 | "\n", 28 | "def update_weights(x, y, weights, bias, learnrate):\n", 29 | " output = output_formula(x, weights, bias)\n", 30 | " d_error = -(y - output)\n", 31 | " weights -= learnrate * d_error * x\n", 32 | " bias -= learnrate * d_error\n", 33 | " return weights, bias" 34 | ] 35 | } 36 | ], 37 | "metadata": { 38 | "kernelspec": { 39 | "display_name": "Python 3", 40 | "language": "python", 41 | "name": "python3" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.6.1" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 2 58 | } 59 | -------------------------------------------------------------------------------- /lessons/DeepLearning/1_IntroNNs/StudentAdmissionsSolutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Solutions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### One-hot encoding the rank" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# Make dummy variables for rank\n", 26 | "one_hot_data = pd.concat([data, pd.get_dummies(data['rank'], prefix='rank')], axis=1)\n", 27 | "\n", 28 | "# Drop the previous rank column\n", 29 | "one_hot_data = one_hot_data.drop('rank', axis=1)\n", 30 | "\n", 31 | "# Print the first 10 rows of our data\n", 32 | "one_hot_data[:10]" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Scaling the data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "# Copying our data\n", 51 | "processed_data = one_hot_data[:]\n", 52 | "\n", 53 | "# Scaling the columns\n", 54 | "processed_data['gre'] = processed_data['gre']/800\n", 55 | "processed_data['gpa'] = processed_data['gpa']/4.0\n", 56 | "processed_data[:10]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Backpropagating the data" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "def error_term_formula(y, output):\n", 75 | " return (y-output) * output * (1 - output)" 76 | ] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "Python 3", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.6.1" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 2 100 | } 101 | -------------------------------------------------------------------------------- /lessons/DeepLearning/1_IntroNNs/data.csv: -------------------------------------------------------------------------------- 1 | 0.78051,-0.063669,1 2 | 0.28774,0.29139,1 3 | 0.40714,0.17878,1 4 | 0.2923,0.4217,1 5 | 0.50922,0.35256,1 6 | 0.27785,0.10802,1 7 | 0.27527,0.33223,1 8 | 0.43999,0.31245,1 9 | 0.33557,0.42984,1 10 | 0.23448,0.24986,1 11 | 0.0084492,0.13658,1 12 | 0.12419,0.33595,1 13 | 0.25644,0.42624,1 14 | 0.4591,0.40426,1 15 | 0.44547,0.45117,1 16 | 0.42218,0.20118,1 17 | 0.49563,0.21445,1 18 | 0.30848,0.24306,1 19 | 0.39707,0.44438,1 20 | 0.32945,0.39217,1 21 | 0.40739,0.40271,1 22 | 0.3106,0.50702,1 23 | 0.49638,0.45384,1 24 | 0.10073,0.32053,1 25 | 0.69907,0.37307,1 26 | 0.29767,0.69648,1 27 | 0.15099,0.57341,1 28 | 0.16427,0.27759,1 29 | 0.33259,0.055964,1 30 | 0.53741,0.28637,1 31 | 0.19503,0.36879,1 32 | 0.40278,0.035148,1 33 | 0.21296,0.55169,1 34 | 0.48447,0.56991,1 35 | 0.25476,0.34596,1 36 | 0.21726,0.28641,1 37 | 0.67078,0.46538,1 38 | 0.3815,0.4622,1 39 | 0.53838,0.32774,1 40 | 0.4849,0.26071,1 41 | 0.37095,0.38809,1 42 | 0.54527,0.63911,1 43 | 0.32149,0.12007,1 44 | 0.42216,0.61666,1 45 | 0.10194,0.060408,1 46 | 0.15254,0.2168,1 47 | 0.45558,0.43769,1 48 | 0.28488,0.52142,1 49 | 0.27633,0.21264,1 50 | 0.39748,0.31902,1 51 | 0.5533,1,0 52 | 0.44274,0.59205,0 53 | 0.85176,0.6612,0 54 | 0.60436,0.86605,0 55 | 0.68243,0.48301,0 56 | 1,0.76815,0 57 | 0.72989,0.8107,0 58 | 0.67377,0.77975,0 59 | 0.78761,0.58177,0 60 | 0.71442,0.7668,0 61 | 0.49379,0.54226,0 62 | 0.78974,0.74233,0 63 | 0.67905,0.60921,0 64 | 0.6642,0.72519,0 65 | 0.79396,0.56789,0 66 | 0.70758,0.76022,0 67 | 0.59421,0.61857,0 68 | 0.49364,0.56224,0 69 | 0.77707,0.35025,0 70 | 0.79785,0.76921,0 71 | 0.70876,0.96764,0 72 | 0.69176,0.60865,0 73 | 0.66408,0.92075,0 74 | 0.65973,0.66666,0 75 | 0.64574,0.56845,0 76 | 0.89639,0.7085,0 77 | 0.85476,0.63167,0 78 | 0.62091,0.80424,0 79 | 0.79057,0.56108,0 80 | 0.58935,0.71582,0 81 | 0.56846,0.7406,0 82 | 0.65912,0.71548,0 83 | 0.70938,0.74041,0 84 | 0.59154,0.62927,0 85 | 0.45829,0.4641,0 86 | 0.79982,0.74847,0 87 | 0.60974,0.54757,0 88 | 0.68127,0.86985,0 89 | 0.76694,0.64736,0 90 | 0.69048,0.83058,0 91 | 0.68122,0.96541,0 92 | 0.73229,0.64245,0 93 | 0.76145,0.60138,0 94 | 0.58985,0.86955,0 95 | 0.73145,0.74516,0 96 | 0.77029,0.7014,0 97 | 0.73156,0.71782,0 98 | 0.44556,0.57991,0 99 | 0.85275,0.85987,0 100 | 0.51912,0.62359,0 101 | -------------------------------------------------------------------------------- /lessons/DeepLearning/1_IntroNNs/student_data.csv: -------------------------------------------------------------------------------- 1 | admit,gre,gpa,rank 2 | 0,380,3.61,3 3 | 1,660,3.67,3 4 | 1,800,4,1 5 | 1,640,3.19,4 6 | 0,520,2.93,4 7 | 1,760,3,2 8 | 1,560,2.98,1 9 | 0,400,3.08,2 10 | 1,540,3.39,3 11 | 0,700,3.92,2 12 | 0,800,4,4 13 | 0,440,3.22,1 14 | 1,760,4,1 15 | 0,700,3.08,2 16 | 1,700,4,1 17 | 0,480,3.44,3 18 | 0,780,3.87,4 19 | 0,360,2.56,3 20 | 0,800,3.75,2 21 | 1,540,3.81,1 22 | 0,500,3.17,3 23 | 1,660,3.63,2 24 | 0,600,2.82,4 25 | 0,680,3.19,4 26 | 1,760,3.35,2 27 | 1,800,3.66,1 28 | 1,620,3.61,1 29 | 1,520,3.74,4 30 | 1,780,3.22,2 31 | 0,520,3.29,1 32 | 0,540,3.78,4 33 | 0,760,3.35,3 34 | 0,600,3.4,3 35 | 1,800,4,3 36 | 0,360,3.14,1 37 | 0,400,3.05,2 38 | 0,580,3.25,1 39 | 0,520,2.9,3 40 | 1,500,3.13,2 41 | 1,520,2.68,3 42 | 0,560,2.42,2 43 | 1,580,3.32,2 44 | 1,600,3.15,2 45 | 0,500,3.31,3 46 | 0,700,2.94,2 47 | 1,460,3.45,3 48 | 1,580,3.46,2 49 | 0,500,2.97,4 50 | 0,440,2.48,4 51 | 0,400,3.35,3 52 | 0,640,3.86,3 53 | 0,440,3.13,4 54 | 0,740,3.37,4 55 | 1,680,3.27,2 56 | 0,660,3.34,3 57 | 1,740,4,3 58 | 0,560,3.19,3 59 | 0,380,2.94,3 60 | 0,400,3.65,2 61 | 0,600,2.82,4 62 | 1,620,3.18,2 63 | 0,560,3.32,4 64 | 0,640,3.67,3 65 | 1,680,3.85,3 66 | 0,580,4,3 67 | 0,600,3.59,2 68 | 0,740,3.62,4 69 | 0,620,3.3,1 70 | 0,580,3.69,1 71 | 0,800,3.73,1 72 | 0,640,4,3 73 | 0,300,2.92,4 74 | 0,480,3.39,4 75 | 0,580,4,2 76 | 0,720,3.45,4 77 | 0,720,4,3 78 | 0,560,3.36,3 79 | 1,800,4,3 80 | 0,540,3.12,1 81 | 1,620,4,1 82 | 0,700,2.9,4 83 | 0,620,3.07,2 84 | 0,500,2.71,2 85 | 0,380,2.91,4 86 | 1,500,3.6,3 87 | 0,520,2.98,2 88 | 0,600,3.32,2 89 | 0,600,3.48,2 90 | 0,700,3.28,1 91 | 1,660,4,2 92 | 0,700,3.83,2 93 | 1,720,3.64,1 94 | 0,800,3.9,2 95 | 0,580,2.93,2 96 | 1,660,3.44,2 97 | 0,660,3.33,2 98 | 0,640,3.52,4 99 | 0,480,3.57,2 100 | 0,700,2.88,2 101 | 0,400,3.31,3 102 | 0,340,3.15,3 103 | 0,580,3.57,3 104 | 0,380,3.33,4 105 | 0,540,3.94,3 106 | 1,660,3.95,2 107 | 1,740,2.97,2 108 | 1,700,3.56,1 109 | 0,480,3.13,2 110 | 0,400,2.93,3 111 | 0,480,3.45,2 112 | 0,680,3.08,4 113 | 0,420,3.41,4 114 | 0,360,3,3 115 | 0,600,3.22,1 116 | 0,720,3.84,3 117 | 0,620,3.99,3 118 | 1,440,3.45,2 119 | 0,700,3.72,2 120 | 1,800,3.7,1 121 | 0,340,2.92,3 122 | 1,520,3.74,2 123 | 1,480,2.67,2 124 | 0,520,2.85,3 125 | 0,500,2.98,3 126 | 0,720,3.88,3 127 | 0,540,3.38,4 128 | 1,600,3.54,1 129 | 0,740,3.74,4 130 | 0,540,3.19,2 131 | 0,460,3.15,4 132 | 1,620,3.17,2 133 | 0,640,2.79,2 134 | 0,580,3.4,2 135 | 0,500,3.08,3 136 | 0,560,2.95,2 137 | 0,500,3.57,3 138 | 0,560,3.33,4 139 | 0,700,4,3 140 | 0,620,3.4,2 141 | 1,600,3.58,1 142 | 0,640,3.93,2 143 | 1,700,3.52,4 144 | 0,620,3.94,4 145 | 0,580,3.4,3 146 | 0,580,3.4,4 147 | 0,380,3.43,3 148 | 0,480,3.4,2 149 | 0,560,2.71,3 150 | 1,480,2.91,1 151 | 0,740,3.31,1 152 | 1,800,3.74,1 153 | 0,400,3.38,2 154 | 1,640,3.94,2 155 | 0,580,3.46,3 156 | 0,620,3.69,3 157 | 1,580,2.86,4 158 | 0,560,2.52,2 159 | 1,480,3.58,1 160 | 0,660,3.49,2 161 | 0,700,3.82,3 162 | 0,600,3.13,2 163 | 0,640,3.5,2 164 | 1,700,3.56,2 165 | 0,520,2.73,2 166 | 0,580,3.3,2 167 | 0,700,4,1 168 | 0,440,3.24,4 169 | 0,720,3.77,3 170 | 0,500,4,3 171 | 0,600,3.62,3 172 | 0,400,3.51,3 173 | 0,540,2.81,3 174 | 0,680,3.48,3 175 | 1,800,3.43,2 176 | 0,500,3.53,4 177 | 1,620,3.37,2 178 | 0,520,2.62,2 179 | 1,620,3.23,3 180 | 0,620,3.33,3 181 | 0,300,3.01,3 182 | 0,620,3.78,3 183 | 0,500,3.88,4 184 | 0,700,4,2 185 | 1,540,3.84,2 186 | 0,500,2.79,4 187 | 0,800,3.6,2 188 | 0,560,3.61,3 189 | 0,580,2.88,2 190 | 0,560,3.07,2 191 | 0,500,3.35,2 192 | 1,640,2.94,2 193 | 0,800,3.54,3 194 | 0,640,3.76,3 195 | 0,380,3.59,4 196 | 1,600,3.47,2 197 | 0,560,3.59,2 198 | 0,660,3.07,3 199 | 1,400,3.23,4 200 | 0,600,3.63,3 201 | 0,580,3.77,4 202 | 0,800,3.31,3 203 | 1,580,3.2,2 204 | 1,700,4,1 205 | 0,420,3.92,4 206 | 1,600,3.89,1 207 | 1,780,3.8,3 208 | 0,740,3.54,1 209 | 1,640,3.63,1 210 | 0,540,3.16,3 211 | 0,580,3.5,2 212 | 0,740,3.34,4 213 | 0,580,3.02,2 214 | 0,460,2.87,2 215 | 0,640,3.38,3 216 | 1,600,3.56,2 217 | 1,660,2.91,3 218 | 0,340,2.9,1 219 | 1,460,3.64,1 220 | 0,460,2.98,1 221 | 1,560,3.59,2 222 | 0,540,3.28,3 223 | 0,680,3.99,3 224 | 1,480,3.02,1 225 | 0,800,3.47,3 226 | 0,800,2.9,2 227 | 1,720,3.5,3 228 | 0,620,3.58,2 229 | 0,540,3.02,4 230 | 0,480,3.43,2 231 | 1,720,3.42,2 232 | 0,580,3.29,4 233 | 0,600,3.28,3 234 | 0,380,3.38,2 235 | 0,420,2.67,3 236 | 1,800,3.53,1 237 | 0,620,3.05,2 238 | 1,660,3.49,2 239 | 0,480,4,2 240 | 0,500,2.86,4 241 | 0,700,3.45,3 242 | 0,440,2.76,2 243 | 1,520,3.81,1 244 | 1,680,2.96,3 245 | 0,620,3.22,2 246 | 0,540,3.04,1 247 | 0,800,3.91,3 248 | 0,680,3.34,2 249 | 0,440,3.17,2 250 | 0,680,3.64,3 251 | 0,640,3.73,3 252 | 0,660,3.31,4 253 | 0,620,3.21,4 254 | 1,520,4,2 255 | 1,540,3.55,4 256 | 1,740,3.52,4 257 | 0,640,3.35,3 258 | 1,520,3.3,2 259 | 1,620,3.95,3 260 | 0,520,3.51,2 261 | 0,640,3.81,2 262 | 0,680,3.11,2 263 | 0,440,3.15,2 264 | 1,520,3.19,3 265 | 1,620,3.95,3 266 | 1,520,3.9,3 267 | 0,380,3.34,3 268 | 0,560,3.24,4 269 | 1,600,3.64,3 270 | 1,680,3.46,2 271 | 0,500,2.81,3 272 | 1,640,3.95,2 273 | 0,540,3.33,3 274 | 1,680,3.67,2 275 | 0,660,3.32,1 276 | 0,520,3.12,2 277 | 1,600,2.98,2 278 | 0,460,3.77,3 279 | 1,580,3.58,1 280 | 1,680,3,4 281 | 1,660,3.14,2 282 | 0,660,3.94,2 283 | 0,360,3.27,3 284 | 0,660,3.45,4 285 | 0,520,3.1,4 286 | 1,440,3.39,2 287 | 0,600,3.31,4 288 | 1,800,3.22,1 289 | 1,660,3.7,4 290 | 0,800,3.15,4 291 | 0,420,2.26,4 292 | 1,620,3.45,2 293 | 0,800,2.78,2 294 | 0,680,3.7,2 295 | 0,800,3.97,1 296 | 0,480,2.55,1 297 | 0,520,3.25,3 298 | 0,560,3.16,1 299 | 0,460,3.07,2 300 | 0,540,3.5,2 301 | 0,720,3.4,3 302 | 0,640,3.3,2 303 | 1,660,3.6,3 304 | 1,400,3.15,2 305 | 1,680,3.98,2 306 | 0,220,2.83,3 307 | 0,580,3.46,4 308 | 1,540,3.17,1 309 | 0,580,3.51,2 310 | 0,540,3.13,2 311 | 0,440,2.98,3 312 | 0,560,4,3 313 | 0,660,3.67,2 314 | 0,660,3.77,3 315 | 1,520,3.65,4 316 | 0,540,3.46,4 317 | 1,300,2.84,2 318 | 1,340,3,2 319 | 1,780,3.63,4 320 | 1,480,3.71,4 321 | 0,540,3.28,1 322 | 0,460,3.14,3 323 | 0,460,3.58,2 324 | 0,500,3.01,4 325 | 0,420,2.69,2 326 | 0,520,2.7,3 327 | 0,680,3.9,1 328 | 0,680,3.31,2 329 | 1,560,3.48,2 330 | 0,580,3.34,2 331 | 0,500,2.93,4 332 | 0,740,4,3 333 | 0,660,3.59,3 334 | 0,420,2.96,1 335 | 0,560,3.43,3 336 | 1,460,3.64,3 337 | 1,620,3.71,1 338 | 0,520,3.15,3 339 | 0,620,3.09,4 340 | 0,540,3.2,1 341 | 1,660,3.47,3 342 | 0,500,3.23,4 343 | 1,560,2.65,3 344 | 0,500,3.95,4 345 | 0,580,3.06,2 346 | 0,520,3.35,3 347 | 0,500,3.03,3 348 | 0,600,3.35,2 349 | 0,580,3.8,2 350 | 0,400,3.36,2 351 | 0,620,2.85,2 352 | 1,780,4,2 353 | 0,620,3.43,3 354 | 1,580,3.12,3 355 | 0,700,3.52,2 356 | 1,540,3.78,2 357 | 1,760,2.81,1 358 | 0,700,3.27,2 359 | 0,720,3.31,1 360 | 1,560,3.69,3 361 | 0,720,3.94,3 362 | 1,520,4,1 363 | 1,540,3.49,1 364 | 0,680,3.14,2 365 | 0,460,3.44,2 366 | 1,560,3.36,1 367 | 0,480,2.78,3 368 | 0,460,2.93,3 369 | 0,620,3.63,3 370 | 0,580,4,1 371 | 0,800,3.89,2 372 | 1,540,3.77,2 373 | 1,680,3.76,3 374 | 1,680,2.42,1 375 | 1,620,3.37,1 376 | 0,560,3.78,2 377 | 0,560,3.49,4 378 | 0,620,3.63,2 379 | 1,800,4,2 380 | 0,640,3.12,3 381 | 0,540,2.7,2 382 | 0,700,3.65,2 383 | 1,540,3.49,2 384 | 0,540,3.51,2 385 | 0,660,4,1 386 | 1,480,2.62,2 387 | 0,420,3.02,1 388 | 1,740,3.86,2 389 | 0,580,3.36,2 390 | 0,640,3.17,2 391 | 0,640,3.51,2 392 | 1,800,3.05,2 393 | 1,660,3.88,2 394 | 1,600,3.38,3 395 | 1,620,3.75,2 396 | 1,460,3.99,3 397 | 0,620,4,2 398 | 0,560,3.04,3 399 | 0,460,2.63,2 400 | 0,700,3.65,2 401 | 0,600,3.89,3 402 | -------------------------------------------------------------------------------- /lessons/DeepLearning/2_Keras/IMDB_In_Keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analyzing IMDB Data in Keras" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Imports\n", 17 | "import numpy as np\n", 18 | "import keras\n", 19 | "from keras.datasets import imdb\n", 20 | "from keras.models import Sequential\n", 21 | "from keras.layers import Dense, Dropout, Activation\n", 22 | "from keras.preprocessing.text import Tokenizer\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "%matplotlib inline\n", 25 | "\n", 26 | "np.random.seed(42)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## 1. Loading the data\n", 34 | "This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Loading the data (it's preloaded in Keras)\n", 44 | "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)\n", 45 | "\n", 46 | "print(x_train.shape)\n", 47 | "print(x_test.shape)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## 2. Examining the data\n", 55 | "Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.\n", 56 | "\n", 57 | "The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "print(x_train[0])\n", 67 | "print(y_train[0])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## 3. One-hot encoding the output\n", 75 | "Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# One-hot encoding the output into vector mode, each of length 1000\n", 85 | "tokenizer = Tokenizer(num_words=1000)\n", 86 | "x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')\n", 87 | "x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')\n", 88 | "print(x_train[0])" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "And we'll also one-hot encode the output." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# One-hot encoding the output\n", 105 | "num_classes = 2\n", 106 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 107 | "y_test = keras.utils.to_categorical(y_test, num_classes)\n", 108 | "print(y_train.shape)\n", 109 | "print(y_test.shape)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## 4. Building the model architecture\n", 117 | "Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# TODO: Build the model architecture\n", 127 | "\n", 128 | "# TODO: Compile the model using a loss function and an optimizer.\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## 5. Training the model\n", 136 | "Run the model here. Experiment with different batch_size, and number of epochs!" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs." 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## 6. Evaluating the model\n", 153 | "This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "score = model.evaluate(x_test, y_test, verbose=0)\n", 163 | "print(\"Accuracy: \", score[1])" 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.6.3" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 2 188 | } 189 | -------------------------------------------------------------------------------- /lessons/DeepLearning/2_Keras/IMDB_In_Keras_Solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analyzing IMDB Data in Keras - Solution" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 4. Building the model architecture\n", 15 | "Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# Building the model architecture with one layer of length 100\n", 25 | "model = Sequential()\n", 26 | "model.add(Dense(512, activation='relu', input_dim=1000))\n", 27 | "model.add(Dropout(0.5))\n", 28 | "model.add(Dense(num_classes, activation='softmax'))\n", 29 | "model.summary()\n", 30 | "\n", 31 | "# Compiling the model using categorical_crossentropy loss, and rmsprop optimizer.\n", 32 | "model.compile(loss='categorical_crossentropy',\n", 33 | " optimizer='rmsprop',\n", 34 | " metrics=['accuracy'])" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## 5. Training the model\n", 42 | "Run the model here. Experiment with different batch_size, and number of epochs!" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# Running and evaluating the model\n", 52 | "hist = model.fit(x_train, y_train,\n", 53 | " batch_size=32,\n", 54 | " epochs=10,\n", 55 | " validation_data=(x_test, y_test), \n", 56 | " verbose=2)" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": "Python 3", 63 | "language": "python", 64 | "name": "python3" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.6.1" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 2 81 | } 82 | -------------------------------------------------------------------------------- /lessons/DeepLearning/2_Keras/student_data.csv: -------------------------------------------------------------------------------- 1 | admit,gre,gpa,rank 2 | 0,380,3.61,3 3 | 1,660,3.67,3 4 | 1,800,4,1 5 | 1,640,3.19,4 6 | 0,520,2.93,4 7 | 1,760,3,2 8 | 1,560,2.98,1 9 | 0,400,3.08,2 10 | 1,540,3.39,3 11 | 0,700,3.92,2 12 | 0,800,4,4 13 | 0,440,3.22,1 14 | 1,760,4,1 15 | 0,700,3.08,2 16 | 1,700,4,1 17 | 0,480,3.44,3 18 | 0,780,3.87,4 19 | 0,360,2.56,3 20 | 0,800,3.75,2 21 | 1,540,3.81,1 22 | 0,500,3.17,3 23 | 1,660,3.63,2 24 | 0,600,2.82,4 25 | 0,680,3.19,4 26 | 1,760,3.35,2 27 | 1,800,3.66,1 28 | 1,620,3.61,1 29 | 1,520,3.74,4 30 | 1,780,3.22,2 31 | 0,520,3.29,1 32 | 0,540,3.78,4 33 | 0,760,3.35,3 34 | 0,600,3.4,3 35 | 1,800,4,3 36 | 0,360,3.14,1 37 | 0,400,3.05,2 38 | 0,580,3.25,1 39 | 0,520,2.9,3 40 | 1,500,3.13,2 41 | 1,520,2.68,3 42 | 0,560,2.42,2 43 | 1,580,3.32,2 44 | 1,600,3.15,2 45 | 0,500,3.31,3 46 | 0,700,2.94,2 47 | 1,460,3.45,3 48 | 1,580,3.46,2 49 | 0,500,2.97,4 50 | 0,440,2.48,4 51 | 0,400,3.35,3 52 | 0,640,3.86,3 53 | 0,440,3.13,4 54 | 0,740,3.37,4 55 | 1,680,3.27,2 56 | 0,660,3.34,3 57 | 1,740,4,3 58 | 0,560,3.19,3 59 | 0,380,2.94,3 60 | 0,400,3.65,2 61 | 0,600,2.82,4 62 | 1,620,3.18,2 63 | 0,560,3.32,4 64 | 0,640,3.67,3 65 | 1,680,3.85,3 66 | 0,580,4,3 67 | 0,600,3.59,2 68 | 0,740,3.62,4 69 | 0,620,3.3,1 70 | 0,580,3.69,1 71 | 0,800,3.73,1 72 | 0,640,4,3 73 | 0,300,2.92,4 74 | 0,480,3.39,4 75 | 0,580,4,2 76 | 0,720,3.45,4 77 | 0,720,4,3 78 | 0,560,3.36,3 79 | 1,800,4,3 80 | 0,540,3.12,1 81 | 1,620,4,1 82 | 0,700,2.9,4 83 | 0,620,3.07,2 84 | 0,500,2.71,2 85 | 0,380,2.91,4 86 | 1,500,3.6,3 87 | 0,520,2.98,2 88 | 0,600,3.32,2 89 | 0,600,3.48,2 90 | 0,700,3.28,1 91 | 1,660,4,2 92 | 0,700,3.83,2 93 | 1,720,3.64,1 94 | 0,800,3.9,2 95 | 0,580,2.93,2 96 | 1,660,3.44,2 97 | 0,660,3.33,2 98 | 0,640,3.52,4 99 | 0,480,3.57,2 100 | 0,700,2.88,2 101 | 0,400,3.31,3 102 | 0,340,3.15,3 103 | 0,580,3.57,3 104 | 0,380,3.33,4 105 | 0,540,3.94,3 106 | 1,660,3.95,2 107 | 1,740,2.97,2 108 | 1,700,3.56,1 109 | 0,480,3.13,2 110 | 0,400,2.93,3 111 | 0,480,3.45,2 112 | 0,680,3.08,4 113 | 0,420,3.41,4 114 | 0,360,3,3 115 | 0,600,3.22,1 116 | 0,720,3.84,3 117 | 0,620,3.99,3 118 | 1,440,3.45,2 119 | 0,700,3.72,2 120 | 1,800,3.7,1 121 | 0,340,2.92,3 122 | 1,520,3.74,2 123 | 1,480,2.67,2 124 | 0,520,2.85,3 125 | 0,500,2.98,3 126 | 0,720,3.88,3 127 | 0,540,3.38,4 128 | 1,600,3.54,1 129 | 0,740,3.74,4 130 | 0,540,3.19,2 131 | 0,460,3.15,4 132 | 1,620,3.17,2 133 | 0,640,2.79,2 134 | 0,580,3.4,2 135 | 0,500,3.08,3 136 | 0,560,2.95,2 137 | 0,500,3.57,3 138 | 0,560,3.33,4 139 | 0,700,4,3 140 | 0,620,3.4,2 141 | 1,600,3.58,1 142 | 0,640,3.93,2 143 | 1,700,3.52,4 144 | 0,620,3.94,4 145 | 0,580,3.4,3 146 | 0,580,3.4,4 147 | 0,380,3.43,3 148 | 0,480,3.4,2 149 | 0,560,2.71,3 150 | 1,480,2.91,1 151 | 0,740,3.31,1 152 | 1,800,3.74,1 153 | 0,400,3.38,2 154 | 1,640,3.94,2 155 | 0,580,3.46,3 156 | 0,620,3.69,3 157 | 1,580,2.86,4 158 | 0,560,2.52,2 159 | 1,480,3.58,1 160 | 0,660,3.49,2 161 | 0,700,3.82,3 162 | 0,600,3.13,2 163 | 0,640,3.5,2 164 | 1,700,3.56,2 165 | 0,520,2.73,2 166 | 0,580,3.3,2 167 | 0,700,4,1 168 | 0,440,3.24,4 169 | 0,720,3.77,3 170 | 0,500,4,3 171 | 0,600,3.62,3 172 | 0,400,3.51,3 173 | 0,540,2.81,3 174 | 0,680,3.48,3 175 | 1,800,3.43,2 176 | 0,500,3.53,4 177 | 1,620,3.37,2 178 | 0,520,2.62,2 179 | 1,620,3.23,3 180 | 0,620,3.33,3 181 | 0,300,3.01,3 182 | 0,620,3.78,3 183 | 0,500,3.88,4 184 | 0,700,4,2 185 | 1,540,3.84,2 186 | 0,500,2.79,4 187 | 0,800,3.6,2 188 | 0,560,3.61,3 189 | 0,580,2.88,2 190 | 0,560,3.07,2 191 | 0,500,3.35,2 192 | 1,640,2.94,2 193 | 0,800,3.54,3 194 | 0,640,3.76,3 195 | 0,380,3.59,4 196 | 1,600,3.47,2 197 | 0,560,3.59,2 198 | 0,660,3.07,3 199 | 1,400,3.23,4 200 | 0,600,3.63,3 201 | 0,580,3.77,4 202 | 0,800,3.31,3 203 | 1,580,3.2,2 204 | 1,700,4,1 205 | 0,420,3.92,4 206 | 1,600,3.89,1 207 | 1,780,3.8,3 208 | 0,740,3.54,1 209 | 1,640,3.63,1 210 | 0,540,3.16,3 211 | 0,580,3.5,2 212 | 0,740,3.34,4 213 | 0,580,3.02,2 214 | 0,460,2.87,2 215 | 0,640,3.38,3 216 | 1,600,3.56,2 217 | 1,660,2.91,3 218 | 0,340,2.9,1 219 | 1,460,3.64,1 220 | 0,460,2.98,1 221 | 1,560,3.59,2 222 | 0,540,3.28,3 223 | 0,680,3.99,3 224 | 1,480,3.02,1 225 | 0,800,3.47,3 226 | 0,800,2.9,2 227 | 1,720,3.5,3 228 | 0,620,3.58,2 229 | 0,540,3.02,4 230 | 0,480,3.43,2 231 | 1,720,3.42,2 232 | 0,580,3.29,4 233 | 0,600,3.28,3 234 | 0,380,3.38,2 235 | 0,420,2.67,3 236 | 1,800,3.53,1 237 | 0,620,3.05,2 238 | 1,660,3.49,2 239 | 0,480,4,2 240 | 0,500,2.86,4 241 | 0,700,3.45,3 242 | 0,440,2.76,2 243 | 1,520,3.81,1 244 | 1,680,2.96,3 245 | 0,620,3.22,2 246 | 0,540,3.04,1 247 | 0,800,3.91,3 248 | 0,680,3.34,2 249 | 0,440,3.17,2 250 | 0,680,3.64,3 251 | 0,640,3.73,3 252 | 0,660,3.31,4 253 | 0,620,3.21,4 254 | 1,520,4,2 255 | 1,540,3.55,4 256 | 1,740,3.52,4 257 | 0,640,3.35,3 258 | 1,520,3.3,2 259 | 1,620,3.95,3 260 | 0,520,3.51,2 261 | 0,640,3.81,2 262 | 0,680,3.11,2 263 | 0,440,3.15,2 264 | 1,520,3.19,3 265 | 1,620,3.95,3 266 | 1,520,3.9,3 267 | 0,380,3.34,3 268 | 0,560,3.24,4 269 | 1,600,3.64,3 270 | 1,680,3.46,2 271 | 0,500,2.81,3 272 | 1,640,3.95,2 273 | 0,540,3.33,3 274 | 1,680,3.67,2 275 | 0,660,3.32,1 276 | 0,520,3.12,2 277 | 1,600,2.98,2 278 | 0,460,3.77,3 279 | 1,580,3.58,1 280 | 1,680,3,4 281 | 1,660,3.14,2 282 | 0,660,3.94,2 283 | 0,360,3.27,3 284 | 0,660,3.45,4 285 | 0,520,3.1,4 286 | 1,440,3.39,2 287 | 0,600,3.31,4 288 | 1,800,3.22,1 289 | 1,660,3.7,4 290 | 0,800,3.15,4 291 | 0,420,2.26,4 292 | 1,620,3.45,2 293 | 0,800,2.78,2 294 | 0,680,3.7,2 295 | 0,800,3.97,1 296 | 0,480,2.55,1 297 | 0,520,3.25,3 298 | 0,560,3.16,1 299 | 0,460,3.07,2 300 | 0,540,3.5,2 301 | 0,720,3.4,3 302 | 0,640,3.3,2 303 | 1,660,3.6,3 304 | 1,400,3.15,2 305 | 1,680,3.98,2 306 | 0,220,2.83,3 307 | 0,580,3.46,4 308 | 1,540,3.17,1 309 | 0,580,3.51,2 310 | 0,540,3.13,2 311 | 0,440,2.98,3 312 | 0,560,4,3 313 | 0,660,3.67,2 314 | 0,660,3.77,3 315 | 1,520,3.65,4 316 | 0,540,3.46,4 317 | 1,300,2.84,2 318 | 1,340,3,2 319 | 1,780,3.63,4 320 | 1,480,3.71,4 321 | 0,540,3.28,1 322 | 0,460,3.14,3 323 | 0,460,3.58,2 324 | 0,500,3.01,4 325 | 0,420,2.69,2 326 | 0,520,2.7,3 327 | 0,680,3.9,1 328 | 0,680,3.31,2 329 | 1,560,3.48,2 330 | 0,580,3.34,2 331 | 0,500,2.93,4 332 | 0,740,4,3 333 | 0,660,3.59,3 334 | 0,420,2.96,1 335 | 0,560,3.43,3 336 | 1,460,3.64,3 337 | 1,620,3.71,1 338 | 0,520,3.15,3 339 | 0,620,3.09,4 340 | 0,540,3.2,1 341 | 1,660,3.47,3 342 | 0,500,3.23,4 343 | 1,560,2.65,3 344 | 0,500,3.95,4 345 | 0,580,3.06,2 346 | 0,520,3.35,3 347 | 0,500,3.03,3 348 | 0,600,3.35,2 349 | 0,580,3.8,2 350 | 0,400,3.36,2 351 | 0,620,2.85,2 352 | 1,780,4,2 353 | 0,620,3.43,3 354 | 1,580,3.12,3 355 | 0,700,3.52,2 356 | 1,540,3.78,2 357 | 1,760,2.81,1 358 | 0,700,3.27,2 359 | 0,720,3.31,1 360 | 1,560,3.69,3 361 | 0,720,3.94,3 362 | 1,520,4,1 363 | 1,540,3.49,1 364 | 0,680,3.14,2 365 | 0,460,3.44,2 366 | 1,560,3.36,1 367 | 0,480,2.78,3 368 | 0,460,2.93,3 369 | 0,620,3.63,3 370 | 0,580,4,1 371 | 0,800,3.89,2 372 | 1,540,3.77,2 373 | 1,680,3.76,3 374 | 1,680,2.42,1 375 | 1,620,3.37,1 376 | 0,560,3.78,2 377 | 0,560,3.49,4 378 | 0,620,3.63,2 379 | 1,800,4,2 380 | 0,640,3.12,3 381 | 0,540,2.7,2 382 | 0,700,3.65,2 383 | 1,540,3.49,2 384 | 0,540,3.51,2 385 | 0,660,4,1 386 | 1,480,2.62,2 387 | 0,420,3.02,1 388 | 1,740,3.86,2 389 | 0,580,3.36,2 390 | 0,640,3.17,2 391 | 0,640,3.51,2 392 | 1,800,3.05,2 393 | 1,660,3.88,2 394 | 1,600,3.38,3 395 | 1,620,3.75,2 396 | 1,460,3.99,3 397 | 0,620,4,2 398 | 0,560,3.04,3 399 | 0,460,2.63,2 400 | 0,700,3.65,2 401 | 0,600,3.89,3 402 | -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/.DS_Store -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Udacity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/Part 4 - Fashion-MNIST Exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Classifying Fashion-MNIST\n", 8 | "\n", 9 | "Now it's your turn to build and train a neural network. You'll be using the [Fashion-MNIST dataset](https://github.com/zalandoresearch/fashion-mnist), a drop-in replacement for the MNIST dataset. MNIST is actually quite trivial with neural networks where you can easily achieve better than 97% accuracy. Fashion-MNIST is a set of 28x28 greyscale images of clothes. It's more complex than MNIST, so it's a better representation of the actual performance of your network, and a better representation of datasets you'll use in the real world.\n", 10 | "\n", 11 | "\n", 12 | "\n", 13 | "In this notebook, you'll build your own neural network. For the most part, you could just copy and paste the code from Part 3, but you wouldn't be learning. It's important for you to write the code yourself and get it to work. Feel free to consult the previous notebook though as you work through this.\n", 14 | "\n", 15 | "First off, let's load the dataset through torchvision." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import torch\n", 25 | "from torchvision import datasets, transforms\n", 26 | "import helper\n", 27 | "\n", 28 | "# Define a transform to normalize the data\n", 29 | "transform = transforms.Compose([transforms.ToTensor(),\n", 30 | " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])\n", 31 | "# Download and load the training data\n", 32 | "trainset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=True, transform=transform)\n", 33 | "trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)\n", 34 | "\n", 35 | "# Download and load the test data\n", 36 | "testset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=False, transform=transform)\n", 37 | "testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Here we can see one of the images." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "image, label = next(iter(trainloader))\n", 54 | "helper.imshow(image[0,:]);" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "With the data loaded, it's time to import the necessary packages." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 1, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "%matplotlib inline\n", 71 | "%config InlineBackend.figure_format = 'retina'\n", 72 | "\n", 73 | "import matplotlib.pyplot as plt\n", 74 | "import numpy as np\n", 75 | "import time\n", 76 | "\n", 77 | "import torch\n", 78 | "from torch import nn\n", 79 | "from torch import optim\n", 80 | "import torch.nn.functional as F\n", 81 | "from torchvision import datasets, transforms\n", 82 | "\n", 83 | "import helper" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Building the network\n", 91 | "\n", 92 | "Here you should define your network. As with MNIST, each image is 28x28 which is a total of 784 pixels, and there are 10 classes. You should include at least one hidden layer. We suggest you use ReLU activations for the layers and to return the logits from the forward pass. It's up to you how many layers you add and the size of those layers." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# TODO: Define your network architecture here" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# Train the network\n", 109 | "\n", 110 | "Now you should create your network and train it. First you'll want to define [the criterion](http://pytorch.org/docs/master/nn.html#loss-functions) ( something like `nn.CrossEntropyLoss`) and [the optimizer](http://pytorch.org/docs/master/optim.html) (typically `optim.SGD` or `optim.Adam`).\n", 111 | "\n", 112 | "Then write the training code. Remember the training pass is a fairly straightforward process:\n", 113 | "\n", 114 | "* Make a forward pass through the network to get the logits \n", 115 | "* Use the logits to calculate the loss\n", 116 | "* Perform a backward pass through the network with `loss.backward()` to calculate the gradients\n", 117 | "* Take a step with the optimizer to update the weights\n", 118 | "\n", 119 | "By adjusting the hyperparameters (hidden units, learning rate, etc), you should be able to get the training loss below 0.4." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# TODO: Create the network, define the criterion and optimizer\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# TODO: Train the network here\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# Test out your network!\n", 147 | "\n", 148 | "dataiter = iter(testloader)\n", 149 | "images, labels = dataiter.next()\n", 150 | "img = images[0]\n", 151 | "# Convert 2D image to 1D vector\n", 152 | "img = img.resize_(1, 784)\n", 153 | "\n", 154 | "# TODO: Calculate the class probabilities (softmax) for img\n", 155 | "ps = \n", 156 | "\n", 157 | "# Plot the image and probabilities\n", 158 | "helper.view_classify(img.resize_(1, 28, 28), ps, version='Fashion')" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Now that your network is trained, you'll want to save it to disk so you can load it later instead of training it again. Obviously, it's impractical to train a network every time you need one. In practice, you'll train it once, save the model, then reload it for further training or making predictions. In the next part, I'll show you how to save and load trained models." 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.6.4" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning with PyTorch 2 | 3 | This repo contains notebooks and related code for Udacity's Deep Learning with PyTorch lesson. This lesson appears in our [AI Programming with Python Nanodegree program](https://www.udacity.com/course/ai-programming-python-nanodegree--nd089). 4 | 5 | * **Part 1:** Introduction to PyTorch and using tensors 6 | * **Part 2:** Building fully-connected neural networks with PyTorch 7 | * **Part 3:** How to train a fully-connected network with backpropagation on MNIST 8 | * **Part 4:** Exercise - train a neural network on Fashion-MNIST 9 | * **Part 5:** Using a trained network for making predictions and validating networks 10 | * **Part 6:** How to save and load trained models 11 | * **Part 7:** Load image data with torchvision, also data augmentation 12 | * **Part 8:** Use transfer learning to train a state-of-the-art image classifier for dogs and cats -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/ImageNet_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/ImageNet_example.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/Pooling_Simple_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/Pooling_Simple_max.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/activation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/activation.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/autoencoder_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/autoencoder_1.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/cat.70.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/cat.70.jpg -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/cat_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/cat_cropped.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/conv_net.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/conv_net.jpg -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/dog.128.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/dog.128.jpg -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/dog_cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/dog_cat.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/examples_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/examples_new.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/fashion-mnist-sprite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/fashion-mnist-sprite.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/full_padding_no_strides_transposed.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/full_padding_no_strides_transposed.gif -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/function_approx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/function_approx.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/gradient_descent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/gradient_descent.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/lenet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/lenet.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/mlp_mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/mlp_mnist.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/multilayer_diagram_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/multilayer_diagram_weights.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/network_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/network_diagram.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/padding_strides.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/padding_strides.gif -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/simple_neuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/simple_neuron.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/test_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/test_examples.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/train_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/train_examples.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/assets/w1_backprop_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/3_PyTorch/assets/w1_backprop_graph.png -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/fc_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Network(nn.Module): 7 | def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5): 8 | ''' Builds a feedforward network with arbitrary hidden layers. 9 | 10 | Arguments 11 | --------- 12 | input_size: integer, size of the input layer 13 | output_size: integer, size of the output layer 14 | hidden_layers: list of integers, the sizes of the hidden layers 15 | 16 | ''' 17 | super().__init__() 18 | # Input to a hidden layer 19 | self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])]) 20 | 21 | # Add a variable number of more hidden layers 22 | layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:]) 23 | self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes]) 24 | 25 | self.output = nn.Linear(hidden_layers[-1], output_size) 26 | 27 | self.dropout = nn.Dropout(p=drop_p) 28 | 29 | def forward(self, x): 30 | ''' Forward pass through the network, returns the output logits ''' 31 | 32 | for each in self.hidden_layers: 33 | x = F.relu(each(x)) 34 | x = self.dropout(x) 35 | x = self.output(x) 36 | 37 | return F.log_softmax(x, dim=1) 38 | 39 | 40 | def validation(model, testloader, criterion): 41 | accuracy = 0 42 | test_loss = 0 43 | for images, labels in testloader: 44 | 45 | images = images.resize_(images.size()[0], 784) 46 | 47 | output = model.forward(images) 48 | test_loss += criterion(output, labels).item() 49 | 50 | ## Calculating the accuracy 51 | # Model's output is log-softmax, take exponential to get the probabilities 52 | ps = torch.exp(output) 53 | # Class with highest probability is our predicted class, compare with true label 54 | equality = (labels.data == ps.max(1)[1]) 55 | # Accuracy is number of correct predictions divided by all predictions, just take the mean 56 | accuracy += equality.type_as(torch.FloatTensor()).mean() 57 | 58 | return test_loss, accuracy 59 | 60 | 61 | def train(model, trainloader, testloader, criterion, optimizer, epochs=5, print_every=40): 62 | 63 | steps = 0 64 | running_loss = 0 65 | for e in range(epochs): 66 | # Model in training mode, dropout is on 67 | model.train() 68 | for images, labels in trainloader: 69 | steps += 1 70 | 71 | # Flatten images into a 784 long vector 72 | images.resize_(images.size()[0], 784) 73 | 74 | optimizer.zero_grad() 75 | 76 | output = model.forward(images) 77 | loss = criterion(output, labels) 78 | loss.backward() 79 | optimizer.step() 80 | 81 | running_loss += loss.item() 82 | 83 | if steps % print_every == 0: 84 | # Model in inference mode, dropout is off 85 | model.eval() 86 | 87 | # Turn off gradients for validation, will speed up inference 88 | with torch.no_grad(): 89 | test_loss, accuracy = validation(model, testloader, criterion) 90 | 91 | print("Epoch: {}/{}.. ".format(e+1, epochs), 92 | "Training Loss: {:.3f}.. ".format(running_loss/print_every), 93 | "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)), 94 | "Test Accuracy: {:.3f}".format(accuracy/len(testloader))) 95 | 96 | running_loss = 0 97 | 98 | # Make sure dropout and grads are on for training 99 | model.train() -------------------------------------------------------------------------------- /lessons/DeepLearning/3_PyTorch/helper.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from torch import nn, optim 4 | from torch.autograd import Variable 5 | 6 | 7 | def test_network(net, trainloader): 8 | 9 | criterion = nn.MSELoss() 10 | optimizer = optim.Adam(net.parameters(), lr=0.001) 11 | 12 | dataiter = iter(trainloader) 13 | images, labels = dataiter.next() 14 | 15 | # Create Variables for the inputs and targets 16 | inputs = Variable(images) 17 | targets = Variable(images) 18 | 19 | # Clear the gradients from all Variables 20 | optimizer.zero_grad() 21 | 22 | # Forward pass, then backward pass, then update weights 23 | output = net.forward(inputs) 24 | loss = criterion(output, targets) 25 | loss.backward() 26 | optimizer.step() 27 | 28 | return True 29 | 30 | 31 | def imshow(image, ax=None, title=None, normalize=True): 32 | """Imshow for Tensor.""" 33 | if ax is None: 34 | fig, ax = plt.subplots() 35 | image = image.numpy().transpose((1, 2, 0)) 36 | 37 | if normalize: 38 | mean = np.array([0.485, 0.456, 0.406]) 39 | std = np.array([0.229, 0.224, 0.225]) 40 | image = std * image + mean 41 | image = np.clip(image, 0, 1) 42 | 43 | ax.imshow(image) 44 | ax.spines['top'].set_visible(False) 45 | ax.spines['right'].set_visible(False) 46 | ax.spines['left'].set_visible(False) 47 | ax.spines['bottom'].set_visible(False) 48 | ax.tick_params(axis='both', length=0) 49 | ax.set_xticklabels('') 50 | ax.set_yticklabels('') 51 | 52 | return ax 53 | 54 | 55 | def view_recon(img, recon): 56 | ''' Function for displaying an image (as a PyTorch Tensor) and its 57 | reconstruction also a PyTorch Tensor 58 | ''' 59 | 60 | fig, axes = plt.subplots(ncols=2, sharex=True, sharey=True) 61 | axes[0].imshow(img.numpy().squeeze()) 62 | axes[1].imshow(recon.data.numpy().squeeze()) 63 | for ax in axes: 64 | ax.axis('off') 65 | ax.set_adjustable('box-forced') 66 | 67 | def view_classify(img, ps, version="MNIST"): 68 | ''' Function for viewing an image and it's predicted classes. 69 | ''' 70 | ps = ps.data.numpy().squeeze() 71 | 72 | fig, (ax1, ax2) = plt.subplots(figsize=(6,9), ncols=2) 73 | ax1.imshow(img.resize_(1, 28, 28).numpy().squeeze()) 74 | ax1.axis('off') 75 | ax2.barh(np.arange(10), ps) 76 | ax2.set_aspect(0.1) 77 | ax2.set_yticks(np.arange(10)) 78 | if version == "MNIST": 79 | ax2.set_yticklabels(np.arange(10)) 80 | elif version == "Fashion": 81 | ax2.set_yticklabels(['T-shirt/top', 82 | 'Trouser', 83 | 'Pullover', 84 | 'Dress', 85 | 'Coat', 86 | 'Sandal', 87 | 'Shirt', 88 | 'Sneaker', 89 | 'Bag', 90 | 'Ankle Boot'], size='small'); 91 | ax2.set_title('Class Probability') 92 | ax2.set_xlim(0, 1.1) 93 | 94 | plt.tight_layout() 95 | -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning with PyTorch 2 | 3 | This repo contains notebooks and related code for Udacity's Deep Learning with PyTorch lesson. This lesson appears in our [AI Programming with Python Nanodegree program](https://www.udacity.com/course/ai-programming-python-nanodegree--nd089). 4 | 5 | * **Part 1:** Introduction to PyTorch and using tensors 6 | * **Part 2:** Building fully-connected neural networks with PyTorch 7 | * **Part 3:** How to train a fully-connected network with backpropagation on MNIST 8 | * **Part 4:** Exercise - train a neural network on Fashion-MNIST 9 | * **Part 5:** Using a trained network for making predictions and validating networks 10 | * **Part 6:** How to save and load trained models 11 | * **Part 7:** Load image data with torchvision, also data augmentation 12 | * **Part 8:** Use transfer learning to train a state-of-the-art image classifier for dogs and cats -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/ImageNet_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/ImageNet_example.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/Pooling_Simple_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/Pooling_Simple_max.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/activation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/activation.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/autoencoder_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/autoencoder_1.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/backprop_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/backprop_diagram.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/cat.70.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/cat.70.jpg -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/cat_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/cat_cropped.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/conv_net.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/conv_net.jpg -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/dog.128.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/dog.128.jpg -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/dog_cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/dog_cat.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/examples_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/examples_new.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/fashion-mnist-sprite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/fashion-mnist-sprite.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/full_padding_no_strides_transposed.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/full_padding_no_strides_transposed.gif -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/function_approx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/function_approx.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/gradient_descent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/gradient_descent.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/image_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/image_distribution.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/infographic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/infographic.pdf -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/lenet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/lenet.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/mlp_mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/mlp_mnist.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/mnist.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/multilayer_diagram_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/multilayer_diagram_weights.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/network_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/network_diagram.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/overfitting.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/padding_strides.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/padding_strides.gif -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/simple_neuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/simple_neuron.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/test_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/test_examples.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/train_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/train_examples.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/assets/w1_backprop_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/DeepLearning/new-intro-to-pytorch/assets/w1_backprop_graph.png -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/fc_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Network(nn.Module): 7 | def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5): 8 | ''' Builds a feedforward network with arbitrary hidden layers. 9 | 10 | Arguments 11 | --------- 12 | input_size: integer, size of the input layer 13 | output_size: integer, size of the output layer 14 | hidden_layers: list of integers, the sizes of the hidden layers 15 | 16 | ''' 17 | super().__init__() 18 | # Input to a hidden layer 19 | self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])]) 20 | 21 | # Add a variable number of more hidden layers 22 | layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:]) 23 | self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes]) 24 | 25 | self.output = nn.Linear(hidden_layers[-1], output_size) 26 | 27 | self.dropout = nn.Dropout(p=drop_p) 28 | 29 | def forward(self, x): 30 | ''' Forward pass through the network, returns the output logits ''' 31 | 32 | for each in self.hidden_layers: 33 | x = F.relu(each(x)) 34 | x = self.dropout(x) 35 | x = self.output(x) 36 | 37 | return F.log_softmax(x, dim=1) 38 | 39 | 40 | def validation(model, testloader, criterion): 41 | accuracy = 0 42 | test_loss = 0 43 | for images, labels in testloader: 44 | 45 | images = images.resize_(images.size()[0], 784) 46 | 47 | output = model.forward(images) 48 | test_loss += criterion(output, labels).item() 49 | 50 | ## Calculating the accuracy 51 | # Model's output is log-softmax, take exponential to get the probabilities 52 | ps = torch.exp(output) 53 | # Class with highest probability is our predicted class, compare with true label 54 | equality = (labels.data == ps.max(1)[1]) 55 | # Accuracy is number of correct predictions divided by all predictions, just take the mean 56 | accuracy += equality.type_as(torch.FloatTensor()).mean() 57 | 58 | return test_loss, accuracy 59 | 60 | 61 | def train(model, trainloader, testloader, criterion, optimizer, epochs=5, print_every=40): 62 | 63 | steps = 0 64 | running_loss = 0 65 | for e in range(epochs): 66 | # Model in training mode, dropout is on 67 | model.train() 68 | for images, labels in trainloader: 69 | steps += 1 70 | 71 | # Flatten images into a 784 long vector 72 | images.resize_(images.size()[0], 784) 73 | 74 | optimizer.zero_grad() 75 | 76 | output = model.forward(images) 77 | loss = criterion(output, labels) 78 | loss.backward() 79 | optimizer.step() 80 | 81 | running_loss += loss.item() 82 | 83 | if steps % print_every == 0: 84 | # Model in inference mode, dropout is off 85 | model.eval() 86 | 87 | # Turn off gradients for validation, will speed up inference 88 | with torch.no_grad(): 89 | test_loss, accuracy = validation(model, testloader, criterion) 90 | 91 | print("Epoch: {}/{}.. ".format(e+1, epochs), 92 | "Training Loss: {:.3f}.. ".format(running_loss/print_every), 93 | "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)), 94 | "Test Accuracy: {:.3f}".format(accuracy/len(testloader))) 95 | 96 | running_loss = 0 97 | 98 | # Make sure dropout and grads are on for training 99 | model.train() 100 | -------------------------------------------------------------------------------- /lessons/DeepLearning/new-intro-to-pytorch/helper.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from torch import nn, optim 4 | from torch.autograd import Variable 5 | 6 | 7 | def test_network(net, trainloader): 8 | 9 | criterion = nn.MSELoss() 10 | optimizer = optim.Adam(net.parameters(), lr=0.001) 11 | 12 | dataiter = iter(trainloader) 13 | images, labels = dataiter.next() 14 | 15 | # Create Variables for the inputs and targets 16 | inputs = Variable(images) 17 | targets = Variable(images) 18 | 19 | # Clear the gradients from all Variables 20 | optimizer.zero_grad() 21 | 22 | # Forward pass, then backward pass, then update weights 23 | output = net.forward(inputs) 24 | loss = criterion(output, targets) 25 | loss.backward() 26 | optimizer.step() 27 | 28 | return True 29 | 30 | 31 | def imshow(image, ax=None, title=None, normalize=True): 32 | """Imshow for Tensor.""" 33 | if ax is None: 34 | fig, ax = plt.subplots() 35 | image = image.numpy().transpose((1, 2, 0)) 36 | 37 | if normalize: 38 | mean = np.array([0.485, 0.456, 0.406]) 39 | std = np.array([0.229, 0.224, 0.225]) 40 | image = std * image + mean 41 | image = np.clip(image, 0, 1) 42 | 43 | ax.imshow(image) 44 | ax.spines['top'].set_visible(False) 45 | ax.spines['right'].set_visible(False) 46 | ax.spines['left'].set_visible(False) 47 | ax.spines['bottom'].set_visible(False) 48 | ax.tick_params(axis='both', length=0) 49 | ax.set_xticklabels('') 50 | ax.set_yticklabels('') 51 | 52 | return ax 53 | 54 | 55 | def view_recon(img, recon): 56 | ''' Function for displaying an image (as a PyTorch Tensor) and its 57 | reconstruction also a PyTorch Tensor 58 | ''' 59 | 60 | fig, axes = plt.subplots(ncols=2, sharex=True, sharey=True) 61 | axes[0].imshow(img.numpy().squeeze()) 62 | axes[1].imshow(recon.data.numpy().squeeze()) 63 | for ax in axes: 64 | ax.axis('off') 65 | ax.set_adjustable('box-forced') 66 | 67 | def view_classify(img, ps, version="MNIST"): 68 | ''' Function for viewing an image and it's predicted classes. 69 | ''' 70 | ps = ps.data.numpy().squeeze() 71 | 72 | fig, (ax1, ax2) = plt.subplots(figsize=(6,9), ncols=2) 73 | ax1.imshow(img.resize_(1, 28, 28).numpy().squeeze()) 74 | ax1.axis('off') 75 | ax2.barh(np.arange(10), ps) 76 | ax2.set_aspect(0.1) 77 | ax2.set_yticks(np.arange(10)) 78 | if version == "MNIST": 79 | ax2.set_yticklabels(np.arange(10)) 80 | elif version == "Fashion": 81 | ax2.set_yticklabels(['T-shirt/top', 82 | 'Trouser', 83 | 'Pullover', 84 | 'Dress', 85 | 'Coat', 86 | 'Sandal', 87 | 'Shirt', 88 | 'Sneaker', 89 | 'Bag', 90 | 'Ankle Boot'], size='small'); 91 | ax2.set_title('Class Probability') 92 | ax2.set_xlim(0, 1.1) 93 | 94 | plt.tight_layout() 95 | -------------------------------------------------------------------------------- /lessons/Supervised/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/.DS_Store -------------------------------------------------------------------------------- /lessons/Supervised/1_DecisionTrees/titanic_survival_exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab: Titanic Survival Exploration with Decision Trees" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Getting Started\n", 15 | "In this lab, you will see how decision trees work by implementing a decision tree in sklearn.\n", 16 | "\n", 17 | "We'll start by loading the dataset and displaying some of its rows." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# Import libraries necessary for this project\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "from IPython.display import display # Allows the use of display() for DataFrames\n", 30 | "\n", 31 | "# Pretty display for notebooks\n", 32 | "%matplotlib inline\n", 33 | "\n", 34 | "# Set a random seed\n", 35 | "import random\n", 36 | "random.seed(42)\n", 37 | "\n", 38 | "# Load the dataset\n", 39 | "in_file = 'titanic_data.csv'\n", 40 | "full_data = pd.read_csv(in_file)\n", 41 | "\n", 42 | "# Print the first few entries of the RMS Titanic data\n", 43 | "display(full_data.head())" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Recall that these are the various features present for each passenger on the ship:\n", 51 | "- **Survived**: Outcome of survival (0 = No; 1 = Yes)\n", 52 | "- **Pclass**: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)\n", 53 | "- **Name**: Name of passenger\n", 54 | "- **Sex**: Sex of the passenger\n", 55 | "- **Age**: Age of the passenger (Some entries contain `NaN`)\n", 56 | "- **SibSp**: Number of siblings and spouses of the passenger aboard\n", 57 | "- **Parch**: Number of parents and children of the passenger aboard\n", 58 | "- **Ticket**: Ticket number of the passenger\n", 59 | "- **Fare**: Fare paid by the passenger\n", 60 | "- **Cabin** Cabin number of the passenger (Some entries contain `NaN`)\n", 61 | "- **Embarked**: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)\n", 62 | "\n", 63 | "Since we're interested in the outcome of survival for each passenger or crew member, we can remove the **Survived** feature from this dataset and store it as its own separate variable `outcomes`. We will use these outcomes as our prediction targets. \n", 64 | "Run the code cell below to remove **Survived** as a feature of the dataset and store it in `outcomes`." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Store the 'Survived' feature in a new variable and remove it from the dataset\n", 74 | "outcomes = full_data['Survived']\n", 75 | "features_raw = full_data.drop('Survived', axis = 1)\n", 76 | "\n", 77 | "# Show the new dataset with 'Survived' removed\n", 78 | "display(features_raw.head())" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "The very same sample of the RMS Titanic data now shows the **Survived** feature removed from the DataFrame. Note that `data` (the passenger data) and `outcomes` (the outcomes of survival) are now *paired*. That means for any passenger `data.loc[i]`, they have the survival outcome `outcomes[i]`.\n", 86 | "\n", 87 | "## Preprocessing the data\n", 88 | "\n", 89 | "Now, let's do some data preprocessing. First, we'll one-hot encode the features." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "features = pd.get_dummies(features_raw)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "And now we'll fill in any blanks with zeroes." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "features = features.fillna(0.0)\n", 115 | "display(features.head())" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "## (TODO) Training the model\n", 123 | "\n", 124 | "Now we're ready to train a model in sklearn. First, let's split the data into training and testing sets. Then we'll train the model on the training set." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "from sklearn.model_selection import train_test_split\n", 134 | "X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Import the classifier from sklearn\n", 144 | "from sklearn.tree import DecisionTreeClassifier\n", 145 | "\n", 146 | "# TODO: Define the classifier, and fit it to the data\n", 147 | "model = None" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Testing the model\n", 155 | "Now, let's see how our model does, let's calculate the accuracy over both the training and the testing set." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# Making predictions\n", 165 | "y_train_pred = model.predict(X_train)\n", 166 | "y_test_pred = model.predict(X_test)\n", 167 | "\n", 168 | "# Calculate the accuracy\n", 169 | "from sklearn.metrics import accuracy_score\n", 170 | "train_accuracy = accuracy_score(y_train, y_train_pred)\n", 171 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 172 | "print('The training accuracy is', train_accuracy)\n", 173 | "print('The test accuracy is', test_accuracy)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Exercise: Improving the model\n", 181 | "\n", 182 | "Ok, high training accuracy and a lower testing accuracy. We may be overfitting a bit.\n", 183 | "\n", 184 | "So now it's your turn to shine! Train a new model, and try to specify some parameters in order to improve the testing accuracy, such as:\n", 185 | "- `max_depth`\n", 186 | "- `min_samples_leaf`\n", 187 | "- `min_samples_split`\n", 188 | "\n", 189 | "You can use your intuition, trial and error, or even better, feel free to use Grid Search!\n", 190 | "\n", 191 | "**Challenge:** Try to get to 85% accuracy on the testing set. If you'd like a hint, take a look at the solutions notebook next." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "# TODO: Train the model\n", 201 | "\n", 202 | "# TODO: Make predictions\n", 203 | "\n", 204 | "# TODO: Calculate the accuracy" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.6.3" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 1 236 | } 237 | -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/.DS_Store -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/images/bayes_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/images/bayes_formula.png -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/images/countvectorizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/images/countvectorizer.png -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/images/dqnb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/images/dqnb.png -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/images/naivebayes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/images/naivebayes.png -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/images/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/images/tfidf.png -------------------------------------------------------------------------------- /lessons/Supervised/2_NaiveBayes/smsspamcollection/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/2_NaiveBayes/smsspamcollection/readme -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/.DS_Store -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/images/bayes_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/images/bayes_formula.png -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/images/countvectorizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/images/countvectorizer.png -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/images/dqnb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/images/dqnb.png -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/images/naivebayes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/images/naivebayes.png -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/images/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/images/tfidf.png -------------------------------------------------------------------------------- /lessons/Supervised/3_EnsembleMethods/smsspamcollection/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/3_EnsembleMethods/smsspamcollection/readme -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/.DS_Store -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/images/bayes_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/images/bayes_formula.png -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/images/countvectorizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/images/countvectorizer.png -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/images/dqnb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/images/dqnb.png -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/images/naivebayes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/images/naivebayes.png -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/images/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/images/tfidf.png -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/smsspamcollection/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Supervised/4_ModelEvaluationMetrics/smsspamcollection/readme -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/tests.py: -------------------------------------------------------------------------------- 1 | def test_one(mod_arg): 2 | ''' 3 | INPUT: 4 | mod_arg - a set of the strings pertaining to the objects that were passed in the fitting of our models 5 | 6 | OUTPUT: 7 | prints correctness of the set 8 | nothing returned 9 | ''' 10 | a = 'X_train' 11 | b = 'X_test' 12 | c = 'y_train' 13 | d = 'y_test' 14 | e = 'training_data' 15 | f = 'testing_data' 16 | if mod_arg == {c, e}: 17 | print("That's right! You need to fit on both parts of the data pertaining to training data!") 18 | else: 19 | print("Oops! That doesn't look quite right! Remember you only want to fit your model to the training data! Notice that X_train hasn't had the data cleaned yet, so that won't work to pass to our fit method. Hint - there are two items you should be passing to your fit method.") 20 | 21 | 22 | def test_two(mod_arg): 23 | ''' 24 | INPUT: 25 | model_arg - a set of the strings pertaining to the objects that were passed in the predicting step 26 | 27 | OUTPUT: 28 | prints correctness of the set 29 | nothing returned 30 | ''' 31 | a = 'X_train' 32 | b = 'X_test' 33 | c = 'y_train' 34 | d = 'y_test' 35 | e = 'training_data' 36 | f = 'testing_data' 37 | if mod_arg == {f}: 38 | print("That's right! To see how well our models perform in a new setting, you will want to predict on the test set of data.") 39 | else: 40 | print("Oops! That doesn't look quite right! Remember you will want to predict on test data to know how well your model will do in a new situation. Hint - there is only one item that should be passed to the predict method of your model. Also notice that X_test has not been cleaned yet, so this cannot be passed to the predict method!") 41 | 42 | 43 | def sol_seven(seven_sol): 44 | ''' 45 | INPUT: dictionary with correct matching of metrics 46 | OUTPUT: nothing returned - prints statement related to correctness of dictionary 47 | ''' 48 | 49 | a = "recall" 50 | b = "precision" 51 | c = "accuracy" 52 | d = 'f1-score' 53 | 54 | 55 | seven_sol_1 = { 56 | 'We have imbalanced classes, which metric do we definitely not want to use?': c, 57 | 'We really want to make sure the positive cases are all caught even if that means we identify some negatives as positives': a, 'When we identify something as positive, we want to be sure it is truly positive': b, 58 | 'We care equally about identifying positive and negative cases': d 59 | } 60 | 61 | if seven_sol == seven_sol_1: 62 | print("That's right! It isn't really necessary to memorize these in practice, but it is important to know they exist and know why might use one metric over another for a particular situation.") 63 | 64 | if seven_sol['We have imbalanced classes, which metric do we definitely not want to use?'] != seven_sol_1['We have imbalanced classes, which metric do we definitely not want to use?']: 65 | print("Oops! The first one isn't right. If we do not have balanced classes, we probably want to stay away from using accuracy.") 66 | 67 | if seven_sol['We really want to make sure the positive cases are all caught even if that means we identify some negatives as positives'] != seven_sol_1['We really want to make sure the positive cases are all caught even if that means we identify some negatives as positives']: 68 | print("Oops! The second one isn't right. If we really want to be sure about catching positive cases, we should be closely watching recall, which has all of the positive clases in the denominator - so we are monitoring how many of them we get right with recall.") 69 | 70 | if seven_sol['When we identify something as positive, we want to be sure it is truly positive'] != seven_sol_1['When we identify something as positive, we want to be sure it is truly positive']: 71 | print("Oops! The third one isn't right. Using precision, we have the predicted positives in the denominator. Therefore, this will help us be sure the items we identify as positive are actually positive.") 72 | 73 | if seven_sol['We care equally about identifying positive and negative cases'] != seven_sol_1['We care equally about identifying positive and negative cases']: 74 | print("Oops! The last one isn't right. If we care equally about precision and recall, we should use f1 score.") 75 | 76 | 77 | def sol_eight(eight_sol): 78 | ''' 79 | INPUT: dictionary with correct matching of metrics 80 | OUTPUT: nothing returned - prints statement related to correctness of dictionary 81 | ''' 82 | a = "naive-bayes" 83 | b = "bagging" 84 | c = "random-forest" 85 | d = 'ada-boost' 86 | e = "svm" 87 | 88 | 89 | eight_sol_1 = { 90 | 'We have imbalanced classes, which metric do we definitely not want to use?': a, 91 | 'We really want to make sure the positive cases are all caught even if that means we identify some negatives as positives': a, 92 | 'When we identify something as positive, we want to be sure it is truly positive': c, 93 | 'We care equally about identifying positive and negative cases': a 94 | } 95 | 96 | if eight_sol_1 == eight_sol: 97 | print("That's right! Naive Bayes was the best model for all of our metrics except precision!") 98 | 99 | else: 100 | print("Oops! That doesn't look right. Make sure you are performing your predictions and matching on the test data. Hint: The naive bayes model actually performs best on all of the metrics except one. Try again!") 101 | -------------------------------------------------------------------------------- /lessons/Supervised/4_ModelEvaluationMetrics/tests2.py: -------------------------------------------------------------------------------- 1 | def q1_check(models_dict): 2 | ''' 3 | INPUT: 4 | models_dict - a dictionary with models and what types of problems the models can be used for 5 | 6 | OUTPUT: 7 | nothing returned 8 | prints statements related to the correctness of the dictionary 9 | ''' 10 | a = 'regression' 11 | b = 'classification' 12 | c = 'both regression and classification' 13 | 14 | models = { 15 | 'decision trees': c, 16 | 'random forest': c, 17 | 'adaptive boosting': c, 18 | 'logistic regression': b, 19 | 'linear regression': a, 20 | } 21 | 22 | if models == models_dict: 23 | print("That's right! All but logistic regression can be used for predicting numeric values. And linear regression is the only one of these that you should not use for predicting categories. Technically sklearn won't stop you from doing most of anything you want, but you probably want to treat cases in the way you found by answering this question!") 24 | 25 | if models['logistic regression'] != models_dict['logistic regression']: 26 | print("Oops! In most cases, you will only want to use logistic regression for classification problems.") 27 | 28 | if models['linear regression'] != models_dict['linear regression']: 29 | print("Oops! Linear regression should actually only be used in regression cases. Try again.") 30 | 31 | if (models['decision trees'] != models_dict['decision trees']) or (models['random forest'] != models_dict['random forest']) or (models['adaptive boosting'] != models_dict['adaptive boosting']): 32 | print("Oops! Actually random forests, decision trees, and adaptive boosting are all techniques that can be used for both regression and classification. Try again!") 33 | 34 | 35 | 36 | 37 | def q6_check(metrics): 38 | ''' 39 | INPUT: 40 | metrics - a dictionary with metrics and what types of problems the metrics can be used for 41 | 42 | OUTPUT: 43 | nothing returned 44 | prints statements related to the correctness of the dictionary 45 | ''' 46 | a = 'regression' 47 | b = 'classification' 48 | c = 'both regression and classification' 49 | 50 | # 51 | metrics_ch = { 52 | 'precision': b, 53 | 'recall': b, 54 | 'accuracy': b, 55 | 'r2_score': a, 56 | 'mean_squared_error': a, 57 | 'area_under_curve': b, 58 | 'mean_absolute_area': a 59 | } 60 | 61 | if metrics_ch == metrics: 62 | print("That's right! Looks like you know your metrics!") 63 | 64 | if (metrics['precision'] != metrics['precision']) or (metrics['recall'] != metrics['recall']) or (metrics['accuracy'] != metrics['accuracy']) or (metrics['area_under_curve'] != metrics['area_under_curve']): 65 | print("Oops! Actually, there are four metrics that are used for classification. Looks like you missed at least one of them.") 66 | 67 | if metrics != metrics_ch: 68 | print("Oops! Something doesn't look quite right. You should have three metrics for regression, and the others should be for classification. None of the metrics are used for both regression and classification.") 69 | 70 | 71 | def check_ten(best_fit): 72 | ''' 73 | INPUT: 74 | 75 | OUTPUT: 76 | 77 | ''' 78 | a = 'decision tree' 79 | b = 'random forest' 80 | c = 'adaptive boosting' 81 | d = 'linear regression' 82 | 83 | 84 | best_fitting = { 85 | 'mse': b, 86 | 'r2': b, 87 | 'mae': b 88 | } 89 | 90 | if best_fit == best_fitting: 91 | print("That's right! The random forest was best in terms of all the metrics this time!") 92 | 93 | else: 94 | print("Oops! Actually the best model was the same for all the metrics. Try again - all of your answers should be the same!") 95 | -------------------------------------------------------------------------------- /lessons/Supervised/5_TrainingTuning/check_file.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.model_selection import train_test_split, RandomizedSearchCV 5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 6 | from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier 7 | from sklearn.naive_bayes import MultinomialNB 8 | import matplotlib.pyplot as plt 9 | from sklearn.svm import SVC 10 | import seaborn as sns 11 | 12 | 13 | def check_one(answers_one): 14 | ''' 15 | INPUT: 16 | answers_one - a dictionary with key-value pairs associated with question 1 17 | 18 | OUTPUT: 19 | nothing returned 20 | print a statement related to the correctness of the answers 21 | ''' 22 | a = '0.65' 23 | b = '0' 24 | c = 'Age' 25 | d = '0.35' 26 | e = 'Glucose' 27 | f = '0.5' 28 | g = "More than zero" 29 | 30 | answers_one_1 = { 31 | 'The proportion of diabetes outcomes in the dataset': d, 32 | 'The number of missing data points in the dataset': b, 33 | 'A dataset with a symmetric distribution': e, 34 | 'A dataset with a right-skewed distribution': c, 35 | 'This variable has the strongest correlation with the outcome': e 36 | } 37 | 38 | if answers_one == answers_one_1: 39 | print("Awesome! These all look great!") 40 | 41 | if answers_one['The proportion of diabetes outcomes in the dataset'] != answers_one_1['The proportion of diabetes outcomes in the dataset']: 42 | print("Oops! That doesn't look like the proportion of 1's in the outcomes column. I saw closer to 35% using the describe() method.") 43 | 44 | if answers_one['The number of missing data points in the dataset'] != answers_one_1['The number of missing data points in the dataset']: 45 | print("Oops! That doesn't look like the right number of missing values. I actually couldn't find any missing values") 46 | 47 | if answers_one['A dataset with a symmetric distribution'] != answers_one_1['A dataset with a symmetric distribution']: 48 | print("Oops! Of the two columns above, Glucose is actually the symmetric column. You can see this by running the .hist() method on your dataframe.") 49 | 50 | if answers_one['A dataset with a right-skewed distribution'] != answers_one_1['A dataset with a right-skewed distribution']: 51 | print("Oops! Of the two columns above, Age is actually the right-skewed column. You can see this by running the .hist() method on your dataframe.") 52 | 53 | if answers_one['This variable has the strongest correlation with the outcome'] != answers_one_1['This variable has the strongest correlation with the outcome']: 54 | print("Oops! Besides Outcome itself, the column that is most correlated with the Outcome variable is actually Glucose.") 55 | 56 | 57 | def print_metrics(y_true, preds, model_name=None): 58 | ''' 59 | INPUT: 60 | y_true - the y values that are actually true in the dataset (numpy array or pandas series) 61 | preds - the predictions for those values from some model (numpy array or pandas series) 62 | model_name - (str - optional) a name associated with the model if you would like to add it to the print statements 63 | 64 | OUTPUT: 65 | None - prints the accuracy, precision, recall, and F1 score 66 | ''' 67 | if model_name == None: 68 | print('Accuracy score: ', format(accuracy_score(y_true, preds))) 69 | print('Precision score: ', format(precision_score(y_true, preds))) 70 | print('Recall score: ', format(recall_score(y_true, preds))) 71 | print('F1 score: ', format(f1_score(y_true, preds))) 72 | print('\n\n') 73 | 74 | else: 75 | print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds))) 76 | print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds))) 77 | print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds))) 78 | print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds))) 79 | print('\n\n') 80 | 81 | 82 | def check_best(best_model): 83 | ''' 84 | INPUT: 85 | best_model - a string of the best model 86 | 87 | OUTPUT: 88 | print a statement related to if the best model matches what the solution found 89 | ''' 90 | a = 'randomforest' 91 | b = 'adaboost' 92 | c = 'supportvector' 93 | 94 | if best_model == b: 95 | print("Nice! It looks like your best model matches the best model I found as well! It makes sense to use f1 score to determine best in this case given the imbalance of classes. There might be justification for precision or recall being the best metric to use as well - precision showed to be best with adaboost again. With recall, SVMs proved to be the best for our models.") 96 | 97 | else: 98 | print("That wasn't the model I had in mind... It makes sense to use f1 score to determine best in this case given the imbalance of classes. There could also be justification for precision or recall being the best metric to use as well - precision showed to be best with adaboost. With recall, SVMs proved to be the best for our models.") 99 | 100 | 101 | def check_q_seven(sol_seven): 102 | ''' 103 | INPUT: 104 | solution dictionary for part seven 105 | OUTPUT: 106 | prints statement related to correctness of dictionary 107 | ''' 108 | a = 'Age' 109 | b = 'BloodPressure' 110 | c = 'BMI' 111 | d = 'DiabetesPedigreeFunction' 112 | e = 'Insulin' 113 | f = 'Glucose' 114 | g = 'Pregnancy' 115 | h = 'SkinThickness' 116 | 117 | 118 | 119 | sol_seven_1 = { 120 | 'The variable that is most related to the outcome of diabetes' : f, 121 | 'The second most related variable to the outcome of diabetes' : c, 122 | 'The third most related variable to the outcome of diabetes' : a, 123 | 'The fourth most related variable to the outcome of diabetes' : d 124 | } 125 | 126 | if sol_seven == sol_seven_1: 127 | print("That's right! Some of these were expected, but some were a bit unexpected too!") 128 | 129 | else: 130 | print("That doesn't look like what I expected, but maybe your feature importances were different - that can definitely happen. Take a look at the best_estimator_.feature_importances_ portion of your fitted model.") 131 | 132 | -------------------------------------------------------------------------------- /lessons/Supervised/5_TrainingTuning/data.csv: -------------------------------------------------------------------------------- 1 | 0.336493583877,-0.985950993354,0.0 2 | -0.0110425297266,-0.10552856162,0.0 3 | 0.238159509297,-0.61741666482,1.0 4 | -0.366782883496,-0.713818716912,1.0 5 | 1.22192307438,-1.03939898614,0.0 6 | -1.30456799971,0.59261847015,0.0 7 | -0.407809098981,-0.509110509763,1.0 8 | 0.893188941965,1.18285985648,0.0 9 | -0.00546337259365,-0.589551228864,1.0 10 | 0.406423768278,0.611062234636,1.0 11 | -0.145506766722,0.0365463997206,1.0 12 | -0.0404887876421,-0.0566500319512,1.0 13 | 1.60355997627,0.0908139379574,0.0 14 | -0.604838450284,-0.111340204903,1.0 15 | -0.534401237223,-1.04875779188,0.0 16 | 0.977706756346,-1.35281793296,1.0 17 | -0.422036924523,-0.274418973593,0.0 18 | 1.69051344717,-0.929766839195,0.0 19 | 0.655534595433,-0.244533046405,1.0 20 | 0.384609916121,-0.334328465856,1.0 21 | -0.109341027267,0.273694976361,1.0 22 | -1.28710021847,-0.406756443289,0.0 23 | 0.435217566287,-0.192221316649,1.0 24 | 0.0555208008113,1.024011876,0.0 25 | 1.5088217057,-0.799489053235,0.0 26 | 0.75932306599,0.775189603256,1.0 27 | 0.967078497167,-0.707726241999,0.0 28 | -0.0231301769156,1.34060202328,0.0 29 | -0.274591142835,-0.549682228079,1.0 30 | -1.2080749077,-1.41385342554,0.0 31 | 0.381259079564,-0.852947496234,1.0 32 | 0.404870623291,-0.38564643089,1.0 33 | 0.0173135930664,0.787433467901,1.0 34 | -0.650474497449,0.377281547969,1.0 35 | -0.175095703948,0.557524657143,1.0 36 | 0.090747012995,0.146764389396,1.0 37 | -0.23406335446,-1.14282728744,0.0 38 | -0.023240502157,0.0329251073349,1.0 39 | -0.98177853269,-0.614024199162,0.0 40 | 0.863118366276,0.626452589641,0.0 41 | -0.494201528321,-1.2458627184,0.0 42 | 0.560657440533,0.960463847964,0.0 43 | 0.517532460272,-1.015620433,0.0 44 | -1.07674778462,1.64110648889,0.0 45 | -0.40295146753,1.74395283754,0.0 46 | 1.26250128528,-0.0880456579187,0.0 47 | -1.13554604657,0.691274079866,0.0 48 | -1.88154070755,0.579520022541,0.0 49 | 1.61949373896,-1.16815366758,1.0 50 | -0.167382068846,0.318140979545,0.0 51 | -0.731515970032,-0.626052631824,1.0 52 | 0.14962052078,1.24000574432,0.0 53 | 1.16720084422,0.521580749715,0.0 54 | -0.436063303539,0.043680311306,1.0 55 | -0.827638902506,0.275166403707,1.0 56 | 1.36953107467,0.971233523422,0.0 57 | 0.690612759144,-1.27804624607,0.0 58 | 1.26986688391,0.575808793135,0.0 59 | 0.208866020688,-0.146742455013,1.0 60 | -0.437203222578,0.52116507147,1.0 61 | -0.378363762158,-0.0769780148552,1.0 62 | -0.423820115256,-0.836137209863,1.0 63 | -0.560756181289,-0.41037775405,1.0 64 | 0.336052960763,-0.224802048045,1.0 65 | -1.33543072512,-0.990358481473,0.0 66 | -0.0289733996866,0.441010128386,1.0 67 | -1.3193906415,-0.37764709941,0.0 68 | -0.808411080806,1.2283790386,0.0 69 | 1.35995943884,1.12161870845,0.0 70 | -0.872069364163,-0.252522725967,1.0 71 | -1.88887517471,0.144098536459,0.0 72 | 1.60845822722,-0.774759253864,0.0 73 | -0.358639909549,0.784305199745,1.0 74 | 0.520332593218,-0.62185400704,1.0 75 | 0.306204273961,0.25448089669,1.0 76 | -1.51072939376,0.00594704976351,0.0 77 | 0.956067338203,-0.533023015577,0.0 78 | 0.288866739458,-0.725155662248,1.0 79 | 0.403468553933,-1.75945770781,0.0 80 | 0.0859415686163,-0.958846823471,1.0 81 | 0.381957047469,0.0124143718471,1.0 82 | 0.336004016976,-0.259620737798,1.0 83 | 1.02869639688,-0.785051442286,0.0 84 | 0.279139768315,0.148068778283,1.0 85 | -0.700587484192,0.118422440942,1.0 86 | -0.474343136475,-0.162548759675,1.0 87 | -1.29581526521,0.755926314388,0.0 88 | 0.140673267698,-1.60264376179,0.0 89 | 0.328196143279,0.444738575921,1.0 90 | -0.940761503292,-1.00437673463,0.0 91 | 0.4177654822,1.11423358886,1.0 92 | -0.802874871784,-1.27790346857,1.0 93 | -0.596842011584,0.593623894204,0.0 94 | -0.112331263254,0.174318514314,0.0 95 | -1.45753325136,-1.30679050369,1.0 96 | 1.63561447039,0.27394296313,1.0 97 | 0.113120402388,0.0204651461722,0.0 98 | 0.753405102224,0.1938301221,0.0 99 | -0.538129041247,-0.000723035827331,0.0 100 | -0.181058441906,0.00266871780379,1.0 101 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/C18_FeatScalingEx_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/1_Clustering/C18_FeatScalingEx_01.png -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/Feature Scaling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Feature Scaling\n", 8 | "\n", 9 | "With any distance based machine learning model (regularized regression methods, neural networks, and now kmeans), you will want to scale your data. \n", 10 | "\n", 11 | "If you have some features that are on completely different scales, this can greatly impact the clusters you get when using K-Means. \n", 12 | "\n", 13 | "In this notebook, you will get to see this first hand. To begin, let's read in the necessary libraries." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "from sklearn import preprocessing as p\n", 28 | "\n", 29 | "%matplotlib inline\n", 30 | "\n", 31 | "plt.rcParams['figure.figsize'] = (16, 9)\n", 32 | "import helpers2 as h\n", 33 | "import tests as t\n", 34 | "\n", 35 | "\n", 36 | "# Create the dataset for the notebook\n", 37 | "data = h.simulate_data(200, 2, 4)\n", 38 | "df = pd.DataFrame(data)\n", 39 | "df.columns = ['height', 'weight']\n", 40 | "df['height'] = np.abs(df['height']*100)\n", 41 | "df['weight'] = df['weight'] + np.random.normal(50, 10, 200)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "`1.` Next, take a look at the data to get familiar with it. The dataset has two columns, and it is stored in the **df** variable. It might be useful to get an idea of the spread in the current data, as well as a visual of the points. " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "#Take a look at the data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "#use this cell if you would like as well" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "Now that we've got a dataset, let's look at some options for scaling the data. As well as how the data might be scaled. There are two very common types of feature scaling that we should discuss:\n", 78 | "\n", 79 | "\n", 80 | "**I. MinMaxScaler**\n", 81 | "\n", 82 | "In some cases it is useful to think of your data in terms of the percent they are as compared to the maximum value. In these cases, you will want to use **MinMaxScaler**.\n", 83 | "\n", 84 | "**II. StandardScaler**\n", 85 | "\n", 86 | "Another very popular type of scaling is to scale data so that it has mean 0 and variance 1. In these cases, you will want to use **StandardScaler**. \n", 87 | "\n", 88 | "It is probably more appropriate with this data to use **StandardScaler**. However, to get practice with feature scaling methods in python, we will perform both.\n", 89 | "\n", 90 | "`2.` First let's fit the **StandardScaler** transformation to this dataset. I will do this one so you can see how to apply preprocessing in sklearn." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "df_ss = p.StandardScaler().fit_transform(df) # Fit and transform the data" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "df_ss = pd.DataFrame(df_ss) #create a dataframe\n", 113 | "df_ss.columns = ['height', 'weight'] #add column names again\n", 114 | "\n", 115 | "plt.scatter(df_ss['height'], df_ss['weight']); # create a plot" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "`3.` Now it's your turn. Try fitting the **MinMaxScaler** transformation to this dataset. You should be able to use the previous example to assist." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# fit and transform" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "#create a dataframe\n", 145 | "#change the column names\n", 146 | "#plot the data" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "`4.` Now let's take a look at how kmeans divides the dataset into different groups for each of the different scalings of the data. Did you end up with different clusters when the data was scaled differently?" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "def fit_kmeans(data, centers):\n", 165 | " '''\n", 166 | " INPUT:\n", 167 | " data = the dataset you would like to fit kmeans to (dataframe)\n", 168 | " centers = the number of centroids (int)\n", 169 | " OUTPUT:\n", 170 | " labels - the labels for each datapoint to which group it belongs (nparray)\n", 171 | " \n", 172 | " '''\n", 173 | " kmeans = KMeans(centers)\n", 174 | " labels = kmeans.fit_predict(data)\n", 175 | " return labels\n", 176 | "\n", 177 | "labels = fit_kmeans(df, 10) #fit kmeans to get the labels\n", 178 | " \n", 179 | "# Plot the original data with clusters\n", 180 | "plt.scatter(df['height'], df['weight'], c=labels, cmap='Set1');" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "#plot each of the scaled datasets" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "#another plot of the other scaled dataset" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Write your response here!" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.1" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 2 243 | } 244 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/Identifying_Clusters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Identifying Clusters\n", 8 | "\n", 9 | "Before we get too far along, let's take a look at some different sets of data to practice identifying clusters.\n", 10 | "\n", 11 | "Start by running the cell below." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "from mpl_toolkits.mplot3d import Axes3D\n", 25 | "from sklearn.cluster import KMeans\n", 26 | "from sklearn.datasets import make_blobs\n", 27 | "import helper_functions as h\n", 28 | "import test_file as t\n", 29 | "from IPython import display\n", 30 | "\n", 31 | "%matplotlib inline\n", 32 | "\n", 33 | "# Make the images larger\n", 34 | "plt.rcParams['figure.figsize'] = (16, 9)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "`1.` Run the cell below to generate a set of data. Then enter an integer next to **`question_1_clusters`** that identifies the number of clusters you think appear in the plot.\n", 42 | "\n", 43 | "If you think that there are 2 clusters in the plot, you should enter:\n", 44 | "\n", 45 | "```\n", 46 | "question_1_clusters = 2\n", 47 | "```" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "h.plot_q1_data()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "question_1_clusters = # Enter the number of clusters you see here as an integer\n", 66 | "\n", 67 | "#Then this will test your number against what we used to generate the data\n", 68 | "t.test_question_1(question_1_clusters)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "`2.` Run the cell below to generate a set of data. Then, similar to the first question, enter the number of clusters you think appear in the plot." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "h.plot_q2_data()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "question_2_clusters = # Enter the number of clusters you see here as an integer\n", 94 | "\n", 95 | "#Then this will test your number against what we used to generate the data\n", 96 | "t.test_question_2(question_2_clusters)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "`3.` Run the cell below to generate a set of data. Then, similar to the previous questions, enter the number of clusters you think appear in the plot." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "h.plot_q3_data()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "question_3_clusters = # Enter the number of clusters you see here as an integer\n", 122 | "\n", 123 | "#Then this will test your number against what we used to generate the data\n", 124 | "t.test_question_3(question_3_clusters)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "`4.` Now one final time, run the cell below, and identify the number of clusters you think are in the plot created. " 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "h.plot_q4_data()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "question_4_clusters = # Enter the number of clusters you see here as an integer\n", 150 | "\n", 151 | "#Then this will test your number against what we used to generate the data\n", 152 | "display.HTML(t.test_question_4(question_4_clusters))" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "source": [ 161 | "**You can find a solution to this by clicking the orange jupyter image at the top of this notebook.**" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 3", 177 | "language": "python", 178 | "name": "python3" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 3 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython3", 190 | "version": "3.6.3" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 2 195 | } 196 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/Identifying_Clusters_Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Identifying Clusters - Solution\n", 8 | "\n", 9 | "Before we get too far along, let's take a look at some different sets of data to practice identifying clusters.\n", 10 | "\n", 11 | "Start by running the cell below." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import numpy as np\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "from mpl_toolkits.mplot3d import Axes3D\n", 23 | "from sklearn.cluster import KMeans\n", 24 | "from sklearn.datasets import make_blobs\n", 25 | "import helper_functions as h\n", 26 | "import test_file as t\n", 27 | "from IPython import display\n", 28 | "\n", 29 | "%matplotlib inline\n", 30 | "\n", 31 | "# Make the images larger\n", 32 | "plt.rcParams['figure.figsize'] = (16, 9)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "`1.` Run the cell below to generate a set of data. Then enter an integer next to **`question_1_clusters`** that identifies the number of clusters you think appear in the plot.\n", 40 | "\n", 41 | "If you think that there are 2 clusters in the plot, you should enter:\n", 42 | "\n", 43 | "```\n", 44 | "question_1_clusters = 3\n", 45 | "```" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "h.plot_q1_data()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "question_1_clusters = 4\n", 64 | "\n", 65 | "#Then this will test your number against what we used to generate the data\n", 66 | "t.test_question_1(question_1_clusters)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "`2.` Run the cell below to generate a set of data. Then, similar to the first question, enter the number of clusters you think appear in the plot." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "h.plot_q2_data()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "question_2_clusters = 2 \n", 92 | "\n", 93 | "#Then this will test your number against what we used to generate the data\n", 94 | "t.test_question_2(question_2_clusters)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "`3.` Run the cell below to generate a set of data. Then, similar to the previous questions, enter the number of clusters you think appear in the plot." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "h.plot_q3_data()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "question_3_clusters = 5\n", 120 | "\n", 121 | "#Then this will test your number against what we used to generate the data\n", 122 | "t.test_question_3(question_3_clusters)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "`4.` Now one final time, run the cell below, and identify the number of clusters you think are in the plot created. " 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "h.plot_q4_data()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "question_4_clusters = 7\n", 148 | "\n", 149 | "#Then this will test your number against what we used to generate the data\n", 150 | "display.HTML(t.test_question_4(question_4_clusters))" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "**You can find a solution to this by clicking the orange jupyter image at the top of this notebook.**" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.1" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/giphy (1).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/1_Clustering/giphy (1).gif -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/giphy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/1_Clustering/giphy.gif -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/helper_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import Axes3D 4 | from sklearn.cluster import KMeans 5 | from sklearn.datasets import make_blobs 6 | 7 | # Generate Question 1 Data 8 | X, y = make_blobs(n_samples=500, n_features=3, centers=4, random_state=5) 9 | 10 | def plot_q1_data(): 11 | fig = plt.figure(); 12 | ax = Axes3D(fig) 13 | ax.scatter(X[:, 0], X[:, 1], X[:, 2]); 14 | 15 | 16 | # Generate Question 2 Data 17 | Z, y = make_blobs(n_samples=500, n_features=5, centers=2, random_state=42) 18 | 19 | def plot_q2_data(): 20 | fig = plt.figure() 21 | plt.scatter(Z[:, 0], Z[:, 1]); 22 | 23 | # Generate Question 3 Data 24 | T, y = make_blobs(n_samples=500, n_features=5, centers=8, random_state=5) 25 | 26 | def plot_q3_data(): 27 | fig = plt.figure(); 28 | ax = Axes3D(fig) 29 | ax.scatter(T[:, 1], T[:, 3], T[:, 4]); 30 | 31 | # Plot data for Question 4 32 | def plot_q4_data(): 33 | fig = plt.figure(); 34 | ax = Axes3D(fig) 35 | ax.scatter(T[:, 1], T[:, 2], T[:, 3]); 36 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/helpers2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import Axes3D 4 | from sklearn.cluster import KMeans 5 | from sklearn.datasets import make_blobs 6 | 7 | def simulate_data(n = 500, features = 10, centroids = 3): 8 | ''' 9 | Simulates n data points, each with number of features equal to features, with a number of centers equal to centroids 10 | INPUT (defaults) 11 | n = number of rows (500) 12 | features = number of columns (10) 13 | centroids = number of centers (3) 14 | Output 15 | dataset = a dataset with the the specified characteristics 16 | ''' 17 | dataset, y = make_blobs(n_samples=n, n_features=features, centers=centroids, random_state=42) 18 | 19 | return dataset 20 | 21 | def plot_data(data, labels): 22 | ''' 23 | Plot data with colors associated with labels 24 | ''' 25 | fig = plt.figure(); 26 | ax = Axes3D(fig) 27 | ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='tab10'); 28 | 29 | data = simulate_data(200, 5, 4) 30 | 31 | def get_kmeans_score(data, center): 32 | ''' 33 | returns the kmeans score regarding SSE for points to centers 34 | INPUT: 35 | data - the dataset you want to fit kmeans to 36 | center - the number of centers you want (the k value) 37 | OUTPUT: 38 | score - the SSE score for the kmeans model fit to the data 39 | ''' 40 | #instantiate kmeans 41 | kmeans = KMeans(n_clusters=center) 42 | 43 | # Then fit the model to your data using the fit method 44 | model = kmeans.fit(data) 45 | 46 | # Obtain a score related to the model fit 47 | score = np.abs(model.score(data)) 48 | 49 | return score 50 | 51 | def fit_mods(): 52 | scores = [] 53 | centers = list(range(1,11)) 54 | 55 | for center in centers: 56 | scores.append(get_kmeans_score(data, center)) 57 | 58 | return centers, scores 59 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/ml-latest-small/README.txt: -------------------------------------------------------------------------------- 1 | Summary 2 | ======= 3 | 4 | This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. These data were created by 671 users between January 09, 1995 and October 16, 2016. This dataset was generated on October 17, 2016. 5 | 6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided. 7 | 8 | The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows. 9 | 10 | This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent. 11 | 12 | This and other GroupLens data sets are publicly available for download at . 13 | 14 | 15 | Usage License 16 | ============= 17 | 18 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions: 19 | 20 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group. 21 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information). 22 | * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions. 23 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota. 24 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction. 25 | 26 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate). 27 | 28 | If you have any further questions or comments, please email 29 | 30 | 31 | Citation 32 | ======== 33 | 34 | To acknowledge use of the dataset in publications, please cite the following paper: 35 | 36 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. DOI= 37 | 38 | 39 | Further Information About GroupLens 40 | =================================== 41 | 42 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including: 43 | 44 | * recommender systems 45 | * online communities 46 | * mobile and ubiquitious technologies 47 | * digital libraries 48 | * local geographic information systems 49 | 50 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators. 51 | 52 | 53 | Content and Use of Files 54 | ======================== 55 | 56 | Formatting and Encoding 57 | ----------------------- 58 | 59 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8. 60 | 61 | User Ids 62 | -------- 63 | 64 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files). 65 | 66 | Movie Ids 67 | --------- 68 | 69 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files). 70 | 71 | 72 | Ratings Data File Structure (ratings.csv) 73 | ----------------------------------------- 74 | 75 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format: 76 | 77 | userId,movieId,rating,timestamp 78 | 79 | The lines within this file are ordered first by userId, then, within user, by movieId. 80 | 81 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars). 82 | 83 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 84 | 85 | Tags Data File Structure (tags.csv) 86 | ----------------------------------- 87 | 88 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format: 89 | 90 | userId,movieId,tag,timestamp 91 | 92 | The lines within this file are ordered first by userId, then, within user, by movieId. 93 | 94 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user. 95 | 96 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 97 | 98 | Movies Data File Structure (movies.csv) 99 | --------------------------------------- 100 | 101 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format: 102 | 103 | movieId,title,genres 104 | 105 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles. 106 | 107 | Genres are a pipe-separated list, and are selected from the following: 108 | 109 | * Action 110 | * Adventure 111 | * Animation 112 | * Children's 113 | * Comedy 114 | * Crime 115 | * Documentary 116 | * Drama 117 | * Fantasy 118 | * Film-Noir 119 | * Horror 120 | * Musical 121 | * Mystery 122 | * Romance 123 | * Sci-Fi 124 | * Thriller 125 | * War 126 | * Western 127 | * (no genres listed) 128 | 129 | Links Data File Structure (links.csv) 130 | --------------------------------------- 131 | 132 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format: 133 | 134 | movieId,imdbId,tmdbId 135 | 136 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link . 137 | 138 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 139 | 140 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 141 | 142 | Use of the resources listed above is subject to the terms of each provider. 143 | 144 | Cross-Validation 145 | ---------------- 146 | 147 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples. 148 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/test_file.py: -------------------------------------------------------------------------------- 1 | def display_gif(fn): 2 | return ''.format(fn) 3 | 4 | 5 | def test_question_1(clusters): 6 | if clusters == 4: 7 | print("That's right! There are 4 clusters in this dataset.") 8 | elif clusters < 4: 9 | print("Oops! We were thinking there were actually more clusters than what you suggested. Try again. A cluster is a group of points that are closer together and separated from other points in the dataset.") 10 | else: 11 | print("Oops! We were thinking there were fewer clusters than what you suggested. Try again. A cluster is a group of points that are closer together and separated from other points in the dataset.") 12 | 13 | 14 | def test_question_2(clusters): 15 | if clusters == 2: 16 | print("That's right! There are 2 clusters in this dataset.") 17 | else: 18 | print("Oops! That doesn't look like what we expected for the number of clusters. Try again. A cluster is a group of points that are closer together and separated from other points in the dataset.") 19 | 20 | def test_question_3(clusters): 21 | print("{} is a reasonable guess for a the number of clusters here. In the next question, you will see a different angle of this data.".format(clusters)) 22 | 23 | def test_question_4(clusters): 24 | print("This data is actually the same as the data used in question 3. Isn't it crazy how looking at data from a different angle can make us believe there are a different number of clusters in the data! We will look at how to address this in the upcoming parts of this lesson.") 25 | return display_gif('http://www.reactiongifs.com/wp-content/uploads/2013/03/mind-blown.gif') 26 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import Axes3D 4 | from sklearn.cluster import KMeans 5 | from sklearn.datasets import make_blobs 6 | 7 | def display_gif(fn): 8 | return ''.format(fn) 9 | 10 | 11 | def test_question_1(data1): 12 | if data1.shape[0] == 200 and data1.shape[1] == 5: 13 | print("Looks good! Continue!") 14 | else: 15 | print("Oops, that looks different than what we expected! The first argument should be the number of rows, the second the number of columns, and the final should be the number of centers.") 16 | 17 | def test_question_2(k_value): 18 | if k_value == 4: 19 | print("That's right! The value of k is the same as the number of centroids used to create your dataset.") 20 | else: 21 | print("Oops! That doesn't seem right! The value of k should be the same as the number of centroids you used in your dataset. In this case, the value for k should be 4.") 22 | 23 | def test_question_7(k_value): 24 | if k_value == 4: 25 | print("That's right! We set up the data with 4 centers, and the plot is consistent! We can see a strong leveling off after 4 clusters, which suggests 4 clusters should be used.") 26 | 27 | return display_gif("https://media2.giphy.com/media/3ohzdIuqJoo8QdKlnW/giphy.gif") 28 | else: 29 | print("Oops! That doesn't seem right! The value of k should be where the 'elbow' can be found in the scree plot. You can see 4-10 all have similar SSE values, suggesting that 4 clusters is the minimum number of clusters to significantly reduce the SSE from centroids to each point.") 30 | -------------------------------------------------------------------------------- /lessons/Unsupervised/1_Clustering/tests2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from sklearn.cluster import KMeans 6 | from IPython.display import Image 7 | from sklearn.datasets.samples_generator import make_blobs 8 | 9 | 10 | 11 | def check_q1(stuff): 12 | a = 0 13 | b = 60 14 | c = 22.9 15 | d = 4.53 16 | e = 511.7 17 | 18 | q1_dict = { 19 | 'number of missing values': a, 20 | 'the mean 5k time in minutes': c, 21 | 'the mean test score as a raw value': e, 22 | 'number of individuals in the dataset': b 23 | } 24 | 25 | if stuff == q1_dict: 26 | print("That looks right!") 27 | 28 | else: 29 | print("Oops! That doesn't look quite right! Try again.") 30 | 31 | 32 | def check_q5(stuff): 33 | a = 'We should always use normalizing' 34 | b = 'We should always scale our variables between 0 and 1.' 35 | c = 'Variable scale will frequently influence your results, so it is important to standardize for all of these algorithms.' 36 | d = 'Scaling will not change the results of your output.' 37 | 38 | if stuff == c: 39 | return Image(filename="./giphy.gif") 40 | else: 41 | print("Oops! That doesn't look quite right. Try again!") -------------------------------------------------------------------------------- /lessons/Unsupervised/2_HierarchcalDensityClustering/dbscan_lab_helper.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from itertools import cycle, islice 4 | from sklearn import cluster 5 | 6 | figsize = (10,10) 7 | point_size=150 8 | point_border=0.8 9 | 10 | 11 | def plot_dataset(dataset, xlim=(-15, 15), ylim=(-15, 15)): 12 | plt.figure(figsize=figsize) 13 | plt.scatter(dataset[:,0], dataset[:,1], s=point_size, color="#00B3E9", edgecolor='black', lw=point_border) 14 | plt.xlim(xlim) 15 | plt.ylim(ylim) 16 | plt.show() 17 | 18 | def plot_clustered_dataset(dataset, y_pred, xlim=(-15, 15), ylim=(-15, 15), neighborhood=False, epsilon=0.5): 19 | 20 | fig, ax = plt.subplots(figsize=figsize) 21 | 22 | colors = np.array(list(islice(cycle(['#df8efd', '#78c465', '#ff8e34', 23 | '#f65e97', '#a65628', '#984ea3', 24 | '#999999', '#e41a1c', '#dede00']), 25 | int(max(y_pred) + 1)))) 26 | colors = np.append(colors, '#BECBD6') 27 | 28 | 29 | if neighborhood: 30 | for point in dataset: 31 | circle1 = plt.Circle(point, epsilon, color='#666666', fill=False, zorder=0, alpha=0.3) 32 | ax.add_artist(circle1) 33 | 34 | ax.scatter(dataset[:, 0], dataset[:, 1], s=point_size, color=colors[y_pred], zorder=10, edgecolor='black', lw=point_border) 35 | plt.xlim(xlim) 36 | plt.ylim(ylim) 37 | plt.show() 38 | 39 | def plot_dbscan_grid(dataset, eps_values, min_samples_values): 40 | 41 | fig = plt.figure(figsize=(16, 20)) 42 | plt.subplots_adjust(left=.02, right=.98, bottom=0.001, top=.96, wspace=.05, 43 | hspace=0.25) 44 | 45 | 46 | plot_num = 1 47 | 48 | for i, min_samples in enumerate(min_samples_values): 49 | for j, eps in enumerate(eps_values): 50 | ax = fig.add_subplot( len(min_samples_values) , len(eps_values), plot_num) 51 | 52 | dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples) 53 | y_pred_2 = dbscan.fit_predict(dataset) 54 | 55 | colors = np.array(list(islice(cycle(['#df8efd', '#78c465', '#ff8e34', 56 | '#f65e97', '#a65628', '#984ea3', 57 | '#999999', '#e41a1c', '#dede00']), 58 | int(max(y_pred_2) + 1)))) 59 | colors = np.append(colors, '#BECBD6') 60 | 61 | 62 | for point in dataset: 63 | circle1 = plt.Circle(point, eps, color='#666666', fill=False, zorder=0, alpha=0.3) 64 | ax.add_artist(circle1) 65 | 66 | ax.text(0, -0.03, 'Epsilon: {} \nMin_samples: {}'.format(eps, min_samples), transform=ax.transAxes, fontsize=16, va='top') 67 | ax.scatter(dataset[:, 0], dataset[:, 1], s=50, color=colors[y_pred_2], zorder=10, edgecolor='black', lw=0.5) 68 | 69 | 70 | plt.xticks(()) 71 | plt.yticks(()) 72 | plt.xlim(-14, 5) 73 | plt.ylim(-12, 7) 74 | 75 | plot_num = plot_num + 1 76 | 77 | plt.show() -------------------------------------------------------------------------------- /lessons/Unsupervised/2_HierarchcalDensityClustering/images/high_epsilon_and_high_min_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/2_HierarchcalDensityClustering/images/high_epsilon_and_high_min_sample.png -------------------------------------------------------------------------------- /lessons/Unsupervised/2_HierarchcalDensityClustering/images/high_epsilon_and_low_min_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/2_HierarchcalDensityClustering/images/high_epsilon_and_low_min_sample.png -------------------------------------------------------------------------------- /lessons/Unsupervised/2_HierarchcalDensityClustering/images/low_epsilon_and_high_min_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/2_HierarchcalDensityClustering/images/low_epsilon_and_high_min_sample.png -------------------------------------------------------------------------------- /lessons/Unsupervised/2_HierarchcalDensityClustering/images/low_epsilon_and_low_min_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/2_HierarchcalDensityClustering/images/low_epsilon_and_low_min_sample.png -------------------------------------------------------------------------------- /lessons/Unsupervised/4_PCA/Interpret_PCA_Results.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Your Turn!\n", 8 | "\n", 9 | "In the last video, you saw two of the main aspects of principal components:\n", 10 | "\n", 11 | "1. **The amount of variability captured by the component.**\n", 12 | "2. **The components themselves.**\n", 13 | "\n", 14 | "In this notebook, you will get a chance to explore these a bit more yourself. First, let's read in the necessary libraries, as well as the data." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "from sklearn.decomposition import PCA\n", 26 | "from sklearn.preprocessing import StandardScaler\n", 27 | "from sklearn.ensemble import RandomForestClassifier\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "from sklearn.metrics import confusion_matrix, accuracy_score\n", 30 | "from helper_functions import show_images, do_pca, scree_plot, plot_component\n", 31 | "import test_code as t\n", 32 | "\n", 33 | "import matplotlib.image as mpimg\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns\n", 36 | "\n", 37 | "%matplotlib inline\n", 38 | "\n", 39 | "#read in our dataset\n", 40 | "train = pd.read_csv('./data/train.csv')\n", 41 | "train.fillna(0, inplace=True)\n", 42 | "\n", 43 | "# save the labels to a Pandas series target\n", 44 | "y = train['label']\n", 45 | "# Drop the label feature\n", 46 | "X = train.drop(\"label\",axis=1)\n", 47 | "\n", 48 | "show_images(30)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "`1.` Perform PCA on the **X** matrix using on your own or using the **do_pca** function from the **helper_functions** module. Reduce the original more than 700 features to only 10 principal components." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# Make sure to keep track of the resulting components and the pca object\n", 65 | "do_pca?" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "source": [ 74 | "`2.` Now use the **scree_plot** function from the **helper_functions** module to take a closer look at the results of your analysis." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "scree_plot? #Use the scree plot to answer the next question" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "`3.` Using the results of your scree plot, match each letter as the value to the correct key in the **solution_three** dictionary. Once you are confident in your solution run the next cell to see if your solution matches ours." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "a = True\n", 100 | "b = False\n", 101 | "c = 6.13\n", 102 | "d = 'The total amount of variability in the data explained by the first two principal components'\n", 103 | "e = None\n", 104 | "\n", 105 | "solution_three = {\n", 106 | " '10.42' : #letter, \n", 107 | " 'The first component will ALWAYS have the most amount of variability explained.': #letter,\n", 108 | " 'The total amount of variability in the data explained by the first component': #letter,\n", 109 | " 'The sum of the variability explained by all the components can be greater than 100%': #letter\n", 110 | "}" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "#Run this cell to see if your solution matches ours\n", 120 | "t.question_3_check(solution_three)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "`4.` Use the **plot_component** function from the **helper_functions** module to look at each of the components (remember they are 0 indexed). Use the results to assist with question 5." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# Use the images of the component weights to answer the next question\n", 137 | "plot_component? " 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "`5.` Using the results from viewing each of your principal component weights in question 4, change the following values of the **solution_five** dictionary to the **number of the index** for the principal component that best matches the description. Once you are confident in your solution run the next cell to see if your solution matches ours." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "solution_five = {\n", 154 | " 'This component looks like it will assist in identifying zero': #number 0-9,\n", 155 | " 'This component looks like it will assist in identifying three': #number 0-9\n", 156 | "}" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "#Run this cell to see if your solution matches ours\n", 166 | "t.question_5_check(solution_five)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "From this notebook, you have had an opportunity to look at the two major parts of PCA:\n", 174 | "\n", 175 | "`I.` The amount of **variance explained by each component**. This is called an **eigenvalue**.\n", 176 | "\n", 177 | "`II.` The principal components themselves, each component is a vector of weights. In this case, the principal components help us understand which pixels of the image are most helpful in identifying the difference between between digits. **Principal components** are also known as **eigenvectors**." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.6.3" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /lessons/Unsupervised/4_PCA/Interpret_PCA_Results_Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Your Turn! (Solution)\n", 8 | "\n", 9 | "In the last video, you saw two of the main aspects of principal components:\n", 10 | "\n", 11 | "1. **The amount of variability captured by the component.**\n", 12 | "2. **The components themselves.**\n", 13 | "\n", 14 | "In this notebook, you will get a chance to explore these a bit more yourself. First, let's read in the necessary libraries, as well as the data." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "from sklearn.decomposition import PCA\n", 26 | "from sklearn.preprocessing import StandardScaler\n", 27 | "from sklearn.ensemble import RandomForestClassifier\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "from sklearn.metrics import confusion_matrix, accuracy_score\n", 30 | "from helper_functions import show_images, do_pca, scree_plot, plot_component\n", 31 | "import test_code as t\n", 32 | "\n", 33 | "import matplotlib.image as mpimg\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns\n", 36 | "\n", 37 | "%matplotlib inline\n", 38 | "\n", 39 | "#read in our dataset\n", 40 | "train = pd.read_csv('./data/train.csv')\n", 41 | "train.fillna(0, inplace=True)\n", 42 | "\n", 43 | "# save the labels to a Pandas series target\n", 44 | "y = train['label']\n", 45 | "# Drop the label feature\n", 46 | "X = train.drop(\"label\",axis=1)\n", 47 | "\n", 48 | "show_images(30)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "`1.` Perform PCA on the **X** matrix using on your own or using the **do_pca** function from the **helper_functions** module. Reduce the original more than 700 features to only 10 principal components." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "pca, X_pca = do_pca(10, X)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "source": [ 75 | "`2.` Now use the **scree_plot** function from the **helper_functions** module to take a closer look at the results of your analysis." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "scree_plot(pca)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "`3.` Using the results of your scree plot, match each letter as the value to the correct key in the **solution_three** dictionary. Once you are confident in your solution run the next cell to see if your solution matches ours." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "a = True\n", 101 | "b = False\n", 102 | "c = 6.13\n", 103 | "d = 'The total amount of variability in the data explained by the first two principal components'\n", 104 | "e = None\n", 105 | "\n", 106 | "solution_three = {\n", 107 | " '10.42' : d, \n", 108 | " 'The first component will ALWAYS have the most amount of variability explained.': a,\n", 109 | " 'The total amount of variability in the data explained by the first component': c,\n", 110 | " 'The sum of the variability explained by all the components can be greater than 100%': b\n", 111 | "}" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "#Run this cell to see if your solution matches ours\n", 121 | "t.question_3_check(solution_three)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "`4.` Use the **plot_component** function from the **helper_functions** module to look at each of the components (remember they are 0 indexed). Use the results to assist with question 5." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "plot_component(pca, 3)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "`5.` Using the results from viewing each of your principal component weights in question 4, change the following values of the **solution_five** dictionary to the **number of the index** for the principal component that best matches the description. Once you are confident in your solution run the next cell to see if your solution matches ours." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "solution_five = {\n", 156 | " 'This component looks like it will assist in identifying zero': 0,\n", 157 | " 'This component looks like it will assist in identifying three': 3\n", 158 | "}" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "#Run this cell to see if your solution matches ours\n", 168 | "t.question_5_check(solution_five)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "From this notebook, you have had an opportunity to look at the two major parts of PCA:\n", 176 | "\n", 177 | "`I.` The amount of **variance explained by each component**. This is called an **eigenvalue**.\n", 178 | "\n", 179 | "`II.` The principal components themselves, each component is a vector of weights. In this case, the principal components help us understand which pixels of the image are most helpful in identifying the difference between between digits. **Principal components** are also known as **eigenvectors**." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 3", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.6.3" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 2 213 | } 214 | -------------------------------------------------------------------------------- /lessons/Unsupervised/4_PCA/PCA_Mini_Project_Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### PCA Mini Project - Solution\n", 8 | "\n", 9 | "In the lesson, you saw how you could use PCA to substantially reduce the dimensionality of the handwritten digits. In this mini-project, you will be using the **cars.csv** file. \n", 10 | "\n", 11 | "To begin, run the cell below to read in the necessary libraries and the dataset. I also read in the helper functions that you used throughout the lesson in case you might find them helpful in completing this project. Otherwise, you can always create functions of your own!" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "from sklearn.decomposition import PCA\n", 23 | "from sklearn.preprocessing import StandardScaler\n", 24 | "from sklearn.ensemble import RandomForestClassifier\n", 25 | "from sklearn.model_selection import train_test_split\n", 26 | "from sklearn.metrics import confusion_matrix, accuracy_score\n", 27 | "from helper_functions import do_pca, scree_plot, plot_components, pca_results\n", 28 | "from IPython import display\n", 29 | "import test_code2 as t\n", 30 | "\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import seaborn as sns\n", 33 | "\n", 34 | "%matplotlib inline\n", 35 | "\n", 36 | "df = pd.read_csv('./data/cars.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "`1.` Now your data is stored in **df**. Use the below cells to take a look your dataset. At the end of your exploration, use your findings to match the appropriate variable to each key in the dictionary below. " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "df.head()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df.describe()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "df.shape" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "a = 7\n", 82 | "b = 66\n", 83 | "c = 387\n", 84 | "d = 18\n", 85 | "e = 0.23\n", 86 | "f = 0.05\n", 87 | "\n", 88 | "\n", 89 | "solution_1_dict = {\n", 90 | " 'The number of cars in the dataset': c,\n", 91 | " 'The number of car features in the dataset': d,\n", 92 | " 'The number of dummy variables in the dataset': a,\n", 93 | " 'The proportion of minivans in the dataset': f,\n", 94 | " 'The max highway mpg for any car': b\n", 95 | "}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# Check your solution against ours by running this cell\n", 105 | "display.HTML(t.check_question_one(solution_1_dict))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "`2.` There are some particularly nice properties about PCA to keep in mind. Use the dictionary below to match the correct variable as the key to each statement. When you are ready, check your solution against ours by running the following cell." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "a = True\n", 122 | "b = False\n", 123 | "\n", 124 | "solution_2_dict = {\n", 125 | " 'The components span the directions of maximum variability.': a,\n", 126 | " 'The components are always orthogonal to one another.': a,\n", 127 | " 'Eigenvalues tell us the amount of information a component holds': a\n", 128 | "}" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# Check your solution against ours by running this cell\n", 138 | "t.check_question_two(solution_2_dict)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "`3.` Fit PCA to reduce the current dimensionality of the datset to 3 dimensions. You can use the helper functions, or perform the steps on your own. If you fit on your own, be sure to standardize your data. At the end of this process, you will want an **X** matrix with the reduced dimensionality to only 3 features. Additionally, you will want your **pca** object back that has been used to fit and transform your dataset. " 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "pca, X_pca = do_pca(3, df)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "`4.` Once you have your pca object, you can take a closer look at what comprises each of the principal components. Use the **pca_results** function from the **helper_functions** module assist with taking a closer look at the results of your analysis. The function takes two arguments: the full dataset and the pca object you created." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "pca_results(df, pca)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "`5.` Use the results, to match each of the variables as the value to the most appropriate key in the dictionary below. When you are ready to check your answers, run the following cell to see if your solution matches ours!" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "a = 'car weight'\n", 189 | "b = 'sports cars'\n", 190 | "c = 'gas mileage'\n", 191 | "d = 0.4352\n", 192 | "e = 0.3061\n", 193 | "f = 0.1667\n", 194 | "g = 0.7053\n", 195 | "\n", 196 | "solution_5_dict = {\n", 197 | " 'The first component positively weights items related to': c, \n", 198 | " 'The amount of variability explained by the first component is': d,\n", 199 | " 'The largest weight of the second component is related to': b,\n", 200 | " 'The total amount of variability explained by the first three components': g\n", 201 | "}" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# Run this cell to check if your solution matches ours.\n", 211 | "t.check_question_five(solution_5_dict)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "`6.` How many components need to be kept to explain at least 85% of the variability in the original dataset? When you think you have the answer, store it in the variable `num_comps`. Then run the following cell to see if your solution matches ours!" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "for comp in range(3, df.shape[1]):\n", 228 | " pca, X_pca = do_pca(comp, df)\n", 229 | " comp_check = pca_results(df, pca)\n", 230 | " if comp_check['Explained Variance'].sum() > 0.85:\n", 231 | " break\n", 232 | " \n", 233 | "\n", 234 | "num_comps = comp_check.shape[0]\n", 235 | "print(\"Using {} components, we can explain {}% of the variability in the original data.\".format(comp_check.shape[0],comp_check['Explained Variance'].sum()))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "# How check your answer here to complete this mini project\n", 245 | "display.HTML(t.question_check_six(num_comps))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [] 254 | } 255 | ], 256 | "metadata": { 257 | "kernelspec": { 258 | "display_name": "Python 3", 259 | "language": "python", 260 | "name": "python3" 261 | }, 262 | "language_info": { 263 | "codemirror_mode": { 264 | "name": "ipython", 265 | "version": 3 266 | }, 267 | "file_extension": ".py", 268 | "mimetype": "text/x-python", 269 | "name": "python", 270 | "nbconvert_exporter": "python", 271 | "pygments_lexer": "ipython3", 272 | "version": "3.6.1" 273 | } 274 | }, 275 | "nbformat": 4, 276 | "nbformat_minor": 2 277 | } 278 | -------------------------------------------------------------------------------- /lessons/Unsupervised/4_PCA/helper_functions.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from sklearn.decomposition import PCA 15 | from sklearn.preprocessing import StandardScaler 16 | from sklearn.ensemble import RandomForestClassifier 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.metrics import confusion_matrix, accuracy_score 19 | 20 | import matplotlib.image as mpimg 21 | import matplotlib.pyplot as plt 22 | import matplotlib.cm as cm 23 | import seaborn as sns 24 | 25 | train = pd.read_csv('./data/train.csv') 26 | 27 | # save the labels to a Pandas series target 28 | y = train['label'] 29 | 30 | # Drop the label feature 31 | X = train.drop("label",axis=1) 32 | 33 | def show_images(num_images): 34 | ''' 35 | This function plots the num_images provided of MNIST dataset. 36 | 37 | INPUT: int - The number of images you would like to view. 38 | Mod 10 of num_images should be 0 and it should be fewer than 101 images. 39 | OUTPUT: A figure with the images shown for the training data. 40 | ''' 41 | if num_images % 10 == 0 and num_images <= 100: 42 | for digit_num in range(0,num_images): 43 | plt.subplot(num_images/10,10,digit_num+1) #create subplots 44 | mat_data = X.iloc[digit_num].as_matrix().reshape(28,28) #reshape images 45 | plt.imshow(mat_data) #plot the data 46 | plt.xticks([]) #removes numbered labels on x-axis 47 | plt.yticks([]) #removes numbered labels on y-axis 48 | else: 49 | print('That is not the right input, please read the docstring before continuing.') 50 | 51 | 52 | def show_images_by_digit(digit_to_see): 53 | ''' 54 | This function plots 50 images all of the type digits_to_see of the MNIST dataset. 55 | 56 | INPUT: digits_to_see - int - A number between 0 and 9 of what you want to see. 57 | OUTPUT: A figure with the images shown for the training data. 58 | ''' 59 | if digit_to_see in list(range(10)): 60 | indices = np.where(y == digit_to_see) # pull indices for num of interest 61 | for digit_num in range(0,50): 62 | plt.subplot(5,10, digit_num+1) #create subplots 63 | mat_data = X.iloc[indices[0][digit_num]].as_matrix().reshape(28,28) #reshape images 64 | plt.imshow(mat_data) #plot the data 65 | plt.xticks([]) #removes numbered labels on x-axis 66 | plt.yticks([]) #removes numbered labels on y-axis 67 | else: 68 | print('That is not the right input, please read the docstring before continuing.') 69 | 70 | 71 | def fit_random_forest_classifier(X, y): 72 | ''' 73 | INPUT: names are pretty self explanatory 74 | OUTPUT: none - prints the confusion matrix and accuracy 75 | ''' 76 | #First let's create training and testing data 77 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 78 | 79 | #We could grid search and tune, but let's just fit a simple model to see how it does 80 | #instantiate 81 | clf = RandomForestClassifier(n_estimators=100, max_depth=None) 82 | 83 | #fit 84 | clf.fit(X_train, y_train) 85 | 86 | #predict 87 | y_preds = clf.predict(X_test) 88 | 89 | #score 90 | print(confusion_matrix(y_test, y_preds)) 91 | acc = accuracy_score(y_test, y_preds) 92 | print(acc) 93 | return acc 94 | 95 | 96 | def fit_random_forest_classifier2(X, y): 97 | ''' 98 | INPUT: X - the x-matrix of input features 99 | y - the response column 100 | OUTPUT: none - prints the confusion matrix and accuracy 101 | ''' 102 | #First let's create training and testing data 103 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 104 | 105 | #We could grid search and tune, but let's just fit a simple model to see how it does 106 | #instantiate 107 | clf = RandomForestClassifier(n_estimators=100, max_depth=None) 108 | 109 | #fit 110 | clf.fit(X_train, y_train) 111 | 112 | #predict 113 | y_preds = clf.predict(X_test) 114 | 115 | #score 116 | acc = accuracy_score(y_test, y_preds) 117 | return acc 118 | 119 | 120 | def do_pca(n_components, data): 121 | ''' 122 | Transforms data using PCA to create n_components, and provides back the results of the 123 | transformation. 124 | 125 | INPUT: n_components - int - the number of principal components to create 126 | data - the data you would like to transform 127 | 128 | OUTPUT: pca - the pca object created after fitting the data 129 | X_pca - the transformed X matrix with new number of components 130 | ''' 131 | X = StandardScaler().fit_transform(data) 132 | pca = PCA(n_components) 133 | X_pca = pca.fit_transform(X) 134 | return pca, X_pca 135 | 136 | 137 | def plot_components(X, y): 138 | ''' 139 | plots the data in a 2 dimensional space to view separation 140 | INPUT: X - the x-matrix of input features 141 | y - the response column 142 | OUTPUT: none 143 | ''' 144 | x_min, x_max = np.min(X, 0), np.max(X, 0) 145 | X = (X - x_min) / (x_max - x_min) 146 | plt.figure(figsize=(10, 6)) 147 | for i in range(X.shape[0]): 148 | plt.text(X[i, 0], X[i, 1], str(y[i]), color=plt.cm.Set1(y[i]), fontdict={'size': 15}) 149 | 150 | plt.xticks([]), plt.yticks([]), plt.ylim([-0.1,1.1]), plt.xlim([-0.1,1.1]) 151 | 152 | 153 | def scree_plot(pca): 154 | ''' 155 | Creates a scree plot associated with the principal components 156 | 157 | INPUT: pca - the result of instantian of PCA in scikit learn 158 | 159 | OUTPUT: 160 | None 161 | ''' 162 | num_components=len(pca.explained_variance_ratio_) 163 | ind = np.arange(num_components) 164 | vals = pca.explained_variance_ratio_ 165 | 166 | plt.figure(figsize=(10, 6)) 167 | ax = plt.subplot(111) 168 | cumvals = np.cumsum(vals) 169 | ax.bar(ind, vals) 170 | ax.plot(ind, cumvals) 171 | for i in range(num_components): 172 | ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12) 173 | 174 | ax.xaxis.set_tick_params(width=0) 175 | ax.yaxis.set_tick_params(width=2, length=12) 176 | 177 | ax.set_xlabel("Principal Component") 178 | ax.set_ylabel("Variance Explained (%)") 179 | plt.title('Explained Variance Per Principal Component') 180 | 181 | 182 | def plot_component(pca, comp): 183 | ''' 184 | Plots an image associated with each component to understand how the weighting 185 | of the components 186 | INPUT: 187 | pca - pca object created from PCA in sklearn 188 | comp - int - the component you want to see starting at 0 189 | OUTPUT 190 | None 191 | ''' 192 | if comp <= len(pca.components_): 193 | mat_data = np.asmatrix(pca.components_[comp]).reshape(28,28) #reshape images 194 | plt.imshow(mat_data); #plot the data 195 | plt.xticks([]) #removes numbered labels on x-axis 196 | plt.yticks([]) #removes numbered labels on y-axis 197 | else: 198 | print('That is not the right input, please read the docstring before continuing.') 199 | 200 | 201 | def pca_results(full_dataset, pca): 202 | ''' 203 | Create a DataFrame of the PCA results 204 | Includes dimension feature weights and explained variance 205 | Visualizes the PCA results 206 | ''' 207 | 208 | # Dimension indexing 209 | dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] 210 | 211 | # PCA components 212 | components = pd.DataFrame(np.round(pca.components_, 4), columns = full_dataset.keys()) 213 | components.index = dimensions 214 | 215 | # PCA explained variance 216 | ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 217 | variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 218 | variance_ratios.index = dimensions 219 | 220 | # Create a bar plot visualization 221 | fig, ax = plt.subplots(figsize = (14,8)) 222 | 223 | # Plot the feature weights as a function of the components 224 | components.plot(ax = ax, kind = 'bar'); 225 | ax.set_ylabel("Feature Weights") 226 | ax.set_xticklabels(dimensions, rotation=0) 227 | 228 | 229 | # Display the explained variance ratios 230 | for i, ev in enumerate(pca.explained_variance_ratio_): 231 | ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) 232 | 233 | # Return a concatenated DataFrame 234 | return pd.concat([variance_ratios, components], axis = 1) 235 | 236 | 237 | -------------------------------------------------------------------------------- /lessons/Unsupervised/4_PCA/test_code.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.decomposition import PCA 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import confusion_matrix, accuracy_score 8 | from helper_functions import show_images, show_images_by_digit 9 | from helper_functions import fit_random_forest_classifier, do_pca, plot_components 10 | 11 | import matplotlib.image as mpimg 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | 15 | train = pd.read_csv('./data/train.csv') 16 | 17 | # save the labels to a Pandas series target 18 | y = train['label'] 19 | 20 | # Drop the label feature 21 | X = train.drop("label", axis=1) 22 | 23 | def question_two_check(input1, input2): 24 | match1 = (input1 == y) 25 | match2 = (input2 == X) 26 | if all(match1) and all(match2): 27 | print("That looks right!") 28 | else: 29 | print("Oops! That doesn't look like what was expected for X and y. X should be a matrix of only the pixels, and y should only hold the label column.") 30 | 31 | 32 | def question_3_check(solution_three): 33 | a = True 34 | b = False 35 | c = 6.13 36 | d = 'The total amount of variability in the data explained by the first two principal components' 37 | e = None 38 | my_sol = { 39 | '10.42' : d, 40 | 'The first component will ALWAYS have the most amount of variability explained.': a, 41 | 'The total amount of variability in the data explained by the first component': c, 42 | 'The sum of the variability explained by all the components can be greater than 100%': b 43 | } 44 | 45 | if my_sol == solution_three: 46 | print("Looks good! The amount of variability explained by each principal component gives us an idea of how much of the original variability in the original data is retained by each component. Nice job matching these up!") 47 | 48 | if my_sol['10.42'] != solution_three['10.42']: 49 | print("Oops! Looks like you missed the first one. Notice that 9.85 is the sum of the two bars shown in the plot. This means that the total amount of variability explained by the first two principal components is 9.85. 5.74% can be explained by the first component, and the rest is explained by the second component.\n\n") 50 | if my_sol['The first component will ALWAYS have the most amount of variability explained.'] != solution_three['The first component will ALWAYS have the most amount of variability explained.']: 51 | print("Oops! Looks like you missed the second one. It is actually the case that the first component will ALWAYS be the largest. This is because PCA tries to find the direction of maximum variance first. In fact, the components will always be in order from largest amount of variability explained first to smallest amount of variability explained as the last component.\n\n") 52 | if my_sol['The total amount of variability in the data explained by the first component'] != solution_three['The total amount of variability in the data explained by the first component']: 53 | print("Oops! Looks like you missed the third one. The amount of variability explained by each component is shown by the bars in the chart. The total amount of variability explained by the combined components up to each component is shown by the line. This gives us an idea of how much is explained so far, and how much each additional component is contributing.\n\n") 54 | if my_sol['The sum of the variability explained by all the components can be greater than 100%'] != solution_three['The sum of the variability explained by all the components can be greater than 100%']: 55 | print("Oops! The last answer doesn't look right. Your principal components are always reducing the original space of your features until you have as many principal components as you had original features. Therefore, the sum of the amount of variability explained by all the components can never exceep 100%") 56 | 57 | 58 | def question_5_check(solution_five): 59 | 60 | my_sol = { 61 | 'This component looks like it will assist in identifying zero': 0, 62 | 'This component looks like it will assist in identifying three': 3 63 | } 64 | 65 | if my_sol == solution_five: 66 | print("Nice job! That matches our solution as well! The index of the first principal component appears to have really high weights where a zero would appear. Alternatively, the fourth (third indexed component) appears to downweight where a three would appear to make it stand out.") 67 | else: 68 | print("Oops! That doesn't look quite right. Please use the indices as numbers for the values, so the first component should be 0, the second component would be 1, and so on.") 69 | -------------------------------------------------------------------------------- /lessons/Unsupervised/4_PCA/test_code2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.decomposition import PCA 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import confusion_matrix, accuracy_score 8 | from helper_functions import show_images, show_images_by_digit 9 | from helper_functions import fit_random_forest_classifier, do_pca, plot_components 10 | 11 | import matplotlib.image as mpimg 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | 15 | def display_gif(fn): 16 | return ''.format(fn) 17 | 18 | 19 | def check_question_one(solution_1_dict): 20 | a = 7 21 | b = 66 22 | c = 387 23 | d = 18 24 | e = 0.23 25 | f = 0.05 26 | 27 | my_sol = { 28 | 'The number of cars in the dataset': c, 29 | 'The number of car features in the dataset': d, 30 | 'The number of dummy variables in the dataset': a, 31 | 'The proportion of minivans in the dataset': f, 32 | 'The max highway mpg for any car': b 33 | } 34 | 35 | if my_sol == solution_1_dict: 36 | print("Nice job! Looks like your dataset matches what we found!") 37 | return display_gif('https://bit.ly/2K9X0gD') 38 | 39 | if my_sol['The number of cars in the dataset'] != solution_1_dict['The number of cars in the dataset']: 40 | print("Sorry, but it looks like you missed the first one. The number of cars in the dataset should match the number of rows in the dataset. Try again!\n\n") 41 | 42 | if my_sol['The number of car features in the dataset'] != solution_1_dict['The number of car features in the dataset']: 43 | print("Sorry, but it looks like you missed the second one. The number of car features in the dataset should match the number of columns in the dataset. Try again!\n\n") 44 | if my_sol['The number of dummy variables in the dataset'] != solution_1_dict['The number of dummy variables in the dataset']: 45 | print("Sorry, but it looks like you missed the third one. The dummy variables are columns with only 1 and 0 values in the dataset. Try again!") 46 | if my_sol['The proportion of minivans in the dataset'] != solution_1_dict['The proportion of minivans in the dataset']: 47 | print("Sorry, but it looks like you missed the fourth one. The proportion of minivans in the dataset can be found by using the describe method on your dataframe or directly on the minivans column of the dataset.") 48 | if my_sol['The max highway mpg for any car'] != solution_1_dict['The max highway mpg for any car']: 49 | print("Sorry, but it looks like you missed the last one. The max highway mpg in the dataset can be found by using the describe method on your dataframe or using the max function in numpy.") 50 | 51 | if my_sol != solution_1_dict: 52 | return display_gif('https://bit.ly/2Hog74V') 53 | 54 | 55 | def check_question_two(solution_2_dict): 56 | a = True 57 | b = False 58 | 59 | my_sol = { 60 | 'The components span the directions of maximum variability.': a, 61 | 'The components are always orthogonal to one another.': a, 62 | 'Eigenvalues tell us the amount of information a component holds': a 63 | } 64 | 65 | try: 66 | if my_sol == solution_2_dict: 67 | print("That's right these are all true. Principal components are orthogonal, span the directions of maximum variability, and the corresponding eigenvalues tell us how much of the original variability is explained by each component.") 68 | else: 69 | print("Oops! That doesn't look quite right! One or more of the statements you marked False is actually True. Try again!") 70 | 71 | except: 72 | print("Oops! That doesn't look quite right! One or more of the statements you marked False is actually True. Try again!") 73 | 74 | 75 | def check_question_five(solution_5_dict): 76 | a = 'car weight' 77 | b = 'sports cars' 78 | c = 'gas mileage' 79 | d = 0.4352 80 | e = 0.3061 81 | f = 0.1667 82 | g = 0.7053 83 | 84 | my_sol = { 85 | 'The first component positively weights items related to': c, 86 | 'The amount of variability explained by the first component is': d, 87 | 'The largest weight of the second component is related to': b, 88 | 'The total amount of variability explained by the first three components': g 89 | } 90 | 91 | if my_sol == solution_5_dict: 92 | print("That's right! Looks like you know a lot about PCA!") 93 | if my_sol['The first component positively weights items related to'] != solution_5_dict['The first component positively weights items related to']: 94 | print("Oops! Looks like you missed the first question. Notice that there are two bars that are large, positive, while the rest are mostly negative. What are the two bars related to?\n\n") 95 | 96 | if my_sol['The amount of variability explained by the first component is'] != solution_5_dict['The amount of variability explained by the first component is']: 97 | print("Oops! Looks like you missed the second question. If you look in the table, you will see the variance explained in the first column. Then the principal component is provided as the weights that show up in each associated row.\n\n") 98 | 99 | if my_sol['The largest weight of the second component is related to'] != solution_5_dict['The largest weight of the second component is related to']: 100 | print("Oops! Looks like you missed the third question. If you look in the table, you will see the variance explained in the first column. Looking at the bar chart, you can see that the largest weight for the second component is the blue bar on the far left. This is the first column in the original set of features.\n\n") 101 | 102 | if my_sol['The total amount of variability explained by the first three components'] != solution_5_dict['The total amount of variability explained by the first three components']: 103 | print("Oops! Looks like you missed the last question. If you add all of the variability explained in the first column of the table, how much is explained by the first component?") 104 | 105 | 106 | def question_check_six(num_comps): 107 | if num_comps == 6: 108 | print("Nice job! That's right! With 6 components, you can explain more than 85% of the variability in the original dataset.") 109 | return display_gif('https://bit.ly/2cKTiso') 110 | else: 111 | print("Oops! That doesn't look quite right. Try again.") 112 | return display_gif('https://bit.ly/2AC30ww') 113 | 114 | 115 | -------------------------------------------------------------------------------- /lessons/Unsupervised/5_ICA/ICA mix 1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/5_ICA/ICA mix 1.wav -------------------------------------------------------------------------------- /lessons/Unsupervised/5_ICA/ICA mix 2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/5_ICA/ICA mix 2.wav -------------------------------------------------------------------------------- /lessons/Unsupervised/5_ICA/ICA mix 3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/lessons/Unsupervised/5_ICA/ICA mix 3.wav -------------------------------------------------------------------------------- /projects/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/projects/.DS_Store -------------------------------------------------------------------------------- /projects/p1_charityml/README.md: -------------------------------------------------------------------------------- 1 | # Data Scientist Nanodegree 2 | # Supervised Learning 3 | ## Project: Finding Donors for CharityML 4 | 5 | ### Install 6 | 7 | This project requires **Python 3.x** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [Pandas](http://pandas.pydata.org) 11 | - [matplotlib](http://matplotlib.org/) 12 | - [scikit-learn](http://scikit-learn.org/stable/) 13 | 14 | You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html) 15 | 16 | We recommend students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 17 | 18 | ### Code 19 | 20 | Template code is provided in the `finding_donors.ipynb` notebook file. You will also be required to use the included `visuals.py` Python file and the `census.csv` dataset file to complete your work. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. Note that the code included in `visuals.py` is meant to be used out-of-the-box and not intended for students to manipulate. If you are interested in how the visualizations are created in the notebook, please feel free to explore this Python file. 21 | 22 | ### Run 23 | 24 | In a terminal or command window, navigate to the top-level project directory `finding_donors/` (that contains this README) and run one of the following commands: 25 | 26 | ```bash 27 | ipython notebook finding_donors.ipynb 28 | ``` 29 | or 30 | ```bash 31 | jupyter notebook finding_donors.ipynb 32 | ``` 33 | 34 | This will open the iPython Notebook software and project file in your browser. 35 | 36 | ### Data 37 | 38 | The modified census dataset consists of approximately 32,000 data points, with each datapoint having 13 features. This dataset is a modified version of the dataset published in the paper *"Scaling Up the Accuracy of Naive-Bayes Classifiers: a Decision-Tree Hybrid",* by Ron Kohavi. You may find this paper [online](https://www.aaai.org/Papers/KDD/1996/KDD96-033.pdf), with the original dataset hosted on [UCI](https://archive.ics.uci.edu/ml/datasets/Census+Income). 39 | 40 | **Features** 41 | - `age`: Age 42 | - `workclass`: Working Class (Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked) 43 | - `education_level`: Level of Education (Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool) 44 | - `education-num`: Number of educational years completed 45 | - `marital-status`: Marital status (Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse) 46 | - `occupation`: Work Occupation (Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces) 47 | - `relationship`: Relationship Status (Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried) 48 | - `race`: Race (White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black) 49 | - `sex`: Sex (Female, Male) 50 | - `capital-gain`: Monetary Capital Gains 51 | - `capital-loss`: Monetary Capital Losses 52 | - `hours-per-week`: Average Hours Per Week Worked 53 | - `native-country`: Native Country (United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands) 54 | 55 | **Target Variable** 56 | - `income`: Income Class (<=50K, >50K) 57 | -------------------------------------------------------------------------------- /projects/p1_charityml/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import matplotlib.pyplot as pl 13 | import matplotlib.patches as mpatches 14 | import numpy as np 15 | import pandas as pd 16 | from time import time 17 | from sklearn.metrics import f1_score, accuracy_score 18 | 19 | 20 | def distribution(data, transformed = False): 21 | """ 22 | Visualization code for displaying skewed distributions of features 23 | """ 24 | 25 | # Create figure 26 | fig = pl.figure(figsize = (11,5)); 27 | 28 | # Skewed feature plotting 29 | for i, feature in enumerate(['capital-gain','capital-loss']): 30 | ax = fig.add_subplot(1, 2, i+1) 31 | ax.hist(data[feature], bins = 25, color = '#00A0A0') 32 | ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14) 33 | ax.set_xlabel("Value") 34 | ax.set_ylabel("Number of Records") 35 | ax.set_ylim((0, 2000)) 36 | ax.set_yticks([0, 500, 1000, 1500, 2000]) 37 | ax.set_yticklabels([0, 500, 1000, 1500, ">2000"]) 38 | 39 | # Plot aesthetics 40 | if transformed: 41 | fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \ 42 | fontsize = 16, y = 1.03) 43 | else: 44 | fig.suptitle("Skewed Distributions of Continuous Census Data Features", \ 45 | fontsize = 16, y = 1.03) 46 | 47 | fig.tight_layout() 48 | fig.show() 49 | 50 | 51 | def evaluate(results, accuracy, f1): 52 | """ 53 | Visualization code to display results of various learners. 54 | 55 | inputs: 56 | - learners: a list of supervised learners 57 | - stats: a list of dictionaries of the statistic results from 'train_predict()' 58 | - accuracy: The score for the naive predictor 59 | - f1: The score for the naive predictor 60 | """ 61 | 62 | # Create figure 63 | fig, ax = pl.subplots(2, 3, figsize = (11,8)) 64 | 65 | # Constants 66 | bar_width = 0.3 67 | colors = ['#A00000','#00A0A0','#00A000'] 68 | 69 | # Super loop to plot four panels of data 70 | for k, learner in enumerate(results.keys()): 71 | for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']): 72 | for i in np.arange(3): 73 | 74 | # Creative plot code 75 | ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k]) 76 | ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45]) 77 | ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"]) 78 | ax[j//3, j%3].set_xlabel("Training Set Size") 79 | ax[j//3, j%3].set_xlim((-0.1, 3.0)) 80 | 81 | # Add unique y-labels 82 | ax[0, 0].set_ylabel("Time (in seconds)") 83 | ax[0, 1].set_ylabel("Accuracy Score") 84 | ax[0, 2].set_ylabel("F-score") 85 | ax[1, 0].set_ylabel("Time (in seconds)") 86 | ax[1, 1].set_ylabel("Accuracy Score") 87 | ax[1, 2].set_ylabel("F-score") 88 | 89 | # Add titles 90 | ax[0, 0].set_title("Model Training") 91 | ax[0, 1].set_title("Accuracy Score on Training Subset") 92 | ax[0, 2].set_title("F-score on Training Subset") 93 | ax[1, 0].set_title("Model Predicting") 94 | ax[1, 1].set_title("Accuracy Score on Testing Set") 95 | ax[1, 2].set_title("F-score on Testing Set") 96 | 97 | # Add horizontal lines for naive predictors 98 | ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 99 | ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 100 | ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 101 | ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 102 | 103 | # Set y-limits for score panels 104 | ax[0, 1].set_ylim((0, 1)) 105 | ax[0, 2].set_ylim((0, 1)) 106 | ax[1, 1].set_ylim((0, 1)) 107 | ax[1, 2].set_ylim((0, 1)) 108 | 109 | # Create patches for the legend 110 | patches = [] 111 | for i, learner in enumerate(results.keys()): 112 | patches.append(mpatches.Patch(color = colors[i], label = learner)) 113 | pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \ 114 | loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large') 115 | 116 | # Aesthetics 117 | pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, x = 0.63, y = 1.05) 118 | # Tune the subplot layout 119 | # Refer - https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.subplots_adjust.html for more details on the arguments 120 | pl.subplots_adjust(left = 0.125, right = 1.2, bottom = 0.1, top = 0.9, wspace = 0.2, hspace = 0.3) 121 | pl.tight_layout() 122 | pl.show() 123 | 124 | 125 | def feature_plot(importances, X_train, y_train): 126 | 127 | # Display the five most important features 128 | indices = np.argsort(importances)[::-1] 129 | columns = X_train.columns.values[indices[:5]] 130 | values = importances[indices][:5] 131 | 132 | # Creat the plot 133 | fig = pl.figure(figsize = (9,5)) 134 | pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16) 135 | pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \ 136 | label = "Feature Weight") 137 | pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \ 138 | label = "Cumulative Feature Weight") 139 | pl.xticks(np.arange(5), columns) 140 | pl.xlim((-0.5, 4.5)) 141 | pl.ylabel("Weight", fontsize = 12) 142 | pl.xlabel("Feature", fontsize = 12) 143 | 144 | pl.legend(loc = 'upper center') 145 | pl.tight_layout() 146 | pl.show() 147 | -------------------------------------------------------------------------------- /projects/p2_image_classifier/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Udacity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /projects/p2_image_classifier/README.md: -------------------------------------------------------------------------------- 1 | # Data Scientist Project 2 | 3 | Project code for Udacity's Data Scientist Nanodegree program. In this project, you will first develop code for an image classifier built with PyTorch, then you will convert it into a command line application. 4 | 5 | In order to complete this project, you will need to use the GPU enabled workspaces within the classroom. The files are all available here for your convenience, but running on your local CPU will likely not work well. 6 | 7 | You should also only enable the GPU when you need it. If you are not using the GPU, please disable it so you do not run out of time! 8 | 9 | ### Data 10 | 11 | The data for this project is quite large - in fact, it is so large you cannot upload it onto Github. If you would like the data for this project, you will want download it from the workspace in the classroom. Though actually completing the project is likely not possible on your local unless you have a GPU. You will be training using 102 different types of flowers, where there ~20 images per flower to train on. Then you will use your trained classifier to see if you can predict the type for new images of the flowers. 12 | -------------------------------------------------------------------------------- /projects/p2_image_classifier/assets/Flowers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/projects/p2_image_classifier/assets/Flowers.png -------------------------------------------------------------------------------- /projects/p2_image_classifier/assets/inference_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/projects/p2_image_classifier/assets/inference_example.png -------------------------------------------------------------------------------- /projects/p2_image_classifier/cat_to_name.json: -------------------------------------------------------------------------------- 1 | {"21": "fire lily", "3": "canterbury bells", "45": "bolero deep blue", "1": "pink primrose", "34": "mexican aster", "27": "prince of wales feathers", "7": "moon orchid", "16": "globe-flower", "25": "grape hyacinth", "26": "corn poppy", "79": "toad lily", "39": "siam tulip", "24": "red ginger", "67": "spring crocus", "35": "alpine sea holly", "32": "garden phlox", "10": "globe thistle", "6": "tiger lily", "93": "ball moss", "33": "love in the mist", "9": "monkshood", "102": "blackberry lily", "14": "spear thistle", "19": "balloon flower", "100": "blanket flower", "13": "king protea", "49": "oxeye daisy", "15": "yellow iris", "61": "cautleya spicata", "31": "carnation", "64": "silverbush", "68": "bearded iris", "63": "black-eyed susan", "69": "windflower", "62": "japanese anemone", "20": "giant white arum lily", "38": "great masterwort", "4": "sweet pea", "86": "tree mallow", "101": "trumpet creeper", "42": "daffodil", "22": "pincushion flower", "2": "hard-leaved pocket orchid", "54": "sunflower", "66": "osteospermum", "70": "tree poppy", "85": "desert-rose", "99": "bromelia", "87": "magnolia", "5": "english marigold", "92": "bee balm", "28": "stemless gentian", "97": "mallow", "57": "gaura", "40": "lenten rose", "47": "marigold", "59": "orange dahlia", "48": "buttercup", "55": "pelargonium", "36": "ruby-lipped cattleya", "91": "hippeastrum", "29": "artichoke", "71": "gazania", "90": "canna lily", "18": "peruvian lily", "98": "mexican petunia", "8": "bird of paradise", "30": "sweet william", "17": "purple coneflower", "52": "wild pansy", "84": "columbine", "12": "colt's foot", "11": "snapdragon", "96": "camellia", "23": "fritillary", "50": "common dandelion", "44": "poinsettia", "53": "primula", "72": "azalea", "65": "californian poppy", "80": "anthurium", "76": "morning glory", "37": "cape flower", "56": "bishop of llandaff", "60": "pink-yellow dahlia", "82": "clematis", "58": "geranium", "75": "thorn apple", "41": "barbeton daisy", "95": "bougainvillea", "43": "sword lily", "83": "hibiscus", "78": "lotus lotus", "88": "cyclamen", "94": "foxglove", "81": "frangipani", "74": "rose", "89": "watercress", "73": "water lily", "46": "wallflower", "77": "passion flower", "51": "petunia"} -------------------------------------------------------------------------------- /projects/p2_image_classifier/predict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/projects/p2_image_classifier/predict.py -------------------------------------------------------------------------------- /projects/p2_image_classifier/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/DSND_Term1/a652bace85b283fb7864f8d6ec6d3bd6f3cb8837/projects/p2_image_classifier/train.py --------------------------------------------------------------------------------