├── .gitignore ├── README.md └── projects ├── boston_housing ├── README.md ├── boston_housing.ipynb ├── housing.csv ├── project_description.md └── visuals.py ├── capstone ├── README.md ├── capstone_proposal_template.md ├── capstone_report_template.md ├── proposal_project_description.md ├── report-example-1.pdf └── report-example-3.pdf ├── customer_segments ├── README.md ├── customer_segments.ipynb ├── customers.csv ├── project_description.md └── visuals.py ├── digit_recognition ├── README.md ├── digit_recognition.ipynb └── project_description.md ├── finding_donors ├── README.md ├── census.csv ├── finding_donors.ipynb ├── project_description.md └── visuals.py ├── image-classification ├── ReadMe.md ├── helper.py ├── image_classification.ipynb └── problem_unittests.py ├── intro-to-tensorflow ├── environment.yml ├── environment_win.yml ├── image │ ├── Learn Rate Tune - Image.png │ ├── Mean Variance - Image.png │ ├── network_diagram.png │ └── notmnist.png ├── intro_to_tensorflow.ipynb └── intro_to_tensorflow_solution.ipynb ├── practice_projects ├── cnn │ ├── .gitignore │ ├── README.md │ ├── cifar10-augmentation │ │ ├── aug_model.weights.best.hdf5 │ │ └── cifar10_augmentation.ipynb │ ├── cifar10-classification │ │ ├── MLP.weights.best.hdf5 │ │ ├── cifar10_cnn.ipynb │ │ ├── cifar10_mlp.ipynb │ │ └── model.weights.best.hdf5 │ ├── conv-visualization │ │ ├── conv_visualization.ipynb │ │ └── images │ │ │ └── udacity_sdc.png │ ├── mnist-mlp │ │ ├── mnist.model.best.hdf5 │ │ └── mnist_mlp.ipynb │ ├── requirements │ │ ├── aind-dog-linux.yml │ │ ├── aind-dog-mac.yml │ │ ├── aind-dog-windows.yml │ │ └── requirements.txt │ └── transfer-learning │ │ ├── bottleneck_features.ipynb │ │ ├── bottleneck_features │ │ └── .gitignore │ │ ├── dogvgg16.weights.best.hdf5 │ │ ├── figures │ │ ├── vgg16.png │ │ └── vgg16_transfer.png │ │ ├── images │ │ ├── American_water_spaniel_00648.jpg │ │ ├── Brittany_02625.jpg │ │ ├── Curly-coated_retriever_03896.jpg │ │ ├── Labrador_retriever_06449.jpg │ │ ├── Labrador_retriever_06455.jpg │ │ ├── Labrador_retriever_06457.jpg │ │ ├── Welsh_springer_spaniel_08203.jpg │ │ └── sopa.jpg │ │ └── transfer_learning.ipynb ├── imdb │ ├── .gitignore │ ├── IMDB_In_Keras.ipynb │ ├── IMDB_In_Keras_Solutions.ipynb │ ├── README.md │ ├── Student_Admissions.ipynb │ ├── requirements │ │ ├── aind-dl-mac-linux.yml │ │ ├── aind-dl-windows.yml │ │ └── requirements.txt │ └── student_data.csv └── naive_bayes_tutorial │ ├── Bayesian_Inference.ipynb │ ├── Bayesian_Inference_solution.ipynb │ ├── ReadMe.md │ ├── images │ ├── bayes_formula.png │ ├── countvectorizer.png │ ├── dqnb.png │ ├── naivebayes.png │ └── tfidf.png │ └── smsspamcollection │ ├── SMSSpamCollection │ └── readme ├── smartcab ├── README.md ├── project_description.md ├── smartcab.ipynb ├── smartcab │ ├── __init__.py │ ├── agent.py │ ├── environment.py │ ├── images │ │ ├── car-black.png │ │ ├── car-blue.png │ │ ├── car-cyan.png │ │ ├── car-green.png │ │ ├── car-magenta.png │ │ ├── car-orange.png │ │ ├── car-red.png │ │ ├── car-white.png │ │ ├── car-yellow.png │ │ ├── east-west.png │ │ ├── logo.png │ │ └── north-south.png │ ├── planner.py │ └── simulator.py └── visuals.py ├── student_intervention ├── README.md ├── project_description.md ├── student-data.csv └── student_intervention.ipynb └── titanic_survival_exploration ├── README.md ├── project_description.md ├── titanic_data.csv ├── titanic_survival_exploration.ipynb └── visuals.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac OS 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | #Ipython Notebook 65 | .ipynb_checkpoints 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # machine-learning 2 | Content for Udacity's Machine Learning curriculum, which includes projects and their descriptions. 3 | 4 | Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License. Please refer to [Udacity Terms of Service](https://www.udacity.com/legal) for further information. 5 | -------------------------------------------------------------------------------- /projects/boston_housing/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | # Model Evaluation and Validation 3 | ## Project: Predicting Boston Housing Prices 4 | 5 | ### Install 6 | 7 | This project requires **Python** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [Pandas](http://pandas.pydata.org/) 11 | - [matplotlib](http://matplotlib.org/) 12 | - [scikit-learn](http://scikit-learn.org/stable/) 13 | 14 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html) 15 | 16 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. 17 | 18 | ### Code 19 | 20 | Template code is provided in the `boston_housing.ipynb` notebook file. You will also be required to use the included `visuals.py` Python file and the `housing.csv` dataset file to complete your work. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. Note that the code included in `visuals.py` is meant to be used out-of-the-box and not intended for students to manipulate. If you are interested in how the visualizations are created in the notebook, please feel free to explore this Python file. 21 | 22 | ### Run 23 | 24 | In a terminal or command window, navigate to the top-level project directory `boston_housing/` (that contains this README) and run one of the following commands: 25 | 26 | ```bash 27 | ipython notebook boston_housing.ipynb 28 | ``` 29 | or 30 | ```bash 31 | jupyter notebook boston_housing.ipynb 32 | ``` 33 | 34 | This will open the Jupyter Notebook software and project file in your browser. 35 | 36 | ### Data 37 | 38 | The modified Boston housing dataset consists of 489 data points, with each datapoint having 3 features. This dataset is a modified version of the Boston Housing dataset found on the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Housing). 39 | 40 | **Features** 41 | 1. `RM`: average number of rooms per dwelling 42 | 2. `LSTAT`: percentage of population considered lower status 43 | 3. `PTRATIO`: pupil-teacher ratio by town 44 | 45 | **Target Variable** 46 | 4. `MEDV`: median value of owner-occupied homes -------------------------------------------------------------------------------- /projects/boston_housing/project_description.md: -------------------------------------------------------------------------------- 1 | # Content: Model Evaluation and Validation 2 | ## Project: Predicting Boston Housing Prices 3 | 4 | ## Project Overview 5 | In this project, you will apply basic machine learning concepts on data collected for housing prices in the Boston, Massachusetts area to predict the selling price of a new home. You will first explore the data to obtain important features and descriptive statistics about the dataset. Next, you will properly split the data into testing and training subsets, and determine a suitable performance metric for this problem. You will then analyze performance graphs for a learning algorithm with varying parameters and training set sizes. This will enable you to pick the optimal model that best generalizes for unseen data. Finally, you will test this optimal model on a new sample and compare the predicted selling price to your statistics. 6 | 7 | ## Project Highlights 8 | This project is designed to get you acquainted to working with datasets in Python and applying basic machine learning techniques using NumPy and Scikit-Learn. Before being expected to use many of the available algorithms in the sklearn library, it will be helpful to first practice analyzing and interpreting the performance of your model. 9 | 10 | Things you will learn by completing this project: 11 | 12 | - How to use NumPy to investigate the latent features of a dataset. 13 | - How to analyze various learning performance plots for variance and bias. 14 | - How to determine the best-guess model for predictions from unseen data. 15 | - How to evaluate a model's performance on unseen data using previous data. 16 | 17 | ## Description 18 | The Boston housing market is highly competitive, and you want to be the best real estate agent in the area. To compete with your peers, you decide to leverage a few basic machine learning concepts to assist you and a client with finding the best selling price for their home. Luckily, you\'ve come across the Boston Housing dataset which contains aggregated data on various features for houses in Greater Boston communities, including the median value of homes for each of those areas. Your task is to build an optimal model based on a statistical analysis with the tools available. This model will then be used to estimate the best selling price for your clients\' homes. 19 | 20 | ## Software and Libraries 21 | This project uses the following software and Python libraries: 22 | 23 | - [Python](https://www.python.org/download/releases/3.0/) 24 | - [NumPy](http://www.numpy.org/) 25 | - [pandas](http://pandas.pydata.org/) 26 | - [scikit-learn](http://scikit-learn.org/stable/) 27 | - [matplotlib](http://matplotlib.org/) 28 | 29 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html). 30 | 31 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. 32 | 33 | ## Starting the Project 34 | 35 | For this assignment, you can find the `boston_housing` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 36 | 37 | This project contains three files: 38 | 39 | - `boston_housing.ipynb`: This is the main file where you will be performing your work on the project. 40 | - `housing.csv`: The project dataset. You'll load this data in the notebook. 41 | - `visuals.py`: This Python script provides supplementary visualizations for the project. Do not modify. 42 | 43 | In the Terminal or Command Prompt, navigate to the folder containing the project files, and then use the command `jupyter notebook boston_housing.ipynb` to open up a browser window or tab to work with your notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the project. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 44 | 45 | ## Submitting the Project 46 | 47 | ### Evaluation 48 | Your project will be reviewed by a Udacity reviewer against the **Predicting Boston Housing Prices project rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 49 | 50 | ### Submission Files 51 | When you are ready to submit your project, collect the following files and compress them into a single archive for upload. Alternatively, you may supply the following files on your GitHub Repo in a folder named `boston_housing` for ease of access: 52 | - The `boston_housing.ipynb` notebook file with all questions answered and all code cells executed and displaying output. 53 | - An **HTML** export of the project notebook with the name **report.html**. This file *must* be present for your project to be evaluated. 54 | 55 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 56 | 57 | ### I'm Ready! 58 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 59 | 60 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 61 | 62 | ### What's Next? 63 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! 64 | -------------------------------------------------------------------------------- /projects/boston_housing/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import matplotlib.pyplot as pl 13 | import numpy as np 14 | from sklearn.model_selection import learning_curve 15 | from sklearn.model_selection import validation_curve 16 | from sklearn.tree import DecisionTreeRegressor 17 | from sklearn.model_selection import ShuffleSplit, train_test_split 18 | 19 | def ModelLearning(X, y): 20 | """ Calculates the performance of several models with varying sizes of training data. 21 | The learning and testing scores for each model are then plotted. """ 22 | 23 | # Create 10 cross-validation sets for training and testing 24 | cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0) 25 | 26 | # Generate the training set sizes increasing by 50 27 | train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int) 28 | 29 | # Create the figure window 30 | fig = pl.figure(figsize=(10,7)) 31 | 32 | # Create three different models based on max_depth 33 | for k, depth in enumerate([1,3,6,10]): 34 | 35 | # Create a Decision tree regressor at max_depth = depth 36 | regressor = DecisionTreeRegressor(max_depth = depth) 37 | 38 | # Calculate the training and testing scores 39 | sizes, train_scores, test_scores = learning_curve(regressor, X, y, \ 40 | cv = cv, train_sizes = train_sizes, scoring = 'r2') 41 | 42 | # Find the mean and standard deviation for smoothing 43 | train_std = np.std(train_scores, axis = 1) 44 | train_mean = np.mean(train_scores, axis = 1) 45 | test_std = np.std(test_scores, axis = 1) 46 | test_mean = np.mean(test_scores, axis = 1) 47 | 48 | # Subplot the learning curve 49 | ax = fig.add_subplot(2, 2, k+1) 50 | ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score') 51 | ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score') 52 | ax.fill_between(sizes, train_mean - train_std, \ 53 | train_mean + train_std, alpha = 0.15, color = 'r') 54 | ax.fill_between(sizes, test_mean - test_std, \ 55 | test_mean + test_std, alpha = 0.15, color = 'g') 56 | 57 | # Labels 58 | ax.set_title('max_depth = %s'%(depth)) 59 | ax.set_xlabel('Number of Training Points') 60 | ax.set_ylabel('Score') 61 | ax.set_xlim([0, X.shape[0]*0.8]) 62 | ax.set_ylim([-0.05, 1.05]) 63 | 64 | # Visual aesthetics 65 | ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.) 66 | fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03) 67 | fig.tight_layout() 68 | fig.show() 69 | 70 | 71 | def ModelComplexity(X, y): 72 | """ Calculates the performance of the model as model complexity increases. 73 | The learning and testing errors rates are then plotted. """ 74 | 75 | # Create 10 cross-validation sets for training and testing 76 | cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0) 77 | 78 | # Vary the max_depth parameter from 1 to 10 79 | max_depth = np.arange(1,11) 80 | 81 | # Calculate the training and testing scores 82 | train_scores, test_scores = validation_curve(DecisionTreeRegressor(), X, y, \ 83 | param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2') 84 | 85 | # Find the mean and standard deviation for smoothing 86 | train_mean = np.mean(train_scores, axis=1) 87 | train_std = np.std(train_scores, axis=1) 88 | test_mean = np.mean(test_scores, axis=1) 89 | test_std = np.std(test_scores, axis=1) 90 | 91 | # Plot the validation curve 92 | pl.figure(figsize=(7, 5)) 93 | pl.title('Decision Tree Regressor Complexity Performance') 94 | pl.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score') 95 | pl.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score') 96 | pl.fill_between(max_depth, train_mean - train_std, \ 97 | train_mean + train_std, alpha = 0.15, color = 'r') 98 | pl.fill_between(max_depth, test_mean - test_std, \ 99 | test_mean + test_std, alpha = 0.15, color = 'g') 100 | 101 | # Visual aesthetics 102 | pl.legend(loc = 'lower right') 103 | pl.xlabel('Maximum Depth') 104 | pl.ylabel('Score') 105 | pl.ylim([-0.05,1.05]) 106 | pl.show() 107 | 108 | 109 | def PredictTrials(X, y, fitter, data): 110 | """ Performs trials of fitting and predicting data. """ 111 | 112 | # Store the predicted prices 113 | prices = [] 114 | 115 | for k in range(10): 116 | # Split the data 117 | X_train, X_test, y_train, y_test = train_test_split(X, y, \ 118 | test_size = 0.2, random_state = k) 119 | 120 | # Fit the data 121 | reg = fitter(X_train, y_train) 122 | 123 | # Make a prediction 124 | pred = reg.predict([data[0]])[0] 125 | prices.append(pred) 126 | 127 | # Result 128 | print("Trial {}: ${:,.2f}".format(k+1, pred)) 129 | 130 | # Display price range 131 | print("\nRange in prices: ${:,.2f}".format(max(prices) - min(prices))) 132 | -------------------------------------------------------------------------------- /projects/capstone/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | ## Specializations 3 | ## Project: Capstone Proposal and Capstone Project 4 | 5 | **Note** 6 | 7 | The Capstone is a two-staged project. The first is the proposal component, where you can receive valuable feedback about your project idea, design, and proposed solution. This must be completed prior to your implementation and submitting for the capstone project. 8 | 9 | You can find the [capstone proposal rubric here](https://review.udacity.com/#!/rubrics/410/view), and the [capstone project rubric here](https://review.udacity.com/#!/rubrics/108/view). Please ensure that you are following directions correctly before submitting these two stages which encapsulate your capstone. 10 | 11 | Please email [machine-support@udacity.com](mailto:machine-support@udacity.com) if you have any questions. 12 | -------------------------------------------------------------------------------- /projects/capstone/capstone_proposal_template.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | ## Capstone Proposal 3 | Joe Udacity 4 | December 31st, 2050 5 | 6 | ## Proposal 7 | _(approx. 2-3 pages)_ 8 | 9 | ### Domain Background 10 | _(approx. 1-2 paragraphs)_ 11 | 12 | In this section, provide brief details on the background information of the domain from which the project is proposed. Historical information relevant to the project should be included. It should be clear how or why a problem in the domain can or should be solved. Related academic research should be appropriately cited in this section, including why that research is relevant. Additionally, a discussion of your personal motivation for investigating a particular problem in the domain is encouraged but not required. 13 | 14 | ### Problem Statement 15 | _(approx. 1 paragraph)_ 16 | 17 | In this section, clearly describe the problem that is to be solved. The problem described should be well defined and should have at least one relevant potential solution. Additionally, describe the problem thoroughly such that it is clear that the problem is quantifiable (the problem can be expressed in mathematical or logical terms) , measurable (the problem can be measured by some metric and clearly observed), and replicable (the problem can be reproduced and occurs more than once). 18 | 19 | ### Datasets and Inputs 20 | _(approx. 2-3 paragraphs)_ 21 | 22 | In this section, the dataset(s) and/or input(s) being considered for the project should be thoroughly described, such as how they relate to the problem and why they should be used. Information such as how the dataset or input is (was) obtained, and the characteristics of the dataset or input, should be included with relevant references and citations as necessary It should be clear how the dataset(s) or input(s) will be used in the project and whether their use is appropriate given the context of the problem. 23 | 24 | ### Solution Statement 25 | _(approx. 1 paragraph)_ 26 | 27 | In this section, clearly describe a solution to the problem. The solution should be applicable to the project domain and appropriate for the dataset(s) or input(s) given. Additionally, describe the solution thoroughly such that it is clear that the solution is quantifiable (the solution can be expressed in mathematical or logical terms) , measurable (the solution can be measured by some metric and clearly observed), and replicable (the solution can be reproduced and occurs more than once). 28 | 29 | ### Benchmark Model 30 | _(approximately 1-2 paragraphs)_ 31 | 32 | In this section, provide the details for a benchmark model or result that relates to the domain, problem statement, and intended solution. Ideally, the benchmark model or result contextualizes existing methods or known information in the domain and problem given, which could then be objectively compared to the solution. Describe how the benchmark model or result is measurable (can be measured by some metric and clearly observed) with thorough detail. 33 | 34 | ### Evaluation Metrics 35 | _(approx. 1-2 paragraphs)_ 36 | 37 | In this section, propose at least one evaluation metric that can be used to quantify the performance of both the benchmark model and the solution model. The evaluation metric(s) you propose should be appropriate given the context of the data, the problem statement, and the intended solution. Describe how the evaluation metric(s) are derived and provide an example of their mathematical representations (if applicable). Complex evaluation metrics should be clearly defined and quantifiable (can be expressed in mathematical or logical terms). 38 | 39 | ### Project Design 40 | _(approx. 1 page)_ 41 | 42 | In this final section, summarize a theoretical workflow for approaching a solution given the problem. Provide thorough discussion for what strategies you may consider employing, what analysis of the data might be required before being used, or which algorithms will be considered for your implementation. The workflow and discussion that you provide should align with the qualities of the previous sections. Additionally, you are encouraged to include small visualizations, pseudocode, or diagrams to aid in describing the project design, but it is not required. The discussion should clearly outline your intended workflow of the capstone project. 43 | 44 | ----------- 45 | 46 | **Before submitting your proposal, ask yourself. . .** 47 | 48 | - Does the proposal you have written follow a well-organized structure similar to that of the project template? 49 | - Is each section (particularly **Solution Statement** and **Project Design**) written in a clear, concise and specific fashion? Are there any ambiguous terms or phrases that need clarification? 50 | - Would the intended audience of your project be able to understand your proposal? 51 | - Have you properly proofread your proposal to assure there are minimal grammatical and spelling mistakes? 52 | - Are all the resources used for this project correctly cited and referenced? 53 | -------------------------------------------------------------------------------- /projects/capstone/capstone_report_template.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | ## Capstone Project 3 | Joe Udacity 4 | December 31st, 2050 5 | 6 | ## I. Definition 7 | _(approx. 1-2 pages)_ 8 | 9 | ### Project Overview 10 | In this section, look to provide a high-level overview of the project in layman’s terms. Questions to ask yourself when writing this section: 11 | - _Has an overview of the project been provided, such as the problem domain, project origin, and related datasets or input data?_ 12 | - _Has enough background information been given so that an uninformed reader would understand the problem domain and following problem statement?_ 13 | 14 | ### Problem Statement 15 | In this section, you will want to clearly define the problem that you are trying to solve, including the strategy (outline of tasks) you will use to achieve the desired solution. You should also thoroughly discuss what the intended solution will be for this problem. Questions to ask yourself when writing this section: 16 | - _Is the problem statement clearly defined? Will the reader understand what you are expecting to solve?_ 17 | - _Have you thoroughly discussed how you will attempt to solve the problem?_ 18 | - _Is an anticipated solution clearly defined? Will the reader understand what results you are looking for?_ 19 | 20 | ### Metrics 21 | In this section, you will need to clearly define the metrics or calculations you will use to measure performance of a model or result in your project. These calculations and metrics should be justified based on the characteristics of the problem and problem domain. Questions to ask yourself when writing this section: 22 | - _Are the metrics you’ve chosen to measure the performance of your models clearly discussed and defined?_ 23 | - _Have you provided reasonable justification for the metrics chosen based on the problem and solution?_ 24 | 25 | 26 | ## II. Analysis 27 | _(approx. 2-4 pages)_ 28 | 29 | ### Data Exploration 30 | In this section, you will be expected to analyze the data you are using for the problem. This data can either be in the form of a dataset (or datasets), input data (or input files), or even an environment. The type of data should be thoroughly described and, if possible, have basic statistics and information presented (such as discussion of input features or defining characteristics about the input or environment). Any abnormalities or interesting qualities about the data that may need to be addressed have been identified (such as features that need to be transformed or the possibility of outliers). Questions to ask yourself when writing this section: 31 | - _If a dataset is present for this problem, have you thoroughly discussed certain features about the dataset? Has a data sample been provided to the reader?_ 32 | - _If a dataset is present for this problem, are statistics about the dataset calculated and reported? Have any relevant results from this calculation been discussed?_ 33 | - _If a dataset is **not** present for this problem, has discussion been made about the input space or input data for your problem?_ 34 | - _Are there any abnormalities or characteristics about the input space or dataset that need to be addressed? (categorical variables, missing values, outliers, etc.)_ 35 | 36 | ### Exploratory Visualization 37 | In this section, you will need to provide some form of visualization that summarizes or extracts a relevant characteristic or feature about the data. The visualization should adequately support the data being used. Discuss why this visualization was chosen and how it is relevant. Questions to ask yourself when writing this section: 38 | - _Have you visualized a relevant characteristic or feature about the dataset or input data?_ 39 | - _Is the visualization thoroughly analyzed and discussed?_ 40 | - _If a plot is provided, are the axes, title, and datum clearly defined?_ 41 | 42 | ### Algorithms and Techniques 43 | In this section, you will need to discuss the algorithms and techniques you intend to use for solving the problem. You should justify the use of each one based on the characteristics of the problem and the problem domain. Questions to ask yourself when writing this section: 44 | - _Are the algorithms you will use, including any default variables/parameters in the project clearly defined?_ 45 | - _Are the techniques to be used thoroughly discussed and justified?_ 46 | - _Is it made clear how the input data or datasets will be handled by the algorithms and techniques chosen?_ 47 | 48 | ### Benchmark 49 | In this section, you will need to provide a clearly defined benchmark result or threshold for comparing across performances obtained by your solution. The reasoning behind the benchmark (in the case where it is not an established result) should be discussed. Questions to ask yourself when writing this section: 50 | - _Has some result or value been provided that acts as a benchmark for measuring performance?_ 51 | - _Is it clear how this result or value was obtained (whether by data or by hypothesis)?_ 52 | 53 | 54 | ## III. Methodology 55 | _(approx. 3-5 pages)_ 56 | 57 | ### Data Preprocessing 58 | In this section, all of your preprocessing steps will need to be clearly documented, if any were necessary. From the previous section, any of the abnormalities or characteristics that you identified about the dataset will be addressed and corrected here. Questions to ask yourself when writing this section: 59 | - _If the algorithms chosen require preprocessing steps like feature selection or feature transformations, have they been properly documented?_ 60 | - _Based on the **Data Exploration** section, if there were abnormalities or characteristics that needed to be addressed, have they been properly corrected?_ 61 | - _If no preprocessing is needed, has it been made clear why?_ 62 | 63 | ### Implementation 64 | In this section, the process for which metrics, algorithms, and techniques that you implemented for the given data will need to be clearly documented. It should be abundantly clear how the implementation was carried out, and discussion should be made regarding any complications that occurred during this process. Questions to ask yourself when writing this section: 65 | - _Is it made clear how the algorithms and techniques were implemented with the given datasets or input data?_ 66 | - _Were there any complications with the original metrics or techniques that required changing prior to acquiring a solution?_ 67 | - _Was there any part of the coding process (e.g., writing complicated functions) that should be documented?_ 68 | 69 | ### Refinement 70 | In this section, you will need to discuss the process of improvement you made upon the algorithms and techniques you used in your implementation. For example, adjusting parameters for certain models to acquire improved solutions would fall under the refinement category. Your initial and final solutions should be reported, as well as any significant intermediate results as necessary. Questions to ask yourself when writing this section: 71 | - _Has an initial solution been found and clearly reported?_ 72 | - _Is the process of improvement clearly documented, such as what techniques were used?_ 73 | - _Are intermediate and final solutions clearly reported as the process is improved?_ 74 | 75 | 76 | ## IV. Results 77 | _(approx. 2-3 pages)_ 78 | 79 | ### Model Evaluation and Validation 80 | In this section, the final model and any supporting qualities should be evaluated in detail. It should be clear how the final model was derived and why this model was chosen. In addition, some type of analysis should be used to validate the robustness of this model and its solution, such as manipulating the input data or environment to see how the model’s solution is affected (this is called sensitivity analysis). Questions to ask yourself when writing this section: 81 | - _Is the final model reasonable and aligning with solution expectations? Are the final parameters of the model appropriate?_ 82 | - _Has the final model been tested with various inputs to evaluate whether the model generalizes well to unseen data?_ 83 | - _Is the model robust enough for the problem? Do small perturbations (changes) in training data or the input space greatly affect the results?_ 84 | - _Can results found from the model be trusted?_ 85 | 86 | ### Justification 87 | In this section, your model’s final solution and its results should be compared to the benchmark you established earlier in the project using some type of statistical analysis. You should also justify whether these results and the solution are significant enough to have solved the problem posed in the project. Questions to ask yourself when writing this section: 88 | - _Are the final results found stronger than the benchmark result reported earlier?_ 89 | - _Have you thoroughly analyzed and discussed the final solution?_ 90 | - _Is the final solution significant enough to have solved the problem?_ 91 | 92 | 93 | ## V. Conclusion 94 | _(approx. 1-2 pages)_ 95 | 96 | ### Free-Form Visualization 97 | In this section, you will need to provide some form of visualization that emphasizes an important quality about the project. It is much more free-form, but should reasonably support a significant result or characteristic about the problem that you want to discuss. Questions to ask yourself when writing this section: 98 | - _Have you visualized a relevant or important quality about the problem, dataset, input data, or results?_ 99 | - _Is the visualization thoroughly analyzed and discussed?_ 100 | - _If a plot is provided, are the axes, title, and datum clearly defined?_ 101 | 102 | ### Reflection 103 | In this section, you will summarize the entire end-to-end problem solution and discuss one or two particular aspects of the project you found interesting or difficult. You are expected to reflect on the project as a whole to show that you have a firm understanding of the entire process employed in your work. Questions to ask yourself when writing this section: 104 | - _Have you thoroughly summarized the entire process you used for this project?_ 105 | - _Were there any interesting aspects of the project?_ 106 | - _Were there any difficult aspects of the project?_ 107 | - _Does the final model and solution fit your expectations for the problem, and should it be used in a general setting to solve these types of problems?_ 108 | 109 | ### Improvement 110 | In this section, you will need to provide discussion as to how one aspect of the implementation you designed could be improved. As an example, consider ways your implementation can be made more general, and what would need to be modified. You do not need to make this improvement, but the potential solutions resulting from these changes are considered and compared/contrasted to your current solution. Questions to ask yourself when writing this section: 111 | - _Are there further improvements that could be made on the algorithms or techniques you used in this project?_ 112 | - _Were there algorithms or techniques you researched that you did not know how to implement, but would consider using if you knew how?_ 113 | - _If you used your final solution as the new benchmark, do you think an even better solution exists?_ 114 | 115 | ----------- 116 | 117 | **Before submitting, ask yourself. . .** 118 | 119 | - Does the project report you’ve written follow a well-organized structure similar to that of the project template? 120 | - Is each section (particularly **Analysis** and **Methodology**) written in a clear, concise and specific fashion? Are there any ambiguous terms or phrases that need clarification? 121 | - Would the intended audience of your project be able to understand your analysis, methods, and results? 122 | - Have you properly proof-read your project report to assure there are minimal grammatical and spelling mistakes? 123 | - Are all the resources used for this project correctly cited and referenced? 124 | - Is the code that implements your solution easily readable and properly commented? 125 | - Does the code execute without error and produce results similar to those reported? 126 | -------------------------------------------------------------------------------- /projects/capstone/proposal_project_description.md: -------------------------------------------------------------------------------- 1 | # Content: Specializations 2 | ## Project: Capstone Proposal and Capstone Project 3 | 4 | ## Capstone Proposal Overview 5 | In this capstone project proposal, prior to completing the following **Capstone Project**, you you will leverage what you've learned throughout the Nanodegree program to author a proposal for solving a problem of your choice by applying machine learning algorithms and techniques. A project proposal encompasses seven key points: 6 | - The project's **domain background** : the field of research where the project is derived; 7 | - A **problem statement** : a problem being investigated for which a solution will be defined; 8 | - The **datasets and inputs** : data or inputs being used for the problem; 9 | - A **solution statement** : a the solution proposed for the problem given; 10 | - A **benchmark model** : some simple or historical model or result to compare the defined solution to; 11 | - A set of **evaluation metrics** : functional representations for how the solution can be measured; 12 | - An outline of the **project design** : how the solution will be developed and results obtained. 13 | 14 | ## Capstone Proposal Highlights 15 | The capstone project proposal is designed to introduce you to writing proposals for major projects. Typically, before you begin working on a solution to a problem, a proposal is written to your peers, advisor, manager, etc., to outline the details of the problem, your research, and your approach to a solution. 16 | 17 | Things you will learn by completing this project proposal: 18 | - How to research a real-world problem of interest. 19 | - How to author a technical proposal document. 20 | - How to organize a proposed workflow for designing a solution. 21 | 22 | ## Capstone Proposal Description 23 | 24 | Think about a technical field or domain that you are passionate about, such as robotics, virtual reality, finance, natural language processing, or even artificial intelligence (the possibilities are endless!). Then, choose an existing problem within that domain that you are interested in which you could solve by applying machine learning algorithms and techniques. Be sure that you have collected all of the resources needed (such as datasets, inputs, and research) to complete this project, and make the appropriate citations wherever necessary in your proposal. Below are a few suggested problem areas you could explore if you are unsure what your passion is: 25 | 26 | - [Robot Motion Planning](https://docs.google.com/document/d/1ZFCH6jS3A5At7_v5IUM5OpAXJYiutFuSIjTzV_E-vdE/pub) 27 | - [Healthcare](https://docs.google.com/document/d/1WzurKKa9AX2DnOH7KiB38mvozdOSemfkGpex8hdTy8c/pub) 28 | - [Computer Vision](https://docs.google.com/document/d/1y-XfjkPFgUQxFIQ9bBncUSjs4HOf5E-45FrLYNBsZb4/pub) 29 | - [Education](https://docs.google.com/document/d/1vjerjRQnWs1kLbZagDYT6rNqiwAG23Yj45oUY88IAxI/pub) 30 | - [Investment and Trading](https://docs.google.com/document/d/1ycGeb1QYKATG6jvz74SAMqxrlek9Ed4RYrzWNhWS-0Q/pub) 31 | 32 | In addition, you may find a technical domain (along with the problem and dataset) as *competitions* on platforms such as [Kaggle](http://kaggle.com), or [Devpost](http://devpost.com). This can be helpful for discovering a particular problem you may be interested in solving as an alternative to the suggested problem areas above. In many cases, some of the requirements for the capstone proposal are already defined for you when choosing from these platforms. 33 | 34 | To determine whether your project and the problem you want to solve fits Udacity's vision of a Machine Learning Capstone Project , please refer to the [capstone proposal rubric](https://review.udacity.com/#!/rubrics/410/view) and the [capstone project rubric](https://review.udacity.com/#!/rubrics/108/view) and make a note of each rubric criteria you will be evaluated on. A satisfactory project will have a proposal that clearly satisfies these requirements. 35 | 36 | ## Software Requirements 37 | **Your proposed project must be written in Python 2.7.** Given the free-form nature of the machine learning capstone, the software and libraries you will need to successfully complete your work will vary depending on the chosen application area and problem definition. Because of this, it is imperative that all necessary software and libraries you consider using in your capstone project are accessible clearly documented. Please note that proprietary software, software that requires private licenses, or software behind a paywall or login account should be avoided. 38 | 39 | ## Data Requirements 40 | Every machine learning capstone project will most certainly require some form of dataset or input data structure (input text files, images, etc.). Similar to the software requirements above, the data you are considering must either be publicly accessible or provided by you during the submission process, and private or proprietary data should not be used without expressed permission. Please take into consideration the file size of your data — while there is no strict upper limit, input files that are excessively large may require reviewers longer than an acceptable amount of time to acquire all of your project files. This can take away from the reviewer's time that could be put towards evaluating your proposal. If the data you are considering fits the criteria of being too large, consider whether you could work with a subset of the data instead, or provide a representative sample of the data. 41 | 42 | ## Ethics 43 | Udacity's A/B Testing course, as part of the Data Analyst Nanodegree, has a segment that discusses [the sensitivity of data](https://classroom.udacity.com/nanodegrees/nd002/parts/00213454013/modules/411033896375460/lessons/3998098714/concepts/39997087540923#) and the expectation of privacy from those whose information has been collected. While most data you find available to the public will not have any ethical complications, it is extremely important that you are considering where the data you are using came from, and whether that data contains any sensitive information. For example, if you worked for a bank and wanted to use customers' bank statements as part of your project, this would most likely be an unethical choice of data and should be avoided. 44 | 45 | ## Proposal Guidelines 46 | Your project submission will be evaluated on the written proposal that is submitted. Additionally, depending on the project you are proposing, other materials such as the data being used will be evaluated. It is expected that the proposal contains enough detail, documentation, analysis, and discussion to adequately reflect the work you intend to complete for the project. Because of this, it is extremely important that the proposal is written in a professional, standardized way, so those who review your project's proposal are able to clearly identify each component of your project in the report. Without a properly written proposal, your project cannot be sufficiently evaluated. A [project proposal template](https://github.com/udacity/machine-learning/blob/master/projects/capstone/capstone_proposal_template.md) is provided for you to understand how a project proposal should be structured. We strongly encourage students to have a proposal that is approximately **two to three pages in length**. 47 | 48 | The Machine Learning Capstone Project proposal should be treated no different than a written research paper for academics. Your goal is to ultimately present the research you've discovered into the respective problem domain you've chosen, and then clearly articulate your intended project to your peers. The narrative found in the [project proposal template](https://github.com/udacity/machine-learning/blob/master/projects/capstone/capstone_proposal_template.md) provides for a *"proposal checklist"* that will aid you in fully completing a documented proposal. Please make use of this resource! 49 | 50 | ## Submitting the Project 51 | 52 | ### Evaluation 53 | Your project will be reviewed by a Udacity reviewer against the **Capstone Project Proposal rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 54 | 55 | ### Submission Files 56 | At minimum, your submission will be required to have the following files listed below. If your submission method of choice is uploading an archive (`*.zip`), please take into consideration the total file size. You will need to include 57 | - A project proposal, *in PDF format only*, with the name **proposal.pdf**, addressing each of the seven key points of a proposal. The recommended page length for a proposal is approximately *two to three pages*. 58 | - Any additional supporting material such as datasets, images, or input files that are necessary for your project and proposal. If these files are too large and you are uploading your submission, instead provide appropriate means of acquiring the necessary files in an included `README.md` file. 59 | 60 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 61 | 62 | ### I'm Ready! 63 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 64 | 65 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 66 | 67 | ### What's Next? 68 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! 69 | -------------------------------------------------------------------------------- /projects/capstone/report-example-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/capstone/report-example-1.pdf -------------------------------------------------------------------------------- /projects/capstone/report-example-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/capstone/report-example-3.pdf -------------------------------------------------------------------------------- /projects/customer_segments/README.md: -------------------------------------------------------------------------------- 1 | # Content: Unsupervised Learning 2 | ## Project: Creating Customer Segments 3 | 4 | ### Install 5 | 6 | This project requires **Python 2.7** and the following Python libraries installed: 7 | 8 | - [NumPy](http://www.numpy.org/) 9 | - [Pandas](http://pandas.pydata.org) 10 | - [matplotlib](http://matplotlib.org/) 11 | - [scikit-learn](http://scikit-learn.org/stable/) 12 | 13 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html) 14 | 15 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. 16 | 17 | ### Code 18 | 19 | Template code is provided in the `customer_segments.ipynb` notebook file. You will also be required to use the included `visuals.py` Python file and the `customers.csv` dataset file to complete your work. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. Note that the code included in `visuals.py` is meant to be used out-of-the-box and not intended for students to manipulate. If you are interested in how the visualizations are created in the notebook, please feel free to explore this Python file. 20 | 21 | ### Run 22 | 23 | In a terminal or command window, navigate to the top-level project directory `customer_segments/` (that contains this README) and run one of the following commands: 24 | 25 | ```bash 26 | ipython notebook customer_segments.ipynb 27 | ``` 28 | or 29 | ```bash 30 | jupyter notebook customer_segments.ipynb 31 | ``` 32 | 33 | This will open the Jupyter Notebook software and project file in your browser. 34 | 35 | ## Data 36 | 37 | The customer segments data is included as a selection of 440 data points collected on data found from clients of a wholesale distributor in Lisbon, Portugal. More information can be found on the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Wholesale+customers). 38 | 39 | Note (m.u.) is shorthand for *monetary units*. 40 | 41 | **Features** 42 | 1) `Fresh`: annual spending (m.u.) on fresh products (Continuous); 43 | 2) `Milk`: annual spending (m.u.) on milk products (Continuous); 44 | 3) `Grocery`: annual spending (m.u.) on grocery products (Continuous); 45 | 4) `Frozen`: annual spending (m.u.) on frozen products (Continuous); 46 | 5) `Detergents_Paper`: annual spending (m.u.) on detergents and paper products (Continuous); 47 | 6) `Delicatessen`: annual spending (m.u.) on and delicatessen products (Continuous); 48 | 7) `Channel`: {Hotel/Restaurant/Cafe - 1, Retail - 2} (Nominal) 49 | 8) `Region`: {Lisbon - 1, Oporto - 2, or Other - 3} (Nominal) -------------------------------------------------------------------------------- /projects/customer_segments/project_description.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | # Unsupervised Learning 3 | ## Project: Creating Customer Segments 4 | 5 | ## Project Overview 6 | In this project you will apply unsupervised learning techniques on product spending data collected for customers of a wholesale distributor in Lisbon, Portugal to identify customer segments hidden in the data. You will first explore the data by selecting a small subset to sample and determine if any product categories highly correlate with one another. Afterwards, you will preprocess the data by scaling each product category and then identifying (and removing) unwanted outliers. With the good, clean customer spending data, you will apply PCA transformations to the data and implement clustering algorithms to segment the transformed customer data. Finally, you will compare the segmentation found with an additional labeling and consider ways this information could assist the wholesale distributor with future service changes. 7 | 8 | ## Project Highlights 9 | This project is designed to give you a hands-on experience with unsupervised learning and work towards developing conclusions for a potential client on a real-world dataset. Many companies today collect vast amounts of data on customers and clientele, and have a strong desire to understand the meaningful relationships hidden in their customer base. Being equipped with this information can assist a company engineer future products and services that best satisfy the demands or needs of their customers. 10 | 11 | Things you will learn by completing this project: 12 | 13 | - How to apply preprocessing techniques such as feature scaling and outlier detection. 14 | - How to interpret data points that have been scaled, transformed, or reduced from PCA. 15 | - How to analyze PCA dimensions and construct a new feature space. 16 | - How to optimally cluster a set of data to find hidden patterns in a dataset. 17 | - How to assess information given by cluster data and use it in a meaningful way. 18 | 19 | ## Description 20 | A wholesale distributor recently tested a change to their delivery method for some customers, by moving from a morning delivery service five days a week to a cheaper evening delivery service three days a week. Initial testing did not discover any significant unsatisfactory results, so they implemented the cheaper option for all customers. Almost immediately, the distributor began getting complaints about the delivery service change and customers were canceling deliveries, losing the distributor more money than what was being saved. You've been hired by the wholesale distributor to find what types of customers they have to help them make better, more informed business decisions in the future. Your task is to use unsupervised learning techniques to see if any similarities exist between customers, and how to best segment customers into distinct categories. 21 | 22 | ## Software and Libraries 23 | This project uses the following software and Python libraries: 24 | 25 | - [Python 2.7](https://www.python.org/download/releases/2.7/) 26 | - [NumPy](http://www.numpy.org/) 27 | - [pandas](http://pandas.pydata.org/) 28 | - [scikit-learn](http://scikit-learn.org/stable/) 29 | - [matplotlib](http://matplotlib.org/) 30 | 31 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html). 32 | 33 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. 34 | 35 | ## Starting the Project 36 | 37 | For this assignment, you can find the `customer_segments` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 38 | 39 | This project contains three files: 40 | 41 | - `customer_segments.ipynb`: This is the main file where you will be performing your work on the project. 42 | - `customers.csv`: The project dataset. You'll load this data in the notebook. 43 | - `visuals.py`: This Python script provides supplementary visualizations for the project. Do not modify. 44 | 45 | In the Terminal or Command Prompt, navigate to the folder containing the project files, and then use the command `jupyter notebook customer_segments.ipynb` to open up a browser window or tab to work with your notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the project. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 46 | 47 | ## Submitting the Project 48 | 49 | ### Evaluation 50 | Your project will be reviewed by a Udacity reviewer against the **Creating Customer Segments project rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 51 | 52 | ### Submission Files 53 | When you are ready to submit your project, collect the following files and compress them into a single archive for upload. Alternatively, you may supply the following files on your GitHub Repo in a folder named `customer_segments` for ease of access: 54 | - The `customer_segments.ipynb` notebook file with all questions answered and all code cells executed and displaying output. 55 | - An **HTML** export of the project notebook with the name **report.html**. This file *must* be present for your project to be evaluated. 56 | 57 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 58 | 59 | ### I'm Ready! 60 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 61 | 62 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 63 | 64 | ### What's Next? 65 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! 66 | -------------------------------------------------------------------------------- /projects/customer_segments/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.cm as cm 14 | import pandas as pd 15 | import numpy as np 16 | 17 | def pca_results(good_data, pca): 18 | ''' 19 | Create a DataFrame of the PCA results 20 | Includes dimension feature weights and explained variance 21 | Visualizes the PCA results 22 | ''' 23 | 24 | # Dimension indexing 25 | dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] 26 | 27 | # PCA components 28 | components = pd.DataFrame(np.round(pca.components_, 4), columns = list(good_data.keys())) 29 | components.index = dimensions 30 | 31 | # PCA explained variance 32 | ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 33 | variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 34 | variance_ratios.index = dimensions 35 | 36 | # Create a bar plot visualization 37 | fig, ax = plt.subplots(figsize = (14,8)) 38 | 39 | # Plot the feature weights as a function of the components 40 | components.plot(ax = ax, kind = 'bar'); 41 | ax.set_ylabel("Feature Weights") 42 | ax.set_xticklabels(dimensions, rotation=0) 43 | 44 | 45 | # Display the explained variance ratios 46 | for i, ev in enumerate(pca.explained_variance_ratio_): 47 | ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) 48 | 49 | # Return a concatenated DataFrame 50 | return pd.concat([variance_ratios, components], axis = 1) 51 | 52 | def cluster_results(reduced_data, preds, centers, pca_samples): 53 | ''' 54 | Visualizes the PCA-reduced cluster data in two dimensions 55 | Adds cues for cluster centers and student-selected sample data 56 | ''' 57 | 58 | predictions = pd.DataFrame(preds, columns = ['Cluster']) 59 | plot_data = pd.concat([predictions, reduced_data], axis = 1) 60 | 61 | # Generate the cluster plot 62 | fig, ax = plt.subplots(figsize = (14,8)) 63 | 64 | # Color map 65 | cmap = cm.get_cmap('gist_rainbow') 66 | 67 | # Color the points based on assigned cluster 68 | for i, cluster in plot_data.groupby('Cluster'): 69 | cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 70 | color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); 71 | 72 | # Plot centers with indicators 73 | for i, c in enumerate(centers): 74 | ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ 75 | alpha = 1, linewidth = 2, marker = 'o', s=200); 76 | ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); 77 | 78 | # Plot transformed sample points 79 | ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ 80 | s = 150, linewidth = 4, color = 'black', marker = 'x'); 81 | 82 | # Set plot title 83 | ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross"); 84 | 85 | 86 | def biplot(good_data, reduced_data, pca): 87 | ''' 88 | Produce a biplot that shows a scatterplot of the reduced 89 | data and the projections of the original features. 90 | 91 | good_data: original data, before transformation. 92 | Needs to be a pandas dataframe with valid column names 93 | reduced_data: the reduced data (the first two dimensions are plotted) 94 | pca: pca object that contains the components_ attribute 95 | 96 | return: a matplotlib AxesSubplot object (for any additional customization) 97 | 98 | This procedure is inspired by the script: 99 | https://github.com/teddyroland/python-biplot 100 | ''' 101 | 102 | fig, ax = plt.subplots(figsize = (14,8)) 103 | # scatterplot of the reduced data 104 | ax.scatter(x=reduced_data.loc[:, 'Dimension 1'], y=reduced_data.loc[:, 'Dimension 2'], 105 | facecolors='b', edgecolors='b', s=70, alpha=0.5) 106 | 107 | feature_vectors = pca.components_.T 108 | 109 | # we use scaling factors to make the arrows easier to see 110 | arrow_size, text_pos = 7.0, 8.0, 111 | 112 | # projections of the original features 113 | for i, v in enumerate(feature_vectors): 114 | ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], 115 | head_width=0.2, head_length=0.2, linewidth=2, color='red') 116 | ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black', 117 | ha='center', va='center', fontsize=18) 118 | 119 | ax.set_xlabel("Dimension 1", fontsize=14) 120 | ax.set_ylabel("Dimension 2", fontsize=14) 121 | ax.set_title("PC plane with original feature projections.", fontsize=16); 122 | return ax 123 | 124 | 125 | def channel_results(reduced_data, outliers, pca_samples): 126 | ''' 127 | Visualizes the PCA-reduced cluster data in two dimensions using the full dataset 128 | Data is labeled by "Channel" and cues added for student-selected sample data 129 | ''' 130 | 131 | # Check that the dataset is loadable 132 | try: 133 | full_data = pd.read_csv("customers.csv") 134 | except: 135 | print("Dataset could not be loaded. Is the file missing?") 136 | return False 137 | 138 | # Create the Channel DataFrame 139 | channel = pd.DataFrame(full_data['Channel'], columns = ['Channel']) 140 | channel = channel.drop(channel.index[outliers]).reset_index(drop = True) 141 | labeled = pd.concat([reduced_data, channel], axis = 1) 142 | 143 | # Generate the cluster plot 144 | fig, ax = plt.subplots(figsize = (14,8)) 145 | 146 | # Color map 147 | cmap = cm.get_cmap('gist_rainbow') 148 | 149 | # Color the points based on assigned Channel 150 | labels = ['Hotel/Restaurant/Cafe', 'Retailer'] 151 | grouped = labeled.groupby('Channel') 152 | for i, channel in grouped: 153 | channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ 154 | color = cmap((i-1)*1.0/2), label = labels[i-1], s=30); 155 | 156 | # Plot transformed sample points 157 | for i, sample in enumerate(pca_samples): 158 | ax.scatter(x = sample[0], y = sample[1], \ 159 | s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none'); 160 | ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125); 161 | 162 | # Set plot title 163 | ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled"); -------------------------------------------------------------------------------- /projects/digit_recognition/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | # Deep Learning 3 | ## Project: Build a Digit Recognition Program 4 | 5 | ### Install 6 | 7 | This project requires **Python 2.x or Python 3.x** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [SciPy](https://www.scipy.org/) 11 | - [scikit-learn](http://scikit-learn.org/0.17/install.html) (v0.17) 12 | - [TensorFlow](http://tensorflow.org) 13 | 14 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html). 15 | 16 | In addition to the above, for those optionally seeking to use image processing software, you may need one of the following: 17 | - [PyGame](http://pygame.org/) 18 | - Helpful links for installing PyGame: 19 | - [Getting Started](https://www.pygame.org/wiki/GettingStarted) 20 | - [PyGame Information](http://www.pygame.org/wiki/info) 21 | - [Google Group](https://groups.google.com/forum/#!forum/pygame-mirror-on-google-groups) 22 | - [PyGame subreddit](https://www.reddit.com/r/pygame/) 23 | - [OpenCV](http://opencv.org/) 24 | 25 | For those optionally seeking to deploy an Android application: 26 | - Android SDK & NDK (see this [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/README.md)) 27 | 28 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. `pygame` and `OpenCV` can then be installed using one of the following commands: 29 | 30 | Mac: 31 | ```bash 32 | conda install -c https://conda.anaconda.org/quasiben pygame 33 | conda install -c menpo opencv=2.4.11 34 | ``` 35 | 36 | Windows & Linux: 37 | ```bash 38 | conda install -c https://conda.anaconda.org/tlatorre pygame 39 | conda install -c menpo opencv=2.4.11 40 | ``` 41 | 42 | ### Code 43 | 44 | A template notebook is provided as `digit_recognition.ipynb`. While no code is included in the notebook, you will be required to use the notebook to implement the basic functionality of your project and answer questions about your implementation and results. 45 | 46 | ### Run 47 | 48 | In a terminal or command window, navigate to the top-level project directory `digit_recognition/` (that contains this README) and run one of the following commands: 49 | 50 | ```bash 51 | ipython notebook digit_recognition.ipynb 52 | ``` 53 | or 54 | ```bash 55 | jupyter notebook digit_recognition.ipynb 56 | ``` 57 | 58 | This will open the Jupyter Notebook software and notebook file in your browser. 59 | 60 | 61 | ### Data 62 | 63 | While no data is directly provided with the project, you will be required to download and use the [Street View House Numbers (SVHN) dataset](http://ufldl.stanford.edu/housenumbers/), along with either the [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) or [MNIST](http://yann.lecun.com/exdb/mnist/) datasets. If you've completed the course material, the **notMINIST** dataset should already be available. 64 | -------------------------------------------------------------------------------- /projects/digit_recognition/project_description.md: -------------------------------------------------------------------------------- 1 | # Content: Deep Learning 2 | ## Project: Build a Digit Recognition Program 3 | 4 | ## Project Overview 5 | 6 | In this project, you will use what you've learned about deep neural networks and convolutional neural networks to create a live camera application or program that prints numbers it observes in real time from images it is given. First, you will design and test a model architecture that can identify sequences of digits in an image. Next, you will train that model so it can decode sequences of digits from natural images by using the [Street View House Numbers (SVHN) dataset](http://ufldl.stanford.edu/housenumbers/). After the model is properly trained, you will then test your model using a live camera application (optional) or program on newly-captured images. Finally, once you obtain meaningful results, you will refine your implementation to also *localize where numbers are on the image*, and test this localization on newly-captured images. 7 | 8 | ## Software Requirements 9 | This project uses the following software and Python libraries: 10 | 11 | - [Python 2.7](https://www.python.org/download/releases/2.7/) 12 | - [NumPy](http://www.numpy.org/) 13 | - [SciPy](https://www.scipy.org/) 14 | - [scikit-learn](http://scikit-learn.org/0.17/install.html) (v0.17) 15 | - [TensorFlow](http://tensorflow.org) 16 | 17 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html). 18 | 19 | In addition to the above, for those optionally seeking to use image processing software, you may need one of the following: 20 | - [PyGame](http://pygame.org/) 21 | - Helpful links for installing PyGame: 22 | - [Getting Started](https://www.pygame.org/wiki/GettingStarted) 23 | - [PyGame Information](http://www.pygame.org/wiki/info) 24 | - [Google Group](https://groups.google.com/forum/#!forum/pygame-mirror-on-google-groups) 25 | - [PyGame subreddit](https://www.reddit.com/r/pygame/) 26 | - [OpenCV](http://opencv.org/) 27 | 28 | For those optionally seeking to deploy an Android application: 29 | - Android SDK & NDK (see this [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/README.md)) 30 | 31 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. `pygame` and `OpenCV` can then be installed using one of the following commands: 32 | 33 | **opencv** 34 | `conda install -c menpo opencv=2.4.11` 35 | 36 | **PyGame:** 37 | Mac: `conda install -c https://conda.anaconda.org/quasiben pygame` 38 | Windows: `conda install -c https://conda.anaconda.org/tlatorre pygame` 39 | Linux: `conda install -c https://conda.anaconda.org/prkrekel pygame` 40 | 41 | ## Starting the Project 42 | 43 | For this assignment, you can find the `digit_recognition` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 44 | 45 | This project contains one file: 46 | 47 | - `digit_recognition.ipynb`: This is the main file where you will be performing your work on the project. 48 | 49 | In addition, you will need to download the [Street View House Numbers (SVHN) dataset](http://ufldl.stanford.edu/housenumbers/), along with either the [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) or [MNIST](http://yann.lecun.com/exdb/mnist/) datasets. If you've completed the course material, the **notMINIST** dataset should already be available. 50 | 51 | In the Terminal or Command Prompt, navigate to the folder containing the project files, and then use the command `jupyter notebook digit_recognition.ipynb` to open up a browser window or tab to work with your notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the project. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 52 | 53 | ## Tasks 54 | 55 | ### Project Report 56 | You will be required to answer questions about your implementation as part of your submission in the provided `digit_recognition.ipynb.` As you complete the tasks below, include thorough, detailed answers to each question *provided in italics*. 57 | 58 | ### Step 1: Design and Test a Model Architecture 59 | Design and implement a deep learning model that learns to recognize sequences of digits. Train the model using synthetic data generated by concatenating character images from [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) or [MNIST](http://yann.lecun.com/exdb/mnist/). To produce a synthetic sequence of digits for testing, you can for example limit yourself to sequences up to five digits, and use five classifiers on top of your deep network. You would have to incorporate an additional ?blank? character to account for shorter number sequences. 60 | 61 | There are various aspects to consider when thinking about this problem: 62 | - Your model can be derived from a deep neural net or a convolutional network. 63 | - You could experiment sharing or not the weights between the softmax classifiers. 64 | - You can also use a recurrent network in your deep neural net to replace the classification layers and directly emit the sequence of digits one-at-a-time. 65 | 66 | Here is an example of a [published baseline model on this problem](http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42241.pdf). ([video](https://www.youtube.com/watch?v=vGPI_JvLoN0)) 67 | 68 | ***QUESTION:*** _What approach did you take in coming up with a solution to this problem?_ 69 | 70 | ***QUESTION:*** _What does your final architecture look like? (Type of model, layers, sizes, connectivity, etc.)_ 71 | 72 | ***QUESTION:*** _How did you train your model? How did you generate your synthetic dataset?_ 73 | 74 | ### Step 2: Train a Model on a Realistic Dataset 75 | Once you have settled on a good architecture, you can train your model on real data. In particular, the [Street View House Numbers (SVHN)](http://ufldl.stanford.edu/housenumbers/) dataset is a good large-scale dataset collected from house numbers in Google Street View. Training on this more challenging dataset, where the digits are not neatly lined-up and have various skews, fonts and colors, likely means you have to do some hyperparameter exploration to perform well. 76 | 77 | ***QUESTION:*** _Describe how you set up the training and testing data for your model. How does the model perform on a realistic dataset?_ 78 | 79 | ***QUESTION:*** _What changes did you have to make, if any, to achieve "good" results? Were there any options you explored that made the results worse?_ 80 | 81 | ***QUESTION:*** _What were your initial and final results with testing on a realistic dataset? Do you believe your model is doing a good enough job at classifying numbers correctly?_ 82 | 83 | ### Step 3: Test a Model on Newly-Captured Images 84 | 85 | Take several pictures of numbers that you find around you (at least five), and run them through your classifier on your computer to produce example results. Alternatively (optionally), you can try using OpenCV / SimpleCV / Pygame to capture live images from a webcam and run those through your classifier. 86 | 87 | ***QUESTION:*** _Choose five candidate images of numbers you took from around you and provide them in the report. Are there any particular qualities of the image(s) that might make classification difficult?_ 88 | 89 | ***QUESTION:*** _Is your model able to perform equally well on captured pictures or a live camera stream when compared to testing on the realistic dataset?_ 90 | 91 | ***QUESTION:*** _If necessary, provide documentation for how an interface was built for your model to load and classify newly-acquired images._ 92 | 93 | ### Step 4: Explore an Improvement for a Model 94 | 95 | There are many things you can do once you have the basic classifier in place. One example would be to also localize where the numbers are on the image. The SVHN dataset provides bounding boxes that you can tune to train a localizer. Train a regression loss to the coordinates of the bounding box, and then test it. 96 | 97 | ***QUESTION:*** _How well does your model localize numbers on the testing set from the realistic dataset? Do your classification results change at all with localization included?_ 98 | 99 | ***QUESTION:*** _Test the localization function on the images you captured in **Step 3**. Does the model accurately calculate a bounding box for the numbers in the images you found? If you did not use a graphical interface, you may need to investigate the bounding boxes by hand._ 100 | 101 | ### Step 5: Build an Application or Program for a Model (Optional) 102 | Take your project one step further. If you're interested, look to build an Android application or even a more robust Python program that can interface with input images and display the classified numbers and even the bounding boxes. You can for example try to build an augmented reality app by overlaying your answer on the image like the [Word Lens](https://en.wikipedia.org/wiki/Word_Lens) app does. 103 | 104 | Loading a TensorFlow model into a camera app on Android is demonstrated in the [TensorFlow Android demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android), which you can simply modify. 105 | 106 | If you decide to explore this optional route, be sure to document your interface and implementation, along with significant results you find. You can see the additional rubric items that you could be evaluated on by [following this link](https://review.udacity.com/#!/rubrics/413/view). 107 | 108 | ## Submitting the Project 109 | 110 | ### Evaluation 111 | Your project will be reviewed by a Udacity reviewer against the **Build a Digit Recognition Program project rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 112 | 113 | ### Submission Files 114 | When you are ready to submit your project, collect the following files and compress them into a single archive for upload. Alternatively, you may supply the following files on your GitHub Repo in a folder named `digit_recognition` for ease of access: 115 | - The `digit_recognition.ipynb` notebook file with all questions answered and all code cells executed and displaying output. 116 | - An **HTML** export of the project notebook with the name **report.html**. This file *must* be present for your project to be evaluated. 117 | - Any additional datasets or images used for the project that are not from the SVHN, notMNIST, or MNIST datasets. 118 | - For the optional image recognition software component, any additional Python files necessary to run the code. 119 | - For the optional Android application component, documentation for accessing the application. This should be a PDF report with the name **documentation.pdf** 120 | 121 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 122 | 123 | ### I'm Ready! 124 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 125 | 126 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 127 | 128 | ### What's Next? 129 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! 130 | -------------------------------------------------------------------------------- /projects/finding_donors/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | # Supervised Learning 3 | ## Project: Finding Donors for CharityML 4 | 5 | ### Install 6 | 7 | This project requires **Python 2.7** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [Pandas](http://pandas.pydata.org) 11 | - [matplotlib](http://matplotlib.org/) 12 | - [scikit-learn](http://scikit-learn.org/stable/) 13 | 14 | You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html) 15 | 16 | We recommend students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 17 | 18 | ### Code 19 | 20 | Template code is provided in the `finding_donors.ipynb` notebook file. You will also be required to use the included `visuals.py` Python file and the `census.csv` dataset file to complete your work. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. Note that the code included in `visuals.py` is meant to be used out-of-the-box and not intended for students to manipulate. If you are interested in how the visualizations are created in the notebook, please feel free to explore this Python file. 21 | 22 | ### Run 23 | 24 | In a terminal or command window, navigate to the top-level project directory `finding_donors/` (that contains this README) and run one of the following commands: 25 | 26 | ```bash 27 | ipython notebook finding_donors.ipynb 28 | ``` 29 | or 30 | ```bash 31 | jupyter notebook finding_donors.ipynb 32 | ``` 33 | 34 | This will open the iPython Notebook software and project file in your browser. 35 | 36 | ### Data 37 | 38 | The modified census dataset consists of approximately 32,000 data points, with each datapoint having 13 features. This dataset is a modified version of the dataset published in the paper *"Scaling Up the Accuracy of Naive-Bayes Classifiers: a Decision-Tree Hybrid",* by Ron Kohavi. You may find this paper [online](https://www.aaai.org/Papers/KDD/1996/KDD96-033.pdf), with the original dataset hosted on [UCI](https://archive.ics.uci.edu/ml/datasets/Census+Income). 39 | 40 | **Features** 41 | - `age`: Age 42 | - `workclass`: Working Class (Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked) 43 | - `education_level`: Level of Education (Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool) 44 | - `education-num`: Number of educational years completed 45 | - `marital-status`: Marital status (Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse) 46 | - `occupation`: Work Occupation (Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces) 47 | - `relationship`: Relationship Status (Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried) 48 | - `race`: Race (White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black) 49 | - `sex`: Sex (Female, Male) 50 | - `capital-gain`: Monetary Capital Gains 51 | - `capital-loss`: Monetary Capital Losses 52 | - `hours-per-week`: Average Hours Per Week Worked 53 | - `native-country`: Native Country (United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands) 54 | 55 | **Target Variable** 56 | - `income`: Income Class (<=50K, >50K) 57 | -------------------------------------------------------------------------------- /projects/finding_donors/project_description.md: -------------------------------------------------------------------------------- 1 | # Content: Supervised Learning 2 | ## Project: Finding Donors for CharityML 3 | 4 | ## Project Overview 5 | In this project, you will apply supervised learning techniques and an analytical mind on data collected for the U.S. census to help CharityML (a fictitious charity organization) identify people most likely to donate to their cause. You will first explore the data to learn how the census data is recorded. Next, you will apply a series of transformations and preprocessing techniques to manipulate the data into a workable format. You will then evaluate several supervised learners of your choice on the data, and consider which is best suited for the solution. Afterwards, you will optimize the model you've selected and present it as your solution to CharityML. Finally, you will explore the chosen model and its predictions under the hood, to see just how well it's performing when considering the data it's given. 6 | predicted selling price to your statistics. 7 | 8 | ## Project Highlights 9 | This project is designed to get you acquainted with the many supervised learning algorithms available in sklearn, and to also provide for a method of evaluating just how each model works and performs on a certain type of data. It is important in machine learning to understand exactly when and where a certain algorithm should be used, and when one should be avoided. 10 | 11 | Things you will learn by completing this project: 12 | - How to identify when preprocessing is needed, and how to apply it. 13 | - How to establish a benchmark for a solution to the problem. 14 | - What each of several supervised learning algorithms accomplishes given a specific dataset. 15 | - How to investigate whether a candidate solution model is adequate for the problem. 16 | 17 | ## Software Requirements 18 | 19 | This project uses the following software and Python libraries: 20 | 21 | - [Python 2.7](https://www.python.org/download/releases/2.7/) 22 | - [NumPy](http://www.numpy.org/) 23 | - [Pandas](http://pandas.pydata.org/) 24 | - [scikit-learn](http://scikit-learn.org/stable/) 25 | - [matplotlib](http://matplotlib.org/) 26 | 27 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html) 28 | 29 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. 30 | 31 | ## Starting the Project 32 | 33 | For this assignment, you can find the `finding_donors` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 34 | 35 | This project contains three files: 36 | 37 | - `finding_donors.ipynb`: This is the main file where you will be performing your work on the project. 38 | - `census.csv`: The project dataset. You'll load this data in the notebook. 39 | - `visuals.py`: A Python file containing visualization code that is run behind-the-scenes. Do not modify 40 | 41 | In the Terminal or Command Prompt, navigate to the folder containing the project files, and then use the command `jupyter notebook finding_donors.ipynb` to open up a browser window or tab to work with your notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the project. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 42 | 43 | ## Submitting the Project 44 | 45 | ### Evaluation 46 | Your project will be reviewed by a Udacity reviewer against the **Finding Donors for CharityML project rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 47 | 48 | ### Submission Files 49 | When you are ready to submit your project, collect the following files and compress them into a single archive for upload. Alternatively, you may supply the following files on your GitHub Repo in a folder named `student_intervention` for ease of access: 50 | - The `finding_donors.ipynb` notebook file with all questions answered and all code cells executed and displaying output. 51 | - An **HTML** export of the project notebook with the name **report.html**. This file *must* be present for your project to be evaluated. 52 | 53 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 54 | 55 | ### I'm Ready! 56 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 57 | 58 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 59 | 60 | ### What's Next? 61 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! 62 | -------------------------------------------------------------------------------- /projects/finding_donors/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import matplotlib.pyplot as pl 13 | import matplotlib.patches as mpatches 14 | import numpy as np 15 | import pandas as pd 16 | from time import time 17 | from sklearn.metrics import f1_score, accuracy_score 18 | 19 | 20 | def distribution(data, transformed = False): 21 | """ 22 | Visualization code for displaying skewed distributions of features 23 | """ 24 | 25 | # Create figure 26 | fig = pl.figure(figsize = (11,5)); 27 | 28 | # Skewed feature plotting 29 | for i, feature in enumerate(['capital-gain','capital-loss']): 30 | ax = fig.add_subplot(1, 2, i+1) 31 | ax.hist(data[feature], bins = 25, color = '#00A0A0') 32 | ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14) 33 | ax.set_xlabel("Value") 34 | ax.set_ylabel("Number of Records") 35 | ax.set_ylim((0, 2000)) 36 | ax.set_yticks([0, 500, 1000, 1500, 2000]) 37 | ax.set_yticklabels([0, 500, 1000, 1500, ">2000"]) 38 | 39 | # Plot aesthetics 40 | if transformed: 41 | fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \ 42 | fontsize = 16, y = 1.03) 43 | else: 44 | fig.suptitle("Skewed Distributions of Continuous Census Data Features", \ 45 | fontsize = 16, y = 1.03) 46 | 47 | fig.tight_layout() 48 | fig.show() 49 | 50 | 51 | def evaluate(results, accuracy, f1): 52 | """ 53 | Visualization code to display results of various learners. 54 | 55 | inputs: 56 | - learners: a list of supervised learners 57 | - stats: a list of dictionaries of the statistic results from 'train_predict()' 58 | - accuracy: The score for the naive predictor 59 | - f1: The score for the naive predictor 60 | """ 61 | 62 | # Create figure 63 | fig, ax = pl.subplots(2, 4, figsize = (11,7)) 64 | 65 | # Constants 66 | bar_width = 0.3 67 | colors = ['#A00000','#00A0A0','#00A000'] 68 | 69 | # Super loop to plot four panels of data 70 | for k, learner in enumerate(results.keys()): 71 | for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']): 72 | for i in np.arange(3): 73 | 74 | # Creative plot code 75 | ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k]) 76 | ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45]) 77 | ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"]) 78 | ax[j//3, j%3].set_xlabel("Training Set Size") 79 | ax[j//3, j%3].set_xlim((-0.1, 3.0)) 80 | 81 | # Add unique y-labels 82 | ax[0, 0].set_ylabel("Time (in seconds)") 83 | ax[0, 1].set_ylabel("Accuracy Score") 84 | ax[0, 2].set_ylabel("F-score") 85 | ax[1, 0].set_ylabel("Time (in seconds)") 86 | ax[1, 1].set_ylabel("Accuracy Score") 87 | ax[1, 2].set_ylabel("F-score") 88 | 89 | # Add titles 90 | ax[0, 0].set_title("Model Training") 91 | ax[0, 1].set_title("Accuracy Score on Training Subset") 92 | ax[0, 2].set_title("F-score on Training Subset") 93 | ax[1, 0].set_title("Model Predicting") 94 | ax[1, 1].set_title("Accuracy Score on Testing Set") 95 | ax[1, 2].set_title("F-score on Testing Set") 96 | 97 | # Add horizontal lines for naive predictors 98 | ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 99 | ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 100 | ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 101 | ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 102 | 103 | # Set y-limits for score panels 104 | ax[0, 1].set_ylim((0, 1)) 105 | ax[0, 2].set_ylim((0, 1)) 106 | ax[1, 1].set_ylim((0, 1)) 107 | ax[1, 2].set_ylim((0, 1)) 108 | 109 | # Set additional plots invisibles 110 | ax[0, 3].set_visible(False) 111 | ax[1, 3].axis('off') 112 | 113 | # Create legend 114 | for i, learner in enumerate(results.keys()): 115 | pl.bar(0, 0, color=colors[i], label=learner) 116 | pl.legend() 117 | 118 | # Aesthetics 119 | pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10) 120 | pl.tight_layout() 121 | pl.show() 122 | 123 | 124 | def feature_plot(importances, X_train, y_train): 125 | 126 | # Display the five most important features 127 | indices = np.argsort(importances)[::-1] 128 | columns = X_train.columns.values[indices[:5]] 129 | values = importances[indices][:5] 130 | 131 | # Creat the plot 132 | fig = pl.figure(figsize = (9,5)) 133 | pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16) 134 | pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \ 135 | label = "Feature Weight") 136 | pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \ 137 | label = "Cumulative Feature Weight") 138 | pl.xticks(np.arange(5), columns) 139 | pl.xlim((-0.5, 4.5)) 140 | pl.ylabel("Weight", fontsize = 12) 141 | pl.xlabel("Feature", fontsize = 12) 142 | 143 | pl.legend(loc = 'upper center') 144 | pl.tight_layout() 145 | pl.show() 146 | -------------------------------------------------------------------------------- /projects/image-classification/ReadMe.md: -------------------------------------------------------------------------------- 1 | This project has been written in Python 3.x. -------------------------------------------------------------------------------- /projects/image-classification/helper.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import LabelBinarizer 5 | 6 | 7 | def _load_label_names(): 8 | """ 9 | Load the label names from file 10 | """ 11 | return ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] 12 | 13 | 14 | def load_cfar10_batch(cifar10_dataset_folder_path, batch_id): 15 | """ 16 | Load a batch of the dataset 17 | """ 18 | with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file: 19 | batch = pickle.load(file, encoding='latin1') 20 | 21 | features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1) 22 | labels = batch['labels'] 23 | 24 | return features, labels 25 | 26 | 27 | def display_stats(cifar10_dataset_folder_path, batch_id, sample_id): 28 | """ 29 | Display Stats of the the dataset 30 | """ 31 | batch_ids = list(range(1, 6)) 32 | 33 | if batch_id not in batch_ids: 34 | print('Batch Id out of Range. Possible Batch Ids: {}'.format(batch_ids)) 35 | return None 36 | 37 | features, labels = load_cfar10_batch(cifar10_dataset_folder_path, batch_id) 38 | 39 | if not (0 <= sample_id < len(features)): 40 | print('{} samples in batch {}. {} is out of range.'.format(len(features), batch_id, sample_id)) 41 | return None 42 | 43 | print('\nStats of batch {}:'.format(batch_id)) 44 | print('Samples: {}'.format(len(features))) 45 | print('Label Counts: {}'.format(dict(zip(*np.unique(labels, return_counts=True))))) 46 | print('First 20 Labels: {}'.format(labels[:20])) 47 | 48 | sample_image = features[sample_id] 49 | sample_label = labels[sample_id] 50 | label_names = _load_label_names() 51 | 52 | print('\nExample of Image {}:'.format(sample_id)) 53 | print('Image - Min Value: {} Max Value: {}'.format(sample_image.min(), sample_image.max())) 54 | print('Image - Shape: {}'.format(sample_image.shape)) 55 | print('Label - Label Id: {} Name: {}'.format(sample_label, label_names[sample_label])) 56 | plt.axis('off') 57 | plt.imshow(sample_image) 58 | 59 | 60 | def _preprocess_and_save(normalize, one_hot_encode, features, labels, filename): 61 | """ 62 | Preprocess data and save it to file 63 | """ 64 | features = normalize(features) 65 | labels = one_hot_encode(labels) 66 | 67 | pickle.dump((features, labels), open(filename, 'wb')) 68 | 69 | 70 | def preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode): 71 | """ 72 | Preprocess Training and Validation Data 73 | """ 74 | n_batches = 5 75 | valid_features = [] 76 | valid_labels = [] 77 | 78 | for batch_i in range(1, n_batches + 1): 79 | features, labels = load_cfar10_batch(cifar10_dataset_folder_path, batch_i) 80 | validation_count = int(len(features) * 0.1) 81 | 82 | # Prprocess and save a batch of training data 83 | _preprocess_and_save( 84 | normalize, 85 | one_hot_encode, 86 | features[:-validation_count], 87 | labels[:-validation_count], 88 | 'preprocess_batch_' + str(batch_i) + '.p') 89 | 90 | # Use a portion of training batch for validation 91 | valid_features.extend(features[-validation_count:]) 92 | valid_labels.extend(labels[-validation_count:]) 93 | 94 | # Preprocess and Save all validation data 95 | _preprocess_and_save( 96 | normalize, 97 | one_hot_encode, 98 | np.array(valid_features), 99 | np.array(valid_labels), 100 | 'preprocess_validation.p') 101 | 102 | with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file: 103 | batch = pickle.load(file, encoding='latin1') 104 | 105 | # load the training data 106 | test_features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1) 107 | test_labels = batch['labels'] 108 | 109 | # Preprocess and Save all training data 110 | _preprocess_and_save( 111 | normalize, 112 | one_hot_encode, 113 | np.array(test_features), 114 | np.array(test_labels), 115 | 'preprocess_training.p') 116 | 117 | 118 | def batch_features_labels(features, labels, batch_size): 119 | """ 120 | Split features and labels into batches 121 | """ 122 | for start in range(0, len(features), batch_size): 123 | end = min(start + batch_size, len(features)) 124 | yield features[start:end], labels[start:end] 125 | 126 | 127 | def load_preprocess_training_batch(batch_id, batch_size): 128 | """ 129 | Load the Preprocessed Training data and return them in batches of or less 130 | """ 131 | filename = 'preprocess_batch_' + str(batch_id) + '.p' 132 | features, labels = pickle.load(open(filename, mode='rb')) 133 | 134 | # Return the training data in batches of size or less 135 | return batch_features_labels(features, labels, batch_size) 136 | 137 | 138 | def display_image_predictions(features, labels, predictions): 139 | n_classes = 10 140 | label_names = _load_label_names() 141 | label_binarizer = LabelBinarizer() 142 | label_binarizer.fit(range(n_classes)) 143 | label_ids = label_binarizer.inverse_transform(np.array(labels)) 144 | 145 | fig, axies = plt.subplots(nrows=4, ncols=2) 146 | fig.tight_layout() 147 | fig.suptitle('Softmax Predictions', fontsize=20, y=1.1) 148 | 149 | n_predictions = 3 150 | margin = 0.05 151 | ind = np.arange(n_predictions) 152 | width = (1. - 2. * margin) / n_predictions 153 | 154 | for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)): 155 | pred_names = [label_names[pred_i] for pred_i in pred_indicies] 156 | correct_name = label_names[label_id] 157 | 158 | axies[image_i][0].imshow(feature) 159 | axies[image_i][0].set_title(correct_name) 160 | axies[image_i][0].set_axis_off() 161 | 162 | axies[image_i][1].barh(ind + margin, pred_values[::-1], width) 163 | axies[image_i][1].set_yticks(ind + margin) 164 | axies[image_i][1].set_yticklabels(pred_names[::-1]) 165 | axies[image_i][1].set_xticks([0, 0.5, 1.0]) 166 | -------------------------------------------------------------------------------- /projects/image-classification/problem_unittests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | from unittest.mock import MagicMock 6 | 7 | 8 | def _print_success_message(): 9 | print('Tests Passed') 10 | 11 | 12 | def test_folder_path(cifar10_dataset_folder_path): 13 | assert cifar10_dataset_folder_path is not None,\ 14 | 'Cifar-10 data folder not set.' 15 | assert cifar10_dataset_folder_path[-1] != '/',\ 16 | 'The "/" shouldn\'t be added to the end of the path.' 17 | assert os.path.exists(cifar10_dataset_folder_path),\ 18 | 'Path not found.' 19 | assert os.path.isdir(cifar10_dataset_folder_path),\ 20 | '{} is not a folder.'.format(os.path.basename(cifar10_dataset_folder_path)) 21 | 22 | train_files = [cifar10_dataset_folder_path + '/data_batch_' + str(batch_id) for batch_id in range(1, 6)] 23 | other_files = [cifar10_dataset_folder_path + '/batches.meta', cifar10_dataset_folder_path + '/test_batch'] 24 | missing_files = [path for path in train_files + other_files if not os.path.exists(path)] 25 | 26 | assert not missing_files,\ 27 | 'Missing files in directory: {}'.format(missing_files) 28 | 29 | print('All files found!') 30 | 31 | 32 | def test_normalize(normalize): 33 | test_shape = (np.random.choice(range(1000)), 32, 32, 3) 34 | test_numbers = np.random.choice(range(256), test_shape) 35 | normalize_out = normalize(test_numbers) 36 | 37 | assert type(normalize_out).__module__ == np.__name__,\ 38 | 'Not Numpy Object' 39 | 40 | assert normalize_out.shape == test_shape,\ 41 | 'Incorrect Shape. {} shape found'.format(normalize_out.shape) 42 | 43 | assert normalize_out.max() <= 1 and normalize_out.min() >= 0,\ 44 | 'Incorect Range. {} to {} found'.format(normalize_out.min(), normalize_out.max()) 45 | 46 | _print_success_message() 47 | 48 | 49 | def test_one_hot_encode(one_hot_encode): 50 | test_shape = np.random.choice(range(1000)) 51 | test_numbers = np.random.choice(range(10), test_shape) 52 | one_hot_out = one_hot_encode(test_numbers) 53 | 54 | assert type(one_hot_out).__module__ == np.__name__,\ 55 | 'Not Numpy Object' 56 | 57 | assert one_hot_out.shape == (test_shape, 10),\ 58 | 'Incorrect Shape. {} shape found'.format(one_hot_out.shape) 59 | 60 | n_encode_tests = 5 61 | test_pairs = list(zip(test_numbers, one_hot_out)) 62 | test_indices = np.random.choice(len(test_numbers), n_encode_tests) 63 | labels = [test_pairs[test_i][0] for test_i in test_indices] 64 | enc_labels = np.array([test_pairs[test_i][1] for test_i in test_indices]) 65 | new_enc_labels = one_hot_encode(labels) 66 | 67 | assert np.array_equal(enc_labels, new_enc_labels),\ 68 | 'Encodings returned different results for the same numbers.\n' \ 69 | 'For the first call it returned:\n' \ 70 | '{}\n' \ 71 | 'For the second call it returned\n' \ 72 | '{}\n' \ 73 | 'Make sure you save the map of labels to encodings outside of the function.'.format(enc_labels, new_enc_labels) 74 | 75 | _print_success_message() 76 | 77 | 78 | def test_nn_image_inputs(neural_net_image_input): 79 | image_shape = (32, 32, 3) 80 | nn_inputs_out_x = neural_net_image_input(image_shape) 81 | 82 | assert nn_inputs_out_x.get_shape().as_list() == [None, image_shape[0], image_shape[1], image_shape[2]],\ 83 | 'Incorrect Image Shape. Found {} shape'.format(nn_inputs_out_x.get_shape().as_list()) 84 | 85 | assert nn_inputs_out_x.op.type == 'Placeholder',\ 86 | 'Incorrect Image Type. Found {} type'.format(nn_inputs_out_x.op.type) 87 | 88 | assert nn_inputs_out_x.name == 'x:0', \ 89 | 'Incorrect Name. Found {}'.format(nn_inputs_out_x.name) 90 | 91 | print('Image Input Tests Passed.') 92 | 93 | 94 | def test_nn_label_inputs(neural_net_label_input): 95 | n_classes = 10 96 | nn_inputs_out_y = neural_net_label_input(n_classes) 97 | 98 | assert nn_inputs_out_y.get_shape().as_list() == [None, n_classes],\ 99 | 'Incorrect Label Shape. Found {} shape'.format(nn_inputs_out_y.get_shape().as_list()) 100 | 101 | assert nn_inputs_out_y.op.type == 'Placeholder',\ 102 | 'Incorrect Label Type. Found {} type'.format(nn_inputs_out_y.op.type) 103 | 104 | assert nn_inputs_out_y.name == 'y:0', \ 105 | 'Incorrect Name. Found {}'.format(nn_inputs_out_y.name) 106 | 107 | print('Label Input Tests Passed.') 108 | 109 | 110 | def test_nn_keep_prob_inputs(neural_net_keep_prob_input): 111 | nn_inputs_out_k = neural_net_keep_prob_input() 112 | 113 | assert nn_inputs_out_k.get_shape().ndims is None,\ 114 | 'Too many dimensions found for keep prob. Found {} dimensions. It should be a scalar (0-Dimension Tensor).'.format(nn_inputs_out_k.get_shape().ndims) 115 | 116 | assert nn_inputs_out_k.op.type == 'Placeholder',\ 117 | 'Incorrect keep prob Type. Found {} type'.format(nn_inputs_out_k.op.type) 118 | 119 | assert nn_inputs_out_k.name == 'keep_prob:0', \ 120 | 'Incorrect Name. Found {}'.format(nn_inputs_out_k.name) 121 | 122 | print('Keep Prob Tests Passed.') 123 | 124 | 125 | def test_con_pool(conv2d_maxpool): 126 | test_x = tf.placeholder(tf.float32, [None, 32, 32, 5]) 127 | test_num_outputs = 10 128 | test_con_k = (2, 2) 129 | test_con_s = (4, 4) 130 | test_pool_k = (2, 2) 131 | test_pool_s = (2, 2) 132 | 133 | conv2d_maxpool_out = conv2d_maxpool(test_x, test_num_outputs, test_con_k, test_con_s, test_pool_k, test_pool_s) 134 | 135 | assert conv2d_maxpool_out.get_shape().as_list() == [None, 4, 4, 10],\ 136 | 'Incorrect Shape. Found {} shape'.format(conv2d_maxpool_out.get_shape().as_list()) 137 | 138 | _print_success_message() 139 | 140 | 141 | def test_flatten(flatten): 142 | test_x = tf.placeholder(tf.float32, [None, 10, 30, 6]) 143 | flat_out = flatten(test_x) 144 | 145 | assert flat_out.get_shape().as_list() == [None, 10*30*6],\ 146 | 'Incorrect Shape. Found {} shape'.format(flat_out.get_shape().as_list()) 147 | 148 | _print_success_message() 149 | 150 | 151 | def test_fully_conn(fully_conn): 152 | test_x = tf.placeholder(tf.float32, [None, 128]) 153 | test_num_outputs = 40 154 | 155 | fc_out = fully_conn(test_x, test_num_outputs) 156 | 157 | assert fc_out.get_shape().as_list() == [None, 40],\ 158 | 'Incorrect Shape. Found {} shape'.format(fc_out.get_shape().as_list()) 159 | 160 | _print_success_message() 161 | 162 | 163 | def test_output(output): 164 | test_x = tf.placeholder(tf.float32, [None, 128]) 165 | test_num_outputs = 40 166 | 167 | output_out = output(test_x, test_num_outputs) 168 | 169 | assert output_out.get_shape().as_list() == [None, 40],\ 170 | 'Incorrect Shape. Found {} shape'.format(output_out.get_shape().as_list()) 171 | 172 | _print_success_message() 173 | 174 | 175 | def test_conv_net(conv_net): 176 | test_x = tf.placeholder(tf.float32, [None, 32, 32, 3]) 177 | test_k = tf.placeholder(tf.float32) 178 | 179 | logits_out = conv_net(test_x, test_k) 180 | 181 | assert logits_out.get_shape().as_list() == [None, 10],\ 182 | 'Incorrect Model Output. Found {}'.format(logits_out.get_shape().as_list()) 183 | 184 | print('Neural Network Built!') 185 | 186 | 187 | def test_train_nn(train_neural_network): 188 | mock_session = tf.Session() 189 | test_x = np.random.rand(128, 32, 32, 3) 190 | test_y = np.random.rand(128, 10) 191 | test_k = np.random.rand(1) 192 | test_optimizer = tf.train.AdamOptimizer() 193 | 194 | mock_session.run = MagicMock() 195 | train_neural_network(mock_session, test_optimizer, test_k, test_x, test_y) 196 | 197 | assert mock_session.run.called, 'Session not used' 198 | 199 | _print_success_message() 200 | -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/environment.yml: -------------------------------------------------------------------------------- 1 | name: dlnd-tf-lab 2 | dependencies: 3 | - openssl=1.0.2j 4 | - pip>=8.1.2 5 | - psutil=4.4.1 6 | - python>=3.4.0 7 | - readline=6.2 8 | - setuptools=27.2.0 9 | - sqlite=3.13.0 10 | - tk=8.5.18 11 | - wheel=0.29.0 12 | - xz=5.2.2 13 | - zlib=1.2.8 14 | - pip: 15 | - appnope==0.1.0 16 | - cycler==0.10.0 17 | - decorator==4.0.10 18 | - entrypoints==0.2.2 19 | - ipykernel==4.5.0 20 | - ipython==5.1.0 21 | - ipython-genutils==0.1.0 22 | - ipywidgets==5.2.2 23 | - jinja2==2.8 24 | - jsonschema==2.5.1 25 | - jupyter==1.0.0 26 | - jupyter-client==4.4.0 27 | - jupyter-console==5.0.0 28 | - jupyter-core==4.2.0 29 | - markupsafe==0.23 30 | - matplotlib==1.5.3 31 | - mistune==0.7.3 32 | - nbconvert==4.2.0 33 | - nbformat==4.1.0 34 | - notebook==4.2.3 35 | - numpy==1.11.2 36 | - pexpect==4.2.1 37 | - pickleshare==0.7.4 38 | - pillow==3.4.2 39 | - prompt-toolkit==1.0.8 40 | - protobuf==3.1.0.post1 41 | - ptyprocess==0.5.1 42 | - pygments==2.1.3 43 | - pyparsing==2.1.10 44 | - python-dateutil==2.5.3 45 | - pytz==2016.7 46 | - pyzmq==16.0.0 47 | - qtconsole==4.2.1 48 | - scikit-learn==0.18 49 | - scipy==0.18.1 50 | - simplegeneric==0.8.1 51 | - six==1.10.0 52 | - sklearn==0.0 53 | - tensorflow>=0.12.1 54 | - terminado==0.6 55 | - tornado==4.4.2 56 | - tqdm==4.8.4 57 | - traitlets==4.3.1 58 | - wcwidth==0.1.7 59 | - widgetsnbextension==1.2.6 60 | -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/environment_win.yml: -------------------------------------------------------------------------------- 1 | name: dlnd-tf-lab 2 | channels: !!python/tuple 3 | - defaults 4 | dependencies: 5 | - bleach=1.5.0=py35_0 6 | - bzip2=1.0.6=vc14_3 7 | - colorama=0.3.7=py35_0 8 | - cycler=0.10.0=py35_0 9 | - decorator=4.0.11=py35_0 10 | - entrypoints=0.2.2=py35_1 11 | - freetype=2.5.5=vc14_2 12 | - html5lib=0.999=py35_0 13 | - icu=57.1=vc14_0 14 | - ipykernel=4.5.2=py35_0 15 | - ipython=5.2.2=py35_0 16 | - ipython_genutils=0.1.0=py35_0 17 | - ipywidgets=5.2.2=py35_1 18 | - jinja2=2.9.4=py35_0 19 | - jpeg=9b=vc14_0 20 | - jsonschema=2.5.1=py35_0 21 | - jupyter=1.0.0=py35_3 22 | - jupyter_client=4.4.0=py35_0 23 | - jupyter_console=5.0.0=py35_0 24 | - jupyter_core=4.3.0=py35_0 25 | - libpng=1.6.27=vc14_0 26 | - libtiff=4.0.6=vc14_3 27 | - markupsafe=0.23=py35_2 28 | - matplotlib=2.0.0=np112py35_0 29 | - mistune=0.7.3=py35_0 30 | - mkl=2017.0.1=0 31 | - nbconvert=5.1.1=py35_0 32 | - nbformat=4.2.0=py35_0 33 | - notebook=4.3.1=py35_1 34 | - numpy=1.12.0=py35_0 35 | - olefile=0.44=py35_0 36 | - openssl=1.0.2k=vc14_0 37 | - pandas=0.19.2=np112py35_1 38 | - pandocfilters=1.4.1=py35_0 39 | - path.py=10.1=py35_0 40 | - pickleshare=0.7.4=py35_0 41 | - pillow=4.0.0=py35_1 42 | - pip=9.0.1=py35_1 43 | - prompt_toolkit=1.0.9=py35_0 44 | - pygments=2.1.3=py35_0 45 | - pyparsing=2.1.4=py35_0 46 | - pyqt=5.6.0=py35_2 47 | - python=3.5.2=0 48 | - python-dateutil=2.6.0=py35_0 49 | - pytz=2016.10=py35_0 50 | - pyzmq=16.0.2=py35_0 51 | - qt=5.6.2=vc14_3 52 | - qtconsole=4.2.1=py35_2 53 | - scikit-learn=0.18.1=np112py35_1 54 | - scipy=0.18.1=np112py35_1 55 | - setuptools=27.2.0=py35_1 56 | - simplegeneric=0.8.1=py35_1 57 | - sip=4.18=py35_0 58 | - six=1.10.0=py35_0 59 | - testpath=0.3=py35_0 60 | - tk=8.5.18=vc14_0 61 | - tornado=4.4.2=py35_0 62 | - traitlets=4.3.1=py35_0 63 | - vs2015_runtime=14.0.25123=0 64 | - wcwidth=0.1.7=py35_0 65 | - wheel=0.29.0=py35_0 66 | - widgetsnbextension=1.2.6=py35_0 67 | - win_unicode_console=0.5=py35_0 68 | - zlib=1.2.8=vc14_3 69 | - pip: 70 | - ipython-genutils==0.1.0 71 | - jupyter-client==4.4.0 72 | - jupyter-console==5.0.0 73 | - jupyter-core==4.3.0 74 | - prompt-toolkit==1.0.9 75 | - protobuf==3.2.0 76 | - tensorflow==1.0.0 77 | - tqdm==4.11.2 78 | - win-unicode-console==0.5 79 | prefix: C:\Users\Mat\Anaconda3\envs\dlnd-tf-lab 80 | 81 | -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/image/Learn Rate Tune - Image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/intro-to-tensorflow/image/Learn Rate Tune - Image.png -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/image/Mean Variance - Image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/intro-to-tensorflow/image/Mean Variance - Image.png -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/image/network_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/intro-to-tensorflow/image/network_diagram.png -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/image/notmnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/intro-to-tensorflow/image/notmnist.png -------------------------------------------------------------------------------- /projects/intro-to-tensorflow/intro_to_tensorflow_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Solutions\n", 10 | "## Problem 1\n", 11 | "Implement the Min-Max scaling function ($X'=a+{\\frac {\\left(X-X_{\\min }\\right)\\left(b-a\\right)}{X_{\\max }-X_{\\min }}}$) with the parameters:\n", 12 | "\n", 13 | "$X_{\\min }=0$\n", 14 | "\n", 15 | "$X_{\\max }=255$\n", 16 | "\n", 17 | "$a=0.1$\n", 18 | "\n", 19 | "$b=0.9$" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Problem 1 - Implement Min-Max scaling for grayscale image data\n", 31 | "def normalize_grayscale(image_data):\n", 32 | " \"\"\"\n", 33 | " Normalize the image data with Min-Max scaling to a range of [0.1, 0.9]\n", 34 | " :param image_data: The image data to be normalized\n", 35 | " :return: Normalized image data\n", 36 | " \"\"\"\n", 37 | " a = 0.1\n", 38 | " b = 0.9\n", 39 | " grayscale_min = 0\n", 40 | " grayscale_max = 255\n", 41 | " return a + ( ( (image_data - grayscale_min)*(b - a) )/( grayscale_max - grayscale_min ) )" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Problem 2\n", 49 | "- Use [tf.placeholder()](https://www.tensorflow.org/api_docs/python/io_ops.html#placeholder) for `features` and `labels` since they are the inputs to the model.\n", 50 | "- Any math operations must have the same type on both sides of the operator. The weights are float32, so the `features` and `labels` must also be float32.\n", 51 | "- Use [tf.Variable()](https://www.tensorflow.org/api_docs/python/state_ops.html#Variable) to allow `weights` and `biases` to be modified.\n", 52 | "- The `weights` must be the dimensions of features by labels. The number of features is the size of the image, 28*28=784. The size of labels is 10.\n", 53 | "- The `biases` must be the dimensions of the labels, which is 10." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "features_count = 784\n", 65 | "labels_count = 10\n", 66 | "\n", 67 | "# Problem 2 - Set the features and labels tensors\n", 68 | "features = tf.placeholder(tf.float32)\n", 69 | "labels = tf.placeholder(tf.float32)\n", 70 | "\n", 71 | "# Problem 2 - Set the weights and biases tensors\n", 72 | "weights = tf.Variable(tf.truncated_normal((features_count, labels_count)))\n", 73 | "biases = tf.Variable(tf.zeros(labels_count))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "# Problem 3\n", 81 | "Configuration 1\n", 82 | "* **Epochs:** 1\n", 83 | "* **Learning Rate:** 0.1\n", 84 | "\n", 85 | "Configuration 2\n", 86 | "* **Epochs:** 4 or 5\n", 87 | "* **Learning Rate:** 0.2" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.5.2" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 0 112 | } 113 | -------------------------------------------------------------------------------- /projects/practice_projects/cnn/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | mnist-mlp/.ipynb_checkpoints/ 3 | mnist-mlp/.DS_Store 4 | conv-visualization/.ipynb_checkpoints/ 5 | conv-visualization/.DS_Store 6 | cifar10-classification/.ipynb_checkpoints/ 7 | cifar10-classification/.DS_Store 8 | cifar10-augmentation/.ipynb_checkpoints/ 9 | cifar10-augmentation/.DS_Store 10 | transfer-learning/dogImages 11 | transfer-learning/bottleneck_features/DogVGG16Data.npz 12 | transfer-learning/.ipynb_checkpoints/ 13 | transfer-learning/.DS_Store 14 | -------------------------------------------------------------------------------- /projects/practice_projects/cnn/README.md: -------------------------------------------------------------------------------- 1 | # cnn practice projects -------------------------------------------------------------------------------- /projects/practice_projects/cnn/cifar10-augmentation/aug_model.weights.best.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/cifar10-augmentation/aug_model.weights.best.hdf5 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/cifar10-classification/MLP.weights.best.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/cifar10-classification/MLP.weights.best.hdf5 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/cifar10-classification/model.weights.best.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/cifar10-classification/model.weights.best.hdf5 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/conv-visualization/images/udacity_sdc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/conv-visualization/images/udacity_sdc.png -------------------------------------------------------------------------------- /projects/practice_projects/cnn/mnist-mlp/mnist.model.best.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/mnist-mlp/mnist.model.best.hdf5 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/requirements/aind-dog-linux.yml: -------------------------------------------------------------------------------- 1 | name: aind-dog 2 | channels: 3 | - defaults 4 | dependencies: 5 | - openssl=1.0.2l=0 6 | - pip=9.0.1=py36_1 7 | - python=3.6.1=2 8 | - readline=6.2=2 9 | - setuptools=27.2.0=py36_0 10 | - sqlite=3.13.0=0 11 | - tk=8.5.18=0 12 | - wheel=0.29.0=py36_0 13 | - xz=5.2.2=1 14 | - zlib=1.2.8=3 15 | - pip: 16 | - bleach==2.0.0 17 | - cycler==0.10.0 18 | - decorator==4.0.11 19 | - entrypoints==0.2.3 20 | - h5py==2.6.0 21 | - html5lib==0.999999999 22 | - ipykernel==4.6.1 23 | - ipython==6.1.0 24 | - ipython-genutils==0.2.0 25 | - ipywidgets==6.0.0 26 | - jedi==0.10.2 27 | - jinja2==2.9.6 28 | - jsonschema==2.6.0 29 | - jupyter==1.0.0 30 | - jupyter-client==5.0.1 31 | - jupyter-console==5.1.0 32 | - jupyter-core==4.3.0 33 | - keras==2.0.2 34 | - markupsafe==1.0 35 | - matplotlib==2.0.0 36 | - mistune==0.7.4 37 | - nbconvert==5.2.1 38 | - nbformat==4.3.0 39 | - notebook==5.0.0 40 | - numpy==1.12.0 41 | - olefile==0.44 42 | - opencv-python==3.2.0.6 43 | - pandocfilters==1.4.1 44 | - pexpect==4.2.1 45 | - pickleshare==0.7.4 46 | - pillow==4.0.0 47 | - prompt-toolkit==1.0.14 48 | - protobuf==3.3.0 49 | - ptyprocess==0.5.1 50 | - pygments==2.2.0 51 | - pyparsing==2.2.0 52 | - python-dateutil==2.6.0 53 | - pytz==2017.2 54 | - pyyaml==3.12 55 | - pyzmq==16.0.2 56 | - qtconsole==4.3.0 57 | - scikit-learn==0.18.1 58 | - scipy==0.18.1 59 | - simplegeneric==0.8.1 60 | - six==1.10.0 61 | - tensorflow==1.0.0 62 | - terminado==0.6 63 | - testpath==0.3.1 64 | - theano==0.9.0 65 | - tornado==4.5.1 66 | - tqdm==4.11.2 67 | - traitlets==4.3.2 68 | - wcwidth==0.1.7 69 | - webencodings==0.5.1 70 | - widgetsnbextension==2.0.0 71 | -------------------------------------------------------------------------------- /projects/practice_projects/cnn/requirements/aind-dog-mac.yml: -------------------------------------------------------------------------------- 1 | name: aind-dog 2 | channels: 3 | - damianavila82 4 | - defaults 5 | dependencies: 6 | - rise=4.0.0b1=py35_0 7 | - _license=1.1=py35_1 8 | - alabaster=0.7.10=py35_0 9 | - anaconda-client=1.6.2=py35_0 10 | - anaconda=custom=py35_0 11 | - anaconda-navigator=1.5.0=py35_0 12 | - anaconda-project=0.4.1=py35_0 13 | - appnope=0.1.0=py35_0 14 | - appscript=1.0.1=py35_0 15 | - astroid=1.4.9=py35_0 16 | - astropy=1.3=np112py35_0 17 | - babel=2.3.4=py35_0 18 | - backports=1.0=py35_0 19 | - beautifulsoup4=4.5.3=py35_0 20 | - bitarray=0.8.1=py35_0 21 | - blaze=0.10.1=py35_0 22 | - bleach=1.5.0=py35_0 23 | - bokeh=0.12.4=py35_0 24 | - boto=2.46.1=py35_0 25 | - bottleneck=1.2.0=np112py35_0 26 | - cffi=1.9.1=py35_0 27 | - chardet=2.3.0=py35_0 28 | - chest=0.2.3=py35_0 29 | - click=6.7=py35_0 30 | - cloudpickle=0.2.2=py35_0 31 | - clyent=1.2.2=py35_0 32 | - colorama=0.3.7=py35_0 33 | - configobj=5.0.6=py35_0 34 | - contextlib2=0.5.4=py35_0 35 | - cryptography=1.7.1=py35_0 36 | - curl=7.52.1=0 37 | - cycler=0.10.0=py35_0 38 | - cython=0.25.2=py35_0 39 | - cytoolz=0.8.2=py35_0 40 | - dask=0.14.0=py35_0 41 | - datashape=0.5.4=py35_0 42 | - decorator=4.0.11=py35_0 43 | - dill=0.2.5=py35_0 44 | - docutils=0.13.1=py35_0 45 | - entrypoints=0.2.2=py35_1 46 | - et_xmlfile=1.0.1=py35_0 47 | - fastcache=1.0.2=py35_1 48 | - flask=0.12=py35_0 49 | - flask-cors=3.0.2=py35_0 50 | - freetype=2.5.5=2 51 | - get_terminal_size=1.0.0=py35_0 52 | - gevent=1.2.1=py35_0 53 | - greenlet=0.4.12=py35_0 54 | - h5py=2.6.0=np112py35_2 55 | - hdf5=1.8.17=1 56 | - heapdict=1.0.0=py35_1 57 | - html5lib=0.999=py35_0 58 | - icu=54.1=0 59 | - idna=2.2=py35_0 60 | - imagesize=0.7.1=py35_0 61 | - ipykernel=4.5.2=py35_0 62 | - ipython=5.3.0=py35_0 63 | - ipython_genutils=0.1.0=py35_0 64 | - ipywidgets=6.0.0=py35_0 65 | - isort=4.2.5=py35_0 66 | - itsdangerous=0.24=py35_0 67 | - jbig=2.1=0 68 | - jdcal=1.3=py35_0 69 | - jedi=0.9.0=py35_1 70 | - jinja2=2.9.5=py35_0 71 | - jpeg=9b=0 72 | - jsonschema=2.5.1=py35_0 73 | - jupyter=1.0.0=py35_3 74 | - jupyter_client=5.0.0=py35_0 75 | - jupyter_console=5.1.0=py35_0 76 | - jupyter_core=4.3.0=py35_0 77 | - lazy-object-proxy=1.2.2=py35_0 78 | - libiconv=1.14=0 79 | - libpng=1.6.27=0 80 | - libtiff=4.0.6=3 81 | - libxml2=2.9.4=0 82 | - libxslt=1.1.29=0 83 | - llvmlite=0.16.0=py35_0 84 | - locket=0.2.0=py35_1 85 | - lxml=3.7.3=py35_0 86 | - markupsafe=0.23=py35_2 87 | - matplotlib=2.0.0=np112py35_0 88 | - mistune=0.7.4=py35_0 89 | - mkl=2017.0.1=0 90 | - mkl-service=1.1.2=py35_3 91 | - mpmath=0.19=py35_1 92 | - multipledispatch=0.4.9=py35_0 93 | - nbconvert=5.1.1=py35_0 94 | - nbformat=4.3.0=py35_0 95 | - networkx=1.11=py35_0 96 | - nltk=3.2.2=py35_0 97 | - nose=1.3.7=py35_1 98 | - notebook=4.4.1=py35_0 99 | - numba=0.31.0=np112py35_0 100 | - numexpr=2.6.2=np112py35_0 101 | - numpy=1.12.0=py35_0 102 | - numpydoc=0.6.0=py35_0 103 | - odo=0.5.0=py35_1 104 | - olefile=0.44=py35_0 105 | - openpyxl=2.4.1=py35_0 106 | - openssl=1.0.2k=0 107 | - pandas=0.19.2=np112py35_1 108 | - pandocfilters=1.4.1=py35_0 109 | - partd=0.3.7=py35_0 110 | - path.py=10.1=py35_0 111 | - pathlib2=2.2.0=py35_0 112 | - patsy=0.4.1=py35_0 113 | - pep8=1.7.0=py35_0 114 | - pexpect=4.2.1=py35_0 115 | - pickleshare=0.7.4=py35_0 116 | - pillow=4.0.0=py35_1 117 | - pip=9.0.1=py35_1 118 | - ply=3.10=py35_0 119 | - prompt_toolkit=1.0.13=py35_0 120 | - psutil=5.2.0=py35_0 121 | - ptyprocess=0.5.1=py35_0 122 | - py=1.4.32=py35_0 123 | - pyasn1=0.2.3=py35_0 124 | - pycosat=0.6.1=py35_1 125 | - pycparser=2.17=py35_0 126 | - pycrypto=2.6.1=py35_4 127 | - pycurl=7.43.0=py35_2 128 | - pyflakes=1.5.0=py35_0 129 | - pygments=2.2.0=py35_0 130 | - pylint=1.6.4=py35_1 131 | - pyopenssl=16.2.0=py35_0 132 | - pyparsing=2.1.4=py35_0 133 | - pyqt=5.6.0=py35_2 134 | - pytables=3.3.0=np112py35_0 135 | - pytest=3.0.6=py35_0 136 | - python=3.5.3=1 137 | - python-dateutil=2.6.0=py35_0 138 | - python.app=1.2=py35_4 139 | - pytz=2016.10=py35_0 140 | - pyyaml=3.12=py35_0 141 | - pyzmq=16.0.2=py35_0 142 | - qt=5.6.2=0 143 | - qtawesome=0.4.4=py35_0 144 | - qtconsole=4.2.1=py35_1 145 | - qtpy=1.2.1=py35_0 146 | - readline=6.2=2 147 | - redis=3.2.0=0 148 | - redis-py=2.10.5=py35_0 149 | - requests=2.13.0=py35_0 150 | - rope=0.9.4=py35_1 151 | - ruamel_yaml=0.11.14=py35_1 152 | - scikit-image=0.12.3=np112py35_1 153 | - scikit-learn=0.18.1=np112py35_1 154 | - scipy=0.19.0=np112py35_0 155 | - seaborn=0.7.1=py35_0 156 | - setuptools=27.2.0=py35_0 157 | - simplegeneric=0.8.1=py35_1 158 | - singledispatch=3.4.0.3=py35_0 159 | - sip=4.18=py35_0 160 | - six=1.10.0=py35_0 161 | - snowballstemmer=1.2.1=py35_0 162 | - sockjs-tornado=1.0.3=py35_0 163 | - sphinx=1.5.1=py35_0 164 | - spyder=3.1.3=py35_0 165 | - sqlalchemy=1.1.6=py35_0 166 | - sqlite=3.13.0=0 167 | - statsmodels=0.8.0=np112py35_0 168 | - sympy=1.0=py35_0 169 | - terminado=0.6=py35_0 170 | - testpath=0.3=py35_0 171 | - tk=8.5.18=0 172 | - toolz=0.8.2=py35_0 173 | - tornado=4.4.2=py35_0 174 | - traitlets=4.3.2=py35_0 175 | - unicodecsv=0.14.1=py35_0 176 | - wcwidth=0.1.7=py35_0 177 | - werkzeug=0.12=py35_0 178 | - wheel=0.29.0=py35_0 179 | - widgetsnbextension=2.0.0=py35_0 180 | - wrapt=1.10.8=py35_0 181 | - xlrd=1.0.0=py35_0 182 | - xlsxwriter=0.9.6=py35_0 183 | - xlwings=0.10.2=py35_0 184 | - xlwt=1.2.0=py35_0 185 | - xz=5.2.2=1 186 | - yaml=0.1.6=0 187 | - zlib=1.2.8=3 188 | - pip: 189 | - backports.shutil-get-terminal-size==1.0.0 190 | - cvxopt==1.1.9 191 | - et-xmlfile==1.0.1 192 | - ipython-genutils==0.1.0 193 | - jupyter-client==5.0.0 194 | - jupyter-console==5.1.0 195 | - jupyter-core==4.3.0 196 | - keras==2.0.0 197 | - opencv-python==3.2.0.6 198 | - prompt-toolkit==1.0.13 199 | - protobuf==3.2.0 200 | - rope-py3k==0.9.4.post1 201 | - tables==3.3.0 202 | - tensorflow==1.0.0 203 | - theano==0.8.2 204 | - tqdm==4.11.2 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/requirements/aind-dog-windows.yml: -------------------------------------------------------------------------------- 1 | name: aind-dog 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _nb_ext_conf=0.3.0=py35_0 6 | - anaconda-client=1.6.2=py35_0 7 | - bleach=1.5.0=py35_0 8 | - bzip2=1.0.6=vc14_3 9 | - clyent=1.2.2=py35_0 10 | - colorama=0.3.7=py35_0 11 | - cycler=0.10.0=py35_0 12 | - decorator=4.0.11=py35_0 13 | - entrypoints=0.2.2=py35_1 14 | - freetype=2.5.5=vc14_2 15 | - h5py=2.7.0=np112py35_0 16 | - hdf5=1.8.15.1=vc14_4 17 | - html5lib=0.999=py35_0 18 | - icu=57.1=vc14_0 19 | - ipykernel=4.5.2=py35_0 20 | - ipython=5.3.0=py35_0 21 | - ipython_genutils=0.1.0=py35_0 22 | - ipywidgets=6.0.0=py35_0 23 | - jinja2=2.9.5=py35_0 24 | - jpeg=9b=vc14_0 25 | - jsonschema=2.5.1=py35_0 26 | - jupyter=1.0.0=py35_3 27 | - jupyter_client=5.0.0=py35_0 28 | - jupyter_console=5.1.0=py35_0 29 | - jupyter_core=4.3.0=py35_0 30 | - libpng=1.6.27=vc14_0 31 | - libtiff=4.0.6=vc14_3 32 | - markupsafe=0.23=py35_2 33 | - matplotlib=2.0.0=np112py35_0 34 | - mistune=0.7.4=py35_0 35 | - mkl=2017.0.1=0 36 | - nb_anacondacloud=1.2.0=py35_0 37 | - nb_conda=2.0.0=py35_0 38 | - nb_conda_kernels=2.0.0=py35_0 39 | - nbconvert=5.1.1=py35_0 40 | - nbformat=4.3.0=py35_0 41 | - nbpresent=3.0.2=py35_0 42 | - notebook=4.4.1=py35_0 43 | - numpy=1.12.1=py35_0 44 | - olefile=0.44=py35_0 45 | - openssl=1.0.2k=vc14_0 46 | - pandocfilters=1.4.1=py35_0 47 | - path.py=10.1=py35_0 48 | - pickleshare=0.7.4=py35_0 49 | - pillow=4.0.0=py35_1 50 | - pip=9.0.1=py35_1 51 | - prompt_toolkit=1.0.13=py35_0 52 | - pygments=2.2.0=py35_0 53 | - pyparsing=2.1.4=py35_0 54 | - pyqt=5.6.0=py35_2 55 | - python=3.5.3=0 56 | - python-dateutil=2.6.0=py35_0 57 | - pytz=2016.10=py35_0 58 | - pyyaml=3.12=py35_0 59 | - pyzmq=16.0.2=py35_0 60 | - qt=5.6.2=vc14_3 61 | - qtconsole=4.2.1=py35_2 62 | - requests=2.13.0=py35_0 63 | - scikit-learn=0.18.1=np112py35_1 64 | - scipy=0.19.0=np112py35_0 65 | - setuptools=27.2.0=py35_1 66 | - simplegeneric=0.8.1=py35_1 67 | - sip=4.18=py35_0 68 | - six=1.10.0=py35_0 69 | - testpath=0.3=py35_0 70 | - tk=8.5.18=vc14_0 71 | - tornado=4.4.2=py35_0 72 | - traitlets=4.3.2=py35_0 73 | - vs2015_runtime=14.0.25123=0 74 | - wcwidth=0.1.7=py35_0 75 | - wheel=0.29.0=py35_0 76 | - widgetsnbextension=2.0.0=py35_0 77 | - win_unicode_console=0.5=py35_0 78 | - zlib=1.2.8=vc14_3 79 | - pip: 80 | - ipython-genutils==0.1.0 81 | - jupyter-client==5.0.0 82 | - jupyter-console==5.1.0 83 | - jupyter-core==4.3.0 84 | - keras==2.0.2 85 | - nb-anacondacloud==1.2.0 86 | - nb-conda==2.0.0 87 | - nb-conda-kernels==2.0.0 88 | - opencv-python==3.1.0.0 89 | - prompt-toolkit==1.0.13 90 | - protobuf==3.2.0 91 | - tensorflow==1.0.1 92 | - theano==0.9.0 93 | - tqdm==4.11.2 94 | - win-unicode-console==0.5 95 | 96 | 97 | -------------------------------------------------------------------------------- /projects/practice_projects/cnn/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==3.2.0.6 2 | h5py==2.6.0 3 | matplotlib==2.0.0 4 | numpy==1.12.0 5 | scipy==0.18.1 6 | tqdm==4.11.2 7 | keras==2.0.2 8 | scikit-learn==0.18.1 9 | pillow==4.0.0 10 | tensorflow==1.0.0 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/bottleneck_features/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/bottleneck_features/.gitignore -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/dogvgg16.weights.best.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/dogvgg16.weights.best.hdf5 -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/figures/vgg16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/figures/vgg16.png -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/figures/vgg16_transfer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/figures/vgg16_transfer.png -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/American_water_spaniel_00648.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/American_water_spaniel_00648.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/Brittany_02625.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/Brittany_02625.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/Curly-coated_retriever_03896.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/Curly-coated_retriever_03896.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/Labrador_retriever_06449.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/Labrador_retriever_06449.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/Labrador_retriever_06455.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/Labrador_retriever_06455.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/Labrador_retriever_06457.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/Labrador_retriever_06457.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/Welsh_springer_spaniel_08203.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/Welsh_springer_spaniel_08203.jpg -------------------------------------------------------------------------------- /projects/practice_projects/cnn/transfer-learning/images/sopa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/cnn/transfer-learning/images/sopa.jpg -------------------------------------------------------------------------------- /projects/practice_projects/imdb/.gitignore: -------------------------------------------------------------------------------- 1 | # Log files (e.g. for TensorBoard) 2 | logs/ 3 | 4 | # Mac 5 | .DS_Store 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /projects/practice_projects/imdb/IMDB_In_Keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analyzing IMDB Data in Keras" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false, 15 | "deletable": true, 16 | "editable": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "# Imports\n", 21 | "import numpy as np\n", 22 | "import keras\n", 23 | "from keras.datasets import imdb\n", 24 | "from keras.models import Sequential\n", 25 | "from keras.layers import Dense, Dropout, Activation\n", 26 | "from keras.preprocessing.text import Tokenizer\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "%matplotlib inline\n", 29 | "\n", 30 | "np.random.seed(42)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## 1. Loading the data\n", 38 | "This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": false, 46 | "deletable": true, 47 | "editable": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "# Loading the data (it's preloaded in Keras)\n", 52 | "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)\n", 53 | "\n", 54 | "print(x_train.shape)\n", 55 | "print(x_test.shape)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## 2. Examining the data\n", 63 | "Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.\n", 64 | "\n", 65 | "The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "print(x_train[0])\n", 77 | "print(y_train[0])" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## 3. One-hot encoding the output\n", 85 | "Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false, 93 | "deletable": true, 94 | "editable": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# One-hot encoding the output into vector mode, each of length 1000\n", 99 | "tokenizer = Tokenizer(num_words=1000)\n", 100 | "x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')\n", 101 | "x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')\n", 102 | "print(x_train[0])" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "And we'll also one-hot encode the output." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false, 117 | "deletable": true, 118 | "editable": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "# One-hot encoding the output\n", 123 | "num_classes = 2\n", 124 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 125 | "y_test = keras.utils.to_categorical(y_test, num_classes)\n", 126 | "print(y_train.shape)\n", 127 | "print(y_test.shape)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## 4. Building the model architecture\n", 135 | "Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false, 143 | "deletable": true, 144 | "editable": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "# TODO: Build the model architecture\n", 149 | "\n", 150 | "# TODO: Compile the model using a loss function and an optimizer.\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "## 5. Training the model\n", 158 | "Run the model here. Experiment with different batch_size, and number of epochs!" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false, 166 | "deletable": true, 167 | "editable": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs." 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## 6. Evaluating the model\n", 179 | "This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": false, 187 | "deletable": true, 188 | "editable": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "score = model.evaluate(x_test, y_test, verbose=0)\n", 193 | "print(\"Accuracy: \", score[1])" 194 | ] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.5.2" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /projects/practice_projects/imdb/IMDB_In_Keras_Solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Analyzing IMDB Data in Keras - Solution" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 29, 16 | "metadata": { 17 | "collapsed": false, 18 | "deletable": true, 19 | "editable": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# Imports\n", 24 | "import numpy as np\n", 25 | "import keras\n", 26 | "from keras.datasets import imdb\n", 27 | "from keras.models import Sequential\n", 28 | "from keras.layers import Dense, Dropout, Activation\n", 29 | "from keras.preprocessing.text import Tokenizer\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "%matplotlib inline\n", 32 | "\n", 33 | "np.random.seed(42)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "deletable": true, 40 | "editable": true 41 | }, 42 | "source": [ 43 | "## 1. Loading the data\n", 44 | "This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 30, 50 | "metadata": { 51 | "collapsed": false, 52 | "deletable": true, 53 | "editable": true 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "(25000,)\n", 61 | "(25000,)\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "# Loading the data (it's preloaded in Keras)\n", 67 | "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)\n", 68 | "\n", 69 | "print(x_train.shape)\n", 70 | "print(x_test.shape)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "deletable": true, 77 | "editable": true 78 | }, 79 | "source": [ 80 | "## 2. Examining the data\n", 81 | "Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.\n", 82 | "\n", 83 | "The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 31, 89 | "metadata": { 90 | "collapsed": false, 91 | "deletable": true, 92 | "editable": true 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]\n", 100 | "1\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "print(x_train[0])\n", 106 | "print(y_train[0])" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "deletable": true, 113 | "editable": true 114 | }, 115 | "source": [ 116 | "## 3. One-hot encoding the output\n", 117 | "Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 32, 123 | "metadata": { 124 | "collapsed": false, 125 | "deletable": true, 126 | "editable": true 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "(25000, 1000)\n", 134 | "(25000, 1000)\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "# Turning the output into vector mode, each of length 1000\n", 140 | "tokenizer = Tokenizer(num_words=1000)\n", 141 | "x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')\n", 142 | "x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')\n", 143 | "print(x_train.shape)\n", 144 | "print(x_test.shape)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "deletable": true, 151 | "editable": true 152 | }, 153 | "source": [ 154 | "And we'll one-hot encode the output." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 33, 160 | "metadata": { 161 | "collapsed": false, 162 | "deletable": true, 163 | "editable": true 164 | }, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "(25000, 2)\n", 171 | "(25000, 2)\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "# One-hot encoding the output\n", 177 | "num_classes = 2\n", 178 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 179 | "y_test = keras.utils.to_categorical(y_test, num_classes)\n", 180 | "print(y_train.shape)\n", 181 | "print(y_test.shape)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "deletable": true, 188 | "editable": true 189 | }, 190 | "source": [ 191 | "## 4. Building the model architecture\n", 192 | "Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 34, 198 | "metadata": { 199 | "collapsed": false, 200 | "deletable": true, 201 | "editable": true 202 | }, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "_________________________________________________________________\n", 209 | "Layer (type) Output Shape Param # \n", 210 | "=================================================================\n", 211 | "dense_3 (Dense) (None, 512) 512512 \n", 212 | "_________________________________________________________________\n", 213 | "dropout_2 (Dropout) (None, 512) 0 \n", 214 | "_________________________________________________________________\n", 215 | "dense_4 (Dense) (None, 2) 1026 \n", 216 | "=================================================================\n", 217 | "Total params: 513,538.0\n", 218 | "Trainable params: 513,538.0\n", 219 | "Non-trainable params: 0.0\n", 220 | "_________________________________________________________________\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "# Building the model architecture with one layer of length 100\n", 226 | "model = Sequential()\n", 227 | "model.add(Dense(512, activation='relu', input_dim=1000))\n", 228 | "model.add(Dropout(0.5))\n", 229 | "model.add(Dense(num_classes, activation='softmax'))\n", 230 | "model.summary()\n", 231 | "\n", 232 | "# Compiling the model using categorical_crossentropy loss, and rmsprop optimizer.\n", 233 | "model.compile(loss='categorical_crossentropy',\n", 234 | " optimizer='rmsprop',\n", 235 | " metrics=['accuracy'])" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": { 241 | "deletable": true, 242 | "editable": true 243 | }, 244 | "source": [ 245 | "## 5. Training the model\n", 246 | "Run the model here. Experiment with different batch_size, and number of epochs!" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 35, 252 | "metadata": { 253 | "collapsed": false, 254 | "deletable": true, 255 | "editable": true 256 | }, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "Train on 25000 samples, validate on 25000 samples\n", 263 | "Epoch 1/10\n", 264 | "9s - loss: 0.3969 - acc: 0.8260 - val_loss: 0.3429 - val_acc: 0.8568\n", 265 | "Epoch 2/10\n", 266 | "9s - loss: 0.3339 - acc: 0.8670 - val_loss: 0.3413 - val_acc: 0.8632\n", 267 | "Epoch 3/10\n", 268 | "9s - loss: 0.3219 - acc: 0.8778 - val_loss: 0.3552 - val_acc: 0.8614\n", 269 | "Epoch 4/10\n", 270 | "9s - loss: 0.3110 - acc: 0.8853 - val_loss: 0.3718 - val_acc: 0.8602\n", 271 | "Epoch 5/10\n", 272 | "9s - loss: 0.3056 - acc: 0.8920 - val_loss: 0.4086 - val_acc: 0.8542\n", 273 | "Epoch 6/10\n", 274 | "10s - loss: 0.2951 - acc: 0.8983 - val_loss: 0.3938 - val_acc: 0.8608\n", 275 | "Epoch 7/10\n", 276 | "9s - loss: 0.2864 - acc: 0.9037 - val_loss: 0.4258 - val_acc: 0.8566\n", 277 | "Epoch 8/10\n", 278 | "9s - loss: 0.2738 - acc: 0.9100 - val_loss: 0.4733 - val_acc: 0.8509\n", 279 | "Epoch 9/10\n", 280 | "8s - loss: 0.2622 - acc: 0.9162 - val_loss: 0.4658 - val_acc: 0.8536\n", 281 | "Epoch 10/10\n", 282 | "12s - loss: 0.2520 - acc: 0.9216 - val_loss: 0.4877 - val_acc: 0.8583\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "# Running and evaluating the model\n", 288 | "hist = model.fit(x_train, y_train,\n", 289 | " batch_size=32,\n", 290 | " epochs=10,\n", 291 | " validation_data=(x_test, y_test), \n", 292 | " verbose=2)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "deletable": true, 299 | "editable": true 300 | }, 301 | "source": [ 302 | "## 6. Evaluating the model\n", 303 | "This will give you the accuracy of the model. Can you get something over 85%?" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 36, 309 | "metadata": { 310 | "collapsed": false, 311 | "deletable": true, 312 | "editable": true 313 | }, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "accuracy: 0.85828\n" 320 | ] 321 | } 322 | ], 323 | "source": [ 324 | "score = model.evaluate(x_test, y_test, verbose=0)\n", 325 | "print(\"accuracy: \", score[1])" 326 | ] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.5.2" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 2 350 | } 351 | -------------------------------------------------------------------------------- /projects/practice_projects/imdb/README.md: -------------------------------------------------------------------------------- 1 | # imdb practice project -------------------------------------------------------------------------------- /projects/practice_projects/imdb/requirements/aind-dl-mac-linux.yml: -------------------------------------------------------------------------------- 1 | name: aind-dl 2 | channels: 3 | - damianavila82 4 | - defaults 5 | dependencies: 6 | - rise=4.0.0b1=py35_0 7 | - _license=1.1=py35_1 8 | - alabaster=0.7.10=py35_0 9 | - anaconda-client=1.6.2=py35_0 10 | - anaconda=custom=py35_0 11 | - anaconda-navigator=1.5.0=py35_0 12 | - anaconda-project=0.4.1=py35_0 13 | - appnope=0.1.0=py35_0 14 | - appscript=1.0.1=py35_0 15 | - astroid=1.4.9=py35_0 16 | - astropy=1.3=np112py35_0 17 | - babel=2.3.4=py35_0 18 | - backports=1.0=py35_0 19 | - beautifulsoup4=4.5.3=py35_0 20 | - bitarray=0.8.1=py35_0 21 | - blaze=0.10.1=py35_0 22 | - bleach=1.5.0=py35_0 23 | - bokeh=0.12.4=py35_0 24 | - boto=2.46.1=py35_0 25 | - bottleneck=1.2.0=np112py35_0 26 | - cffi=1.9.1=py35_0 27 | - chardet=2.3.0=py35_0 28 | - chest=0.2.3=py35_0 29 | - click=6.7=py35_0 30 | - cloudpickle=0.2.2=py35_0 31 | - clyent=1.2.2=py35_0 32 | - colorama=0.3.7=py35_0 33 | - configobj=5.0.6=py35_0 34 | - contextlib2=0.5.4=py35_0 35 | - cryptography=1.7.1=py35_0 36 | - curl=7.52.1=0 37 | - cycler=0.10.0=py35_0 38 | - cython=0.25.2=py35_0 39 | - cytoolz=0.8.2=py35_0 40 | - dask=0.14.0=py35_0 41 | - datashape=0.5.4=py35_0 42 | - decorator=4.0.11=py35_0 43 | - dill=0.2.5=py35_0 44 | - docutils=0.13.1=py35_0 45 | - entrypoints=0.2.2=py35_1 46 | - et_xmlfile=1.0.1=py35_0 47 | - fastcache=1.0.2=py35_1 48 | - flask=0.12=py35_0 49 | - flask-cors=3.0.2=py35_0 50 | - freetype=2.5.5=2 51 | - get_terminal_size=1.0.0=py35_0 52 | - gevent=1.2.1=py35_0 53 | - greenlet=0.4.12=py35_0 54 | - h5py=2.6.0=np112py35_2 55 | - hdf5=1.8.17=1 56 | - heapdict=1.0.0=py35_1 57 | - html5lib=0.999=py35_0 58 | - icu=54.1=0 59 | - idna=2.2=py35_0 60 | - imagesize=0.7.1=py35_0 61 | - ipykernel=4.5.2=py35_0 62 | - ipython=5.3.0=py35_0 63 | - ipython_genutils=0.1.0=py35_0 64 | - ipywidgets=6.0.0=py35_0 65 | - isort=4.2.5=py35_0 66 | - itsdangerous=0.24=py35_0 67 | - jbig=2.1=0 68 | - jdcal=1.3=py35_0 69 | - jedi=0.9.0=py35_1 70 | - jinja2=2.9.5=py35_0 71 | - jpeg=9b=0 72 | - jsonschema=2.5.1=py35_0 73 | - jupyter=1.0.0=py35_3 74 | - jupyter_client=5.0.0=py35_0 75 | - jupyter_console=5.1.0=py35_0 76 | - jupyter_core=4.3.0=py35_0 77 | - lazy-object-proxy=1.2.2=py35_0 78 | - libiconv=1.14=0 79 | - libpng=1.6.27=0 80 | - libtiff=4.0.6=3 81 | - libxml2=2.9.4=0 82 | - libxslt=1.1.29=0 83 | - llvmlite=0.16.0=py35_0 84 | - locket=0.2.0=py35_1 85 | - lxml=3.7.3=py35_0 86 | - markupsafe=0.23=py35_2 87 | - matplotlib=2.0.0=np112py35_0 88 | - mistune=0.7.4=py35_0 89 | - mkl=2017.0.1=0 90 | - mkl-service=1.1.2=py35_3 91 | - mpmath=0.19=py35_1 92 | - multipledispatch=0.4.9=py35_0 93 | - nbconvert=5.1.1=py35_0 94 | - nbformat=4.3.0=py35_0 95 | - networkx=1.11=py35_0 96 | - nltk=3.2.2=py35_0 97 | - nose=1.3.7=py35_1 98 | - notebook=4.4.1=py35_0 99 | - numba=0.31.0=np112py35_0 100 | - numexpr=2.6.2=np112py35_0 101 | - numpy=1.12.0=py35_0 102 | - numpydoc=0.6.0=py35_0 103 | - odo=0.5.0=py35_1 104 | - olefile=0.44=py35_0 105 | - openpyxl=2.4.1=py35_0 106 | - openssl=1.0.2k=0 107 | - pandas=0.19.2=np112py35_1 108 | - pandocfilters=1.4.1=py35_0 109 | - partd=0.3.7=py35_0 110 | - path.py=10.1=py35_0 111 | - pathlib2=2.2.0=py35_0 112 | - patsy=0.4.1=py35_0 113 | - pep8=1.7.0=py35_0 114 | - pexpect=4.2.1=py35_0 115 | - pickleshare=0.7.4=py35_0 116 | - pillow=4.0.0=py35_1 117 | - pip=9.0.1=py35_1 118 | - ply=3.10=py35_0 119 | - prompt_toolkit=1.0.13=py35_0 120 | - psutil=5.2.0=py35_0 121 | - ptyprocess=0.5.1=py35_0 122 | - py=1.4.32=py35_0 123 | - pyasn1=0.2.3=py35_0 124 | - pycosat=0.6.1=py35_1 125 | - pycparser=2.17=py35_0 126 | - pycrypto=2.6.1=py35_4 127 | - pycurl=7.43.0=py35_2 128 | - pyflakes=1.5.0=py35_0 129 | - pygments=2.2.0=py35_0 130 | - pylint=1.6.4=py35_1 131 | - pyopenssl=16.2.0=py35_0 132 | - pyparsing=2.1.4=py35_0 133 | - pyqt=5.6.0=py35_2 134 | - pytables=3.3.0=np112py35_0 135 | - pytest=3.0.6=py35_0 136 | - python=3.5.3=1 137 | - python-dateutil=2.6.0=py35_0 138 | - python.app=1.2=py35_4 139 | - pytz=2016.10=py35_0 140 | - pyyaml=3.12=py35_0 141 | - pyzmq=16.0.2=py35_0 142 | - qt=5.6.2=0 143 | - qtawesome=0.4.4=py35_0 144 | - qtconsole=4.2.1=py35_1 145 | - qtpy=1.2.1=py35_0 146 | - readline=6.2=2 147 | - redis=3.2.0=0 148 | - redis-py=2.10.5=py35_0 149 | - requests=2.13.0=py35_0 150 | - rope=0.9.4=py35_1 151 | - ruamel_yaml=0.11.14=py35_1 152 | - scikit-image=0.12.3=np112py35_1 153 | - scikit-learn=0.18.1=np112py35_1 154 | - scipy=0.19.0=np112py35_0 155 | - seaborn=0.7.1=py35_0 156 | - setuptools=27.2.0=py35_0 157 | - simplegeneric=0.8.1=py35_1 158 | - singledispatch=3.4.0.3=py35_0 159 | - sip=4.18=py35_0 160 | - six=1.10.0=py35_0 161 | - snowballstemmer=1.2.1=py35_0 162 | - sockjs-tornado=1.0.3=py35_0 163 | - sphinx=1.5.1=py35_0 164 | - spyder=3.1.3=py35_0 165 | - sqlalchemy=1.1.6=py35_0 166 | - sqlite=3.13.0=0 167 | - statsmodels=0.8.0=np112py35_0 168 | - sympy=1.0=py35_0 169 | - terminado=0.6=py35_0 170 | - testpath=0.3=py35_0 171 | - tk=8.5.18=0 172 | - toolz=0.8.2=py35_0 173 | - tornado=4.4.2=py35_0 174 | - traitlets=4.3.2=py35_0 175 | - unicodecsv=0.14.1=py35_0 176 | - wcwidth=0.1.7=py35_0 177 | - werkzeug=0.12=py35_0 178 | - wheel=0.29.0=py35_0 179 | - widgetsnbextension=2.0.0=py35_0 180 | - wrapt=1.10.8=py35_0 181 | - xlrd=1.0.0=py35_0 182 | - xlsxwriter=0.9.6=py35_0 183 | - xlwings=0.10.2=py35_0 184 | - xlwt=1.2.0=py35_0 185 | - xz=5.2.2=1 186 | - yaml=0.1.6=0 187 | - zlib=1.2.8=3 188 | - pip: 189 | - backports.shutil-get-terminal-size==1.0.0 190 | - cvxopt==1.1.9 191 | - et-xmlfile==1.0.1 192 | - ipython-genutils==0.1.0 193 | - jupyter-client==5.0.0 194 | - jupyter-console==5.1.0 195 | - jupyter-core==4.3.0 196 | - keras==2.0.0 197 | - opencv-python==3.2.0.6 198 | - prompt-toolkit==1.0.13 199 | - protobuf==3.2.0 200 | - rope-py3k==0.9.4.post1 201 | - tables==3.3.0 202 | - tensorflow==1.0.0 203 | - theano==0.8.2 204 | - tqdm==4.11.2 205 | -------------------------------------------------------------------------------- /projects/practice_projects/imdb/requirements/aind-dl-windows.yml: -------------------------------------------------------------------------------- 1 | name: aind-dl 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _nb_ext_conf=0.3.0=py35_0 6 | - anaconda-client=1.6.2=py35_0 7 | - bleach=1.5.0=py35_0 8 | - bzip2=1.0.6=vc14_3 9 | - clyent=1.2.2=py35_0 10 | - colorama=0.3.7=py35_0 11 | - cycler=0.10.0=py35_0 12 | - decorator=4.0.11=py35_0 13 | - entrypoints=0.2.2=py35_1 14 | - freetype=2.5.5=vc14_2 15 | - h5py=2.7.0=np112py35_0 16 | - hdf5=1.8.15.1=vc14_4 17 | - html5lib=0.999=py35_0 18 | - icu=57.1=vc14_0 19 | - ipykernel=4.5.2=py35_0 20 | - ipython=5.3.0=py35_0 21 | - ipython_genutils=0.1.0=py35_0 22 | - ipywidgets=6.0.0=py35_0 23 | - jinja2=2.9.5=py35_0 24 | - jpeg=9b=vc14_0 25 | - jsonschema=2.5.1=py35_0 26 | - jupyter=1.0.0=py35_3 27 | - jupyter_client=5.0.0=py35_0 28 | - jupyter_console=5.1.0=py35_0 29 | - jupyter_core=4.3.0=py35_0 30 | - libpng=1.6.27=vc14_0 31 | - libtiff=4.0.6=vc14_3 32 | - markupsafe=0.23=py35_2 33 | - matplotlib=2.0.0=np112py35_0 34 | - mistune=0.7.4=py35_0 35 | - mkl=2017.0.1=0 36 | - nb_anacondacloud=1.2.0=py35_0 37 | - nb_conda=2.0.0=py35_0 38 | - nb_conda_kernels=2.0.0=py35_0 39 | - nbconvert=5.1.1=py35_0 40 | - nbformat=4.3.0=py35_0 41 | - nbpresent=3.0.2=py35_0 42 | - notebook=4.4.1=py35_0 43 | - numpy=1.12.1=py35_0 44 | - olefile=0.44=py35_0 45 | - openssl=1.0.2k=vc14_0 46 | - pandocfilters=1.4.1=py35_0 47 | - path.py=10.1=py35_0 48 | - pickleshare=0.7.4=py35_0 49 | - pillow=4.0.0=py35_1 50 | - pip=9.0.1=py35_1 51 | - prompt_toolkit=1.0.13=py35_0 52 | - pygments=2.2.0=py35_0 53 | - pyparsing=2.1.4=py35_0 54 | - pyqt=5.6.0=py35_2 55 | - python=3.5.3=0 56 | - python-dateutil=2.6.0=py35_0 57 | - pytz=2016.10=py35_0 58 | - pyyaml=3.12=py35_0 59 | - pyzmq=16.0.2=py35_0 60 | - qt=5.6.2=vc14_3 61 | - qtconsole=4.2.1=py35_2 62 | - requests=2.13.0=py35_0 63 | - scikit-learn=0.18.1=np112py35_1 64 | - scipy=0.19.0=np112py35_0 65 | - setuptools=27.2.0=py35_1 66 | - simplegeneric=0.8.1=py35_1 67 | - sip=4.18=py35_0 68 | - six=1.10.0=py35_0 69 | - testpath=0.3=py35_0 70 | - tk=8.5.18=vc14_0 71 | - tornado=4.4.2=py35_0 72 | - traitlets=4.3.2=py35_0 73 | - vs2015_runtime=14.0.25123=0 74 | - wcwidth=0.1.7=py35_0 75 | - wheel=0.29.0=py35_0 76 | - widgetsnbextension=2.0.0=py35_0 77 | - win_unicode_console=0.5=py35_0 78 | - zlib=1.2.8=vc14_3 79 | - pip: 80 | - ipython-genutils==0.1.0 81 | - jupyter-client==5.0.0 82 | - jupyter-console==5.1.0 83 | - jupyter-core==4.3.0 84 | - keras==2.0.2 85 | - nb-anacondacloud==1.2.0 86 | - nb-conda==2.0.0 87 | - nb-conda-kernels==2.0.0 88 | - opencv-python==3.1.0.0 89 | - prompt-toolkit==1.0.13 90 | - protobuf==3.2.0 91 | - tensorflow==1.0.1 92 | - theano==0.9.0 93 | - tqdm==4.11.2 94 | - win-unicode-console==0.5 95 | 96 | 97 | -------------------------------------------------------------------------------- /projects/practice_projects/imdb/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==3.2.0.6 2 | h5py==2.6.0 3 | matplotlib==2.0.0 4 | numpy==1.12.0 5 | scipy==0.18.1 6 | tqdm==4.11.2 7 | keras==2.0.2 8 | scikit-learn==0.18.1 9 | pillow==4.0.0 10 | tensorflow==1.0.0 11 | pandas==0.19.2 12 | -------------------------------------------------------------------------------- /projects/practice_projects/imdb/student_data.csv: -------------------------------------------------------------------------------- 1 | admit,gre,gpa,rank 2 | 0,380,3.61,3 3 | 1,660,3.67,3 4 | 1,800,4,1 5 | 1,640,3.19,4 6 | 0,520,2.93,4 7 | 1,760,3,2 8 | 1,560,2.98,1 9 | 0,400,3.08,2 10 | 1,540,3.39,3 11 | 0,700,3.92,2 12 | 0,800,4,4 13 | 0,440,3.22,1 14 | 1,760,4,1 15 | 0,700,3.08,2 16 | 1,700,4,1 17 | 0,480,3.44,3 18 | 0,780,3.87,4 19 | 0,360,2.56,3 20 | 0,800,3.75,2 21 | 1,540,3.81,1 22 | 0,500,3.17,3 23 | 1,660,3.63,2 24 | 0,600,2.82,4 25 | 0,680,3.19,4 26 | 1,760,3.35,2 27 | 1,800,3.66,1 28 | 1,620,3.61,1 29 | 1,520,3.74,4 30 | 1,780,3.22,2 31 | 0,520,3.29,1 32 | 0,540,3.78,4 33 | 0,760,3.35,3 34 | 0,600,3.4,3 35 | 1,800,4,3 36 | 0,360,3.14,1 37 | 0,400,3.05,2 38 | 0,580,3.25,1 39 | 0,520,2.9,3 40 | 1,500,3.13,2 41 | 1,520,2.68,3 42 | 0,560,2.42,2 43 | 1,580,3.32,2 44 | 1,600,3.15,2 45 | 0,500,3.31,3 46 | 0,700,2.94,2 47 | 1,460,3.45,3 48 | 1,580,3.46,2 49 | 0,500,2.97,4 50 | 0,440,2.48,4 51 | 0,400,3.35,3 52 | 0,640,3.86,3 53 | 0,440,3.13,4 54 | 0,740,3.37,4 55 | 1,680,3.27,2 56 | 0,660,3.34,3 57 | 1,740,4,3 58 | 0,560,3.19,3 59 | 0,380,2.94,3 60 | 0,400,3.65,2 61 | 0,600,2.82,4 62 | 1,620,3.18,2 63 | 0,560,3.32,4 64 | 0,640,3.67,3 65 | 1,680,3.85,3 66 | 0,580,4,3 67 | 0,600,3.59,2 68 | 0,740,3.62,4 69 | 0,620,3.3,1 70 | 0,580,3.69,1 71 | 0,800,3.73,1 72 | 0,640,4,3 73 | 0,300,2.92,4 74 | 0,480,3.39,4 75 | 0,580,4,2 76 | 0,720,3.45,4 77 | 0,720,4,3 78 | 0,560,3.36,3 79 | 1,800,4,3 80 | 0,540,3.12,1 81 | 1,620,4,1 82 | 0,700,2.9,4 83 | 0,620,3.07,2 84 | 0,500,2.71,2 85 | 0,380,2.91,4 86 | 1,500,3.6,3 87 | 0,520,2.98,2 88 | 0,600,3.32,2 89 | 0,600,3.48,2 90 | 0,700,3.28,1 91 | 1,660,4,2 92 | 0,700,3.83,2 93 | 1,720,3.64,1 94 | 0,800,3.9,2 95 | 0,580,2.93,2 96 | 1,660,3.44,2 97 | 0,660,3.33,2 98 | 0,640,3.52,4 99 | 0,480,3.57,2 100 | 0,700,2.88,2 101 | 0,400,3.31,3 102 | 0,340,3.15,3 103 | 0,580,3.57,3 104 | 0,380,3.33,4 105 | 0,540,3.94,3 106 | 1,660,3.95,2 107 | 1,740,2.97,2 108 | 1,700,3.56,1 109 | 0,480,3.13,2 110 | 0,400,2.93,3 111 | 0,480,3.45,2 112 | 0,680,3.08,4 113 | 0,420,3.41,4 114 | 0,360,3,3 115 | 0,600,3.22,1 116 | 0,720,3.84,3 117 | 0,620,3.99,3 118 | 1,440,3.45,2 119 | 0,700,3.72,2 120 | 1,800,3.7,1 121 | 0,340,2.92,3 122 | 1,520,3.74,2 123 | 1,480,2.67,2 124 | 0,520,2.85,3 125 | 0,500,2.98,3 126 | 0,720,3.88,3 127 | 0,540,3.38,4 128 | 1,600,3.54,1 129 | 0,740,3.74,4 130 | 0,540,3.19,2 131 | 0,460,3.15,4 132 | 1,620,3.17,2 133 | 0,640,2.79,2 134 | 0,580,3.4,2 135 | 0,500,3.08,3 136 | 0,560,2.95,2 137 | 0,500,3.57,3 138 | 0,560,3.33,4 139 | 0,700,4,3 140 | 0,620,3.4,2 141 | 1,600,3.58,1 142 | 0,640,3.93,2 143 | 1,700,3.52,4 144 | 0,620,3.94,4 145 | 0,580,3.4,3 146 | 0,580,3.4,4 147 | 0,380,3.43,3 148 | 0,480,3.4,2 149 | 0,560,2.71,3 150 | 1,480,2.91,1 151 | 0,740,3.31,1 152 | 1,800,3.74,1 153 | 0,400,3.38,2 154 | 1,640,3.94,2 155 | 0,580,3.46,3 156 | 0,620,3.69,3 157 | 1,580,2.86,4 158 | 0,560,2.52,2 159 | 1,480,3.58,1 160 | 0,660,3.49,2 161 | 0,700,3.82,3 162 | 0,600,3.13,2 163 | 0,640,3.5,2 164 | 1,700,3.56,2 165 | 0,520,2.73,2 166 | 0,580,3.3,2 167 | 0,700,4,1 168 | 0,440,3.24,4 169 | 0,720,3.77,3 170 | 0,500,4,3 171 | 0,600,3.62,3 172 | 0,400,3.51,3 173 | 0,540,2.81,3 174 | 0,680,3.48,3 175 | 1,800,3.43,2 176 | 0,500,3.53,4 177 | 1,620,3.37,2 178 | 0,520,2.62,2 179 | 1,620,3.23,3 180 | 0,620,3.33,3 181 | 0,300,3.01,3 182 | 0,620,3.78,3 183 | 0,500,3.88,4 184 | 0,700,4,2 185 | 1,540,3.84,2 186 | 0,500,2.79,4 187 | 0,800,3.6,2 188 | 0,560,3.61,3 189 | 0,,,2 190 | 0,560,3.07,2 191 | 0,500,3.35,2 192 | 1,640,2.94,2 193 | 0,800,3.54,3 194 | 0,640,3.76,3 195 | 0,380,3.59,4 196 | 1,600,3.47,2 197 | 0,560,3.59,2 198 | 0,660,3.07,3 199 | 1,400,3.23,4 200 | 0,600,3.63,3 201 | 0,580,3.77,4 202 | 0,800,3.31,3 203 | 1,580,3.2,2 204 | 1,700,4,1 205 | 0,420,3.92,4 206 | 1,600,3.89,1 207 | 1,780,3.8,3 208 | 0,740,3.54,1 209 | 1,640,3.63,1 210 | 0,540,3.16,3 211 | 0,580,3.5,2 212 | 0,740,3.34,4 213 | 0,580,3.02,2 214 | 0,,2.87,2 215 | 0,640,3.38,3 216 | 1,600,3.56,2 217 | 1,660,2.91,3 218 | 0,340,2.9,1 219 | 1,460,3.64,1 220 | 0,460,2.98,1 221 | 1,560,3.59,2 222 | 0,540,3.28,3 223 | 0,680,3.99,3 224 | 1,480,3.02,1 225 | 0,800,3.47,3 226 | 0,800,2.9,2 227 | 1,720,3.5,3 228 | 0,620,3.58,2 229 | 0,540,3.02,4 230 | 0,480,3.43,2 231 | 1,720,3.42,2 232 | 0,580,3.29,4 233 | 0,600,3.28,3 234 | 0,380,3.38,2 235 | 0,420,2.67,3 236 | 1,800,3.53,1 237 | 0,620,3.05,2 238 | 1,660,, 239 | 0,480,4,2 240 | 0,500,2.86,4 241 | 0,700,3.45,3 242 | 0,440,2.76,2 243 | 1,520,3.81,1 244 | 1,680,2.96,3 245 | 0,620,3.22,2 246 | 0,540,3.04,1 247 | 0,800,3.91,3 248 | 0,680,3.34,2 249 | 0,440,3.17,2 250 | 0,680,3.64,3 251 | 0,640,3.73,3 252 | 0,660,3.31,4 253 | 0,620,3.21,4 254 | 1,520,4,2 255 | 1,540,3.55,4 256 | 1,740,3.52,4 257 | 0,640,3.35,3 258 | 1,520,3.3,2 259 | 1,620,3.95,3 260 | 0,520,3.51,2 261 | 0,640,3.81,2 262 | 0,680,3.11,2 263 | 0,440,3.15,2 264 | 1,520,3.19,3 265 | 1,620,3.95,3 266 | 1,520,3.9,3 267 | 0,380,3.34,3 268 | 0,560,3.24,4 269 | 1,600,3.64,3 270 | 1,680,3.46,2 271 | 0,500,2.81,3 272 | 1,640,3.95,2 273 | 0,540,3.33,3 274 | 1,680,3.67,2 275 | 0,660,3.32,1 276 | 0,520,3.12,2 277 | 1,600,2.98,2 278 | 0,460,3.77,3 279 | 1,580,3.58,1 280 | 1,680,3,4 281 | 1,660,3.14,2 282 | 0,660,3.94,2 283 | 0,360,3.27,3 284 | 0,660,3.45,4 285 | 0,520,3.1,4 286 | 1,440,3.39,2 287 | 0,600,3.31,4 288 | 1,800,3.22,1 289 | 1,660,3.7,4 290 | 0,800,3.15,4 291 | 0,420,2.26,4 292 | 1,620,3.45,2 293 | 0,800,2.78,2 294 | 0,680,3.7,2 295 | 0,800,3.97,1 296 | 0,480,2.55,1 297 | 0,520,3.25,3 298 | 0,560,3.16,1 299 | 0,460,3.07,2 300 | 0,540,3.5,2 301 | 0,720,3.4,3 302 | 0,640,3.3,2 303 | 1,660,3.6,3 304 | 1,400,3.15,2 305 | 1,680,3.98,2 306 | 0,220,2.83,3 307 | 0,580,3.46,4 308 | 1,540,3.17,1 309 | 0,580,3.51,2 310 | 0,540,3.13,2 311 | 0,440,2.98,3 312 | 0,560,4,3 313 | 0,660,3.67,2 314 | 0,660,3.77,3 315 | 1,520,3.65,4 316 | 0,540,3.46,4 317 | 1,300,2.84,2 318 | 1,340,3,2 319 | 1,780,3.63,4 320 | 1,480,3.71,4 321 | 0,540,3.28,1 322 | 0,460,3.14,3 323 | 0,460,3.58,2 324 | 0,500,3.01,4 325 | 0,420,2.69,2 326 | 0,520,2.7,3 327 | 0,680,3.9,1 328 | 0,680,3.31,2 329 | 1,560,3.48,2 330 | 0,580,3.34,2 331 | 0,500,2.93,4 332 | 0,740,4,3 333 | 0,660,3.59,3 334 | 0,420,2.96,1 335 | 0,560,3.43,3 336 | 1,460,3.64,3 337 | 1,620,3.71,1 338 | 0,520,3.15,3 339 | 0,620,3.09,4 340 | 0,540,3.2,1 341 | 1,660,3.47,3 342 | 0,500,3.23,4 343 | 1,560,2.65,3 344 | 0,500,3.95,4 345 | 0,580,3.06,2 346 | 0,520,3.35,3 347 | 0,500,3.03,3 348 | 0,600,3.35,2 349 | 0,580,3.8,2 350 | 0,400,3.36,2 351 | 0,620,2.85,2 352 | 1,780,4,2 353 | 0,620,3.43,3 354 | 1,580,3.12,3 355 | 0,700,3.52,2 356 | 1,540,3.78,2 357 | 1,760,2.81,1 358 | 0,700,3.27,2 359 | 0,720,3.31,1 360 | 1,560,3.69,3 361 | 0,720,3.94,3 362 | 1,520,4,1 363 | 1,540,3.49,1 364 | 0,680,3.14,2 365 | 0,460,3.44,2 366 | 1,560,3.36,1 367 | 0,480,2.78,3 368 | 0,460,2.93,3 369 | 0,620,3.63,3 370 | 0,580,4,1 371 | 0,800,3.89,2 372 | 1,540,3.77,2 373 | 1,680,3.76,3 374 | 1,680,2.42,1 375 | 1,620,3.37,1 376 | 0,560,3.78,2 377 | 0,560,3.49,4 378 | 0,620,3.63,2 379 | 1,800,4,2 380 | 0,640,3.12,3 381 | 0,540,2.7,2 382 | 0,700,3.65,2 383 | 1,540,3.49,2 384 | 0,540,3.51,2 385 | 0,660,4,1 386 | 1,480,2.62,2 387 | 0,420,3.02,1 388 | 1,740,3.86,2 389 | 0,580,3.36,2 390 | 0,640,3.17,2 391 | 0,640,3.51,2 392 | 1,800,3.05,2 393 | 1,660,3.88,2 394 | 1,600,3.38,3 395 | 1,620,3.75,2 396 | 1,460,3.99,3 397 | 0,620,4,2 398 | 0,560,3.04,3 399 | 0,460,2.63,2 400 | 0,700,3.65,2 401 | 0,600,3.89,3 -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/ReadMe.md: -------------------------------------------------------------------------------- 1 | Naive Bayes tutorial from scratch. Can be viewed in the iPython notebook. Happy learning! -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/images/bayes_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/naive_bayes_tutorial/images/bayes_formula.png -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/images/countvectorizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/naive_bayes_tutorial/images/countvectorizer.png -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/images/dqnb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/naive_bayes_tutorial/images/dqnb.png -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/images/naivebayes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/naive_bayes_tutorial/images/naivebayes.png -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/images/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/naive_bayes_tutorial/images/tfidf.png -------------------------------------------------------------------------------- /projects/practice_projects/naive_bayes_tutorial/smsspamcollection/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/practice_projects/naive_bayes_tutorial/smsspamcollection/readme -------------------------------------------------------------------------------- /projects/smartcab/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | # Reinforcement Learning 3 | ## Project: Train a Smartcab How to Drive 4 | 5 | ### Install 6 | 7 | This project requires **Python 2.7** with the [pygame](https://www.pygame.org/wiki/GettingStarted 8 | ) library installed 9 | 10 | ### Code 11 | 12 | Template code is provided in the `smartcab/agent.py` python file. Additional supporting python code can be found in `smartcab/enviroment.py`, `smartcab/planner.py`, and `smartcab/simulator.py`. Supporting images for the graphical user interface can be found in the `images` folder. While some code has already been implemented to get you started, you will need to implement additional functionality for the `LearningAgent` class in `agent.py` when requested to successfully complete the project. 13 | 14 | ### Run 15 | 16 | In a terminal or command window, navigate to the top-level project directory `smartcab/` (that contains this README) and run one of the following commands: 17 | 18 | ```python smartcab/agent.py``` 19 | ```python -m smartcab.agent``` 20 | 21 | This will run the `agent.py` file and execute your agent code. 22 | -------------------------------------------------------------------------------- /projects/smartcab/project_description.md: -------------------------------------------------------------------------------- 1 | # Content: Reinforcement Learning 2 | ## Project: Train a Smartcab How to Drive 3 | 4 | ## Project Overview 5 | 6 | In this project you will apply reinforcement learning techniques for a self-driving agent in a simplified world to aid it in effectively reaching its destinations in the allotted time. You will first investigate the environment the agent operates in by constructing a very basic driving implementation. Once your agent is successful at operating within the environment, you will then identify each possible state the agent can be in when considering such things as traffic lights and oncoming traffic at each intersection. With states identified, you will then implement a Q-Learning algorithm for the self-driving agent to guide the agent towards its destination within the allotted time. Finally, you will improve upon the Q-Learning algorithm to find the best configuration of learning and exploration factors to ensure the self-driving agent is reaching its destinations with consistently positive results. 7 | 8 | ## Description 9 | In the not-so-distant future, taxicab companies across the United States no longer employ human drivers to operate their fleet of vehicles. Instead, the taxicabs are operated by self-driving agents, known as *smartcabs*, to transport people from one location to another within the cities those companies operate. In major metropolitan areas, such as Chicago, New York City, and San Francisco, an increasing number of people have come to depend on *smartcabs* to get to where they need to go as safely and reliably as possible. Although *smartcabs* have become the transport of choice, concerns have arose that a self-driving agent might not be as safe or reliable as human drivers, particularly when considering city traffic lights and other vehicles. To alleviate these concerns, your task as an employee for a national taxicab company is to use reinforcement learning techniques to construct a demonstration of a *smartcab* operating in real-time to prove that both safety and reliability can be achieved. 10 | 11 | ## Software Requirements 12 | This project uses the following software and Python libraries: 13 | 14 | - [Python 2.7](https://www.python.org/download/releases/2.7/) 15 | - [NumPy](http://www.numpy.org/) 16 | - [pandas](http://pandas.pydata.org/) 17 | - [matplotlib](http://matplotlib.org/) 18 | - [PyGame](http://pygame.org/) 19 | 20 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. `pygame` can then be installed using one of the following commands: 21 | 22 | Mac: `conda install -c https://conda.anaconda.org/quasiben pygame` 23 | Windows: `conda install -c https://conda.anaconda.org/prkrekel pygame` 24 | Linux: `conda install -c https://conda.anaconda.org/tlatorre pygame` 25 | 26 | ## Fixing Common PyGame Problems 27 | 28 | The PyGame library can in some cases require a bit of troubleshooting to work correctly for this project. While the PyGame aspect of the project is not required for a successful submission (you can complete the project without a visual simulation, although it is more difficult), it is very helpful to have it working! If you encounter an issue with PyGame, first see these helpful links below that are developed by communities of users working with the library: 29 | - [Getting Started](https://www.pygame.org/wiki/GettingStarted) 30 | - [PyGame Information](http://www.pygame.org/wiki/info) 31 | - [Google Group](https://groups.google.com/forum/#!forum/pygame-mirror-on-google-groups) 32 | - [PyGame subreddit](https://www.reddit.com/r/pygame/) 33 | 34 | ### Problems most often reported by students 35 | _"PyGame won't install on my machine; there was an issue with the installation."_ 36 | **Solution:** As has been recommended for previous projects, Udacity suggests that you are using the Anaconda distribution of Python, which can then allow you to install PyGame through the `conda`-specific command. 37 | 38 | _"I'm seeing a black screen when running the code; output says that it can't load car images."_ 39 | **Solution:** The code will not operate correctly unless it is run from the top-level directory for `smartcab`. The top-level directory is the one that contains the **README** and the project notebook. 40 | 41 | If you continue to have problems with the project code in regards to PyGame, you can also [use the discussion forums](https://discussions.udacity.com/c/nd009-reinforcement-learning) to find posts from students that encountered issues that you may be experiencing. Additionally, you can seek help from a swath of students in the [MLND Student Slack Community](http://mlnd.slack.com). 42 | 43 | ## Starting the Project 44 | 45 | For this assignment, you can find the `smartcab` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 46 | 47 | This project contains three directories: 48 | 49 | - `/logs/`: This folder will contain all log files that are given from the simulation when specific prerequisites are met. 50 | - `/images/`: This folder contains various images of cars to be used in the graphical user interface. You will not need to modify or create any files in this directory. 51 | - `/smartcab/`: This folder contains the Python scripts that create the environment, graphical user interface, the simulation, and the agents. You will not need to modify or create any files in this directory except for `agent.py`. 52 | 53 | It also contains two files: 54 | - `smartcab.ipynb`: This is the main file where you will answer questions and provide an analysis for your work. 55 | -`visuals.py`: This Python script provides supplementary visualizations for the analysis. Do not modify. 56 | 57 | Finally, in `/smartcab/` are the following four files: 58 | - **Modify:** 59 | - `agent.py`: This is the main Python file where you will be performing your work on the project. 60 | - **Do not modify:** 61 | - `environment.py`: This Python file will create the *smartcab* environment. 62 | - `planner.py`: This Python file creates a high-level planner for the agent to follow towards a set goal. 63 | - `simulation.py`: This Python file creates the simulation and graphical user interface. 64 | 65 | ### Running the Code 66 | In a terminal or command window, navigate to the top-level project directory `smartcab/` (that contains the two project directories) and run one of the following commands: 67 | 68 | `python smartcab/agent.py` or 69 | `python -m smartcab.agent` 70 | 71 | This will run the `agent.py` file and execute your implemented agent code into the environment. Additionally, use the command `jupyter notebook smartcab.ipynb` from this same directory to open up a browser window or tab to work with your analysis notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the implementation necessary for your `agent.py` agent file. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 72 | 73 | ## Definitions 74 | 75 | ### Environment 76 | The *smartcab* operates in an ideal, grid-like city (similar to New York City), with roads going in the North-South and East-West directions. Other vehicles will certainly be present on the road, but there will be no pedestrians to be concerned with. At each intersection there is a traffic light that either allows traffic in the North-South direction or the East-West direction. U.S. Right-of-Way rules apply: 77 | - On a green light, a left turn is permitted if there is no oncoming traffic making a right turn or coming straight through the intersection. 78 | - On a red light, a right turn is permitted if no oncoming traffic is approaching from your left through the intersection. 79 | To understand how to correctly yield to oncoming traffic when turning left, you may refer to [this official drivers? education video](https://www.youtube.com/watch?v=TW0Eq2Q-9Ac), or [this passionate exposition](https://www.youtube.com/watch?v=0EdkxI6NeuA). 80 | 81 | ### Inputs and Outputs 82 | Assume that the *smartcab* is assigned a route plan based on the passengers? starting location and destination. The route is split at each intersection into waypoints, and you may assume that the *smartcab*, at any instant, is at some intersection in the world. Therefore, the next waypoint to the destination, assuming the destination has not already been reached, is one intersection away in one direction (North, South, East, or West). The *smartcab* has only an egocentric view of the intersection it is at: It can determine the state of the traffic light for its direction of movement, and whether there is a vehicle at the intersection for each of the oncoming directions. For each action, the *smartcab* may either idle at the intersection, or drive to the next intersection to the left, right, or ahead of it. Finally, each trip has a time to reach the destination which decreases for each action taken (the passengers want to get there quickly). If the allotted time becomes zero before reaching the destination, the trip has failed. 83 | 84 | ### Rewards and Goal 85 | The *smartcab* will receive positive or negative rewards based on the action it as taken. Expectedly, the *smartcab* will receive a small positive reward when making a good action, and a varying amount of negative reward dependent on the severity of the traffic violation it would have committed. Based on the rewards and penalties the *smartcab* receives, the self-driving agent implementation should learn an optimal policy for driving on the city roads while obeying traffic rules, avoiding accidents, and reaching passengers? destinations in the allotted time. 86 | 87 | ## Submitting the Project 88 | 89 | ### Evaluation 90 | Your project will be reviewed by a Udacity reviewer against the **Train a Smartcab to Drive project rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 91 | 92 | ### Submission Files 93 | When you are ready to submit your project, collect the following files and compress them into a single archive for upload. Alternatively, you may supply the following files on your GitHub Repo in a folder named `smartcab` for ease of access: 94 | - The `agent.py` Python file with all code implemented as required in the instructed tasks. 95 | - The `/logs/` folder which should contain **five** log files that were produced from your simulation and used in the analysis. 96 | - The `smartcab.ipynb` notebook file with all questions answered and all visualization cells executed and displaying results. 97 | - An **HTML** export of the project notebook with the name **report.html**. This file *must* be present for your project to be evaluated. 98 | 99 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 100 | 101 | ### I'm Ready! 102 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 103 | 104 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 105 | 106 | ### What's Next? 107 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! -------------------------------------------------------------------------------- /projects/smartcab/smartcab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/__init__.py -------------------------------------------------------------------------------- /projects/smartcab/smartcab/agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | from environment import Agent, Environment 4 | from planner import RoutePlanner 5 | from simulator import Simulator 6 | 7 | class LearningAgent(Agent): 8 | """ An agent that learns to drive in the Smartcab world. 9 | This is the object you will be modifying. """ 10 | 11 | def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): 12 | super(LearningAgent, self).__init__(env) # Set the agent in the evironment 13 | self.planner = RoutePlanner(self.env, self) # Create a route planner 14 | self.valid_actions = self.env.valid_actions # The set of valid actions 15 | 16 | # Set parameters of the learning agent 17 | self.learning = learning # Whether the agent is expected to learn 18 | self.Q = dict() # Create a Q-table which will be a dictionary of tuples 19 | self.epsilon = epsilon # Random exploration factor 20 | self.alpha = alpha # Learning factor 21 | 22 | ########### 23 | ## TO DO ## 24 | ########### 25 | # Set any additional class parameters as needed 26 | 27 | 28 | def reset(self, destination=None, testing=False): 29 | """ The reset function is called at the beginning of each trial. 30 | 'testing' is set to True if testing trials are being used 31 | once training trials have completed. """ 32 | 33 | # Select the destination as the new location to route to 34 | self.planner.route_to(destination) 35 | 36 | ########### 37 | ## TO DO ## 38 | ########### 39 | # Update epsilon using a decay function of your choice 40 | # Update additional class parameters as needed 41 | # If 'testing' is True, set epsilon and alpha to 0 42 | 43 | return None 44 | 45 | def build_state(self): 46 | """ The build_state function is called when the agent requests data from the 47 | environment. The next waypoint, the intersection inputs, and the deadline 48 | are all features available to the agent. """ 49 | 50 | # Collect data about the environment 51 | waypoint = self.planner.next_waypoint() # The next waypoint 52 | inputs = self.env.sense(self) # Visual input - intersection light and traffic 53 | deadline = self.env.get_deadline(self) # Remaining deadline 54 | 55 | ########### 56 | ## TO DO ## 57 | ########### 58 | 59 | # NOTE : you are not allowed to engineer features outside of the inputs available. 60 | # Because the aim of this project is to teach Reinforcement Learning, we have placed 61 | # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. 62 | # With the hand-engineered features, this learning process gets entirely negated. 63 | 64 | # Set 'state' as a tuple of relevant data for the agent 65 | state = None 66 | 67 | return state 68 | 69 | 70 | def get_maxQ(self, state): 71 | """ The get_maxQ function is called when the agent is asked to find the 72 | maximum Q-value of all actions based on the 'state' the smartcab is in. """ 73 | 74 | ########### 75 | ## TO DO ## 76 | ########### 77 | # Calculate the maximum Q-value of all actions for a given state 78 | 79 | maxQ = None 80 | 81 | return maxQ 82 | 83 | 84 | def createQ(self, state): 85 | """ The createQ function is called when a state is generated by the agent. """ 86 | 87 | ########### 88 | ## TO DO ## 89 | ########### 90 | # When learning, check if the 'state' is not in the Q-table 91 | # If it is not, create a new dictionary for that state 92 | # Then, for each action available, set the initial Q-value to 0.0 93 | 94 | return 95 | 96 | 97 | def choose_action(self, state): 98 | """ The choose_action function is called when the agent is asked to choose 99 | which action to take, based on the 'state' the smartcab is in. """ 100 | 101 | # Set the agent state and default action 102 | self.state = state 103 | self.next_waypoint = self.planner.next_waypoint() 104 | action = None 105 | 106 | ########### 107 | ## TO DO ## 108 | ########### 109 | # When not learning, choose a random action 110 | # When learning, choose a random action with 'epsilon' probability 111 | # Otherwise, choose an action with the highest Q-value for the current state 112 | # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". 113 | return action 114 | 115 | 116 | def learn(self, state, action, reward): 117 | """ The learn function is called after the agent completes an action and 118 | receives a reward. This function does not consider future rewards 119 | when conducting learning. """ 120 | 121 | ########### 122 | ## TO DO ## 123 | ########### 124 | # When learning, implement the value iteration update rule 125 | # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') 126 | 127 | return 128 | 129 | 130 | def update(self): 131 | """ The update function is called when a time step is completed in the 132 | environment for a given trial. This function will build the agent 133 | state, choose an action, receive a reward, and learn if enabled. """ 134 | 135 | state = self.build_state() # Get current state 136 | self.createQ(state) # Create 'state' in Q-table 137 | action = self.choose_action(state) # Choose an action 138 | reward = self.env.act(self, action) # Receive a reward 139 | self.learn(state, action, reward) # Q-learn 140 | 141 | return 142 | 143 | 144 | def run(): 145 | """ Driving function for running the simulation. 146 | Press ESC to close the simulation, or [SPACE] to pause the simulation. """ 147 | 148 | ############## 149 | # Create the environment 150 | # Flags: 151 | # verbose - set to True to display additional output from the simulation 152 | # num_dummies - discrete number of dummy agents in the environment, default is 100 153 | # grid_size - discrete number of intersections (columns, rows), default is (8, 6) 154 | env = Environment() 155 | 156 | ############## 157 | # Create the driving agent 158 | # Flags: 159 | # learning - set to True to force the driving agent to use Q-learning 160 | # * epsilon - continuous value for the exploration factor, default is 1 161 | # * alpha - continuous value for the learning rate, default is 0.5 162 | agent = env.create_agent(LearningAgent) 163 | 164 | ############## 165 | # Follow the driving agent 166 | # Flags: 167 | # enforce_deadline - set to True to enforce a deadline metric 168 | env.set_primary_agent(agent) 169 | 170 | ############## 171 | # Create the simulation 172 | # Flags: 173 | # update_delay - continuous time (in seconds) between actions, default is 2.0 seconds 174 | # display - set to False to disable the GUI if PyGame is enabled 175 | # log_metrics - set to True to log trial and simulation results to /logs 176 | # optimized - set to True to change the default log file name 177 | sim = Simulator(env) 178 | 179 | ############## 180 | # Run the simulator 181 | # Flags: 182 | # tolerance - epsilon tolerance before beginning testing, default is 0.05 183 | # n_test - discrete number of testing trials to perform, default is 0 184 | sim.run() 185 | 186 | 187 | if __name__ == '__main__': 188 | run() 189 | -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-black.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-blue.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-cyan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-cyan.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-green.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-magenta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-magenta.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-orange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-orange.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-red.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-white.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/car-yellow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/car-yellow.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/east-west.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/east-west.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/logo.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/images/north-south.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementmihailescu/machine-learning/7ba8bd66d491a31f1650f73f7a0d26ff619e20cd/projects/smartcab/smartcab/images/north-south.png -------------------------------------------------------------------------------- /projects/smartcab/smartcab/planner.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | class RoutePlanner(object): 4 | """ Complex route planner that is meant for a perpendicular grid network. """ 5 | 6 | def __init__(self, env, agent): 7 | self.env = env 8 | self.agent = agent 9 | self.destination = None 10 | 11 | def route_to(self, destination=None): 12 | """ Select the destination if one is provided, otherwise choose a random intersection. """ 13 | 14 | self.destination = destination if destination is not None else random.choice(self.env.intersections.keys()) 15 | 16 | def next_waypoint(self): 17 | """ Creates the next waypoint based on current heading, location, 18 | intended destination and L1 distance from destination. """ 19 | 20 | # Collect global location details 21 | bounds = self.env.grid_size 22 | location = self.env.agent_states[self.agent]['location'] 23 | heading = self.env.agent_states[self.agent]['heading'] 24 | 25 | delta_a = (self.destination[0] - location[0], self.destination[1] - location[1]) 26 | delta_b = (bounds[0] + delta_a[0] if delta_a[0] <= 0 else delta_a[0] - bounds[0], \ 27 | bounds[1] + delta_a[1] if delta_a[1] <= 0 else delta_a[1] - bounds[1]) 28 | 29 | # Calculate true difference in location based on world-wrap 30 | # This will pre-determine the need for U-turns from improper headings 31 | dx = delta_a[0] if abs(delta_a[0]) < abs(delta_b[0]) else delta_b[0] 32 | dy = delta_a[1] if abs(delta_a[1]) < abs(delta_b[1]) else delta_b[1] 33 | 34 | # First check if destination is at location 35 | if dx == 0 and dy == 0: 36 | return None 37 | 38 | # Next check if destination is cardinally East or West of location 39 | elif dx != 0: 40 | 41 | if dx * heading[0] > 0: # Heading the correct East or West direction 42 | return 'forward' 43 | elif dx * heading[0] < 0 and heading[0] < 0: # Heading West, destination East 44 | if dy > 0: # Destination also to the South 45 | return 'left' 46 | else: 47 | return 'right' 48 | elif dx * heading[0] < 0 and heading[0] > 0: # Heading East, destination West 49 | if dy < 0: # Destination also to the North 50 | return 'left' 51 | else: 52 | return 'right' 53 | elif dx * heading[1] > 0: # Heading North destination West; Heading South destination East 54 | return 'left' 55 | else: 56 | return 'right' 57 | 58 | # Finally, check if destination is cardinally North or South of location 59 | elif dy != 0: 60 | 61 | if dy * heading[1] > 0: # Heading the correct North or South direction 62 | return 'forward' 63 | elif dy * heading[1] < 0 and heading[1] < 0: # Heading North, destination South 64 | if dx < 0: # Destination also to the West 65 | return 'left' 66 | else: 67 | return 'right' 68 | elif dy * heading[1] < 0 and heading[1] > 0: # Heading South, destination North 69 | if dx > 0: # Destination also to the East 70 | return 'left' 71 | else: 72 | return 'right' 73 | elif dy * heading[0] > 0: # Heading West destination North; Heading East destination South 74 | return 'right' 75 | else: 76 | return 'left' -------------------------------------------------------------------------------- /projects/smartcab/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | ########################################### 7 | # 8 | # Display inline matplotlib plots with IPython 9 | from IPython import get_ipython 10 | get_ipython().run_line_magic('matplotlib', 'inline') 11 | ########################################### 12 | 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | import pandas as pd 16 | import os 17 | import ast 18 | 19 | 20 | def calculate_safety(data): 21 | """ Calculates the safety rating of the smartcab during testing. """ 22 | 23 | good_ratio = data['good_actions'].sum() * 1.0 / \ 24 | (data['initial_deadline'] - data['final_deadline']).sum() 25 | 26 | if good_ratio == 1: # Perfect driving 27 | return ("A+", "green") 28 | else: # Imperfect driving 29 | if data['actions'].apply(lambda x: ast.literal_eval(x)[4]).sum() > 0: # Major accident 30 | return ("F", "red") 31 | elif data['actions'].apply(lambda x: ast.literal_eval(x)[3]).sum() > 0: # Minor accident 32 | return ("D", "#EEC700") 33 | elif data['actions'].apply(lambda x: ast.literal_eval(x)[2]).sum() > 0: # Major violation 34 | return ("C", "#EEC700") 35 | else: # Minor violation 36 | minor = data['actions'].apply(lambda x: ast.literal_eval(x)[1]).sum() 37 | if minor >= len(data)/2: # Minor violation in at least half of the trials 38 | return ("B", "green") 39 | else: 40 | return ("A", "green") 41 | 42 | 43 | def calculate_reliability(data): 44 | """ Calculates the reliability rating of the smartcab during testing. """ 45 | 46 | success_ratio = data['success'].sum() * 1.0 / len(data) 47 | 48 | if success_ratio == 1: # Always meets deadline 49 | return ("A+", "green") 50 | else: 51 | if success_ratio >= 0.90: 52 | return ("A", "green") 53 | elif success_ratio >= 0.80: 54 | return ("B", "green") 55 | elif success_ratio >= 0.70: 56 | return ("C", "#EEC700") 57 | elif success_ratio >= 0.60: 58 | return ("D", "#EEC700") 59 | else: 60 | return ("F", "red") 61 | 62 | 63 | def plot_trials(csv): 64 | """ Plots the data from logged metrics during a simulation.""" 65 | 66 | data = pd.read_csv(os.path.join("logs", csv)) 67 | 68 | if len(data) < 10: 69 | print "Not enough data collected to create a visualization." 70 | print "At least 20 trials are required." 71 | return 72 | 73 | # Create additional features 74 | data['average_reward'] = (data['net_reward'] / (data['initial_deadline'] - data['final_deadline'])).rolling(window=10, center=False).mean() 75 | data['reliability_rate'] = (data['success']*100).rolling(window=10, center=False).mean() # compute avg. net reward with window=10 76 | data['good_actions'] = data['actions'].apply(lambda x: ast.literal_eval(x)[0]) 77 | data['good'] = (data['good_actions'] * 1.0 / \ 78 | (data['initial_deadline'] - data['final_deadline'])).rolling(window=10, center=False).mean() 79 | data['minor'] = (data['actions'].apply(lambda x: ast.literal_eval(x)[1]) * 1.0 / \ 80 | (data['initial_deadline'] - data['final_deadline'])).rolling(window=10, center=False).mean() 81 | data['major'] = (data['actions'].apply(lambda x: ast.literal_eval(x)[2]) * 1.0 / \ 82 | (data['initial_deadline'] - data['final_deadline'])).rolling(window=10, center=False).mean() 83 | data['minor_acc'] = (data['actions'].apply(lambda x: ast.literal_eval(x)[3]) * 1.0 / \ 84 | (data['initial_deadline'] - data['final_deadline'])).rolling(window=10, center=False).mean() 85 | data['major_acc'] = (data['actions'].apply(lambda x: ast.literal_eval(x)[4]) * 1.0 / \ 86 | (data['initial_deadline'] - data['final_deadline'])).rolling(window=10, center=False).mean() 87 | data['epsilon'] = data['parameters'].apply(lambda x: ast.literal_eval(x)['e']) 88 | data['alpha'] = data['parameters'].apply(lambda x: ast.literal_eval(x)['a']) 89 | 90 | 91 | # Create training and testing subsets 92 | training_data = data[data['testing'] == False] 93 | testing_data = data[data['testing'] == True] 94 | 95 | plt.figure(figsize=(12,8)) 96 | 97 | 98 | ############### 99 | ### Average step reward plot 100 | ############### 101 | 102 | ax = plt.subplot2grid((6,6), (0,3), colspan=3, rowspan=2) 103 | ax.set_title("10-Trial Rolling Average Reward per Action") 104 | ax.set_ylabel("Reward per Action") 105 | ax.set_xlabel("Trial Number") 106 | ax.set_xlim((10, len(training_data))) 107 | 108 | # Create plot-specific data 109 | step = training_data[['trial','average_reward']].dropna() 110 | 111 | ax.axhline(xmin = 0, xmax = 1, y = 0, color = 'black', linestyle = 'dashed') 112 | ax.plot(step['trial'], step['average_reward']) 113 | 114 | 115 | ############### 116 | ### Parameters Plot 117 | ############### 118 | 119 | ax = plt.subplot2grid((6,6), (2,3), colspan=3, rowspan=2) 120 | 121 | # Check whether the agent was expected to learn 122 | if csv != 'sim_no-learning.csv': 123 | ax.set_ylabel("Parameter Value") 124 | ax.set_xlabel("Trial Number") 125 | ax.set_xlim((1, len(training_data))) 126 | ax.set_ylim((0, 1.05)) 127 | 128 | ax.plot(training_data['trial'], training_data['epsilon'], color='blue', label='Exploration factor') 129 | ax.plot(training_data['trial'], training_data['alpha'], color='green', label='Learning factor') 130 | 131 | ax.legend(bbox_to_anchor=(0.5,1.19), fancybox=True, ncol=2, loc='upper center', fontsize=10) 132 | 133 | else: 134 | ax.axis('off') 135 | ax.text(0.52, 0.30, "Simulation completed\nwith learning disabled.", fontsize=24, ha='center', style='italic') 136 | 137 | 138 | ############### 139 | ### Bad Actions Plot 140 | ############### 141 | 142 | actions = training_data[['trial','good', 'minor','major','minor_acc','major_acc']].dropna() 143 | maximum = (1 - actions['good']).values.max() 144 | 145 | ax = plt.subplot2grid((6,6), (0,0), colspan=3, rowspan=4) 146 | ax.set_title("10-Trial Rolling Relative Frequency of Bad Actions") 147 | ax.set_ylabel("Relative Frequency") 148 | ax.set_xlabel("Trial Number") 149 | 150 | ax.set_ylim((0, maximum + 0.01)) 151 | ax.set_xlim((10, len(training_data))) 152 | 153 | ax.set_yticks(np.linspace(0, maximum+0.01, 10)) 154 | 155 | ax.plot(actions['trial'], (1 - actions['good']), color='black', label='Total Bad Actions', linestyle='dotted', linewidth=3) 156 | ax.plot(actions['trial'], actions['minor'], color='orange', label='Minor Violation', linestyle='dashed') 157 | ax.plot(actions['trial'], actions['major'], color='orange', label='Major Violation', linewidth=2) 158 | ax.plot(actions['trial'], actions['minor_acc'], color='red', label='Minor Accident', linestyle='dashed') 159 | ax.plot(actions['trial'], actions['major_acc'], color='red', label='Major Accident', linewidth=2) 160 | 161 | ax.legend(loc='upper right', fancybox=True, fontsize=10) 162 | 163 | 164 | ############### 165 | ### Rolling Success-Rate plot 166 | ############### 167 | 168 | ax = plt.subplot2grid((6,6), (4,0), colspan=4, rowspan=2) 169 | ax.set_title("10-Trial Rolling Rate of Reliability") 170 | ax.set_ylabel("Rate of Reliability") 171 | ax.set_xlabel("Trial Number") 172 | ax.set_xlim((10, len(training_data))) 173 | ax.set_ylim((-5, 105)) 174 | ax.set_yticks(np.arange(0, 101, 20)) 175 | ax.set_yticklabels(['0%', '20%', '40%', '60%', '80%', '100%']) 176 | 177 | # Create plot-specific data 178 | trial = training_data.dropna()['trial'] 179 | rate = training_data.dropna()['reliability_rate'] 180 | 181 | # Rolling success rate 182 | ax.plot(trial, rate, label="Reliability Rate", color='blue') 183 | 184 | 185 | ############### 186 | ### Test results 187 | ############### 188 | 189 | ax = plt.subplot2grid((6,6), (4,4), colspan=2, rowspan=2) 190 | ax.axis('off') 191 | 192 | if len(testing_data) > 0: 193 | safety_rating, safety_color = calculate_safety(testing_data) 194 | reliability_rating, reliability_color = calculate_reliability(testing_data) 195 | 196 | # Write success rate 197 | ax.text(0.40, .9, "{} testing trials simulated.".format(len(testing_data)), fontsize=14, ha='center') 198 | ax.text(0.40, 0.7, "Safety Rating:", fontsize=16, ha='center') 199 | ax.text(0.40, 0.42, "{}".format(safety_rating), fontsize=40, ha='center', color=safety_color) 200 | ax.text(0.40, 0.27, "Reliability Rating:", fontsize=16, ha='center') 201 | ax.text(0.40, 0, "{}".format(reliability_rating), fontsize=40, ha='center', color=reliability_color) 202 | 203 | else: 204 | ax.text(0.36, 0.30, "Simulation completed\nwith testing disabled.", fontsize=20, ha='center', style='italic') 205 | 206 | plt.tight_layout() 207 | plt.show() 208 | -------------------------------------------------------------------------------- /projects/student_intervention/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | # Supervised Learning 3 | ## Project: Building a Student Intervention System 4 | 5 | ### Install 6 | 7 | This project requires **Python 2.7** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [Pandas](http://pandas.pydata.org) 11 | - [scikit-learn](http://scikit-learn.org/stable/) 12 | 13 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html) 14 | 15 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. 16 | 17 | ### Code 18 | 19 | Template code is provided in the `student_intervention.ipynb` notebook file. You will also be required to use the `student-data.csv` dataset file to complete your work. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. 20 | 21 | ### Run 22 | 23 | In a terminal or command window, navigate to the top-level project directory `student_intervention/` (that contains this README) and run one of the following commands: 24 | 25 | ```bash 26 | ipython notebook student_intervention.ipynb 27 | ``` 28 | or 29 | ```bash 30 | jupyter notebook student_intervention.ipynb 31 | ``` 32 | 33 | This will open the Jupyter Notebook software and project file in your browser. 34 | 35 | ### Data 36 | 37 | The dataset used in this project is included as `student-data.csv`. This dataset has the following attributes: 38 | 39 | - `school` : student's school (binary: "GP" or "MS") 40 | - `sex` : student's sex (binary: "F" - female or "M" - male) 41 | - `age` : student's age (numeric: from 15 to 22) 42 | - `address` : student's home address type (binary: "U" - urban or "R" - rural) 43 | - `famsize` : family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3) 44 | - `Pstatus` : parent's cohabitation status (binary: "T" - living together or "A" - apart) 45 | - `Medu` : mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education) 46 | - `Fedu` : father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education) 47 | - `Mjob` : mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other") 48 | - `Fjob` : father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other") 49 | - `reason` : reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other") 50 | - `guardian` : student's guardian (nominal: "mother", "father" or "other") 51 | - `traveltime` : home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 52 | - `studytime` : weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 53 | - `failures` : number of past class failures (numeric: n if 1<=n<3, else 4) 54 | - `schoolsup` : extra educational support (binary: yes or no) 55 | - `famsup` : family educational support (binary: yes or no) 56 | - `paid` : extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 57 | - `activities` : extra-curricular activities (binary: yes or no) 58 | - `nursery` : attended nursery school (binary: yes or no) 59 | - `higher` : wants to take higher education (binary: yes or no) 60 | - `internet` : Internet access at home (binary: yes or no) 61 | - `romantic` : with a romantic relationship (binary: yes or no) 62 | - `famrel` : quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 63 | - `freetime` : free time after school (numeric: from 1 - very low to 5 - very high) 64 | - `goout` : going out with friends (numeric: from 1 - very low to 5 - very high) 65 | - `Dalc` : workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 66 | - `Walc` : weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 67 | - `health` : current health status (numeric: from 1 - very bad to 5 - very good) 68 | - `absences` : number of school absences (numeric: from 0 to 93) 69 | - `passed` : did the student pass the final exam (binary: yes or no) 70 | -------------------------------------------------------------------------------- /projects/student_intervention/project_description.md: -------------------------------------------------------------------------------- 1 | # Content: Supervised Learning 2 | ## Project: Creating a Student Intervention System 3 | 4 | ## Project Overview 5 | As education has grown to rely more on technology, vast amounts of data has become available for examination and prediction. Logs of student activities, grades, interactions with teachers and fellow students, and more, are now captured in real time through learning management systems like **Canvas** and **Edmodo**. This is especially true for online classrooms, which are becoming popular even at the primary and secondary school level. Within all levels of education, there exists a push to help increase the likelihood of student success, without watering down the education or engaging in behaviors that fail to improve the underlying issues. Graduation rates are often the criteria of choice, and educators seek new ways to predict the success and failure of students early enough to stage effective interventions. 6 | 7 | ## Description 8 | A local school district has a goal to reach a 95% graduation rate by the end of the decade by identifying students who need intervention before they drop out of school. As a software engineer contacted by the school district, your task is to model the factors that predict how likely a student is to pass their high school final exam, by constructing an intervention system that leverages supervised learning techniques. The board of supervisors has asked that you find the most effective model that uses the least amount of computation costs to save on the budget. You will need to analyze the dataset on students' performance and develop a model that will predict the likelihood that a given student will pass, quantifying whether an intervention is necessary. 9 | 10 | ## Software Requirements 11 | This project uses the following software and Python libraries: 12 | 13 | - [Python 2.7](https://www.python.org/download/releases/2.7/) 14 | - [NumPy](http://www.numpy.org/) 15 | - [pandas](http://pandas.pydata.org/) 16 | - [scikit-learn](http://scikit-learn.org/0.17/install.html) (v0.17) 17 | 18 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html). 19 | 20 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. Make sure that you select the Python 2.7 installer and not the Python 3.x installer. 21 | 22 | ## Starting the Project 23 | For this assignment, you can find the `student_intervention` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 24 | 25 | This project contains two files: 26 | 27 | - `student_intervention.ipynb`: This is the main file where you will be performing your work on the project. 28 | - `student-data.csv`: The project dataset. You?ll load this data in the notebook. 29 | 30 | In the Terminal or Command Prompt, navigate to the folder containing the project files, and then use the command `jupyter notebook student_intervention.ipynb` to open up a browser window or tab to work with your notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the project. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 31 | 32 | ## Submitting the Project 33 | 34 | ### Evaluation 35 | Your project will be reviewed by a Udacity reviewer against the **Building a Student Intervention System project rubric**. Be sure to review this rubric thoroughly and self-evaluate your project before submission. All criteria found in the rubric must be *meeting specifications* for you to pass. 36 | 37 | ### Submission Files 38 | When you are ready to submit your project, collect the following files and compress them into a single archive for upload. Alternatively, you may supply the following files on your GitHub Repo in a folder named `student_intervention` for ease of access: 39 | - The `student_intervention.ipynb` notebook file with all questions answered and all code cells executed and displaying output. 40 | - An **HTML** export of the project notebook with the name **report.html**. This file *must* be present for your project to be evaluated. 41 | 42 | Once you have collected these files and reviewed the project rubric, proceed to the project submission page. 43 | 44 | ### I'm Ready! 45 | When you're ready to submit your project, click on the **Submit Project** button at the bottom of the page. 46 | 47 | If you are having any problems submitting your project or wish to check on the status of your submission, please email us at **machine-support@udacity.com** or visit us in the discussion forums. 48 | 49 | ### What's Next? 50 | You will get an email as soon as your reviewer has feedback for you. In the meantime, review your next project and feel free to get started on it or the courses supporting it! 51 | -------------------------------------------------------------------------------- /projects/titanic_survival_exploration/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Engineer Nanodegree 2 | ## Introduction and Foundations 3 | ## Project: Titanic Survival Exploration 4 | 5 | ### Install 6 | 7 | This project requires **Python** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [Pandas](http://pandas.pydata.org) 11 | - [matplotlib](http://matplotlib.org/) 12 | - [scikit-learn](http://scikit-learn.org/stable/) 13 | 14 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html) 15 | 16 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included 17 | 18 | ### Code 19 | 20 | Template code is provided in the notebook `titanic_survival_exploration.ipynb` notebook file. Additional supporting code can be found in `visuals.py`. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project. Note that the code included in `visuals.py` is meant to be used out-of-the-box and not intended for students to manipulate. If you are interested in how the visualizations are created in the notebook, please feel free to explore this Python file. 21 | 22 | ### Run 23 | 24 | In a terminal or command window, navigate to the top-level project directory `titanic_survival_exploration/` (that contains this README) and run one of the following commands: 25 | 26 | ```bash 27 | jupyter notebook titanic_survival_exploration.ipynb 28 | ``` 29 | or 30 | ```bash 31 | ipython notebook titanic_survival_exploration.ipynb 32 | ``` 33 | 34 | This will open the Jupyter Notebook software and project file in your web browser. 35 | 36 | ### Data 37 | 38 | The dataset used in this project is included as `titanic_data.csv`. This dataset is provided by Udacity and contains the following attributes: 39 | 40 | **Features** 41 | - `pclass` : Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) 42 | - `name` : Name 43 | - `sex` : Sex 44 | - `age` : Age 45 | - `sibsp` : Number of Siblings/Spouses Aboard 46 | - `parch` : Number of Parents/Children Aboard 47 | - `ticket` : Ticket Number 48 | - `fare` : Passenger Fare 49 | - `cabin` : Cabin 50 | - `embarked` : Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton) 51 | 52 | **Target Variable** 53 | - `survival` : Survival (0 = No; 1 = Yes) -------------------------------------------------------------------------------- /projects/titanic_survival_exploration/project_description.md: -------------------------------------------------------------------------------- 1 | ## Content: Introduction and Foundations 2 | ## Project: Titanic Survival Exploration 3 | 4 | ## Project Overview 5 | Welcome to the Machine Learning Engineer Nanodegree! 6 | 7 | In this ***optional*** project, you will create decision functions that attempt to predict survival outcomes from the 1912 Titanic disaster based on each passenger's features, such as sex and age. You will start with a simple algorithm and increase its complexity until you are able to accurately predict the outcomes for at least 80% of the passengers in the provided data. This project will introduce you to some of the concepts of machine learning as you start the Nanodegree program. 8 | 9 | In addition, you'll make sure Python is installed with the necessary packages to complete this project. There are two Python libraries, `numpy` and `pandas`, that we'll use a bit here in this project. Don't worry about how these libraries work for now -- we'll get to them in more detail in later projects. This project will also familiarize you with the submission process for the projects that you will be completing as part of the Nanodegree program. 10 | 11 | ## Software Requirements 12 | This project uses the following software and Python libraries: 13 | 14 | - [Python](https://www.python.org/downloads/) 15 | - [NumPy](http://www.numpy.org/) 16 | - [pandas](http://pandas.pydata.org/) 17 | - [matplotlib](http://matplotlib.org/) 18 | 19 | You will also need to have software installed to run and execute a [Jupyter Notebook](http://ipython.org/notebook.html). 20 | 21 | If you do not have Python installed yet, it is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python, which already has the above packages and more included. 22 | 23 | If you already have Python 2.7 installed on your computer, then you can install `numpy`, `pandas`, `matplotlib` and Jupyter Notebook (formerly known as "iPython") by using [pip](https://pip.pypa.io/en/stable/) on the command line. [This page](http://www.lfd.uci.edu/~gohlke/pythonlibs/) may also be of use for some packages for Windows users, if pip has trouble performing the installation. After installing pip, you can install all the packages with the following command: 24 | 25 | `sudo pip install numpy pandas matplotlib jupyter` 26 | 27 | ## Starting the Project 28 | 29 | For this assignment, you can find the `titanic_survival_exploration` folder containing the necessary project files on the [Machine Learning projects GitHub](https://github.com/udacity/machine-learning), under the `projects` folder. You may download all of the files for projects we'll use in this Nanodegree program directly from this repo. Please make sure that you use the most recent version of project files when completing a project! 30 | 31 | This project contains three files: 32 | 33 | - `titanic_survival_exploration.ipynb`: This is the main file where you will be performing your work on the project. 34 | - `titanic_data.csv`: The project dataset. You?ll load this data in the notebook. 35 | - `visuals.py`: This Python script provides supplementary visualizations for the project. Do not modify. 36 | 37 | In the Terminal or Command Prompt, navigate to the folder containing the project files, and then use the command `jupyter notebook titanic_survival_exploration.ipynb` to open up a browser window or tab to work with your notebook. Alternatively, you can use the command `jupyter notebook` or `ipython notebook` and navigate to the notebook file in the browser window that opens. Follow the instructions in the notebook and answer each question presented to successfully complete the project. A **README** file has also been provided with the project files which may contain additional necessary information or instruction for the project. 38 | -------------------------------------------------------------------------------- /projects/titanic_survival_exploration/visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import matplotlib.pyplot as plt 15 | 16 | def filter_data(data, condition): 17 | """ 18 | Remove elements that do not match the condition provided. 19 | Takes a data list as input and returns a filtered list. 20 | Conditions should be a list of strings of the following format: 21 | ' ' 22 | where the following operations are valid: >, <, >=, <=, ==, != 23 | 24 | Example: ["Sex == 'male'", 'Age < 18'] 25 | """ 26 | 27 | field, op, value = condition.split(" ") 28 | 29 | # convert value into number or strip excess quotes if string 30 | try: 31 | value = float(value) 32 | except: 33 | value = value.strip("\'\"") 34 | 35 | # get booleans for filtering 36 | if op == ">": 37 | matches = data[field] > value 38 | elif op == "<": 39 | matches = data[field] < value 40 | elif op == ">=": 41 | matches = data[field] >= value 42 | elif op == "<=": 43 | matches = data[field] <= value 44 | elif op == "==": 45 | matches = data[field] == value 46 | elif op == "!=": 47 | matches = data[field] != value 48 | else: # catch invalid operation codes 49 | raise Exception("Invalid comparison operator. Only >, <, >=, <=, ==, != allowed.") 50 | 51 | # filter data and outcomes 52 | data = data[matches].reset_index(drop = True) 53 | return data 54 | 55 | def survival_stats(data, outcomes, key, filters = []): 56 | """ 57 | Print out selected statistics regarding survival, given a feature of 58 | interest and any number of filters (including no filters) 59 | """ 60 | 61 | # Check that the key exists 62 | if key not in data.columns.values : 63 | print("'{}' is not a feature of the Titanic data. Did you spell something wrong?".format(key)) 64 | return False 65 | 66 | # Return the function before visualizing if 'Cabin' or 'Ticket' 67 | # is selected: too many unique categories to display 68 | if(key == 'Cabin' or key == 'PassengerId' or key == 'Ticket'): 69 | print("'{}' has too many unique categories to display! Try a different feature.".format(key)) 70 | return False 71 | 72 | # Merge data and outcomes into single dataframe 73 | all_data = pd.concat([data, outcomes.to_frame()], axis = 1) 74 | 75 | # Apply filters to data 76 | for condition in filters: 77 | all_data = filter_data(all_data, condition) 78 | 79 | # Create outcomes DataFrame 80 | all_data = all_data[[key, 'Survived']] 81 | 82 | # Create plotting figure 83 | plt.figure(figsize=(8,6)) 84 | 85 | # 'Numerical' features 86 | if(key == 'Age' or key == 'Fare'): 87 | 88 | # Remove NaN values from Age data 89 | all_data = all_data[~np.isnan(all_data[key])] 90 | 91 | # Divide the range of data into bins and count survival rates 92 | min_value = all_data[key].min() 93 | max_value = all_data[key].max() 94 | value_range = max_value - min_value 95 | 96 | # 'Fares' has larger range of values than 'Age' so create more bins 97 | if(key == 'Fare'): 98 | bins = np.arange(0, all_data['Fare'].max() + 20, 20) 99 | if(key == 'Age'): 100 | bins = np.arange(0, all_data['Age'].max() + 10, 10) 101 | 102 | # Overlay each bin's survival rates 103 | nonsurv_vals = all_data[all_data['Survived'] == 0][key].reset_index(drop = True) 104 | surv_vals = all_data[all_data['Survived'] == 1][key].reset_index(drop = True) 105 | plt.hist(nonsurv_vals, bins = bins, alpha = 0.6, 106 | color = 'red', label = 'Did not survive') 107 | plt.hist(surv_vals, bins = bins, alpha = 0.6, 108 | color = 'green', label = 'Survived') 109 | 110 | # Add legend to plot 111 | plt.xlim(0, bins.max()) 112 | plt.legend(framealpha = 0.8) 113 | 114 | # 'Categorical' features 115 | else: 116 | 117 | # Set the various categories 118 | if(key == 'Pclass'): 119 | values = np.arange(1,4) 120 | if(key == 'Parch' or key == 'SibSp'): 121 | values = np.arange(0,np.max(data[key]) + 1) 122 | if(key == 'Embarked'): 123 | values = ['C', 'Q', 'S'] 124 | if(key == 'Sex'): 125 | values = ['male', 'female'] 126 | 127 | # Create DataFrame containing categories and count of each 128 | frame = pd.DataFrame(index = np.arange(len(values)), columns=(key,'Survived','NSurvived')) 129 | for i, value in enumerate(values): 130 | frame.loc[i] = [value, \ 131 | len(all_data[(all_data['Survived'] == 1) & (all_data[key] == value)]), \ 132 | len(all_data[(all_data['Survived'] == 0) & (all_data[key] == value)])] 133 | 134 | # Set the width of each bar 135 | bar_width = 0.4 136 | 137 | # Display each category's survival rates 138 | for i in np.arange(len(frame)): 139 | nonsurv_bar = plt.bar(i-bar_width, frame.loc[i]['NSurvived'], width = bar_width, color = 'r') 140 | surv_bar = plt.bar(i, frame.loc[i]['Survived'], width = bar_width, color = 'g') 141 | 142 | plt.xticks(np.arange(len(frame)), values) 143 | plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8) 144 | 145 | # Common attributes for plot formatting 146 | plt.xlabel(key) 147 | plt.ylabel('Number of Passengers') 148 | plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key)) 149 | plt.show() 150 | 151 | # Report number of passengers with missing values 152 | if sum(pd.isnull(all_data[key])): 153 | nan_outcomes = all_data[pd.isnull(all_data[key])]['Survived'] 154 | print("Passengers with missing '{}' values: {} ({} survived, {} did not survive)".format( \ 155 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0))) 156 | 157 | --------------------------------------------------------------------------------