├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── analyses ├── __init__.py ├── churn.ipynb └── churn_measurements.py ├── aws ├── __init__.py └── aws.ipynb ├── commands ├── __init__.py ├── linux.ipynb ├── misc.ipynb └── styles │ └── custom.css ├── data ├── churn.csv ├── confusion_matrix.png ├── ozone.csv ├── ozone_copy.csv └── titanic │ ├── genderclassmodel.csv │ ├── genderclassmodel.py │ ├── gendermodel.csv │ ├── gendermodel.py │ ├── myfirstforest.py │ ├── results-rf.csv │ ├── test.csv │ └── train.csv ├── deep-learning ├── deep-dream │ ├── dream.ipynb │ ├── flowers.jpg │ └── sky1024px.jpg ├── keras-tutorial │ ├── 0. Preamble.ipynb │ ├── 1.1 Introduction - Deep Learning and ANN.ipynb │ ├── 1.2 Introduction - Theano.ipynb │ ├── 1.3 Introduction - Keras.ipynb │ ├── 1.4 (Extra) A Simple Implementation of ANN for MNIST.ipynb │ ├── 2.1 Supervised Learning - ConvNets.ipynb │ ├── 2.2.1 Supervised Learning - ConvNet HandsOn Part I.ipynb │ ├── 2.2.2 Supervised Learning - ConvNet HandsOn Part II.ipynb │ ├── 2.3 Supervised Learning - Famous Models with Keras.ipynb │ ├── 3.1 Unsupervised Learning - AutoEncoders and Embeddings.ipynb │ ├── 3.2 RNN and LSTM.ipynb │ ├── 3.3 (Extra) LSTM for Sentence Generation.ipynb │ ├── 4. Conclusions.ipynb │ ├── LICENSE │ ├── data │ │ ├── female_blog_list.txt │ │ ├── intro_to_ann.csv │ │ ├── male_blog_list.txt │ │ ├── mnist.pkl.gz │ │ ├── rt-polarity.neg │ │ └── rt-polarity.pos │ ├── data_helpers.py │ ├── deep-learning-osx.yml │ ├── deep-learning.yml │ ├── deep_learning_models │ │ ├── LICENSE │ │ ├── README.md │ │ ├── imagenet_utils.py │ │ ├── resnet50.py │ │ ├── vgg16.py │ │ └── vgg19.py │ ├── imgs │ │ ├── ConvNet LeNet.png │ │ ├── LSTM3-chain.png │ │ ├── MLP.png │ │ ├── MaxPool.png │ │ ├── Perceptron and MLP.png │ │ ├── Perceptron.png │ │ ├── RNN-rolled.png │ │ ├── RNN-unrolled.png │ │ ├── autoencoder.png │ │ ├── backprop.png │ │ ├── cnn1.png │ │ ├── cnn2.png │ │ ├── cnn3.png │ │ ├── cnn4.png │ │ ├── cnn5.png │ │ ├── cnn6.png │ │ ├── conv.png │ │ ├── convnets_cover.png │ │ ├── euroscipy_2016_logo.png │ │ ├── gru.png │ │ ├── imagenet │ │ │ ├── apricot_565.jpeg │ │ │ ├── apricot_696.jpeg │ │ │ ├── apricot_787.jpeg │ │ │ ├── strawberry_1157.jpeg │ │ │ ├── strawberry_1174.jpeg │ │ │ └── strawberry_1189.jpeg │ │ ├── keDyv.png │ │ ├── keras-logo-small.jpg │ │ ├── keras_rank_1.jpg │ │ ├── keras_rank_2.jpg │ │ ├── mlp_details.png │ │ ├── overfitting.png │ │ ├── rnn.png │ │ ├── rnn2.png │ │ └── sprint.jpg │ ├── outline.md │ ├── solutions │ │ ├── sol_111.py │ │ └── sol_112.py │ └── w2v.py ├── tensor-flow-examples │ ├── Setup_TensorFlow.md │ ├── input_data.py │ ├── multigpu_basics.py │ └── notebooks │ │ ├── 1_intro │ │ └── basic_operations.ipynb │ │ ├── 2_basic_classifiers │ │ ├── linear_regression.ipynb │ │ ├── logistic_regression.ipynb │ │ └── nearest_neighbor.ipynb │ │ ├── 3_neural_networks │ │ ├── alexnet.ipynb │ │ ├── convolutional_network.ipynb │ │ ├── multilayer_perceptron.ipynb │ │ └── recurrent_network.ipynb │ │ ├── 4_multi_gpu │ │ └── multigpu_basics.ipynb │ │ └── 5_ui │ │ ├── graph_visualization.ipynb │ │ └── loss_visualization.ipynb ├── tensor-flow-exercises │ ├── 1_notmnist.ipynb │ ├── 2_fullyconnected.ipynb │ ├── 3_regularization.ipynb │ ├── 4_convolutions.ipynb │ ├── 5_word2vec.ipynb │ ├── 6_lstm.ipynb │ ├── Dockerfile │ └── README.md └── theano-tutorial │ ├── intro_theano │ ├── Makefile │ ├── intro_theano.ipynb │ ├── intro_theano.pdf │ ├── logistic_regression.ipynb │ └── utils.py │ ├── rnn_tutorial │ ├── Makefile │ ├── instruction.pdf │ ├── lstm_text.ipynb │ ├── lstm_text.py │ ├── rnn_lstm.pdf │ ├── rnn_precompile.py │ ├── simple_rnn.ipynb │ └── synthetic.py │ ├── scan_tutorial │ ├── scan_ex1_solution.py │ ├── scan_ex2_solution.py │ └── scan_tutorial.ipynb │ └── theano_mlp │ └── theano_mlp.ipynb ├── images ├── README.sketch ├── README_1200x800.gif ├── aws.png ├── commands.png ├── cover.png ├── coversmall.png ├── coversmall_alt.png ├── deep.png ├── k-means.gif ├── kaggle.png ├── keras.jpg ├── matplotlib.png ├── mrjob.png ├── numpy.png ├── pandas.png ├── python.png ├── regex-1.png ├── regex-2.png ├── scikitlearn.png ├── scipy.png ├── spark.png ├── svm.gif ├── tensorflow.png └── theano.png ├── kaggle ├── __init__.py └── titanic.ipynb ├── mapreduce ├── __init__.py ├── mapreduce-python.ipynb ├── mr_s3_log_parser.py └── test_mr_s3_log_parser.py ├── matplotlib ├── 04.00-Introduction-To-Matplotlib.ipynb ├── 04.01-Simple-Line-Plots.ipynb ├── 04.02-Simple-Scatter-Plots.ipynb ├── 04.03-Errorbars.ipynb ├── 04.04-Density-and-Contour-Plots.ipynb ├── 04.05-Histograms-and-Binnings.ipynb ├── 04.06-Customizing-Legends.ipynb ├── 04.07-Customizing-Colorbars.ipynb ├── 04.08-Multiple-Subplots.ipynb ├── 04.09-Text-and-Annotation.ipynb ├── 04.10-Customizing-Ticks.ipynb ├── 04.11-Settings-and-Stylesheets.ipynb ├── 04.12-Three-Dimensional-Plotting.ipynb ├── 04.13-Geographic-Data-With-Basemap.ipynb ├── 04.14-Visualization-With-Seaborn.ipynb ├── 04.15-Further-Resources.ipynb ├── __init__.py ├── matplotlib-applied.ipynb ├── matplotlib.ipynb └── tests │ └── __init__.py ├── misc ├── Algorithmia.ipynb └── regex.ipynb ├── numpy ├── 02.00-Introduction-to-NumPy.ipynb ├── 02.01-Understanding-Data-Types.ipynb ├── 02.02-The-Basics-Of-NumPy-Arrays.ipynb ├── 02.03-Computation-on-arrays-ufuncs.ipynb ├── 02.04-Computation-on-arrays-aggregates.ipynb ├── 02.05-Computation-on-arrays-broadcasting.ipynb ├── 02.06-Boolean-Arrays-and-Masks.ipynb ├── 02.07-Fancy-Indexing.ipynb ├── 02.08-Sorting.ipynb ├── 02.09-Structured-Data-NumPy.ipynb ├── __init__.py ├── figures │ ├── 02.05-broadcasting.png │ ├── PDSH-cover-small.png │ ├── array_vs_list.png │ └── cint_vs_pyint.png ├── numpy.ipynb └── tests │ └── __init__.py ├── pandas ├── 03.00-Introduction-to-Pandas.ipynb ├── 03.01-Introducing-Pandas-Objects.ipynb ├── 03.02-Data-Indexing-and-Selection.ipynb ├── 03.03-Operations-in-Pandas.ipynb ├── 03.04-Missing-Values.ipynb ├── 03.05-Hierarchical-Indexing.ipynb ├── 03.06-Concat-And-Append.ipynb ├── 03.07-Merge-and-Join.ipynb ├── 03.08-Aggregation-and-Grouping.ipynb ├── 03.09-Pivot-Tables.ipynb ├── 03.10-Working-With-Strings.ipynb ├── 03.11-Working-with-Time-Series.ipynb ├── 03.12-Performance-Eval-and-Query.ipynb ├── 03.13-Further-Resources.ipynb ├── __init__.py ├── pandas.ipynb └── tests │ └── __init__.py ├── python-data ├── __init__.py ├── datetime.ipynb ├── files.ipynb ├── functions.ipynb ├── hello_world.txt ├── logs.ipynb ├── pdb.ipynb ├── structs.ipynb ├── structs_utils.ipynb ├── tests │ ├── __init__.py │ ├── test_transform_util.py │ └── test_type_util.py ├── transform_util.py ├── type_util.py └── unit_tests.ipynb ├── scikit-learn ├── __init__.py ├── fig_code │ ├── ML_flow_chart.py │ ├── __init__.py │ ├── data.py │ ├── figures.py │ ├── helpers.py │ ├── linear_regression.py │ ├── scikit-learn.ipynb │ ├── sgd_separator.py │ └── svm_gui.py ├── scikit-learn-gmm.ipynb ├── scikit-learn-intro.ipynb ├── scikit-learn-k-means.ipynb ├── scikit-learn-linear-reg.ipynb ├── scikit-learn-pca.ipynb ├── scikit-learn-random-forest.ipynb ├── scikit-learn-svm.ipynb ├── scikit-learn-validation.ipynb └── tests │ └── __init__.py ├── scipy ├── 2002FemPreg.dat.gz ├── 2002FemPreg.dct ├── __init__.py ├── effect_size.ipynb ├── first.py ├── hypothesis.ipynb ├── nsfg.py ├── sampling.ipynb ├── tests │ └── __init__.py ├── thinkplot.py └── thinkstats2.py └── spark ├── __init__.py ├── hdfs.ipynb └── spark.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # IPython notebook 57 | .ipynb_checkpoints 58 | 59 | # Repo scratch directory 60 | scratch/ 61 | 62 | # Misc 63 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This repository contains a variety of content; some developed by Donne Martin, 2 | and some from third-parties. The third-party content is distributed under the 3 | license provided by those parties. 4 | 5 | The content developed by Donne Martin is distributed under the following license: 6 | 7 | I am providing code and resources in this repository to you under an open source 8 | license. Because this is my personal repository, the license you receive to my 9 | code and resources is from me and not my employer (Facebook). 10 | 11 | Copyright 2015 Donne Martin 12 | 13 | Licensed under the Apache License, Version 2.0 (the "License"); 14 | you may not use this file except in compliance with the License. 15 | You may obtain a copy of the License at 16 | 17 | http://www.apache.org/licenses/LICENSE-2.0 18 | 19 | Unless required by applicable law or agreed to in writing, software 20 | distributed under the License is distributed on an "AS IS" BASIS, 21 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | See the License for the specific language governing permissions and 23 | limitations under the License. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/__init__.py -------------------------------------------------------------------------------- /analyses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/analyses/__init__.py -------------------------------------------------------------------------------- /analyses/churn_measurements.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | __author__ = "Eric Chiang" 5 | __email__ = "eric[at]yhathq.com" 6 | 7 | """ 8 | 9 | Measurements inspired by Philip Tetlock's "Expert Political Judgment" 10 | 11 | Equations take from Yaniv, Yates, & Smith (1991): 12 | "Measures of Descrimination Skill in Probabilistic Judgement" 13 | 14 | """ 15 | 16 | 17 | def calibration(prob,outcome,n_bins=10): 18 | """Calibration measurement for a set of predictions. 19 | 20 | When predicting events at a given probability, how far is frequency 21 | of positive outcomes from that probability? 22 | NOTE: Lower scores are better 23 | 24 | prob: array_like, float 25 | Probability estimates for a set of events 26 | 27 | outcome: array_like, bool 28 | If event predicted occurred 29 | 30 | n_bins: int 31 | Number of judgement categories to prefrom calculation over. 32 | Prediction are binned based on probability, since "descrete" 33 | probabilities aren't required. 34 | 35 | """ 36 | prob = np.array(prob) 37 | outcome = np.array(outcome) 38 | 39 | c = 0.0 40 | # Construct bins 41 | judgement_bins = np.arange(n_bins + 1) / n_bins 42 | # Which bin is each prediction in? 43 | bin_num = np.digitize(prob,judgement_bins) 44 | for j_bin in np.unique(bin_num): 45 | # Is event in bin 46 | in_bin = bin_num == j_bin 47 | # Predicted probability taken as average of preds in bin 48 | predicted_prob = np.mean(prob[in_bin]) 49 | # How often did events in this bin actually happen? 50 | true_bin_prob = np.mean(outcome[in_bin]) 51 | # Squared distance between predicted and true times num of obs 52 | c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2) 53 | return c / len(prob) 54 | 55 | def discrimination(prob,outcome,n_bins=10): 56 | """Discrimination measurement for a set of predictions. 57 | 58 | For each judgement category, how far from the base probability 59 | is the true frequency of that bin? 60 | NOTE: High scores are better 61 | 62 | prob: array_like, float 63 | Probability estimates for a set of events 64 | 65 | outcome: array_like, bool 66 | If event predicted occurred 67 | 68 | n_bins: int 69 | Number of judgement categories to prefrom calculation over. 70 | Prediction are binned based on probability, since "descrete" 71 | probabilities aren't required. 72 | 73 | """ 74 | prob = np.array(prob) 75 | outcome = np.array(outcome) 76 | 77 | d = 0.0 78 | # Base frequency of outcomes 79 | base_prob = np.mean(outcome) 80 | # Construct bins 81 | judgement_bins = np.arange(n_bins + 1) / n_bins 82 | # Which bin is each prediction in? 83 | bin_num = np.digitize(prob,judgement_bins) 84 | for j_bin in np.unique(bin_num): 85 | in_bin = bin_num == j_bin 86 | true_bin_prob = np.mean(outcome[in_bin]) 87 | # Squared distance between true and base times num of obs 88 | d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2) 89 | return d / len(prob) 90 | -------------------------------------------------------------------------------- /aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/aws/__init__.py -------------------------------------------------------------------------------- /commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/commands/__init__.py -------------------------------------------------------------------------------- /commands/styles/custom.css: -------------------------------------------------------------------------------- 1 | 46 | -------------------------------------------------------------------------------- /data/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/data/confusion_matrix.png -------------------------------------------------------------------------------- /data/ozone.csv: -------------------------------------------------------------------------------- 1 | "Ozone","Solar.R","Wind","Temp","Month","Day" 2 | 41,190,7.4,67,5,1 3 | 36,118,8,72,5,2 4 | 12,149,12.6,74,5,3 5 | 18,313,11.5,62,5,4 6 | NA,NA,14.3,56,5,5 7 | 28,NA,14.9,66,5,6 8 | 23,299,8.6,65,5,7 9 | 19,99,13.8,59,5,8 10 | 8,19,20.1,61,5,9 11 | NA,194,8.6,69,5,10 12 | 7,NA,6.9,74,5,11 13 | 16,256,9.7,69,5,12 14 | 11,290,9.2,66,5,13 15 | 14,274,10.9,68,5,14 16 | 18,65,13.2,58,5,15 17 | 14,334,11.5,64,5,16 18 | 34,307,12,66,5,17 19 | 6,78,18.4,57,5,18 20 | 30,322,11.5,68,5,19 21 | 11,44,9.7,62,5,20 22 | 1,8,9.7,59,5,21 23 | 11,320,16.6,73,5,22 24 | 4,25,9.7,61,5,23 25 | 32,92,12,61,5,24 26 | NA,66,16.6,57,5,25 27 | NA,266,14.9,58,5,26 28 | NA,NA,8,57,5,27 29 | 23,13,12,67,5,28 30 | 45,252,14.9,81,5,29 31 | 115,223,5.7,79,5,30 32 | 37,279,7.4,76,5,31 33 | NA,286,8.6,78,6,1 34 | NA,287,9.7,74,6,2 35 | NA,242,16.1,67,6,3 36 | NA,186,9.2,84,6,4 37 | NA,220,8.6,85,6,5 38 | NA,264,14.3,79,6,6 39 | 29,127,9.7,82,6,7 40 | NA,273,6.9,87,6,8 41 | 71,291,13.8,90,6,9 42 | 39,323,11.5,87,6,10 43 | NA,259,10.9,93,6,11 44 | NA,250,9.2,92,6,12 45 | 23,148,8,82,6,13 46 | NA,332,13.8,80,6,14 47 | NA,322,11.5,79,6,15 48 | 21,191,14.9,77,6,16 49 | 37,284,20.7,72,6,17 50 | 20,37,9.2,65,6,18 51 | 12,120,11.5,73,6,19 52 | 13,137,10.3,76,6,20 53 | NA,150,6.3,77,6,21 54 | NA,59,1.7,76,6,22 55 | NA,91,4.6,76,6,23 56 | NA,250,6.3,76,6,24 57 | NA,135,8,75,6,25 58 | NA,127,8,78,6,26 59 | NA,47,10.3,73,6,27 60 | NA,98,11.5,80,6,28 61 | NA,31,14.9,77,6,29 62 | NA,138,8,83,6,30 63 | 135,269,4.1,84,7,1 64 | 49,248,9.2,85,7,2 65 | 32,236,9.2,81,7,3 66 | NA,101,10.9,84,7,4 67 | 64,175,4.6,83,7,5 68 | 40,314,10.9,83,7,6 69 | 77,276,5.1,88,7,7 70 | 97,267,6.3,92,7,8 71 | 97,272,5.7,92,7,9 72 | 85,175,7.4,89,7,10 73 | NA,139,8.6,82,7,11 74 | 10,264,14.3,73,7,12 75 | 27,175,14.9,81,7,13 76 | NA,291,14.9,91,7,14 77 | 7,48,14.3,80,7,15 78 | 48,260,6.9,81,7,16 79 | 35,274,10.3,82,7,17 80 | 61,285,6.3,84,7,18 81 | 79,187,5.1,87,7,19 82 | 63,220,11.5,85,7,20 83 | 16,7,6.9,74,7,21 84 | NA,258,9.7,81,7,22 85 | NA,295,11.5,82,7,23 86 | 80,294,8.6,86,7,24 87 | 108,223,8,85,7,25 88 | 20,81,8.6,82,7,26 89 | 52,82,12,86,7,27 90 | 82,213,7.4,88,7,28 91 | 50,275,7.4,86,7,29 92 | 64,253,7.4,83,7,30 93 | 59,254,9.2,81,7,31 94 | 39,83,6.9,81,8,1 95 | 9,24,13.8,81,8,2 96 | 16,77,7.4,82,8,3 97 | 78,NA,6.9,86,8,4 98 | 35,NA,7.4,85,8,5 99 | 66,NA,4.6,87,8,6 100 | 122,255,4,89,8,7 101 | 89,229,10.3,90,8,8 102 | 110,207,8,90,8,9 103 | NA,222,8.6,92,8,10 104 | NA,137,11.5,86,8,11 105 | 44,192,11.5,86,8,12 106 | 28,273,11.5,82,8,13 107 | 65,157,9.7,80,8,14 108 | NA,64,11.5,79,8,15 109 | 22,71,10.3,77,8,16 110 | 59,51,6.3,79,8,17 111 | 23,115,7.4,76,8,18 112 | 31,244,10.9,78,8,19 113 | 44,190,10.3,78,8,20 114 | 21,259,15.5,77,8,21 115 | 9,36,14.3,72,8,22 116 | NA,255,12.6,75,8,23 117 | 45,212,9.7,79,8,24 118 | 168,238,3.4,81,8,25 119 | 73,215,8,86,8,26 120 | NA,153,5.7,88,8,27 121 | 76,203,9.7,97,8,28 122 | 118,225,2.3,94,8,29 123 | 84,237,6.3,96,8,30 124 | 85,188,6.3,94,8,31 125 | 96,167,6.9,91,9,1 126 | 78,197,5.1,92,9,2 127 | 73,183,2.8,93,9,3 128 | 91,189,4.6,93,9,4 129 | 47,95,7.4,87,9,5 130 | 32,92,15.5,84,9,6 131 | 20,252,10.9,80,9,7 132 | 23,220,10.3,78,9,8 133 | 21,230,10.9,75,9,9 134 | 24,259,9.7,73,9,10 135 | 44,236,14.9,81,9,11 136 | 21,259,15.5,76,9,12 137 | 28,238,6.3,77,9,13 138 | 9,24,10.9,71,9,14 139 | 13,112,11.5,71,9,15 140 | 46,237,6.9,78,9,16 141 | 18,224,13.8,67,9,17 142 | 13,27,10.3,76,9,18 143 | 24,238,10.3,68,9,19 144 | 16,201,8,82,9,20 145 | 13,238,12.6,64,9,21 146 | 23,14,9.2,71,9,22 147 | 36,139,10.3,81,9,23 148 | 7,49,10.3,69,9,24 149 | 14,20,16.6,63,9,25 150 | 30,193,6.9,70,9,26 151 | NA,145,13.2,77,9,27 152 | 14,191,14.3,75,9,28 153 | 18,131,8,76,9,29 154 | 20,223,11.5,68,9,30 155 | -------------------------------------------------------------------------------- /data/ozone_copy.csv: -------------------------------------------------------------------------------- 1 | 41.0,190.0,7.4,67,5,1 2 | 36.0,118.0,8.0,72,5,2 3 | 12.0,149.0,12.6,74,5,3 4 | 18.0,313.0,11.5,62,5,4 5 | ,,14.3,56,5,5 6 | 28.0,,14.9,66,5,6 7 | 23.0,299.0,8.6,65,5,7 8 | 19.0,99.0,13.8,59,5,8 9 | 8.0,19.0,20.1,61,5,9 10 | ,194.0,8.6,69,5,10 11 | 7.0,,6.9,74,5,11 12 | 16.0,256.0,9.7,69,5,12 13 | 11.0,290.0,9.2,66,5,13 14 | 14.0,274.0,10.9,68,5,14 15 | 18.0,65.0,13.2,58,5,15 16 | 14.0,334.0,11.5,64,5,16 17 | 34.0,307.0,12.0,66,5,17 18 | 6.0,78.0,18.4,57,5,18 19 | 30.0,322.0,11.5,68,5,19 20 | 11.0,44.0,9.7,62,5,20 21 | 1.0,8.0,9.7,59,5,21 22 | 11.0,320.0,16.6,73,5,22 23 | 4.0,25.0,9.7,61,5,23 24 | 32.0,92.0,12.0,61,5,24 25 | ,66.0,16.6,57,5,25 26 | ,266.0,14.9,58,5,26 27 | ,,8.0,57,5,27 28 | 23.0,13.0,12.0,67,5,28 29 | 45.0,252.0,14.9,81,5,29 30 | 115.0,223.0,5.7,79,5,30 31 | 37.0,279.0,7.4,76,5,31 32 | ,286.0,8.6,78,6,1 33 | ,287.0,9.7,74,6,2 34 | ,242.0,16.1,67,6,3 35 | ,186.0,9.2,84,6,4 36 | ,220.0,8.6,85,6,5 37 | ,264.0,14.3,79,6,6 38 | 29.0,127.0,9.7,82,6,7 39 | ,273.0,6.9,87,6,8 40 | 71.0,291.0,13.8,90,6,9 41 | 39.0,323.0,11.5,87,6,10 42 | ,259.0,10.9,93,6,11 43 | ,250.0,9.2,92,6,12 44 | 23.0,148.0,8.0,82,6,13 45 | ,332.0,13.8,80,6,14 46 | ,322.0,11.5,79,6,15 47 | 21.0,191.0,14.9,77,6,16 48 | 37.0,284.0,20.7,72,6,17 49 | 20.0,37.0,9.2,65,6,18 50 | 12.0,120.0,11.5,73,6,19 51 | 13.0,137.0,10.3,76,6,20 52 | ,150.0,6.3,77,6,21 53 | ,59.0,1.7,76,6,22 54 | ,91.0,4.6,76,6,23 55 | ,250.0,6.3,76,6,24 56 | ,135.0,8.0,75,6,25 57 | ,127.0,8.0,78,6,26 58 | ,47.0,10.3,73,6,27 59 | ,98.0,11.5,80,6,28 60 | ,31.0,14.9,77,6,29 61 | ,138.0,8.0,83,6,30 62 | 135.0,269.0,4.1,84,7,1 63 | 49.0,248.0,9.2,85,7,2 64 | 32.0,236.0,9.2,81,7,3 65 | ,101.0,10.9,84,7,4 66 | 64.0,175.0,4.6,83,7,5 67 | 40.0,314.0,10.9,83,7,6 68 | 77.0,276.0,5.1,88,7,7 69 | 97.0,267.0,6.3,92,7,8 70 | 97.0,272.0,5.7,92,7,9 71 | 85.0,175.0,7.4,89,7,10 72 | ,139.0,8.6,82,7,11 73 | 10.0,264.0,14.3,73,7,12 74 | 27.0,175.0,14.9,81,7,13 75 | ,291.0,14.9,91,7,14 76 | 7.0,48.0,14.3,80,7,15 77 | 48.0,260.0,6.9,81,7,16 78 | 35.0,274.0,10.3,82,7,17 79 | 61.0,285.0,6.3,84,7,18 80 | 79.0,187.0,5.1,87,7,19 81 | 63.0,220.0,11.5,85,7,20 82 | 16.0,7.0,6.9,74,7,21 83 | ,258.0,9.7,81,7,22 84 | ,295.0,11.5,82,7,23 85 | 80.0,294.0,8.6,86,7,24 86 | 108.0,223.0,8.0,85,7,25 87 | 20.0,81.0,8.6,82,7,26 88 | 52.0,82.0,12.0,86,7,27 89 | 82.0,213.0,7.4,88,7,28 90 | 50.0,275.0,7.4,86,7,29 91 | 64.0,253.0,7.4,83,7,30 92 | 59.0,254.0,9.2,81,7,31 93 | 39.0,83.0,6.9,81,8,1 94 | 9.0,24.0,13.8,81,8,2 95 | 16.0,77.0,7.4,82,8,3 96 | 78.0,,6.9,86,8,4 97 | 35.0,,7.4,85,8,5 98 | 66.0,,4.6,87,8,6 99 | 122.0,255.0,4.0,89,8,7 100 | 89.0,229.0,10.3,90,8,8 101 | 110.0,207.0,8.0,90,8,9 102 | ,222.0,8.6,92,8,10 103 | ,137.0,11.5,86,8,11 104 | 44.0,192.0,11.5,86,8,12 105 | 28.0,273.0,11.5,82,8,13 106 | 65.0,157.0,9.7,80,8,14 107 | ,64.0,11.5,79,8,15 108 | 22.0,71.0,10.3,77,8,16 109 | 59.0,51.0,6.3,79,8,17 110 | 23.0,115.0,7.4,76,8,18 111 | 31.0,244.0,10.9,78,8,19 112 | 44.0,190.0,10.3,78,8,20 113 | 21.0,259.0,15.5,77,8,21 114 | 9.0,36.0,14.3,72,8,22 115 | ,255.0,12.6,75,8,23 116 | 45.0,212.0,9.7,79,8,24 117 | 168.0,238.0,3.4,81,8,25 118 | 73.0,215.0,8.0,86,8,26 119 | ,153.0,5.7,88,8,27 120 | 76.0,203.0,9.7,97,8,28 121 | 118.0,225.0,2.3,94,8,29 122 | 84.0,237.0,6.3,96,8,30 123 | 85.0,188.0,6.3,94,8,31 124 | 96.0,167.0,6.9,91,9,1 125 | 78.0,197.0,5.1,92,9,2 126 | 73.0,183.0,2.8,93,9,3 127 | 91.0,189.0,4.6,93,9,4 128 | 47.0,95.0,7.4,87,9,5 129 | 32.0,92.0,15.5,84,9,6 130 | 20.0,252.0,10.9,80,9,7 131 | 23.0,220.0,10.3,78,9,8 132 | 21.0,230.0,10.9,75,9,9 133 | 24.0,259.0,9.7,73,9,10 134 | 44.0,236.0,14.9,81,9,11 135 | 21.0,259.0,15.5,76,9,12 136 | 28.0,238.0,6.3,77,9,13 137 | 9.0,24.0,10.9,71,9,14 138 | 13.0,112.0,11.5,71,9,15 139 | 46.0,237.0,6.9,78,9,16 140 | 18.0,224.0,13.8,67,9,17 141 | 13.0,27.0,10.3,76,9,18 142 | 24.0,238.0,10.3,68,9,19 143 | 16.0,201.0,8.0,82,9,20 144 | 13.0,238.0,12.6,64,9,21 145 | 23.0,14.0,9.2,71,9,22 146 | 36.0,139.0,10.3,81,9,23 147 | 7.0,49.0,10.3,69,9,24 148 | 14.0,20.0,16.6,63,9,25 149 | 30.0,193.0,6.9,70,9,26 150 | ,145.0,13.2,77,9,27 151 | 14.0,191.0,14.3,75,9,28 152 | 18.0,131.0,8.0,76,9,29 153 | 20.0,223.0,11.5,68,9,30 154 | -------------------------------------------------------------------------------- /data/titanic/genderclassmodel.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,0 35 | 925,0 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,0 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,0 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,0 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,0 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,0 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,0 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,0 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,0 368 | 1258,0 369 | 1259,0 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /data/titanic/genderclassmodel.py: -------------------------------------------------------------------------------- 1 | """ Now that the user can read in a file this creates a model which uses the price, class and gender 2 | Author : AstroDave 3 | Date : 18th September 2012 4 | Revised : 28 March 2014 5 | 6 | """ 7 | 8 | 9 | import csv as csv 10 | import numpy as np 11 | 12 | csv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file 13 | header = csv_file_object.next() # Skip the fist line as it is a header 14 | data=[] # Create a variable to hold the data 15 | 16 | for row in csv_file_object: # Skip through each row in the csv file 17 | data.append(row) # adding each row to the data variable 18 | data = np.array(data) # Then convert from a list to an array 19 | 20 | # In order to analyse the price column I need to bin up that data 21 | # here are my binning parameters, the problem we face is some of the fares are very large 22 | # So we can either have a lot of bins with nothing in them or we can just lose some 23 | # information by just considering that anythng over 39 is simply in the last bin. 24 | # So we add a ceiling 25 | fare_ceiling = 40 26 | # then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling 27 | data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0 28 | 29 | fare_bracket_size = 10 30 | number_of_price_brackets = fare_ceiling / fare_bracket_size 31 | number_of_classes = 3 # I know there were 1st, 2nd and 3rd classes on board. 32 | number_of_classes = len(np.unique(data[0::,2])) # But it's better practice to calculate this from the Pclass directly: 33 | # just take the length of an array of UNIQUE values in column index 2 34 | 35 | 36 | # This reference matrix will show the proportion of survivors as a sorted table of 37 | # gender, class and ticket fare. 38 | # First initialize it with all zeros 39 | survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float) 40 | 41 | # I can now find the stats of all the women and men on board 42 | for i in xrange(number_of_classes): 43 | for j in xrange(number_of_price_brackets): 44 | 45 | women_only_stats = data[ (data[0::,4] == "female") \ 46 | & (data[0::,2].astype(np.float) == i+1) \ 47 | & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \ 48 | & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1] 49 | 50 | men_only_stats = data[ (data[0::,4] != "female") \ 51 | & (data[0::,2].astype(np.float) == i+1) \ 52 | & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \ 53 | & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1] 54 | 55 | #if i == 0 and j == 3: 56 | 57 | survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) # Female stats 58 | survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float)) # Male stats 59 | 60 | # Since in python if it tries to find the mean of an array with nothing in it 61 | # (such that the denominator is 0), then it returns nan, we can convert these to 0 62 | # by just saying where does the array not equal the array, and set these to 0. 63 | survival_table[ survival_table != survival_table ] = 0. 64 | 65 | # Now I have my proportion of survivors, simply round them such that if <0.5 66 | # I predict they dont surivive, and if >= 0.5 they do 67 | survival_table[ survival_table < 0.5 ] = 0 68 | survival_table[ survival_table >= 0.5 ] = 1 69 | 70 | # Now I have my indicator I can read in the test file and write out 71 | # if a women then survived(1) if a man then did not survived (0) 72 | # First read in test 73 | test_file = open('test.csv', 'rb') 74 | test_file_object = csv.reader(test_file) 75 | header = test_file_object.next() 76 | 77 | # Also open the a new file so I can write to it. 78 | predictions_file = open("genderclassmodel.csv", "wb") 79 | predictions_file_object = csv.writer(predictions_file) 80 | predictions_file_object.writerow(["PassengerId", "Survived"]) 81 | 82 | # First thing to do is bin up the price file 83 | for row in test_file_object: 84 | for j in xrange(number_of_price_brackets): 85 | # If there is no fare then place the price of the ticket according to class 86 | try: 87 | row[8] = float(row[8]) # No fare recorded will come up as a string so 88 | # try to make it a float 89 | except: # If fails then just bin the fare according to the class 90 | bin_fare = 3 - float(row[1]) 91 | break # Break from the loop and move to the next row 92 | if row[8] > fare_ceiling: # Otherwise now test to see if it is higher 93 | # than the fare ceiling we set earlier 94 | bin_fare = number_of_price_brackets - 1 95 | break # And then break to the next row 96 | 97 | if row[8] >= j*fare_bracket_size\ 98 | and row[8] < (j+1)*fare_bracket_size: # If passed these tests then loop through 99 | # each bin until you find the right one 100 | # append it to the bin_fare 101 | # and move to the next loop 102 | bin_fare = j 103 | break 104 | # Now I have the binned fare, passenger class, and whether female or male, we can 105 | # just cross ref their details with our survival table 106 | if row[3] == 'female': 107 | predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])]) 108 | else: 109 | predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])]) 110 | 111 | # Close out the files 112 | test_file.close() 113 | predictions_file.close() -------------------------------------------------------------------------------- /data/titanic/gendermodel.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /data/titanic/gendermodel.py: -------------------------------------------------------------------------------- 1 | """ This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this 2 | Author : AstroDave 3 | Date : 18 September 2012 4 | Revised: 28 March 2014 5 | 6 | """ 7 | 8 | 9 | import csv as csv 10 | import numpy as np 11 | 12 | csv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file 13 | header = csv_file_object.next() # Skip the fist line as it is a header 14 | data=[] # Create a variable to hold the data 15 | 16 | for row in csv_file_object: # Skip through each row in the csv file, 17 | data.append(row[0:]) # adding each row to the data variable 18 | data = np.array(data) # Then convert from a list to an array. 19 | 20 | # Now I have an array of 12 columns and 891 rows 21 | # I can access any element I want, so the entire first column would 22 | # be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0 23 | # I have to add the .astype() command, because 24 | # when appending the rows, python thought it was a string - so needed to convert 25 | 26 | # Set some variables 27 | number_passengers = np.size(data[0::,1].astype(np.float)) 28 | number_survived = np.sum(data[0::,1].astype(np.float)) 29 | proportion_survivors = number_survived / number_passengers 30 | 31 | # I can now find the stats of all the women on board, 32 | # by making an array that lists True/False whether each row is female 33 | women_only_stats = data[0::,4] == "female" # This finds where all the women are 34 | men_only_stats = data[0::,4] != "female" # This finds where all the men are (note != means 'not equal') 35 | 36 | # I can now filter the whole data, to find statistics for just women, by just placing 37 | # women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index. 38 | # You can test it by placing it there, and requesting column index [4], and the output should all read 'female' 39 | # e.g. try typing this: data[women_only_stats,4] 40 | women_onboard = data[women_only_stats,1].astype(np.float) 41 | men_onboard = data[men_only_stats,1].astype(np.float) 42 | 43 | # and derive some statistics about them 44 | proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard) 45 | proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard) 46 | 47 | print 'Proportion of women who survived is %s' % proportion_women_survived 48 | print 'Proportion of men who survived is %s' % proportion_men_survived 49 | 50 | # Now that I have my indicator that women were much more likely to survive, 51 | # I am done with the training set. 52 | # Now I will read in the test file and write out my simplistic prediction: 53 | # if female, then model that she survived (1) 54 | # if male, then model that he did not survive (0) 55 | 56 | # First, read in test.csv 57 | test_file = open('test.csv', 'rb') 58 | test_file_object = csv.reader(test_file) 59 | header = test_file_object.next() 60 | 61 | # Also open the a new file so I can write to it. Call it something descriptive 62 | # Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex') 63 | # Write out the PassengerId, and my prediction. 64 | 65 | predictions_file = open("gendermodel.csv", "wb") 66 | predictions_file_object = csv.writer(predictions_file) 67 | predictions_file_object.writerow(["PassengerId", "Survived"]) # write the column headers 68 | for row in test_file_object: # For each row in test file, 69 | if row[3] == 'female': # is it a female, if yes then 70 | predictions_file_object.writerow([row[0], "1"]) # write the PassengerId, and predict 1 71 | else: # or else if male, 72 | predictions_file_object.writerow([row[0], "0"]) # write the PassengerId, and predict 0. 73 | test_file.close() # Close out the files. 74 | predictions_file.close() 75 | 76 | -------------------------------------------------------------------------------- /data/titanic/myfirstforest.py: -------------------------------------------------------------------------------- 1 | """ Writing my first randomforest code. 2 | Author : AstroDave 3 | Date : 23rd September 2012 4 | Revised: 15 April 2014 5 | please see packages.python.org/milk/randomforests.html for more 6 | 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | import csv as csv 11 | from sklearn.ensemble import RandomForestClassifier 12 | 13 | # Data cleanup 14 | # TRAIN DATA 15 | train_df = pd.read_csv('train.csv', header=0) # Load the train file into a dataframe 16 | 17 | # I need to convert all strings to integer classifiers. 18 | # I need to fill in the missing values of the data and make it complete. 19 | 20 | # female = 0, Male = 1 21 | train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 22 | 23 | # Embarked from 'C', 'Q', 'S' 24 | # Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc. 25 | 26 | # All missing Embarked -> just make them embark from most common place 27 | if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0: 28 | train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values 29 | 30 | Ports = list(enumerate(np.unique(train_df['Embarked']))) # determine all values of Embarked, 31 | Ports_dict = { name : i for i, name in Ports } # set up a dictionary in the form Ports : index 32 | train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int) # Convert all Embark strings to int 33 | 34 | # All the ages with no data -> make the median of all Ages 35 | median_age = train_df['Age'].dropna().median() 36 | if len(train_df.Age[ train_df.Age.isnull() ]) > 0: 37 | train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age 38 | 39 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender) 40 | train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 41 | 42 | 43 | # TEST DATA 44 | test_df = pd.read_csv('test.csv', header=0) # Load the test file into a dataframe 45 | 46 | # I need to do the same with the test data now, so that the columns are the same as the training data 47 | # I need to convert all strings to integer classifiers: 48 | # female = 0, Male = 1 49 | test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 50 | 51 | # Embarked from 'C', 'Q', 'S' 52 | # All missing Embarked -> just make them embark from most common place 53 | if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0: 54 | test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values 55 | # Again convert all Embarked strings to int 56 | test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int) 57 | 58 | 59 | # All the ages with no data -> make the median of all Ages 60 | median_age = test_df['Age'].dropna().median() 61 | if len(test_df.Age[ test_df.Age.isnull() ]) > 0: 62 | test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age 63 | 64 | # All the missing Fares -> assume median of their respective class 65 | if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0: 66 | median_fare = np.zeros(3) 67 | for f in range(0,3): # loop 0 to 2 68 | median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median() 69 | for f in range(0,3): # loop 0 to 2 70 | test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f] 71 | 72 | # Collect the test data's PassengerIds before dropping it 73 | ids = test_df['PassengerId'].values 74 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender) 75 | test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 76 | 77 | 78 | # The data is now ready to go. So lets fit to the train, then predict to the test! 79 | # Convert back to a numpy array 80 | train_data = train_df.values 81 | test_data = test_df.values 82 | 83 | 84 | print 'Training...' 85 | forest = RandomForestClassifier(n_estimators=100) 86 | forest = forest.fit( train_data[0::,1::], train_data[0::,0] ) 87 | 88 | print 'Predicting...' 89 | output = forest.predict(test_data).astype(int) 90 | 91 | 92 | predictions_file = open("myfirstforest.csv", "wb") 93 | open_file_object = csv.writer(predictions_file) 94 | open_file_object.writerow(["PassengerId","Survived"]) 95 | open_file_object.writerows(zip(ids, output)) 96 | predictions_file.close() 97 | print 'Done.' 98 | -------------------------------------------------------------------------------- /data/titanic/results-rf.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0.0 3 | 893,0.0 4 | 894,0.0 5 | 895,1.0 6 | 896,1.0 7 | 897,0.0 8 | 898,0.0 9 | 899,0.0 10 | 900,1.0 11 | 901,0.0 12 | 902,0.0 13 | 903,0.0 14 | 904,1.0 15 | 905,0.0 16 | 906,1.0 17 | 907,1.0 18 | 908,0.0 19 | 909,1.0 20 | 910,1.0 21 | 911,1.0 22 | 912,0.0 23 | 913,1.0 24 | 914,1.0 25 | 915,1.0 26 | 916,1.0 27 | 917,0.0 28 | 918,1.0 29 | 919,1.0 30 | 920,1.0 31 | 921,0.0 32 | 922,0.0 33 | 923,0.0 34 | 924,1.0 35 | 925,0.0 36 | 926,1.0 37 | 927,1.0 38 | 928,0.0 39 | 929,0.0 40 | 930,0.0 41 | 931,1.0 42 | 932,0.0 43 | 933,1.0 44 | 934,0.0 45 | 935,1.0 46 | 936,1.0 47 | 937,0.0 48 | 938,1.0 49 | 939,0.0 50 | 940,1.0 51 | 941,1.0 52 | 942,0.0 53 | 943,0.0 54 | 944,1.0 55 | 945,1.0 56 | 946,0.0 57 | 947,0.0 58 | 948,0.0 59 | 949,0.0 60 | 950,0.0 61 | 951,1.0 62 | 952,0.0 63 | 953,0.0 64 | 954,0.0 65 | 955,1.0 66 | 956,1.0 67 | 957,1.0 68 | 958,1.0 69 | 959,0.0 70 | 960,0.0 71 | 961,1.0 72 | 962,1.0 73 | 963,0.0 74 | 964,0.0 75 | 965,0.0 76 | 966,1.0 77 | 967,0.0 78 | 968,0.0 79 | 969,1.0 80 | 970,0.0 81 | 971,1.0 82 | 972,1.0 83 | 973,0.0 84 | 974,0.0 85 | 975,0.0 86 | 976,0.0 87 | 977,0.0 88 | 978,1.0 89 | 979,0.0 90 | 980,0.0 91 | 981,1.0 92 | 982,1.0 93 | 983,0.0 94 | 984,1.0 95 | 985,0.0 96 | 986,0.0 97 | 987,0.0 98 | 988,1.0 99 | 989,0.0 100 | 990,0.0 101 | 991,0.0 102 | 992,1.0 103 | 993,0.0 104 | 994,0.0 105 | 995,0.0 106 | 996,1.0 107 | 997,0.0 108 | 998,0.0 109 | 999,0.0 110 | 1000,0.0 111 | 1001,0.0 112 | 1002,0.0 113 | 1003,0.0 114 | 1004,1.0 115 | 1005,0.0 116 | 1006,1.0 117 | 1007,0.0 118 | 1008,0.0 119 | 1009,1.0 120 | 1010,0.0 121 | 1011,1.0 122 | 1012,1.0 123 | 1013,0.0 124 | 1014,1.0 125 | 1015,0.0 126 | 1016,0.0 127 | 1017,1.0 128 | 1018,0.0 129 | 1019,0.0 130 | 1020,0.0 131 | 1021,0.0 132 | 1022,1.0 133 | 1023,0.0 134 | 1024,0.0 135 | 1025,0.0 136 | 1026,0.0 137 | 1027,0.0 138 | 1028,0.0 139 | 1029,0.0 140 | 1030,0.0 141 | 1031,0.0 142 | 1032,0.0 143 | 1033,1.0 144 | 1034,0.0 145 | 1035,0.0 146 | 1036,1.0 147 | 1037,0.0 148 | 1038,0.0 149 | 1039,0.0 150 | 1040,1.0 151 | 1041,0.0 152 | 1042,1.0 153 | 1043,0.0 154 | 1044,0.0 155 | 1045,1.0 156 | 1046,0.0 157 | 1047,0.0 158 | 1048,1.0 159 | 1049,0.0 160 | 1050,1.0 161 | 1051,1.0 162 | 1052,0.0 163 | 1053,1.0 164 | 1054,1.0 165 | 1055,0.0 166 | 1056,0.0 167 | 1057,1.0 168 | 1058,0.0 169 | 1059,0.0 170 | 1060,1.0 171 | 1061,0.0 172 | 1062,0.0 173 | 1063,0.0 174 | 1064,0.0 175 | 1065,0.0 176 | 1066,0.0 177 | 1067,1.0 178 | 1068,1.0 179 | 1069,0.0 180 | 1070,1.0 181 | 1071,1.0 182 | 1072,0.0 183 | 1073,0.0 184 | 1074,1.0 185 | 1075,0.0 186 | 1076,1.0 187 | 1077,0.0 188 | 1078,1.0 189 | 1079,0.0 190 | 1080,0.0 191 | 1081,0.0 192 | 1082,0.0 193 | 1083,0.0 194 | 1084,1.0 195 | 1085,0.0 196 | 1086,1.0 197 | 1087,0.0 198 | 1088,1.0 199 | 1089,0.0 200 | 1090,0.0 201 | 1091,0.0 202 | 1092,0.0 203 | 1093,1.0 204 | 1094,0.0 205 | 1095,1.0 206 | 1096,0.0 207 | 1097,0.0 208 | 1098,0.0 209 | 1099,0.0 210 | 1100,1.0 211 | 1101,0.0 212 | 1102,0.0 213 | 1103,0.0 214 | 1104,0.0 215 | 1105,1.0 216 | 1106,0.0 217 | 1107,0.0 218 | 1108,0.0 219 | 1109,0.0 220 | 1110,1.0 221 | 1111,0.0 222 | 1112,1.0 223 | 1113,0.0 224 | 1114,1.0 225 | 1115,1.0 226 | 1116,0.0 227 | 1117,1.0 228 | 1118,0.0 229 | 1119,0.0 230 | 1120,0.0 231 | 1121,0.0 232 | 1122,0.0 233 | 1123,1.0 234 | 1124,0.0 235 | 1125,0.0 236 | 1126,1.0 237 | 1127,0.0 238 | 1128,0.0 239 | 1129,1.0 240 | 1130,1.0 241 | 1131,1.0 242 | 1132,0.0 243 | 1133,1.0 244 | 1134,0.0 245 | 1135,0.0 246 | 1136,0.0 247 | 1137,0.0 248 | 1138,1.0 249 | 1139,0.0 250 | 1140,1.0 251 | 1141,0.0 252 | 1142,1.0 253 | 1143,0.0 254 | 1144,0.0 255 | 1145,0.0 256 | 1146,0.0 257 | 1147,0.0 258 | 1148,0.0 259 | 1149,0.0 260 | 1150,1.0 261 | 1151,0.0 262 | 1152,0.0 263 | 1153,0.0 264 | 1154,1.0 265 | 1155,1.0 266 | 1156,0.0 267 | 1157,0.0 268 | 1158,0.0 269 | 1159,0.0 270 | 1160,0.0 271 | 1161,0.0 272 | 1162,0.0 273 | 1163,0.0 274 | 1164,1.0 275 | 1165,0.0 276 | 1166,0.0 277 | 1167,1.0 278 | 1168,0.0 279 | 1169,0.0 280 | 1170,0.0 281 | 1171,0.0 282 | 1172,0.0 283 | 1173,1.0 284 | 1174,0.0 285 | 1175,0.0 286 | 1176,1.0 287 | 1177,0.0 288 | 1178,0.0 289 | 1179,0.0 290 | 1180,0.0 291 | 1181,0.0 292 | 1182,0.0 293 | 1183,0.0 294 | 1184,0.0 295 | 1185,0.0 296 | 1186,0.0 297 | 1187,0.0 298 | 1188,1.0 299 | 1189,0.0 300 | 1190,0.0 301 | 1191,0.0 302 | 1192,0.0 303 | 1193,0.0 304 | 1194,0.0 305 | 1195,0.0 306 | 1196,0.0 307 | 1197,1.0 308 | 1198,1.0 309 | 1199,1.0 310 | 1200,0.0 311 | 1201,0.0 312 | 1202,0.0 313 | 1203,1.0 314 | 1204,0.0 315 | 1205,0.0 316 | 1206,1.0 317 | 1207,1.0 318 | 1208,0.0 319 | 1209,0.0 320 | 1210,0.0 321 | 1211,0.0 322 | 1212,0.0 323 | 1213,0.0 324 | 1214,0.0 325 | 1215,1.0 326 | 1216,1.0 327 | 1217,0.0 328 | 1218,1.0 329 | 1219,0.0 330 | 1220,0.0 331 | 1221,0.0 332 | 1222,1.0 333 | 1223,1.0 334 | 1224,0.0 335 | 1225,1.0 336 | 1226,0.0 337 | 1227,0.0 338 | 1228,1.0 339 | 1229,0.0 340 | 1230,0.0 341 | 1231,0.0 342 | 1232,0.0 343 | 1233,0.0 344 | 1234,0.0 345 | 1235,1.0 346 | 1236,0.0 347 | 1237,0.0 348 | 1238,0.0 349 | 1239,1.0 350 | 1240,0.0 351 | 1241,1.0 352 | 1242,1.0 353 | 1243,0.0 354 | 1244,0.0 355 | 1245,0.0 356 | 1246,1.0 357 | 1247,0.0 358 | 1248,1.0 359 | 1249,0.0 360 | 1250,0.0 361 | 1251,1.0 362 | 1252,0.0 363 | 1253,1.0 364 | 1254,1.0 365 | 1255,1.0 366 | 1256,1.0 367 | 1257,0.0 368 | 1258,0.0 369 | 1259,0.0 370 | 1260,1.0 371 | 1261,1.0 372 | 1262,0.0 373 | 1263,1.0 374 | 1264,0.0 375 | 1265,0.0 376 | 1266,1.0 377 | 1267,1.0 378 | 1268,0.0 379 | 1269,0.0 380 | 1270,0.0 381 | 1271,0.0 382 | 1272,0.0 383 | 1273,0.0 384 | 1274,1.0 385 | 1275,1.0 386 | 1276,0.0 387 | 1277,1.0 388 | 1278,0.0 389 | 1279,0.0 390 | 1280,0.0 391 | 1281,0.0 392 | 1282,0.0 393 | 1283,1.0 394 | 1284,0.0 395 | 1285,0.0 396 | 1286,0.0 397 | 1287,1.0 398 | 1288,0.0 399 | 1289,1.0 400 | 1290,0.0 401 | 1291,0.0 402 | 1292,1.0 403 | 1293,0.0 404 | 1294,1.0 405 | 1295,0.0 406 | 1296,0.0 407 | 1297,0.0 408 | 1298,0.0 409 | 1299,0.0 410 | 1300,0.0 411 | 1301,1.0 412 | 1302,0.0 413 | 1303,1.0 414 | 1304,0.0 415 | 1305,0.0 416 | 1306,1.0 417 | 1307,0.0 418 | 1308,0.0 419 | 1309,0.0 420 | -------------------------------------------------------------------------------- /deep-learning/deep-dream/flowers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/deep-dream/flowers.jpg -------------------------------------------------------------------------------- /deep-learning/deep-dream/sky1024px.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/deep-dream/sky1024px.jpg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/4. Conclusions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Credits: Forked from [deep-learning-keras-tensorflow](https://github.com/leriomaggio/deep-learning-keras-tensorflow) by Valerio Maggio" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "slideshow": { 14 | "slide_type": "slide" 15 | } 16 | }, 17 | "source": [ 18 | "# Conclusions" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "slideshow": { 25 | "slide_type": "subslide" 26 | } 27 | }, 28 | "source": [ 29 | "* Keras is a powerful and battery-included framework for Deep Learning in Python\n", 30 | "\n", 31 | "* Keras is **simple** to use..\n", 32 | "\n", 33 | "* ...but it is **not** for simple things!" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "slideshow": { 40 | "slide_type": "subslide" 41 | } 42 | }, 43 | "source": [ 44 | "" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "slideshow": { 51 | "slide_type": "subslide" 52 | } 53 | }, 54 | "source": [ 55 | "" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "slideshow": { 62 | "slide_type": "slide" 63 | } 64 | }, 65 | "source": [ 66 | "## Some References for .." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "slideshow": { 73 | "slide_type": "fragment" 74 | } 75 | }, 76 | "source": [ 77 | "#### Cutting Edge\n", 78 | "\n", 79 | "* Fractal Net Implementation with Keras: https://github.com/snf/keras-fractalnet -\n", 80 | "* Please check out: [https://github.com/fchollet/keras-resources]()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "slideshow": { 87 | "slide_type": "fragment" 88 | } 89 | }, 90 | "source": [ 91 | "#### Hyper-Cool\n", 92 | "\n", 93 | "* Hyperas: https://github.com/maxpumperla/hyperas\n", 94 | " - A web dashboard for Keras Models" 95 | ] 96 | } 97 | ], 98 | "metadata": { 99 | "celltoolbar": "Slideshow", 100 | "kernelspec": { 101 | "display_name": "Python 3", 102 | "language": "python", 103 | "name": "python3" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.4.3" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 0 120 | } 121 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 MPBA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/data/female_blog_list.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/female_blog_list.txt -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/data/male_blog_list.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/male_blog_list.txt -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/data/mnist.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/mnist.pkl.gz -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/data/rt-polarity.neg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/rt-polarity.neg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/data/rt-polarity.pos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/rt-polarity.pos -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import itertools 4 | from collections import Counter 5 | """ 6 | Original taken from https://github.com/dennybritz/cnn-text-classification-tf 7 | """ 8 | 9 | def clean_str(string): 10 | """ 11 | Tokenization/string cleaning for all datasets except for SST. 12 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 13 | """ 14 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 15 | string = re.sub(r"\'s", " \'s", string) 16 | string = re.sub(r"\'ve", " \'ve", string) 17 | string = re.sub(r"n\'t", " n\'t", string) 18 | string = re.sub(r"\'re", " \'re", string) 19 | string = re.sub(r"\'d", " \'d", string) 20 | string = re.sub(r"\'ll", " \'ll", string) 21 | string = re.sub(r",", " , ", string) 22 | string = re.sub(r"!", " ! ", string) 23 | string = re.sub(r"\(", " \( ", string) 24 | string = re.sub(r"\)", " \) ", string) 25 | string = re.sub(r"\?", " \? ", string) 26 | string = re.sub(r"\s{2,}", " ", string) 27 | return string.strip().lower() 28 | 29 | 30 | def load_data_and_labels(): 31 | """ 32 | Loads MR polarity data from files, splits the data into words and generates labels. 33 | Returns split sentences and labels. 34 | """ 35 | # Load data from files 36 | positive_examples = list(open("./data/rt-polarity.pos", encoding='ISO-8859-1').readlines()) 37 | positive_examples = [s.strip() for s in positive_examples] 38 | negative_examples = list(open("./data/rt-polarity.neg", encoding='ISO-8859-1').readlines()) 39 | negative_examples = [s.strip() for s in negative_examples] 40 | # Split by words 41 | x_text = positive_examples + negative_examples 42 | x_text = [clean_str(sent) for sent in x_text] 43 | x_text = [s.split(" ") for s in x_text] 44 | # Generate labels 45 | positive_labels = [[0, 1] for _ in positive_examples] 46 | negative_labels = [[1, 0] for _ in negative_examples] 47 | y = np.concatenate([positive_labels, negative_labels], 0) 48 | return [x_text, y] 49 | 50 | 51 | def pad_sentences(sentences, padding_word=""): 52 | """ 53 | Pads all sentences to the same length. The length is defined by the longest sentence. 54 | Returns padded sentences. 55 | """ 56 | sequence_length = max(len(x) for x in sentences) 57 | padded_sentences = [] 58 | for i in range(len(sentences)): 59 | sentence = sentences[i] 60 | num_padding = sequence_length - len(sentence) 61 | new_sentence = sentence + [padding_word] * num_padding 62 | padded_sentences.append(new_sentence) 63 | return padded_sentences 64 | 65 | 66 | def build_vocab(sentences): 67 | """ 68 | Builds a vocabulary mapping from word to index based on the sentences. 69 | Returns vocabulary mapping and inverse vocabulary mapping. 70 | """ 71 | # Build vocabulary 72 | word_counts = Counter(itertools.chain(*sentences)) 73 | # Mapping from index to word 74 | vocabulary_inv = [x[0] for x in word_counts.most_common()] 75 | # Mapping from word to index 76 | vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} 77 | return [vocabulary, vocabulary_inv] 78 | 79 | 80 | def build_input_data(sentences, labels, vocabulary): 81 | """ 82 | Maps sentencs and labels to vectors based on a vocabulary. 83 | """ 84 | x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) 85 | y = np.array(labels) 86 | return [x, y] 87 | 88 | 89 | def load_data(): 90 | """ 91 | Loads and preprocessed data for the MR dataset. 92 | Returns input vectors, labels, vocabulary, and inverse vocabulary. 93 | """ 94 | # Load and preprocess data 95 | sentences, labels = load_data_and_labels() 96 | sentences_padded = pad_sentences(sentences) 97 | vocabulary, vocabulary_inv = build_vocab(sentences_padded) 98 | x, y = build_input_data(sentences_padded, labels, vocabulary) 99 | return [x, y, vocabulary, vocabulary_inv] 100 | 101 | 102 | def batch_iter(data, batch_size, num_epochs): 103 | """ 104 | Generates a batch iterator for a dataset. 105 | """ 106 | data = np.array(data) 107 | data_size = len(data) 108 | num_batches_per_epoch = int(len(data)/batch_size) + 1 109 | for epoch in range(num_epochs): 110 | # Shuffle the data at each epoch 111 | shuffle_indices = np.random.permutation(np.arange(data_size)) 112 | shuffled_data = data[shuffle_indices] 113 | for batch_num in range(num_batches_per_epoch): 114 | start_index = batch_num * batch_size 115 | end_index = min((batch_num + 1) * batch_size, data_size) 116 | yield shuffled_data[start_index:end_index] -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/deep-learning-osx.yml: -------------------------------------------------------------------------------- 1 | name: deep-learning 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - accelerate=2.3.0=np111py35_3 7 | - accelerate_cudalib=2.0=0 8 | - appnope=0.1.0=py35_0 9 | - bokeh=0.12.1=py35_0 10 | - cffi=1.6.0=py35_0 11 | - backports.shutil_get_terminal_size=1.0.0=py35_0 12 | - blas=1.1=openblas 13 | - ca-certificates=2016.8.2=3 14 | - certifi=2016.8.2=py35_0 15 | - cycler=0.10.0=py35_0 16 | - cython=0.24.1=py35_0 17 | - decorator=4.0.10=py35_0 18 | - entrypoints=0.2.2=py35_0 19 | - freetype=2.6.3=1 20 | - h5py=2.6.0=np111py35_6 21 | - hdf5=1.8.17=2 22 | - ipykernel=4.3.1=py35_1 23 | - ipython=5.1.0=py35_0 24 | - ipywidgets=5.2.2=py35_0 25 | - jinja2=2.8=py35_1 26 | - jsonschema=2.5.1=py35_0 27 | - jupyter_client=4.3.0=py35_0 28 | - jupyter_console=5.0.0=py35_0 29 | - jupyter_core=4.1.1=py35_1 30 | - libgfortran=3.0.0=0 31 | - libpng=1.6.24=0 32 | - libsodium=1.0.10=0 33 | - markupsafe=0.23=py35_0 34 | - matplotlib=1.5.2=np111py35_5 35 | - mistune=0.7.3=py35_0 36 | - nbconvert=4.2.0=py35_0 37 | - nbformat=4.0.1=py35_0 38 | - ncurses=5.9=8 39 | - nose=1.3.7=py35_1 40 | - notebook=4.2.2=py35_0 41 | - numpy=1.11.1=py35_blas_openblas_201 42 | - openblas=0.2.18=4 43 | - openssl=1.0.2h=2 44 | - pandas=0.18.1=np111py35_1 45 | - pexpect=4.2.0=py35_1 46 | - pickleshare=0.7.3=py35_0 47 | - pip=8.1.2=py35_0 48 | - prompt_toolkit=1.0.6=py35_0 49 | - ptyprocess=0.5.1=py35_0 50 | - pygments=2.1.3=py35_1 51 | - pyparsing=2.1.7=py35_0 52 | - python=3.5.2=2 53 | - python-dateutil=2.5.3=py35_0 54 | - pytz=2016.6.1=py35_0 55 | - pyyaml=3.11=py35_0 56 | - pyzmq=15.4.0=py35_0 57 | - qtconsole=4.2.1=py35_0 58 | - readline=6.2=0 59 | - requests=2.11.0=py35_0 60 | - scikit-learn=0.17.1=np111py35_blas_openblas_201 61 | - scipy=0.18.0=np111py35_blas_openblas_201 62 | - setuptools=25.1.6=py35_0 63 | - simplegeneric=0.8.1=py35_0 64 | - sip=4.18=py35_0 65 | - six=1.10.0=py35_0 66 | - sqlite=3.13.0=1 67 | - terminado=0.6=py35_0 68 | - tk=8.5.19=0 69 | - tornado=4.4.1=py35_1 70 | - traitlets=4.2.2=py35_0 71 | - wcwidth=0.1.7=py35_0 72 | - wheel=0.29.0=py35_0 73 | - widgetsnbextension=1.2.6=py35_3 74 | - xz=5.2.2=0 75 | - yaml=0.1.6=0 76 | - zeromq=4.1.5=0 77 | - zlib=1.2.8=3 78 | - cudatoolkit=7.5=0 79 | - ipython_genutils=0.1.0=py35_0 80 | - jupyter=1.0.0=py35_3 81 | - llvmlite=0.11.0=py35_0 82 | - mkl=11.3.3=0 83 | - mkl-service=1.1.2=py35_2 84 | - numba=0.26.0=np111py35_0 85 | - pycparser=2.14=py35_1 86 | - pyqt=4.11.4=py35_4 87 | - python.app=1.2=py35_4 88 | - qt=4.8.7=4 89 | - snakeviz=0.4.1=py35_0 90 | - pip: 91 | - backports.shutil-get-terminal-size==1.0.0 92 | - certifi==2016.8.2 93 | - cycler==0.10.0 94 | - cython==0.24.1 95 | - decorator==4.0.10 96 | - h5py==2.6.0 97 | - ipykernel==4.3.1 98 | - ipython==5.1.0 99 | - ipython-genutils==0.1.0 100 | - ipywidgets==5.2.2 101 | - jinja2==2.8 102 | - jsonschema==2.5.1 103 | - jupyter-client==4.3.0 104 | - jupyter-console==5.0.0 105 | - jupyter-core==4.1.1 106 | - keras==1.0.7 107 | - markupsafe==0.23 108 | - matplotlib==1.5.2 109 | - mistune==0.7.3 110 | - nbconvert==4.2.0 111 | - nbformat==4.0.1 112 | - nose==1.3.7 113 | - notebook==4.2.2 114 | - numpy==1.11.1 115 | - pandas==0.18.1 116 | - pexpect==4.2.0 117 | - pickleshare==0.7.3 118 | - pip==8.1.2 119 | - prompt-toolkit==1.0.6 120 | - ptyprocess==0.5.1 121 | - pygments==2.1.3 122 | - pyparsing==2.1.7 123 | - python-dateutil==2.5.3 124 | - pytz==2016.6.1 125 | - pyyaml==3.11 126 | - pyzmq==15.4.0 127 | - qtconsole==4.2.1 128 | - requests==2.11.0 129 | - scikit-learn==0.17.1 130 | - scipy==0.18.0 131 | - setuptools==25.1.6 132 | - simplegeneric==0.8.1 133 | - six==1.10.0 134 | - terminado==0.6 135 | - theano==0.8.2 136 | - tornado==4.4.1 137 | - traitlets==4.2.2 138 | - wcwidth==0.1.7 139 | - wheel==0.29.0 140 | - widgetsnbextension==1.2.6 141 | prefix: /Users/valerio/anaconda/envs/deep-learning 142 | 143 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/deep-learning.yml: -------------------------------------------------------------------------------- 1 | name: deep-learning 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - accelerate=2.3.0=np111py35_3 7 | - accelerate_cudalib=2.0=0 8 | - bokeh=0.12.1=py35_0 9 | - cffi=1.6.0=py35_0 10 | - backports.shutil_get_terminal_size=1.0.0=py35_0 11 | - blas=1.1=openblas 12 | - ca-certificates=2016.8.2=3 13 | - cairo=1.12.18=8 14 | - certifi=2016.8.2=py35_0 15 | - cycler=0.10.0=py35_0 16 | - cython=0.24.1=py35_0 17 | - decorator=4.0.10=py35_0 18 | - entrypoints=0.2.2=py35_0 19 | - fontconfig=2.11.1=3 20 | - freetype=2.6.3=1 21 | - gettext=0.19.7=1 22 | - glib=2.48.0=4 23 | - h5py=2.6.0=np111py35_6 24 | - harfbuzz=1.0.6=0 25 | - hdf5=1.8.17=2 26 | - icu=56.1=4 27 | - ipykernel=4.3.1=py35_1 28 | - ipython=5.1.0=py35_0 29 | - ipywidgets=5.2.2=py35_0 30 | - jinja2=2.8=py35_1 31 | - jpeg=9b=0 32 | - jsonschema=2.5.1=py35_0 33 | - jupyter_client=4.3.0=py35_0 34 | - jupyter_console=5.0.0=py35_0 35 | - jupyter_core=4.1.1=py35_1 36 | - libffi=3.2.1=2 37 | - libiconv=1.14=3 38 | - libpng=1.6.24=0 39 | - libsodium=1.0.10=0 40 | - libtiff=4.0.6=6 41 | - libxml2=2.9.4=0 42 | - markupsafe=0.23=py35_0 43 | - matplotlib=1.5.2=np111py35_6 44 | - mistune=0.7.3=py35_0 45 | - nbconvert=4.2.0=py35_0 46 | - nbformat=4.0.1=py35_0 47 | - ncurses=5.9=8 48 | - nose=1.3.7=py35_1 49 | - notebook=4.2.2=py35_0 50 | - numpy=1.11.1=py35_blas_openblas_201 51 | - openblas=0.2.18=4 52 | - openssl=1.0.2h=2 53 | - pandas=0.18.1=np111py35_1 54 | - pango=1.40.1=0 55 | - path.py=8.2.1=py35_0 56 | - pcre=8.38=1 57 | - pexpect=4.2.0=py35_1 58 | - pickleshare=0.7.3=py35_0 59 | - pip=8.1.2=py35_0 60 | - pixman=0.32.6=0 61 | - prompt_toolkit=1.0.6=py35_0 62 | - protobuf=3.0.0b3=py35_1 63 | - ptyprocess=0.5.1=py35_0 64 | - pygments=2.1.3=py35_1 65 | - pyparsing=2.1.7=py35_0 66 | - python=3.5.2=2 67 | - python-dateutil=2.5.3=py35_0 68 | - pytz=2016.6.1=py35_0 69 | - pyyaml=3.11=py35_0 70 | - pyzmq=15.4.0=py35_0 71 | - qt=4.8.7=0 72 | - qtconsole=4.2.1=py35_0 73 | - readline=6.2=0 74 | - requests=2.11.0=py35_0 75 | - scikit-learn=0.17.1=np111py35_blas_openblas_201 76 | - scipy=0.18.0=np111py35_blas_openblas_201 77 | - setuptools=25.1.6=py35_0 78 | - simplegeneric=0.8.1=py35_0 79 | - sip=4.18=py35_0 80 | - six=1.10.0=py35_0 81 | - sqlite=3.13.0=1 82 | - terminado=0.6=py35_0 83 | - tk=8.5.19=0 84 | - tornado=4.4.1=py35_1 85 | - traitlets=4.2.2=py35_0 86 | - wcwidth=0.1.7=py35_0 87 | - wheel=0.29.0=py35_0 88 | - widgetsnbextension=1.2.6=py35_3 89 | - xz=5.2.2=0 90 | - yaml=0.1.6=0 91 | - zeromq=4.1.5=0 92 | - zlib=1.2.8=3 93 | - cudatoolkit=7.5=0 94 | - ipython_genutils=0.1.0=py35_0 95 | - jupyter=1.0.0=py35_3 96 | - libgfortran=3.0.0=1 97 | - llvmlite=0.11.0=py35_0 98 | - mkl=11.3.3=0 99 | - mkl-service=1.1.2=py35_2 100 | - numba=0.26.0=np111py35_0 101 | - pycparser=2.14=py35_1 102 | - pyqt=4.11.4=py35_4 103 | - snakeviz=0.4.1=py35_0 104 | - pip: 105 | - backports.shutil-get-terminal-size==1.0.0 106 | - certifi==2016.8.2 107 | - cycler==0.10.0 108 | - cython==0.24.1 109 | - decorator==4.0.10 110 | - h5py==2.6.0 111 | - ipykernel==4.3.1 112 | - ipython==5.1.0 113 | - ipython-genutils==0.1.0 114 | - ipywidgets==5.2.2 115 | - jinja2==2.8 116 | - jsonschema==2.5.1 117 | - jupyter-client==4.3.0 118 | - jupyter-console==5.0.0 119 | - jupyter-core==4.1.1 120 | - keras==1.0.7 121 | - mako==1.0.4 122 | - markupsafe==0.23 123 | - matplotlib==1.5.2 124 | - mistune==0.7.3 125 | - nbconvert==4.2.0 126 | - nbformat==4.0.1 127 | - nose==1.3.7 128 | - notebook==4.2.2 129 | - numpy==1.11.1 130 | - pandas==0.18.1 131 | - path.py==8.2.1 132 | - pexpect==4.2.0 133 | - pickleshare==0.7.3 134 | - pip==8.1.2 135 | - prompt-toolkit==1.0.6 136 | - protobuf==3.0.0b2 137 | - ptyprocess==0.5.1 138 | - pygments==2.1.3 139 | - pyparsing==2.1.7 140 | - python-dateutil==2.5.3 141 | - pytz==2016.6.1 142 | - pyyaml==3.11 143 | - pyzmq==15.4.0 144 | - qtconsole==4.2.1 145 | - requests==2.11.0 146 | - scikit-learn==0.17.1 147 | - scipy==0.18.0 148 | - setuptools==25.1.4 149 | - simplegeneric==0.8.1 150 | - six==1.10.0 151 | - terminado==0.6 152 | - theano==0.8.2 153 | - tornado==4.4.1 154 | - traitlets==4.2.2 155 | - wcwidth==0.1.7 156 | - wheel==0.29.0 157 | - widgetsnbextension==1.2.6 158 | prefix: /home/valerio/anaconda3/envs/deep-learning 159 | 160 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/deep_learning_models/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 François Chollet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/deep_learning_models/README.md: -------------------------------------------------------------------------------- 1 | # Trained image classification models for Keras 2 | 3 | This repository contains code for the following Keras models: 4 | 5 | - VGG16 6 | - VGG19 7 | - ResNet50 8 | 9 | We plan on adding Inception v3 soon. 10 | 11 | All architectures are compatible with both TensorFlow and Theano, and upon instantiation the models will be built according to the image dimension ordering set in your Keras configuration file at `~/.keras/keras.json`. For instance, if you have set `image_dim_ordering=tf`, then any model loaded from this repository will get built according to the TensorFlow dimension ordering convention, "Width-Height-Depth". 12 | 13 | Weights can be automatically loaded upon instantiation (`weights='imagenet'` argument in model constructor). Weights are automatically downloaded if necessary, and cached locally in `~/.keras/models/`. 14 | 15 | **Note that using these models requires the latest version of Keras (from the Github repo, not PyPI).** 16 | 17 | ## Examples 18 | 19 | ### Classify images 20 | 21 | ```python 22 | from resnet50 import ResNet50 23 | from keras.preprocessing import image 24 | from imagenet_utils import preprocess_input, decode_predictions 25 | 26 | model = ResNet50(weights='imagenet') 27 | 28 | img_path = 'elephant.jpg' 29 | img = image.load_img(img_path, target_size=(224, 224)) 30 | x = image.img_to_array(img) 31 | x = np.expand_dims(x, axis=0) 32 | x = preprocess_input(x) 33 | 34 | preds = model.predict(x) 35 | print('Predicted:', decode_predictions(preds)) 36 | # print: [[u'n02504458', u'African_elephant']] 37 | ``` 38 | 39 | ### Extract features from images 40 | 41 | ```python 42 | from vgg16 import VGG16 43 | from keras.preprocessing import image 44 | from imagenet_utils import preprocess_input 45 | 46 | model = VGG16(weights='imagenet', include_top=False) 47 | 48 | img_path = 'elephant.jpg' 49 | img = image.load_img(img_path, target_size=(224, 224)) 50 | x = image.img_to_array(img) 51 | x = np.expand_dims(x, axis=0) 52 | x = preprocess_input(x) 53 | 54 | features = model.predict(x) 55 | ``` 56 | 57 | ### Extract features from an arbitrary intermediate layer 58 | 59 | ```python 60 | from vgg19 import VGG19 61 | from keras.preprocessing import image 62 | from imagenet_utils import preprocess_input 63 | from keras.models import Model 64 | 65 | base_model = VGG19(weights='imagenet') 66 | model = Model(input=base_model.input, output=base_model.get_layer('block4_pool').output) 67 | 68 | img_path = 'elephant.jpg' 69 | img = image.load_img(img_path, target_size=(224, 224)) 70 | x = image.img_to_array(img) 71 | x = np.expand_dims(x, axis=0) 72 | x = preprocess_input(x) 73 | 74 | block4_pool_features = model.predict(x) 75 | ``` 76 | 77 | ## References 78 | 79 | - [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556) - please cite this paper if you use the VGG models in your work. 80 | - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) - please cite this paper if you use the ResNet model in your work. 81 | 82 | Additionally, don't forget to [cite Keras](https://keras.io/getting-started/faq/#how-should-i-cite-keras) if you use these models. 83 | 84 | 85 | ## License 86 | 87 | - All code in this repository is under the MIT license as specified by the LICENSE file. 88 | - The ResNet50 weights are ported from the ones [released by Kaiming He](https://github.com/KaimingHe/deep-residual-networks) under the [MIT license](https://github.com/KaimingHe/deep-residual-networks/blob/master/LICENSE). 89 | - The VGG16 and VGG19 weights are ported from the ones [released by VGG at Oxford](http://www.robots.ox.ac.uk/~vgg/research/very_deep/) under the [Creative Commons Attribution License](https://creativecommons.org/licenses/by/4.0/). -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/deep_learning_models/imagenet_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | 4 | from keras.utils.data_utils import get_file 5 | from keras import backend as K 6 | 7 | CLASS_INDEX = None 8 | CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json' 9 | 10 | 11 | def preprocess_input(x, dim_ordering='default'): 12 | if dim_ordering == 'default': 13 | dim_ordering = K.image_dim_ordering() 14 | assert dim_ordering in {'tf', 'th'} 15 | 16 | if dim_ordering == 'th': 17 | x[:, 0, :, :] -= 103.939 18 | x[:, 1, :, :] -= 116.779 19 | x[:, 2, :, :] -= 123.68 20 | # 'RGB'->'BGR' 21 | x = x[:, ::-1, :, :] 22 | else: 23 | x[:, :, :, 0] -= 103.939 24 | x[:, :, :, 1] -= 116.779 25 | x[:, :, :, 2] -= 123.68 26 | # 'RGB'->'BGR' 27 | x = x[:, :, :, ::-1] 28 | return x 29 | 30 | 31 | def decode_predictions(preds): 32 | global CLASS_INDEX 33 | assert len(preds.shape) == 2 and preds.shape[1] == 1000 34 | if CLASS_INDEX is None: 35 | fpath = get_file('imagenet_class_index.json', 36 | CLASS_INDEX_PATH, 37 | cache_subdir='models') 38 | CLASS_INDEX = json.load(open(fpath)) 39 | indices = np.argmax(preds, axis=-1) 40 | results = [] 41 | for i in indices: 42 | results.append(CLASS_INDEX[str(i)]) 43 | return results 44 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/ConvNet LeNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/ConvNet LeNet.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/LSTM3-chain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/LSTM3-chain.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/MLP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/MLP.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/MaxPool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/MaxPool.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/Perceptron and MLP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/Perceptron and MLP.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/Perceptron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/Perceptron.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/RNN-rolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/RNN-rolled.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/RNN-unrolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/RNN-unrolled.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/autoencoder.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/backprop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/backprop.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/cnn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn1.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/cnn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn2.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/cnn3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn3.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/cnn4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn4.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/cnn5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn5.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/cnn6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn6.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/conv.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/convnets_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/convnets_cover.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/euroscipy_2016_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/euroscipy_2016_logo.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/gru.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/gru.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/imagenet/apricot_565.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/apricot_565.jpeg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/imagenet/apricot_696.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/apricot_696.jpeg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/imagenet/apricot_787.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/apricot_787.jpeg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/imagenet/strawberry_1157.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1157.jpeg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/imagenet/strawberry_1174.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1174.jpeg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/imagenet/strawberry_1189.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1189.jpeg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/keDyv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keDyv.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/keras-logo-small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keras-logo-small.jpg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/keras_rank_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keras_rank_1.jpg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/keras_rank_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keras_rank_2.jpg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/mlp_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/mlp_details.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/overfitting.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/rnn.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/rnn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/rnn2.png -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/imgs/sprint.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/sprint.jpg -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/outline.md: -------------------------------------------------------------------------------- 1 | # Outline (Draft) 2 | 3 | - Part I: Introduction 4 | 5 | - Intro to ANN 6 | - (naive pure-Python implementation from `pybrain`) 7 | - fast forward 8 | - sgd + backprop 9 | - Intro to Theano 10 | - Model + SGD with Theano (simple logreg) 11 | 12 | - Introduction to Keras 13 | - Overview and main features 14 | - Theano backend 15 | - Tensorflow backend 16 | - Same LogReg with Keras 17 | 18 | - Part II: Supervised Learning + Keras Internals 19 | - Intro: Focus on Image Classification 20 | - Multi-Layer Perceptron and Fully Connected 21 | - Examples with `keras.models.Sequential` and `Dense` 22 | - HandsOn: MLP with keras 23 | 24 | - Intro to CNN 25 | - meaning of convolutional filters 26 | - examples from ImageNet 27 | 28 | - Meaning of dimensions of Conv filters (through an exmple of ConvNet) 29 | - HandsOn: ConvNet with keras 30 | 31 | - Advanced CNN 32 | - Dropout and MaxPooling 33 | - Famous ANN in Keras (likely moved somewhere else) 34 | - ref: https://github.com/fchollet/deep-learning-models 35 | - VGG16 36 | - VGG19 37 | - LaNet 38 | - Inception/GoogleNet 39 | - ResNet 40 | *Implementation and examples 41 | - HandsOn: Fine tuning a network on new dataset 42 | 43 | - Part III: Unsupervised Learning + Keras Internals 44 | - AutoEncoders 45 | - word2vec & doc2vec (gensim) + `keras.dataset` (i.e. `keras.dataset.imdb`) 46 | - HandsOn: _______ 47 | 48 | *should we include embedding here? 49 | 50 | - Part IV: Advanced Materials 51 | - RNN (LSTM) 52 | - RNN, LSTM, GRU 53 | - Meaning of dimensions of rnn (backprop though time, etc) 54 | - HandsOn: IMDB (?) 55 | 56 | - CNN-RNN 57 | - Time Distributed Convolution 58 | - Some of the recent advances in DL implemented in Keras 59 | - e.g. https://github.com/snf/keras-fractalnet - Fractal Net Implementation with Keras 60 | 61 | 62 | Notes: 63 | 64 | 1) Please, add more details in Part IV (i.e. /Advanced Materials/) 65 | 2) As for Keras internals, I Would consider this: https://github.com/wuaalb/keras_extensions/blob/master/keras_extensions/rbm.py 66 | This is just to show how easy it is to extend Keras ( in this case, properly creating a new `Layer`). 67 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/solutions/sol_111.py: -------------------------------------------------------------------------------- 1 | ann = ANN(2, 10, 1) 2 | %timeit -n 1 -r 1 ann.train(zip(X,y), iterations=2) 3 | plot_decision_boundary(ann) 4 | plt.title("Our next model with 10 hidden units") 5 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/solutions/sol_112.py: -------------------------------------------------------------------------------- 1 | ann = ANN(2, 10, 1) 2 | %timeit -n 1 -r 1 ann.train(zip(X,y), iterations=100) 3 | plot_decision_boundary(ann) 4 | plt.title("Our model with 10 hidden units and 100 iterations") 5 | -------------------------------------------------------------------------------- /deep-learning/keras-tutorial/w2v.py: -------------------------------------------------------------------------------- 1 | from gensim.models import word2vec 2 | from os.path import join, exists, split 3 | import os 4 | import numpy as np 5 | 6 | def train_word2vec(sentence_matrix, vocabulary_inv, 7 | num_features=300, min_word_count=1, context=10): 8 | """ 9 | Trains, saves, loads Word2Vec model 10 | Returns initial weights for embedding layer. 11 | 12 | inputs: 13 | sentence_matrix # int matrix: num_sentences x max_sentence_len 14 | vocabulary_inv # dict {str:int} 15 | num_features # Word vector dimensionality 16 | min_word_count # Minimum word count 17 | context # Context window size 18 | """ 19 | model_dir = 'word2vec_models' 20 | model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context) 21 | model_name = join(model_dir, model_name) 22 | if exists(model_name): 23 | embedding_model = word2vec.Word2Vec.load(model_name) 24 | print('Loading existing Word2Vec model \'%s\'' % split(model_name)[-1]) 25 | else: 26 | # Set values for various parameters 27 | num_workers = 2 # Number of threads to run in parallel 28 | downsampling = 1e-3 # Downsample setting for frequent words 29 | 30 | # Initialize and train the model 31 | print("Training Word2Vec model...") 32 | sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix] 33 | embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \ 34 | size=num_features, min_count = min_word_count, \ 35 | window = context, sample = downsampling) 36 | 37 | # If we don't plan to train the model any further, calling 38 | # init_sims will make the model much more memory-efficient. 39 | embedding_model.init_sims(replace=True) 40 | 41 | # Saving the model for later use. You can load it later using Word2Vec.load() 42 | if not exists(model_dir): 43 | os.mkdir(model_dir) 44 | print('Saving Word2Vec model \'%s\'' % split(model_name)[-1]) 45 | embedding_model.save(model_name) 46 | 47 | # add unknown words 48 | embedding_weights = [np.array([embedding_model[w] if w in embedding_model\ 49 | else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\ 50 | for w in vocabulary_inv])] 51 | return embedding_weights 52 | 53 | if __name__=='__main__': 54 | import data_helpers 55 | print("Loading data...") 56 | x, _, _, vocabulary_inv = data_helpers.load_data() 57 | w = train_word2vec(x, vocabulary_inv) 58 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/input_data.py: -------------------------------------------------------------------------------- 1 | """Functions for downloading and reading MNIST data.""" 2 | from __future__ import print_function 3 | import gzip 4 | import os 5 | import urllib 6 | import numpy 7 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' 8 | def maybe_download(filename, work_directory): 9 | """Download the data from Yann's website, unless it's already here.""" 10 | if not os.path.exists(work_directory): 11 | os.mkdir(work_directory) 12 | filepath = os.path.join(work_directory, filename) 13 | if not os.path.exists(filepath): 14 | filepath, _ = urllib.urlretrieve(SOURCE_URL + filename, filepath) 15 | statinfo = os.stat(filepath) 16 | print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.') 17 | return filepath 18 | def _read32(bytestream): 19 | dt = numpy.dtype(numpy.uint32).newbyteorder('>') 20 | return numpy.frombuffer(bytestream.read(4), dtype=dt) 21 | def extract_images(filename): 22 | """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" 23 | print('Extracting', filename) 24 | with gzip.open(filename) as bytestream: 25 | magic = _read32(bytestream) 26 | if magic != 2051: 27 | raise ValueError( 28 | 'Invalid magic number %d in MNIST image file: %s' % 29 | (magic, filename)) 30 | num_images = _read32(bytestream) 31 | rows = _read32(bytestream) 32 | cols = _read32(bytestream) 33 | buf = bytestream.read(rows * cols * num_images) 34 | data = numpy.frombuffer(buf, dtype=numpy.uint8) 35 | data = data.reshape(num_images, rows, cols, 1) 36 | return data 37 | def dense_to_one_hot(labels_dense, num_classes=10): 38 | """Convert class labels from scalars to one-hot vectors.""" 39 | num_labels = labels_dense.shape[0] 40 | index_offset = numpy.arange(num_labels) * num_classes 41 | labels_one_hot = numpy.zeros((num_labels, num_classes)) 42 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 43 | return labels_one_hot 44 | def extract_labels(filename, one_hot=False): 45 | """Extract the labels into a 1D uint8 numpy array [index].""" 46 | print('Extracting', filename) 47 | with gzip.open(filename) as bytestream: 48 | magic = _read32(bytestream) 49 | if magic != 2049: 50 | raise ValueError( 51 | 'Invalid magic number %d in MNIST label file: %s' % 52 | (magic, filename)) 53 | num_items = _read32(bytestream) 54 | buf = bytestream.read(num_items) 55 | labels = numpy.frombuffer(buf, dtype=numpy.uint8) 56 | if one_hot: 57 | return dense_to_one_hot(labels) 58 | return labels 59 | class DataSet(object): 60 | def __init__(self, images, labels, fake_data=False): 61 | if fake_data: 62 | self._num_examples = 10000 63 | else: 64 | assert images.shape[0] == labels.shape[0], ( 65 | "images.shape: %s labels.shape: %s" % (images.shape, 66 | labels.shape)) 67 | self._num_examples = images.shape[0] 68 | # Convert shape from [num examples, rows, columns, depth] 69 | # to [num examples, rows*columns] (assuming depth == 1) 70 | assert images.shape[3] == 1 71 | images = images.reshape(images.shape[0], 72 | images.shape[1] * images.shape[2]) 73 | # Convert from [0, 255] -> [0.0, 1.0]. 74 | images = images.astype(numpy.float32) 75 | images = numpy.multiply(images, 1.0 / 255.0) 76 | self._images = images 77 | self._labels = labels 78 | self._epochs_completed = 0 79 | self._index_in_epoch = 0 80 | @property 81 | def images(self): 82 | return self._images 83 | @property 84 | def labels(self): 85 | return self._labels 86 | @property 87 | def num_examples(self): 88 | return self._num_examples 89 | @property 90 | def epochs_completed(self): 91 | return self._epochs_completed 92 | def next_batch(self, batch_size, fake_data=False): 93 | """Return the next `batch_size` examples from this data set.""" 94 | if fake_data: 95 | fake_image = [1.0 for _ in xrange(784)] 96 | fake_label = 0 97 | return [fake_image for _ in xrange(batch_size)], [ 98 | fake_label for _ in xrange(batch_size)] 99 | start = self._index_in_epoch 100 | self._index_in_epoch += batch_size 101 | if self._index_in_epoch > self._num_examples: 102 | # Finished epoch 103 | self._epochs_completed += 1 104 | # Shuffle the data 105 | perm = numpy.arange(self._num_examples) 106 | numpy.random.shuffle(perm) 107 | self._images = self._images[perm] 108 | self._labels = self._labels[perm] 109 | # Start next epoch 110 | start = 0 111 | self._index_in_epoch = batch_size 112 | assert batch_size <= self._num_examples 113 | end = self._index_in_epoch 114 | return self._images[start:end], self._labels[start:end] 115 | def read_data_sets(train_dir, fake_data=False, one_hot=False): 116 | class DataSets(object): 117 | pass 118 | data_sets = DataSets() 119 | if fake_data: 120 | data_sets.train = DataSet([], [], fake_data=True) 121 | data_sets.validation = DataSet([], [], fake_data=True) 122 | data_sets.test = DataSet([], [], fake_data=True) 123 | return data_sets 124 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' 125 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' 126 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz' 127 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz' 128 | VALIDATION_SIZE = 5000 129 | local_file = maybe_download(TRAIN_IMAGES, train_dir) 130 | train_images = extract_images(local_file) 131 | local_file = maybe_download(TRAIN_LABELS, train_dir) 132 | train_labels = extract_labels(local_file, one_hot=one_hot) 133 | local_file = maybe_download(TEST_IMAGES, train_dir) 134 | test_images = extract_images(local_file) 135 | local_file = maybe_download(TEST_LABELS, train_dir) 136 | test_labels = extract_labels(local_file, one_hot=one_hot) 137 | validation_images = train_images[:VALIDATION_SIZE] 138 | validation_labels = train_labels[:VALIDATION_SIZE] 139 | train_images = train_images[VALIDATION_SIZE:] 140 | train_labels = train_labels[VALIDATION_SIZE:] 141 | data_sets.train = DataSet(train_images, train_labels) 142 | data_sets.validation = DataSet(validation_images, validation_labels) 143 | data_sets.test = DataSet(test_images, test_labels) 144 | return data_sets -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/multigpu_basics.py: -------------------------------------------------------------------------------- 1 | #Multi GPU Basic example 2 | ''' 3 | This tutorial requires your machine to have 2 GPUs 4 | "/cpu:0": The CPU of your machine. 5 | "/gpu:0": The first GPU of your machine 6 | "/gpu:1": The second GPU of your machine 7 | ''' 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import datetime 12 | 13 | #Processing Units logs 14 | log_device_placement = True 15 | 16 | #num of multiplications to perform 17 | n = 10 18 | 19 | ''' 20 | Example: compute A^n + B^n on 2 GPUs 21 | Results on 8 cores with 2 GTX-980: 22 | * Single GPU computation time: 0:00:11.277449 23 | * Multi GPU computation time: 0:00:07.131701 24 | ''' 25 | #Create random large matrix 26 | A = np.random.rand(1e4, 1e4).astype('float32') 27 | B = np.random.rand(1e4, 1e4).astype('float32') 28 | 29 | # Creates a graph to store results 30 | c1 = [] 31 | c2 = [] 32 | 33 | def matpow(M, n): 34 | if n < 1: #Abstract cases where n < 1 35 | return M 36 | else: 37 | return tf.matmul(M, matpow(M, n-1)) 38 | 39 | ''' 40 | Single GPU computing 41 | ''' 42 | with tf.device('/gpu:0'): 43 | a = tf.constant(A) 44 | b = tf.constant(B) 45 | #compute A^n and B^n and store results in c1 46 | c1.append(matpow(a, n)) 47 | c1.append(matpow(b, n)) 48 | 49 | with tf.device('/cpu:0'): 50 | sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n 51 | 52 | t1_1 = datetime.datetime.now() 53 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess: 54 | # Runs the op. 55 | sess.run(sum) 56 | t2_1 = datetime.datetime.now() 57 | 58 | 59 | ''' 60 | Multi GPU computing 61 | ''' 62 | #GPU:0 computes A^n 63 | with tf.device('/gpu:0'): 64 | #compute A^n and store result in c2 65 | a = tf.constant(A) 66 | c2.append(matpow(a, n)) 67 | 68 | #GPU:1 computes B^n 69 | with tf.device('/gpu:1'): 70 | #compute B^n and store result in c2 71 | b = tf.constant(B) 72 | c2.append(matpow(b, n)) 73 | 74 | with tf.device('/cpu:0'): 75 | sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n 76 | 77 | t1_2 = datetime.datetime.now() 78 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess: 79 | # Runs the op. 80 | sess.run(sum) 81 | t2_2 = datetime.datetime.now() 82 | 83 | 84 | print "Single GPU computation time: " + str(t2_1-t1_1) 85 | print "Multi GPU computation time: " + str(t2_2-t1_2) -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/1_intro/basic_operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic Operations in TensorFlow\n", 8 | "\n", 9 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 10 | "\n", 11 | "## Setup\n", 12 | "\n", 13 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import tensorflow as tf" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "# Basic constant operations\n", 36 | "# The value returned by the constructor represents the output\n", 37 | "# of the Constant op.\n", 38 | "a = tf.constant(2)\n", 39 | "b = tf.constant(3)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "a=2, b=3\n", 54 | "Addition with constants: 5\n", 55 | "Multiplication with constants: 6\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "# Launch the default graph.\n", 61 | "with tf.Session() as sess:\n", 62 | " print \"a=2, b=3\"\n", 63 | " print \"Addition with constants: %i\" % sess.run(a+b)\n", 64 | " print \"Multiplication with constants: %i\" % sess.run(a*b)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "# Basic Operations with variable as graph input\n", 76 | "# The value returned by the constructor represents the output\n", 77 | "# of the Variable op. (define as input when running session)\n", 78 | "# tf Graph input\n", 79 | "a = tf.placeholder(tf.int16)\n", 80 | "b = tf.placeholder(tf.int16)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# Define some operations\n", 92 | "add = tf.add(a, b)\n", 93 | "mul = tf.mul(a, b)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Addition with variables: 5\n", 108 | "Multiplication with variables: 6\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "# Launch the default graph.\n", 114 | "with tf.Session() as sess:\n", 115 | " # Run every operation with variable input\n", 116 | " print \"Addition with variables: %i\" % sess.run(add, feed_dict={a: 2, b: 3})\n", 117 | " print \"Multiplication with variables: %i\" % sess.run(mul, feed_dict={a: 2, b: 3})" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 8, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "# ----------------\n", 129 | "# More in details:\n", 130 | "# Matrix Multiplication from TensorFlow official tutorial\n", 131 | "\n", 132 | "# Create a Constant op that produces a 1x2 matrix. The op is\n", 133 | "# added as a node to the default graph.\n", 134 | "#\n", 135 | "# The value returned by the constructor represents the output\n", 136 | "# of the Constant op.\n", 137 | "matrix1 = tf.constant([[3., 3.]])" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 9, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "# Create another Constant that produces a 2x1 matrix.\n", 149 | "matrix2 = tf.constant([[2.],[2.]])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 10, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.\n", 161 | "# The returned value, 'product', represents the result of the matrix\n", 162 | "# multiplication.\n", 163 | "product = tf.matmul(matrix1, matrix2)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 11, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "[[ 12.]]\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "# To run the matmul op we call the session 'run()' method, passing 'product'\n", 183 | "# which represents the output of the matmul op. This indicates to the call\n", 184 | "# that we want to get the output of the matmul op back.\n", 185 | "#\n", 186 | "# All inputs needed by the op are run automatically by the session. They\n", 187 | "# typically are run in parallel.\n", 188 | "#\n", 189 | "# The call 'run(product)' thus causes the execution of threes ops in the\n", 190 | "# graph: the two constants and matmul.\n", 191 | "#\n", 192 | "# The output of the op is returned in 'result' as a numpy `ndarray` object.\n", 193 | "with tf.Session() as sess:\n", 194 | " result = sess.run(product)\n", 195 | " print result" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [] 206 | } 207 | ], 208 | "metadata": { 209 | "kernelspec": { 210 | "display_name": "Python 2", 211 | "language": "python", 212 | "name": "python2" 213 | }, 214 | "language_info": { 215 | "codemirror_mode": { 216 | "name": "ipython", 217 | "version": 2 218 | }, 219 | "file_extension": ".py", 220 | "mimetype": "text/x-python", 221 | "name": "python", 222 | "nbconvert_exporter": "python", 223 | "pygments_lexer": "ipython2", 224 | "version": "2.7.5+" 225 | } 226 | }, 227 | "nbformat": 4, 228 | "nbformat_minor": 0 229 | } 230 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/2_basic_classifiers/logistic_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Logistic Regression in TensorFlow\n", 10 | "\n", 11 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 12 | "\n", 13 | "## Setup\n", 14 | "\n", 15 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 5, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Extracting /tmp/data/train-images-idx3-ubyte.gz\n", 30 | "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n", 31 | "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n", 32 | "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# Import MINST data\n", 38 | "import input_data\n", 39 | "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 6, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import tensorflow as tf" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 7, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# Parameters\n", 62 | "learning_rate = 0.01\n", 63 | "training_epochs = 25\n", 64 | "batch_size = 100\n", 65 | "display_step = 1" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 8, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# tf Graph Input\n", 77 | "x = tf.placeholder(\"float\", [None, 784]) # mnist data image of shape 28*28=784\n", 78 | "y = tf.placeholder(\"float\", [None, 10]) # 0-9 digits recognition => 10 classes" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 9, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# Create model\n", 90 | "\n", 91 | "# Set model weights\n", 92 | "W = tf.Variable(tf.zeros([784, 10]))\n", 93 | "b = tf.Variable(tf.zeros([10]))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 10, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "# Construct model\n", 105 | "activation = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 11, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "# Minimize error using cross entropy\n", 117 | "# Cross entropy\n", 118 | "cost = -tf.reduce_sum(y*tf.log(activation)) \n", 119 | "# Gradient Descent\n", 120 | "optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) " 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 12, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# Initializing the variables\n", 132 | "init = tf.initialize_all_variables()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 13, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Epoch: 0001 cost= 29.860479714\n", 147 | "Epoch: 0002 cost= 22.080549484\n", 148 | "Epoch: 0003 cost= 21.237104595\n", 149 | "Epoch: 0004 cost= 20.460196280\n", 150 | "Epoch: 0005 cost= 20.185128237\n", 151 | "Epoch: 0006 cost= 19.940297202\n", 152 | "Epoch: 0007 cost= 19.645111119\n", 153 | "Epoch: 0008 cost= 19.507218031\n", 154 | "Epoch: 0009 cost= 19.389794492\n", 155 | "Epoch: 0010 cost= 19.177005816\n", 156 | "Epoch: 0011 cost= 19.082493615\n", 157 | "Epoch: 0012 cost= 19.072873598\n", 158 | "Epoch: 0013 cost= 18.938005402\n", 159 | "Epoch: 0014 cost= 18.891806430\n", 160 | "Epoch: 0015 cost= 18.839480221\n", 161 | "Epoch: 0016 cost= 18.769349510\n", 162 | "Epoch: 0017 cost= 18.590865587\n", 163 | "Epoch: 0018 cost= 18.623413677\n", 164 | "Epoch: 0019 cost= 18.546149085\n", 165 | "Epoch: 0020 cost= 18.432274895\n", 166 | "Epoch: 0021 cost= 18.358189004\n", 167 | "Epoch: 0022 cost= 18.380014628\n", 168 | "Epoch: 0023 cost= 18.499993471\n", 169 | "Epoch: 0024 cost= 18.386477311\n", 170 | "Epoch: 0025 cost= 18.258080609\n", 171 | "Optimization Finished!\n", 172 | "Accuracy: 0.9048\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "# Launch the graph\n", 178 | "with tf.Session() as sess:\n", 179 | " sess.run(init)\n", 180 | "\n", 181 | " # Training cycle\n", 182 | " for epoch in range(training_epochs):\n", 183 | " avg_cost = 0.\n", 184 | " total_batch = int(mnist.train.num_examples/batch_size)\n", 185 | " # Loop over all batches\n", 186 | " for i in range(total_batch):\n", 187 | " batch_xs, batch_ys = mnist.train.next_batch(batch_size)\n", 188 | " # Fit training using batch data\n", 189 | " sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})\n", 190 | " # Compute average loss\n", 191 | " avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch\n", 192 | " # Display logs per epoch step\n", 193 | " if epoch % display_step == 0:\n", 194 | " print \"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(avg_cost)\n", 195 | "\n", 196 | " print \"Optimization Finished!\"\n", 197 | "\n", 198 | " # Test model\n", 199 | " correct_prediction = tf.equal(tf.argmax(activation, 1), tf.argmax(y, 1))\n", 200 | " # Calculate accuracy\n", 201 | " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", 202 | " print \"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})" 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.4.3" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 0 227 | } 228 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/4_multi_gpu/multigpu_basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic Multi GPU Computation in TensorFlow\n", 8 | "\n", 9 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 10 | "\n", 11 | "## Setup\n", 12 | "\n", 13 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "source": [ 22 | "This tutorial requires your machine to have 2 GPUs\n", 23 | "* \"/cpu:0\": The CPU of your machine.\n", 24 | "* \"/gpu:0\": The first GPU of your machine\n", 25 | "* \"/gpu:1\": The second GPU of your machine\n", 26 | "* For this example, we are using 2 GTX-980" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import tensorflow as tf\n", 39 | "import datetime" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "#Processing Units logs\n", 51 | "log_device_placement = True\n", 52 | "\n", 53 | "#num of multiplications to perform\n", 54 | "n = 10" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# Example: compute A^n + B^n on 2 GPUs\n", 66 | "\n", 67 | "# Create random large matrix\n", 68 | "A = np.random.rand(1e4, 1e4).astype('float32')\n", 69 | "B = np.random.rand(1e4, 1e4).astype('float32')\n", 70 | "\n", 71 | "# Creates a graph to store results\n", 72 | "c1 = []\n", 73 | "c2 = []\n", 74 | "\n", 75 | "# Define matrix power\n", 76 | "def matpow(M, n):\n", 77 | " if n < 1: #Abstract cases where n < 1\n", 78 | " return M\n", 79 | " else:\n", 80 | " return tf.matmul(M, matpow(M, n-1))" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# Single GPU computing\n", 92 | "\n", 93 | "with tf.device('/gpu:0'):\n", 94 | " a = tf.constant(A)\n", 95 | " b = tf.constant(B)\n", 96 | " #compute A^n and B^n and store results in c1\n", 97 | " c1.append(matpow(a, n))\n", 98 | " c1.append(matpow(b, n))\n", 99 | "\n", 100 | "with tf.device('/cpu:0'):\n", 101 | " sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n\n", 102 | "\n", 103 | "t1_1 = datetime.datetime.now()\n", 104 | "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n", 105 | " # Runs the op.\n", 106 | " sess.run(sum)\n", 107 | "t2_1 = datetime.datetime.now()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "# Multi GPU computing\n", 119 | "# GPU:0 computes A^n\n", 120 | "with tf.device('/gpu:0'):\n", 121 | " #compute A^n and store result in c2\n", 122 | " a = tf.constant(A)\n", 123 | " c2.append(matpow(a, n))\n", 124 | "\n", 125 | "#GPU:1 computes B^n\n", 126 | "with tf.device('/gpu:1'):\n", 127 | " #compute B^n and store result in c2\n", 128 | " b = tf.constant(B)\n", 129 | " c2.append(matpow(b, n))\n", 130 | "\n", 131 | "with tf.device('/cpu:0'):\n", 132 | " sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n\n", 133 | "\n", 134 | "t1_2 = datetime.datetime.now()\n", 135 | "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n", 136 | " # Runs the op.\n", 137 | " sess.run(sum)\n", 138 | "t2_2 = datetime.datetime.now()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Single GPU computation time: 0:00:11.833497\n", 153 | "Multi GPU computation time: 0:00:07.085913\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "print \"Single GPU computation time: \" + str(t2_1-t1_1)\n", 159 | "print \"Multi GPU computation time: \" + str(t2_2-t1_2)" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.4.3" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 0 184 | } 185 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-exercises/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM b.gcr.io/tensorflow/tensorflow:latest 2 | MAINTAINER Vincent Vanhoucke 3 | RUN pip install scikit-learn 4 | ADD *.ipynb /notebooks/ 5 | WORKDIR /notebooks 6 | CMD ["/run_jupyter.sh"] 7 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-exercises/README.md: -------------------------------------------------------------------------------- 1 | Exercises 2 | =========================================================== 3 | 4 | Building the Docker container 5 | ----------------------------- 6 | 7 | docker build -t $USER/exercises . 8 | 9 | Running the container 10 | --------------------- 11 | 12 | docker run -p 8888:8888 -it --rm $USER/exercises 13 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/intro_theano/Makefile: -------------------------------------------------------------------------------- 1 | intro_theano.pdf: slides_source/intro_theano.tex 2 | cd slides_source; pdflatex --shell-escape intro_theano.tex 3 | mv slides_source/intro_theano.pdf . 4 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/intro_theano/intro_theano.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/intro_theano/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | 10 | import numpy 11 | from six.moves import xrange 12 | 13 | 14 | def scale_to_unit_interval(ndar, eps=1e-8): 15 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 16 | ndar = ndar.copy() 17 | ndar -= ndar.min() 18 | ndar *= 1.0 / (ndar.max() + eps) 19 | return ndar 20 | 21 | 22 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 23 | scale_rows_to_unit_interval=True, 24 | output_pixel_vals=True): 25 | """ 26 | Transform an array with one flattened image per row, into an array in 27 | which images are reshaped and layed out like tiles on a floor. 28 | 29 | This function is useful for visualizing datasets whose rows are images, 30 | and also columns of matrices for transforming those rows 31 | (such as the first layer of a neural net). 32 | 33 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 34 | be 2-D ndarrays or None; 35 | :param X: a 2-D array in which every row is a flattened image. 36 | 37 | :type img_shape: tuple; (height, width) 38 | :param img_shape: the original shape of each image 39 | 40 | :type tile_shape: tuple; (rows, cols) 41 | :param tile_shape: the number of images to tile (rows, cols) 42 | 43 | :param output_pixel_vals: if output should be pixel values (i.e. int8 44 | values) or floats 45 | 46 | :param scale_rows_to_unit_interval: if the values need to be scaled before 47 | being plotted to [0,1] or not 48 | 49 | 50 | :returns: array suitable for viewing as an image. 51 | (See:`Image.fromarray`.) 52 | :rtype: a 2-d array with same dtype as X. 53 | 54 | """ 55 | 56 | assert len(img_shape) == 2 57 | assert len(tile_shape) == 2 58 | assert len(tile_spacing) == 2 59 | 60 | # The expression below can be re-written in a more C style as 61 | # follows : 62 | # 63 | # out_shape = [0,0] 64 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 65 | # tile_spacing[0] 66 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 67 | # tile_spacing[1] 68 | out_shape = [ 69 | (ishp + tsp) * tshp - tsp 70 | for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing) 71 | ] 72 | 73 | if isinstance(X, tuple): 74 | assert len(X) == 4 75 | # Create an output numpy ndarray to store the image 76 | if output_pixel_vals: 77 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 78 | dtype='uint8') 79 | else: 80 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 81 | dtype=X.dtype) 82 | 83 | #colors default to 0, alpha defaults to 1 (opaque) 84 | if output_pixel_vals: 85 | channel_defaults = [0, 0, 0, 255] 86 | else: 87 | channel_defaults = [0., 0., 0., 1.] 88 | 89 | for i in xrange(4): 90 | if X[i] is None: 91 | # if channel is None, fill it with zeros of the correct 92 | # dtype 93 | dt = out_array.dtype 94 | if output_pixel_vals: 95 | dt = 'uint8' 96 | out_array[:, :, i] = numpy.zeros( 97 | out_shape, 98 | dtype=dt 99 | ) + channel_defaults[i] 100 | else: 101 | # use a recurrent call to compute the channel and store it 102 | # in the output 103 | out_array[:, :, i] = tile_raster_images( 104 | X[i], img_shape, tile_shape, tile_spacing, 105 | scale_rows_to_unit_interval, output_pixel_vals) 106 | return out_array 107 | 108 | else: 109 | # if we are dealing with only one channel 110 | H, W = img_shape 111 | Hs, Ws = tile_spacing 112 | 113 | # generate a matrix to store the output 114 | dt = X.dtype 115 | if output_pixel_vals: 116 | dt = 'uint8' 117 | out_array = numpy.zeros(out_shape, dtype=dt) 118 | 119 | for tile_row in xrange(tile_shape[0]): 120 | for tile_col in xrange(tile_shape[1]): 121 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 122 | this_x = X[tile_row * tile_shape[1] + tile_col] 123 | if scale_rows_to_unit_interval: 124 | # if we should scale values to be between 0 and 1 125 | # do this by calling the `scale_to_unit_interval` 126 | # function 127 | this_img = scale_to_unit_interval( 128 | this_x.reshape(img_shape)) 129 | else: 130 | this_img = this_x.reshape(img_shape) 131 | # add the slice to the corresponding position in the 132 | # output array 133 | c = 1 134 | if output_pixel_vals: 135 | c = 255 136 | out_array[ 137 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 138 | tile_col * (W + Ws): tile_col * (W + Ws) + W 139 | ] = this_img * c 140 | return out_array 141 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/Makefile: -------------------------------------------------------------------------------- 1 | all: instruction.pdf rnn_lstm.pdf 2 | 3 | instruction.pdf: slides_source/instruction.tex 4 | cd slides_source; pdflatex --shell-escape instruction.tex 5 | cd slides_source; pdflatex --shell-escape instruction.tex 6 | cd slides_source; pdflatex --shell-escape instruction.tex 7 | mv slides_source/instruction.pdf . 8 | 9 | rnn_lstm.pdf: slides_source/rnn_lstm.tex 10 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex 11 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex 12 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex 13 | mv slides_source/rnn_lstm.pdf . 14 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/synthetic.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | 4 | 5 | def mackey_glass(sample_len=1000, tau=17, seed=None, n_samples = 1): 6 | ''' 7 | mackey_glass(sample_len=1000, tau=17, seed = None, n_samples = 1) -> input 8 | Generate the Mackey Glass time-series. Parameters are: 9 | - sample_len: length of the time-series in timesteps. Default is 1000. 10 | - tau: delay of the MG - system. Commonly used values are tau=17 (mild 11 | chaos) and tau=30 (moderate chaos). Default is 17. 12 | - seed: to seed the random generator, can be used to generate the same 13 | timeseries at each invocation. 14 | - n_samples : number of samples to generate 15 | ''' 16 | delta_t = 10 17 | history_len = tau * delta_t 18 | # Initial conditions for the history of the system 19 | timeseries = 1.2 20 | 21 | if seed is not None: 22 | np.random.seed(seed) 23 | 24 | samples = [] 25 | 26 | for _ in range(n_samples): 27 | history = collections.deque(1.2 * np.ones(history_len) + 0.2 * \ 28 | (np.random.rand(history_len) - 0.5)) 29 | # Preallocate the array for the time-series 30 | inp = np.zeros((sample_len,1)) 31 | 32 | for timestep in range(sample_len): 33 | for _ in range(delta_t): 34 | xtau = history.popleft() 35 | history.append(timeseries) 36 | timeseries = history[-1] + (0.2 * xtau / (1.0 + xtau ** 10) - \ 37 | 0.1 * history[-1]) / delta_t 38 | inp[timestep] = timeseries 39 | 40 | # Squash timeseries through tanh 41 | inp = np.tanh(inp - 1) 42 | samples.append(inp) 43 | return samples 44 | 45 | 46 | def mso(sample_len=1000, n_samples = 1): 47 | ''' 48 | mso(sample_len=1000, n_samples = 1) -> input 49 | Generate the Multiple Sinewave Oscillator time-series, a sum of two sines 50 | with incommensurable periods. Parameters are: 51 | - sample_len: length of the time-series in timesteps 52 | - n_samples: number of samples to generate 53 | ''' 54 | signals = [] 55 | for _ in range(n_samples): 56 | phase = np.random.rand() 57 | x = np.atleast_2d(np.arange(sample_len)).T 58 | signals.append(np.sin(0.2 * x + phase) + np.sin(0.311 * x + phase)) 59 | return signals 60 | 61 | 62 | def lorentz(sample_len=1000, sigma=10, rho=28, beta=8 / 3, step=0.01): 63 | """This function generates a Lorentz time series of length sample_len, 64 | with standard parameters sigma, rho and beta. 65 | """ 66 | 67 | x = np.zeros([sample_len]) 68 | y = np.zeros([sample_len]) 69 | z = np.zeros([sample_len]) 70 | 71 | # Initial conditions taken from 'Chaos and Time Series Analysis', J. Sprott 72 | x[0] = 0; 73 | y[0] = -0.01; 74 | z[0] = 9; 75 | 76 | for t in range(sample_len - 1): 77 | x[t + 1] = x[t] + sigma * (y[t] - x[t]) * step 78 | y[t + 1] = y[t] + (x[t] * (rho - z[t]) - y[t]) * step 79 | z[t + 1] = z[t] + (x[t] * y[t] - beta * z[t]) * step 80 | 81 | x.shape += (1,) 82 | y.shape += (1,) 83 | z.shape += (1,) 84 | 85 | return np.concatenate((x, y, z), axis=1) 86 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/scan_tutorial/scan_ex1_solution.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | 5 | coefficients = T.vector("coefficients") 6 | x = T.scalar("x") 7 | max_coefficients_supported = 10000 8 | 9 | 10 | def step(coeff, power, prior_value, free_var): 11 | return prior_value + (coeff * (free_var ** power)) 12 | 13 | # Generate the components of the polynomial 14 | full_range = T.arange(max_coefficients_supported) 15 | outputs_info = np.zeros((), dtype=theano.config.floatX) 16 | 17 | components, updates = theano.scan(fn=step, 18 | sequences=[coefficients, full_range], 19 | outputs_info=outputs_info, 20 | non_sequences=x) 21 | 22 | polynomial = components[-1] 23 | calculate_polynomial = theano.function(inputs=[coefficients, x], 24 | outputs=polynomial, 25 | updates=updates) 26 | 27 | test_coeff = np.asarray([1, 0, 2], dtype=theano.config.floatX) 28 | print(calculate_polynomial(test_coeff, 3)) 29 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/scan_tutorial/scan_ex2_solution.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | 5 | probabilities = T.vector() 6 | nb_samples = T.iscalar() 7 | 8 | rng = T.shared_randomstreams.RandomStreams(1234) 9 | 10 | 11 | def sample_from_pvect(pvect): 12 | """ Provided utility function: given a symbolic vector of 13 | probabilities (which MUST sum to 1), sample one element 14 | and return its index. 15 | """ 16 | onehot_sample = rng.multinomial(n=1, pvals=pvect) 17 | sample = onehot_sample.argmax() 18 | return sample 19 | 20 | 21 | def set_p_to_zero(pvect, i): 22 | """ Provided utility function: given a symbolic vector of 23 | probabilities and an index 'i', set the probability of the 24 | i-th element to 0 and renormalize the probabilities so they 25 | sum to 1. 26 | """ 27 | new_pvect = T.set_subtensor(pvect[i], 0.) 28 | new_pvect = new_pvect / new_pvect.sum() 29 | return new_pvect 30 | 31 | 32 | def step(p): 33 | sample = sample_from_pvect(p) 34 | new_p = set_p_to_zero(p, sample) 35 | return new_p, sample 36 | 37 | output, updates = theano.scan(fn=step, 38 | outputs_info=[probabilities, None], 39 | n_steps=nb_samples) 40 | 41 | modified_probabilities, samples = output 42 | 43 | f = theano.function(inputs=[probabilities, nb_samples], 44 | outputs=[samples], 45 | updates=updates) 46 | 47 | # Testing the function 48 | test_probs = np.asarray([0.6, 0.3, 0.1], dtype=theano.config.floatX) 49 | for i in range(10): 50 | print(f(test_probs, 2)) 51 | -------------------------------------------------------------------------------- /images/README.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/README.sketch -------------------------------------------------------------------------------- /images/README_1200x800.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/README_1200x800.gif -------------------------------------------------------------------------------- /images/aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/aws.png -------------------------------------------------------------------------------- /images/commands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/commands.png -------------------------------------------------------------------------------- /images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/cover.png -------------------------------------------------------------------------------- /images/coversmall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/coversmall.png -------------------------------------------------------------------------------- /images/coversmall_alt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/coversmall_alt.png -------------------------------------------------------------------------------- /images/deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/deep.png -------------------------------------------------------------------------------- /images/k-means.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/k-means.gif -------------------------------------------------------------------------------- /images/kaggle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/kaggle.png -------------------------------------------------------------------------------- /images/keras.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/keras.jpg -------------------------------------------------------------------------------- /images/matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/matplotlib.png -------------------------------------------------------------------------------- /images/mrjob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/mrjob.png -------------------------------------------------------------------------------- /images/numpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/numpy.png -------------------------------------------------------------------------------- /images/pandas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/pandas.png -------------------------------------------------------------------------------- /images/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/python.png -------------------------------------------------------------------------------- /images/regex-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/regex-1.png -------------------------------------------------------------------------------- /images/regex-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/regex-2.png -------------------------------------------------------------------------------- /images/scikitlearn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/scikitlearn.png -------------------------------------------------------------------------------- /images/scipy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/scipy.png -------------------------------------------------------------------------------- /images/spark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/spark.png -------------------------------------------------------------------------------- /images/svm.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/svm.gif -------------------------------------------------------------------------------- /images/tensorflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/tensorflow.png -------------------------------------------------------------------------------- /images/theano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/theano.png -------------------------------------------------------------------------------- /kaggle/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/kaggle/__init__.py -------------------------------------------------------------------------------- /mapreduce/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/mapreduce/__init__.py -------------------------------------------------------------------------------- /mapreduce/mr_s3_log_parser.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | from mrjob.job import MRJob 4 | from mrjob.protocol import RawValueProtocol, ReprProtocol 5 | import re 6 | 7 | 8 | class MrS3LogParser(MRJob): 9 | """Parses the logs from S3 based on the S3 logging format: 10 | http://docs.aws.amazon.com/AmazonS3/latest/dev/LogFormat.html 11 | 12 | Aggregates a user's daily requests by user agent and operation 13 | 14 | Outputs date_time, requester, user_agent, operation, count 15 | """ 16 | 17 | LOGPATS = r'(\S+) (\S+) \[(.*?)\] (\S+) (\S+) ' \ 18 | r'(\S+) (\S+) (\S+) ("([^"]+)"|-) ' \ 19 | r'(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ' \ 20 | r'("([^"]+)"|-) ("([^"]+)"|-)' 21 | NUM_ENTRIES_PER_LINE = 17 22 | logpat = re.compile(LOGPATS) 23 | 24 | (S3_LOG_BUCKET_OWNER, 25 | S3_LOG_BUCKET, 26 | S3_LOG_DATE_TIME, 27 | S3_LOG_IP, 28 | S3_LOG_REQUESTER_ID, 29 | S3_LOG_REQUEST_ID, 30 | S3_LOG_OPERATION, 31 | S3_LOG_KEY, 32 | S3_LOG_HTTP_METHOD, 33 | S3_LOG_HTTP_STATUS, 34 | S3_LOG_S3_ERROR, 35 | S3_LOG_BYTES_SENT, 36 | S3_LOG_OBJECT_SIZE, 37 | S3_LOG_TOTAL_TIME, 38 | S3_LOG_TURN_AROUND_TIME, 39 | S3_LOG_REFERER, 40 | S3_LOG_USER_AGENT) = range(NUM_ENTRIES_PER_LINE) 41 | 42 | DELIMITER = '\t' 43 | 44 | # We use RawValueProtocol for input to be format agnostic 45 | # and avoid any type of parsing errors 46 | INPUT_PROTOCOL = RawValueProtocol 47 | 48 | # We use RawValueProtocol for output so we can output raw lines 49 | # instead of (k, v) pairs 50 | OUTPUT_PROTOCOL = RawValueProtocol 51 | 52 | # Encode the intermediate records using repr() instead of JSON, so the 53 | # record doesn't get Unicode-encoded 54 | INTERNAL_PROTOCOL = ReprProtocol 55 | 56 | def clean_date_time_zone(self, raw_date_time_zone): 57 | """Converts entry 22/Jul/2013:21:04:17 +0000 to the format 58 | 'YYYY-MM-DD HH:MM:SS' which is more suitable for loading into 59 | a database such as Redshift or RDS 60 | 61 | Note: requires the chars "[ ]" to be stripped prior to input 62 | Returns the converted datetime annd timezone 63 | or None for both values if failed 64 | 65 | TODO: Needs to combine timezone with date as one field 66 | """ 67 | date_time = None 68 | time_zone_parsed = None 69 | 70 | # TODO: Probably cleaner to parse this with a regex 71 | date_parsed = raw_date_time_zone[:raw_date_time_zone.find(":")] 72 | time_parsed = raw_date_time_zone[raw_date_time_zone.find(":") + 1: 73 | raw_date_time_zone.find("+") - 1] 74 | time_zone_parsed = raw_date_time_zone[raw_date_time_zone.find("+"):] 75 | 76 | try: 77 | date_struct = time.strptime(date_parsed, "%d/%b/%Y") 78 | converted_date = time.strftime("%Y-%m-%d", date_struct) 79 | date_time = converted_date + " " + time_parsed 80 | 81 | # Throws a ValueError exception if the operation fails that is 82 | # caught by the calling function and is handled appropriately 83 | except ValueError as error: 84 | raise ValueError(error) 85 | else: 86 | return converted_date, date_time, time_zone_parsed 87 | 88 | def mapper(self, _, line): 89 | line = line.strip() 90 | match = self.logpat.search(line) 91 | 92 | date_time = None 93 | requester = None 94 | user_agent = None 95 | operation = None 96 | 97 | try: 98 | for n in range(self.NUM_ENTRIES_PER_LINE): 99 | group = match.group(1 + n) 100 | 101 | if n == self.S3_LOG_DATE_TIME: 102 | date, date_time, time_zone_parsed = \ 103 | self.clean_date_time_zone(group) 104 | # Leave the following line of code if 105 | # you want to aggregate by date 106 | date_time = date + " 00:00:00" 107 | elif n == self.S3_LOG_REQUESTER_ID: 108 | requester = group 109 | elif n == self.S3_LOG_USER_AGENT: 110 | user_agent = group 111 | elif n == self.S3_LOG_OPERATION: 112 | operation = group 113 | else: 114 | pass 115 | 116 | except Exception: 117 | yield (("Error while parsing line: %s", line), 1) 118 | else: 119 | yield ((date_time, requester, user_agent, operation), 1) 120 | 121 | def reducer(self, key, values): 122 | output = list(key) 123 | output = self.DELIMITER.join(output) + \ 124 | self.DELIMITER + \ 125 | str(sum(values)) 126 | 127 | yield None, output 128 | 129 | def steps(self): 130 | return [ 131 | self.mr(mapper=self.mapper, 132 | reducer=self.reducer) 133 | ] 134 | 135 | 136 | if __name__ == '__main__': 137 | MrS3LogParser.run() -------------------------------------------------------------------------------- /mapreduce/test_mr_s3_log_parser.py: -------------------------------------------------------------------------------- 1 | 2 | from StringIO import StringIO 3 | import unittest2 as unittest 4 | from mr_s3_log_parser import MrS3LogParser 5 | 6 | 7 | class MrTestsUtil: 8 | 9 | def run_mr_sandbox(self, mr_job, stdin): 10 | # inline runs the job in the same process so small jobs tend to 11 | # run faster and stack traces are simpler 12 | # --no-conf prevents options from local mrjob.conf from polluting 13 | # the testing environment 14 | # "-" reads from standard in 15 | mr_job.sandbox(stdin=stdin) 16 | 17 | # make_runner ensures job cleanup is performed regardless of 18 | # success or failure 19 | with mr_job.make_runner() as runner: 20 | runner.run() 21 | for line in runner.stream_output(): 22 | key, value = mr_job.parse_output_line(line) 23 | yield value 24 | 25 | 26 | class TestMrS3LogParser(unittest.TestCase): 27 | 28 | mr_job = None 29 | mr_tests_util = None 30 | 31 | RAW_LOG_LINE_INVALID = \ 32 | '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \ 33 | '00000388225bcc00000 ' \ 34 | 's3-storage [22/Jul/2013:21:03:27 +0000] ' \ 35 | '00.111.222.33 ' \ 36 | 37 | RAW_LOG_LINE_VALID = \ 38 | '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \ 39 | '00000388225bcc00000 ' \ 40 | 's3-storage [22/Jul/2013:21:03:27 +0000] ' \ 41 | '00.111.222.33 ' \ 42 | 'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \ 43 | 'REST.HEAD.OBJECT user/file.pdf ' \ 44 | '"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \ 45 | '00000SDZk ' \ 46 | 'HTTP/1.1" 200 - - 4000272 18 - "-" ' \ 47 | '"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \ 48 | '00000XMHZJp6DjM9x5JVEAMo8MG00000' 49 | 50 | DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000" 51 | DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000" 52 | DATE_VALID = "2013-07-22" 53 | DATE_TIME_VALID = "2013-07-22 21:04:17" 54 | TIME_ZONE_VALID = "+0000" 55 | 56 | def __init__(self, *args, **kwargs): 57 | super(TestMrS3LogParser, self).__init__(*args, **kwargs) 58 | self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-']) 59 | self.mr_tests_util = MrTestsUtil() 60 | 61 | def test_invalid_log_lines(self): 62 | stdin = StringIO(self.RAW_LOG_LINE_INVALID) 63 | 64 | for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin): 65 | self.assertEqual(result.find("Error"), 0) 66 | 67 | def test_valid_log_lines(self): 68 | stdin = StringIO(self.RAW_LOG_LINE_VALID) 69 | 70 | for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin): 71 | self.assertEqual(result.find("Error"), -1) 72 | 73 | def test_clean_date_time_zone(self): 74 | date, date_time, time_zone_parsed = \ 75 | self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID) 76 | self.assertEqual(date, self.DATE_VALID) 77 | self.assertEqual(date_time, self.DATE_TIME_VALID) 78 | self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID) 79 | 80 | # Use a lambda to delay the calling of clean_date_time_zone so that 81 | # assertRaises has enough time to handle it properly 82 | self.assertRaises(ValueError, 83 | lambda: self.mr_job.clean_date_time_zone( 84 | self.DATE_TIME_ZONE_INVALID)) 85 | 86 | if __name__ == '__main__': 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /matplotlib/04.15-Further-Resources.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "\n", 9 | "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", 10 | "\n", 11 | "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n", 12 | "\n", 13 | "*No changes were made to the contents of this notebook from the original.*" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "\n", 21 | "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Further Resources" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Matplotlib Resources\n", 36 | "\n", 37 | "A single chapter in a book can never hope to cover all the available features and plot types available in Matplotlib.\n", 38 | "As with other packages we've seen, liberal use of IPython's tab-completion and help functions (see [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb)) can be very helpful when exploring Matplotlib's API.\n", 39 | "In addition, Matplotlib’s [online documentation](http://matplotlib.org/) can be a helpful reference.\n", 40 | "See in particular the [Matplotlib gallery](http://matplotlib.org/gallery.html) linked on that page: it shows thumbnails of hundreds of different plot types, each one linked to a page with the Python code snippet used to generate it.\n", 41 | "In this way, you can visually inspect and learn about a wide range of different plotting styles and visualization techniques.\n", 42 | "\n", 43 | "For a book-length treatment of Matplotlib, I would recommend [*Interactive Applications Using Matplotlib*](https://www.packtpub.com/application-development/interactive-applications-using-matplotlib), written by Matplotlib core developer Ben Root." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Other Python Graphics Libraries\n", 51 | "\n", 52 | "Although Matplotlib is the most prominent Python visualization library, there are other more modern tools that are worth exploring as well.\n", 53 | "I'll mention a few of them briefly here:\n", 54 | "\n", 55 | "- [Bokeh](http://bokeh.pydata.org) is a JavaScript visualization library with a Python frontend that creates highly interactive visualizations capable of handling very large and/or streaming datasets. The Python front-end outputs a JSON data structure that can be interpreted by the Bokeh JS engine.\n", 56 | "- [Plotly](http://plot.ly) is the eponymous open source product of the Plotly company, and is similar in spirit to Bokeh. Because Plotly is the main product of a startup, it is receiving a high level of development effort. Use of the library is entirely free.\n", 57 | "- [Vispy](http://vispy.org/) is an actively developed project focused on dynamic visualizations of very large datasets. Because it is built to target OpenGL and make use of efficient graphics processors in your computer, it is able to render some quite large and stunning visualizations.\n", 58 | "- [Vega](https://vega.github.io/) and [Vega-Lite](https://vega.github.io/vega-lite) are declarative graphics representations, and are the product of years of research into the fundamental language of data visualization. The reference rendering implementation is JavaScript, but the API is language agnostic. There is a Python API under development in the [Altair](https://altair-viz.github.io/) package. Though as of summer 2016 it's not yet fully mature, I'm quite excited for the possibilities of this project to provide a common reference point for visualization in Python and other languages.\n", 59 | "\n", 60 | "The visualization space in the Python community is very dynamic, and I fully expect this list to be out of date as soon as it is published.\n", 61 | "Keep an eye out for what's coming in the future!" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "\n", 69 | "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.4.3" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 0 94 | } 95 | -------------------------------------------------------------------------------- /matplotlib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/matplotlib/__init__.py -------------------------------------------------------------------------------- /matplotlib/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/matplotlib/tests/__init__.py -------------------------------------------------------------------------------- /misc/regex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Regex\n", 8 | "\n", 9 | "Credits: Material from [Regular Expressions Cheat Sheet](http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/) by Dave Child\n", 10 | "\n", 11 | "Use with http://www.regexr.com to generate regular expressions." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "

\n", 19 | " \n", 20 | " \n", 21 | "
\n", 22 | "

" 23 | ] 24 | } 25 | ], 26 | "metadata": { 27 | "kernelspec": { 28 | "display_name": "Python 2", 29 | "language": "python", 30 | "name": "python2" 31 | }, 32 | "language_info": { 33 | "codemirror_mode": { 34 | "name": "ipython", 35 | "version": 2 36 | }, 37 | "file_extension": ".py", 38 | "mimetype": "text/x-python", 39 | "name": "python", 40 | "nbconvert_exporter": "python", 41 | "pygments_lexer": "ipython2", 42 | "version": "2.7.10" 43 | } 44 | }, 45 | "nbformat": 4, 46 | "nbformat_minor": 0 47 | } 48 | -------------------------------------------------------------------------------- /numpy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/__init__.py -------------------------------------------------------------------------------- /numpy/figures/02.05-broadcasting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/02.05-broadcasting.png -------------------------------------------------------------------------------- /numpy/figures/PDSH-cover-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/PDSH-cover-small.png -------------------------------------------------------------------------------- /numpy/figures/array_vs_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/array_vs_list.png -------------------------------------------------------------------------------- /numpy/figures/cint_vs_pyint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/cint_vs_pyint.png -------------------------------------------------------------------------------- /numpy/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/tests/__init__.py -------------------------------------------------------------------------------- /pandas/03.00-Introduction-to-Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "\n", 9 | "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", 10 | "\n", 11 | "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n", 12 | "\n", 13 | "*No changes were made to the contents of this notebook from the original.*" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "\n", 21 | "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Data Manipulation with Pandas" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "In the previous chapter, we dove into detail on NumPy and its ``ndarray`` object, which provides efficient storage and manipulation of dense typed arrays in Python.\n", 36 | "Here we'll build on this knowledge by looking in detail at the data structures provided by the Pandas library.\n", 37 | "Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a ``DataFrame``.\n", 38 | "``DataFrame``s are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data.\n", 39 | "As well as offering a convenient storage interface for labeled data, Pandas implements a number of powerful data operations familiar to users of both database frameworks and spreadsheet programs.\n", 40 | "\n", 41 | "As we saw, NumPy's ``ndarray`` data structure provides essential features for the type of clean, well-organized data typically seen in numerical computing tasks.\n", 42 | "While it serves this purpose very well, its limitations become clear when we need more flexibility (e.g., attaching labels to data, working with missing data, etc.) and when attempting operations that do not map well to element-wise broadcasting (e.g., groupings, pivots, etc.), each of which is an important piece of analyzing the less structured data available in many forms in the world around us.\n", 43 | "Pandas, and in particular its ``Series`` and ``DataFrame`` objects, builds on the NumPy array structure and provides efficient access to these sorts of \"data munging\" tasks that occupy much of a data scientist's time.\n", 44 | "\n", 45 | "In this chapter, we will focus on the mechanics of using ``Series``, ``DataFrame``, and related structures effectively.\n", 46 | "We will use examples drawn from real datasets where appropriate, but these examples are not necessarily the focus." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Installing and Using Pandas\n", 54 | "\n", 55 | "Installation of Pandas on your system requires NumPy to be installed, and if building the library from source, requires the appropriate tools to compile the C and Cython sources on which Pandas is built.\n", 56 | "Details on this installation can be found in the [Pandas documentation](http://pandas.pydata.org/).\n", 57 | "If you followed the advice outlined in the [Preface](00.00-Preface.ipynb) and used the Anaconda stack, you already have Pandas installed.\n", 58 | "\n", 59 | "Once Pandas is installed, you can import it and check the version:" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 1, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "'0.18.1'" 73 | ] 74 | }, 75 | "execution_count": 1, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "import pandas\n", 82 | "pandas.__version__" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "Just as we generally import NumPy under the alias ``np``, we will import Pandas under the alias ``pd``:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 2, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "import pandas as pd" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "This import convention will be used throughout the remainder of this book." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Reminder about Built-In Documentation\n", 115 | "\n", 116 | "As you read through this chapter, don't forget that IPython gives you the ability to quickly explore the contents of a package (by using the tab-completion feature) as well as the documentation of various functions (using the ``?`` character). (Refer back to [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) if you need a refresher on this.)\n", 117 | "\n", 118 | "For example, to display all the contents of the pandas namespace, you can type\n", 119 | "\n", 120 | "```ipython\n", 121 | "In [3]: pd.\n", 122 | "```\n", 123 | "\n", 124 | "And to display Pandas's built-in documentation, you can use this:\n", 125 | "\n", 126 | "```ipython\n", 127 | "In [4]: pd?\n", 128 | "```\n", 129 | "\n", 130 | "More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/." 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "\n", 138 | "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >" 139 | ] 140 | } 141 | ], 142 | "metadata": { 143 | "anaconda-cloud": {}, 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.4.3" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 0 164 | } 165 | -------------------------------------------------------------------------------- /pandas/03.13-Further-Resources.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "\n", 9 | "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", 10 | "\n", 11 | "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n", 12 | "\n", 13 | "*No changes were made to the contents of this notebook from the original.*" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "\n", 21 | "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Further Resources\n", 29 | "\n", 30 | "In this chapter, we've covered many of the basics of using Pandas effectively for data analysis.\n", 31 | "Still, much has been omitted from our discussion.\n", 32 | "To learn more about Pandas, I recommend the following resources:\n", 33 | "\n", 34 | "- [Pandas online documentation](http://pandas.pydata.org/): This is the go-to source for complete documentation of the package. While the examples in the documentation tend to be small generated datasets, the description of the options is complete and generally very useful for understanding the use of various functions.\n", 35 | "\n", 36 | "- [*Python for Data Analysis*](http://shop.oreilly.com/product/0636920023784.do) Written by Wes McKinney (the original creator of Pandas), this book contains much more detail on the Pandas package than we had room for in this chapter. In particular, he takes a deep dive into tools for time series, which were his bread and butter as a financial consultant. The book also has many entertaining examples of applying Pandas to gain insight from real-world datasets. Keep in mind, though, that the book is now several years old, and the Pandas package has quite a few new features that this book does not cover (but be on the lookout for a new edition in 2017).\n", 37 | "\n", 38 | "- [Stack Overflow](http://stackoverflow.com/questions/tagged/pandas): Pandas has so many users that any question you have has likely been asked and answered on Stack Overflow. Using Pandas is a case where some Google-Fu is your best friend. Simply go to your favorite search engine and type in the question, problem, or error you're coming across–more than likely you'll find your answer on a Stack Overflow page.\n", 39 | "\n", 40 | "- [Pandas on PyVideo](http://pyvideo.org/search?q=pandas): From PyCon to SciPy to PyData, many conferences have featured tutorials from Pandas developers and power users. The PyCon tutorials in particular tend to be given by very well-vetted presenters.\n", 41 | "\n", 42 | "Using these resources, combined with the walk-through given in this chapter, my hope is that you'll be poised to use Pandas to tackle any data analysis problem you come across!" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "\n", 50 | "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >" 51 | ] 52 | } 53 | ], 54 | "metadata": { 55 | "anaconda-cloud": {}, 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.4.3" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 0 76 | } 77 | -------------------------------------------------------------------------------- /pandas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/pandas/__init__.py -------------------------------------------------------------------------------- /pandas/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/pandas/tests/__init__.py -------------------------------------------------------------------------------- /python-data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/python-data/__init__.py -------------------------------------------------------------------------------- /python-data/files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Files\n", 15 | "\n", 16 | "* Read a File\n", 17 | "* Write a File\n", 18 | "* Read and Write UTF-8" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Read a File\n", 26 | "\n", 27 | "Open a file in read-only mode.\n", 28 | "Iterate over the file lines. rstrip removes the EOL markers." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "class TypeUtil:\n", 43 | "\n", 44 | " @classmethod\n", 45 | " def is_iterable(cls, obj):\n", 46 | " \"\"\"Determines if obj is iterable.\n", 47 | "\n", 48 | " Useful when writing functions that can accept multiple types of\n", 49 | " input (list, tuple, ndarray, iterator). Pairs well with\n", 50 | " convert_to_list.\n", 51 | " \"\"\"\n", 52 | " try:\n", 53 | " iter(obj)\n", 54 | " return True\n", 55 | " except TypeError:\n", 56 | " return False\n", 57 | "\n", 58 | " @classmethod\n", 59 | " def convert_to_list(cls, obj):\n", 60 | " \"\"\"Converts obj to a list if it is not a list and it is iterable,\n", 61 | " else returns the original obj.\n", 62 | " \"\"\"\n", 63 | " if not isinstance(obj, list) and cls.is_iterable(obj):\n", 64 | " obj = list(obj)\n", 65 | " return obj\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "old_file_path = 'type_util.py'\n", 71 | "with open(old_file_path, 'r') as old_file:\n", 72 | " for line in old_file:\n", 73 | " print(line.rstrip())" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Write to a file\n", 81 | "\n", 82 | "Create a new file overwriting any previous file with the same name, write text, then close the file:" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 2, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "new_file_path = 'hello_world.txt'\n", 94 | "with open(new_file_path, 'w') as new_file:\n", 95 | " new_file.write('hello world!')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Read and Write UTF-8" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "import codecs\n", 114 | "with codecs.open(\"hello_world_new.txt\", \"a\", \"utf-8\") as new_file:\n", 115 | " with codecs.open(\"hello_world.txt\", \"r\", \"utf-8\") as old_file: \n", 116 | " for line in old_file:\n", 117 | " new_file.write(line + '\\n')" 118 | ] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": "Python 2", 124 | "language": "python", 125 | "name": "python2" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 2 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython2", 137 | "version": "2.7.10" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 0 142 | } 143 | -------------------------------------------------------------------------------- /python-data/hello_world.txt: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /python-data/logs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Logging in Python\n", 15 | "* Logging with RotatingFileHandler\n", 16 | "* Logging with TimedRotatingFileHandler " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Logging with RotatingFileHandler\n", 24 | "\n", 25 | "The logging discussion is taken from the [Python Logging Cookbook](https://docs.python.org/2/howto/logging-cookbook.html#using-file-rotation):\n", 26 | "\n", 27 | "Sometimes you want to let a log file grow to a certain size, then open a new file and log to that. You may want to keep a certain number of these files, and when that many files have been created, rotate the files so that the number of files and the size of the files both remain bounded. For this usage pattern, the logging package provides a RotatingFileHandler.\n", 28 | "\n", 29 | "The most current file is always logging_rotatingfile_example.out, and each time it reaches the size limit it is renamed with the suffix .1. Each of the existing backup files is renamed to increment the suffix (.1 becomes .2, etc.) and the .6 file is erased.\n", 30 | "\n", 31 | "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import logging\n", 43 | "import time\n", 44 | " \n", 45 | "from logging.handlers import RotatingFileHandler\n", 46 | " \n", 47 | "#----------------------------------------------------------------------\n", 48 | "def create_rotating_log(path):\n", 49 | " \"\"\"\n", 50 | " Creates a rotating log\n", 51 | " \"\"\"\n", 52 | " logger = logging.getLogger(\"Rotating Log\")\n", 53 | " logger.setLevel(logging.INFO)\n", 54 | " \n", 55 | " # add a rotating handler\n", 56 | " handler = RotatingFileHandler(path, maxBytes=20,\n", 57 | " backupCount=5)\n", 58 | " logger.addHandler(handler)\n", 59 | " \n", 60 | " for i in range(10):\n", 61 | " logger.info(\"This is test log line %s\" % i)\n", 62 | " time.sleep(1.5)\n", 63 | " \n", 64 | "#----------------------------------------------------------------------\n", 65 | "if __name__ == \"__main__\":\n", 66 | " log_file = \"test.log\"\n", 67 | " create_rotating_log(log_file)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Logging with TimedRotatingFileHandler\n", 75 | "\n", 76 | "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import logging\n", 88 | "import time\n", 89 | " \n", 90 | "from logging.handlers import TimedRotatingFileHandler\n", 91 | " \n", 92 | "#----------------------------------------------------------------------\n", 93 | "def create_timed_rotating_log(path):\n", 94 | " \"\"\"\"\"\"\n", 95 | " logger = logging.getLogger(\"Rotating Log\")\n", 96 | " logger.setLevel(logging.INFO)\n", 97 | " \n", 98 | " # Rotate log based on when parameter:\n", 99 | " # second (s)\n", 100 | " # minute (m)\n", 101 | " # hour (h)\n", 102 | " # day (d)\n", 103 | " # w0-w6 (weekday, 0=Monday)\n", 104 | " # midnight\n", 105 | " handler = TimedRotatingFileHandler(path,\n", 106 | " when=\"m\",\n", 107 | " interval=1,\n", 108 | " backupCount=5)\n", 109 | " logger.addHandler(handler)\n", 110 | " \n", 111 | " for i in range(20):\n", 112 | " logger.info(\"This is a test!\")\n", 113 | " time.sleep(1.5)\n", 114 | " \n", 115 | "#----------------------------------------------------------------------\n", 116 | "if __name__ == \"__main__\":\n", 117 | " log_file = \"timed_test.log\"\n", 118 | " create_timed_rotating_log(log_file)" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 2", 125 | "language": "python", 126 | "name": "python2" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 2 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython2", 138 | "version": "2.7.10" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 0 143 | } 144 | -------------------------------------------------------------------------------- /python-data/pdb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# PDB\n", 15 | "\n", 16 | "The pdb module defines an interactive source code debugger for Python programs. Below are frequently used commands:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "# Run pdb when this line is hit\n", 28 | "import pdb; pdb.set_trace()\n", 29 | "\n", 30 | "# Run pdb when the script is run\n", 31 | "python -m pdb script.py\n", 32 | "\n", 33 | "# Help\n", 34 | "h[elp]\n", 35 | "\n", 36 | "# Show current content\n", 37 | "l[ist]\n", 38 | "\n", 39 | "# Examine variables\n", 40 | "p[rint]\n", 41 | "\n", 42 | "# Pretty print\n", 43 | "pp\n", 44 | "\n", 45 | "# Go to next line\n", 46 | "n[ext]\n", 47 | "\n", 48 | "# Step into\n", 49 | "s[tep]\n", 50 | "\n", 51 | "# Continue execution until the line with the line number greater \n", 52 | "# than the current one is reached or when returning from current frame.\n", 53 | "until\n", 54 | "\n", 55 | "# Return\n", 56 | "r[eturn]\n", 57 | "\n", 58 | "# See all breakpoints\n", 59 | "b to see all breakpoints\n", 60 | "\n", 61 | "# Set breakpoint at line 16\n", 62 | "b 16 \n", 63 | "\n", 64 | "# Clear breakpoint 1\n", 65 | "cl[ear] 1\n", 66 | "\n", 67 | "# Continue\n", 68 | "c[ontinue]\n", 69 | "\n", 70 | "# Conditional breakpoints, line 11\n", 71 | "b 11, this_year == 2015\n", 72 | "\n", 73 | "# Stack location\n", 74 | "w[here]\n", 75 | "\n", 76 | "# Go up in stack\n", 77 | "u[p]\n", 78 | "\n", 79 | "# Go down in stack\n", 80 | "d[own]\n", 81 | "\n", 82 | "# Longlist shows full method of where you're in (Python 3)\n", 83 | "ll\n", 84 | "\n", 85 | "# Quit\n", 86 | "q[uit]" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 2", 93 | "language": "python", 94 | "name": "python2" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 2 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython2", 106 | "version": "2.7.10" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 0 111 | } 112 | -------------------------------------------------------------------------------- /python-data/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/python-data/tests/__init__.py -------------------------------------------------------------------------------- /python-data/tests/test_transform_util.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from ..transform_util import TransformUtil 3 | 4 | 5 | class TestTransformUtil(): 6 | 7 | states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', \ 8 | 'FlOrIda', 'south carolina##', 'West virginia?'] 9 | 10 | expected_output = ['Alabama', 11 | 'Georgia', 12 | 'Georgia', 13 | 'Georgia', 14 | 'Florida', 15 | 'South Carolina', 16 | 'West Virginia'] 17 | 18 | def test_remove_punctuation(self): 19 | assert_equal(TransformUtil.remove_punctuation('!#?'), '') 20 | 21 | def test_map_remove_punctuation(self): 22 | # Map applies a function to a collection 23 | output = map(TransformUtil.remove_punctuation, self.states) 24 | assert_equal('!#?' not in output, True) 25 | 26 | def test_clean_strings(self): 27 | clean_ops = [str.strip, TransformUtil.remove_punctuation, str.title] 28 | output = TransformUtil.clean_strings(self.states, clean_ops) 29 | assert_equal(output, self.expected_output) -------------------------------------------------------------------------------- /python-data/tests/test_type_util.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from ..type_util import TypeUtil 3 | 4 | 5 | class TestUtil(): 6 | 7 | def test_is_iterable(self): 8 | assert_equal(TypeUtil.is_iterable('foo'), True) 9 | assert_equal(TypeUtil.is_iterable(7), False) 10 | 11 | def test_convert_to_list(self): 12 | assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True) 13 | assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False) -------------------------------------------------------------------------------- /python-data/transform_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class TransformUtil: 5 | 6 | @classmethod 7 | def remove_punctuation(cls, value): 8 | """Removes !, #, and ?. 9 | """ 10 | return re.sub('[!#?]', '', value) 11 | 12 | @classmethod 13 | def clean_strings(cls, strings, ops): 14 | """General purpose method to clean strings. 15 | 16 | Pass in a sequence of strings and the operations to perform. 17 | """ 18 | result = [] 19 | for value in strings: 20 | for function in ops: 21 | value = function(value) 22 | result.append(value) 23 | return result -------------------------------------------------------------------------------- /python-data/type_util.py: -------------------------------------------------------------------------------- 1 | class TypeUtil: 2 | 3 | @classmethod 4 | def is_iterable(cls, obj): 5 | """Determines if obj is iterable. 6 | 7 | Useful when writing functions that can accept multiple types of 8 | input (list, tuple, ndarray, iterator). Pairs well with 9 | convert_to_list. 10 | """ 11 | try: 12 | iter(obj) 13 | return True 14 | except TypeError: 15 | return False 16 | 17 | @classmethod 18 | def convert_to_list(cls, obj): 19 | """Converts obj to a list if it is not a list and it is iterable, 20 | else returns the original obj. 21 | """ 22 | if not isinstance(obj, list) and cls.is_iterable(obj): 23 | obj = list(obj) 24 | return obj -------------------------------------------------------------------------------- /python-data/unit_tests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Nose Unit Tests with IPython Notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Nose\n", 22 | "\n", 23 | "Testing is a vital part of software development. Nose extends unittest to make testing easier." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Install Nose\n", 31 | "\n", 32 | "Run the following command line:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "!pip install nose" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Create the Code\n", 51 | "\n", 52 | "Save your code to a file with the %%file magic:" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 1, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "Overwriting type_util.py\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "%%file type_util.py\n", 72 | "class TypeUtil:\n", 73 | "\n", 74 | " @classmethod\n", 75 | " def is_iterable(cls, obj):\n", 76 | " \"\"\"Determines if obj is iterable.\n", 77 | "\n", 78 | " Useful when writing functions that can accept multiple types of\n", 79 | " input (list, tuple, ndarray, iterator). Pairs well with\n", 80 | " convert_to_list.\n", 81 | " \"\"\"\n", 82 | " try:\n", 83 | " iter(obj)\n", 84 | " return True\n", 85 | " except TypeError:\n", 86 | " return False\n", 87 | "\n", 88 | " @classmethod\n", 89 | " def convert_to_list(cls, obj):\n", 90 | " \"\"\"Converts obj to a list if it is not a list and it is iterable, \n", 91 | " else returns the original obj.\n", 92 | " \"\"\"\n", 93 | " if not isinstance(obj, list) and cls.is_iterable(obj):\n", 94 | " obj = list(obj)\n", 95 | " return obj\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Create the Nose Tests\n", 103 | "\n", 104 | "Save your test to a file with the %%file magic:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 2, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Overwriting tests/test_type_util.py\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "%%file tests/test_type_util.py\n", 124 | "from nose.tools import assert_equal\n", 125 | "from ..type_util import TypeUtil\n", 126 | "\n", 127 | "\n", 128 | "class TestUtil():\n", 129 | "\n", 130 | " def test_is_iterable(self):\n", 131 | " assert_equal(TypeUtil.is_iterable('foo'), True)\n", 132 | " assert_equal(TypeUtil.is_iterable(7), False)\n", 133 | "\n", 134 | " def test_convert_to_list(self):\n", 135 | " assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)\n", 136 | " assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Run the Nose Tests\n", 144 | "\n", 145 | "Run the following command line:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "core.tests.test_type_util.TestUtil.test_convert_to_list ... ok\r\n", 160 | "core.tests.test_type_util.TestUtil.test_is_iterable ... ok\r\n", 161 | "\r\n", 162 | "----------------------------------------------------------------------\r\n", 163 | "Ran 2 tests in 0.001s\r\n", 164 | "\r\n", 165 | "OK\r\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "!nosetests tests/test_type_util.py -v" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 2", 177 | "language": "python", 178 | "name": "python2" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 2 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython2", 190 | "version": "2.7.10" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 0 195 | } 196 | -------------------------------------------------------------------------------- /scikit-learn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scikit-learn/__init__.py -------------------------------------------------------------------------------- /scikit-learn/fig_code/ML_flow_chart.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tutorial Diagrams 3 | ----------------- 4 | 5 | This script plots the flow-charts used in the scikit-learn tutorials. 6 | """ 7 | 8 | import numpy as np 9 | import pylab as pl 10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow 11 | 12 | def create_base(box_bg = '#CCCCCC', 13 | arrow1 = '#88CCFF', 14 | arrow2 = '#88FF88', 15 | supervised=True): 16 | fig = pl.figure(figsize=(9, 6), facecolor='w') 17 | ax = pl.axes((0, 0, 1, 1), 18 | xticks=[], yticks=[], frameon=False) 19 | ax.set_xlim(0, 9) 20 | ax.set_ylim(0, 6) 21 | 22 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg), 23 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg), 24 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg), 25 | 26 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg), 27 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg), 28 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg), 29 | 30 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg), 31 | 32 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg), 33 | 34 | Circle((5.5, 3.5), 1.0, fc=box_bg), 35 | 36 | Polygon([[5.5, 1.7], 37 | [6.1, 1.1], 38 | [5.5, 0.5], 39 | [4.9, 1.1]], fc=box_bg), 40 | 41 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1, 42 | width=0.25, head_width=0.5, head_length=0.2), 43 | 44 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1, 45 | width=0.25, head_width=0.5, head_length=0.2), 46 | 47 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1, 48 | width=0.25, head_width=0.5, head_length=0.2), 49 | 50 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2, 51 | width=0.25, head_width=0.5, head_length=0.2), 52 | 53 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2, 54 | width=0.25, head_width=0.5, head_length=0.2), 55 | 56 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2, 57 | width=0.25, head_width=0.5, head_length=0.2)] 58 | 59 | if supervised: 60 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg), 61 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg), 62 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg), 63 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1, 64 | width=0.25, head_width=0.5, head_length=0.2), 65 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)] 66 | else: 67 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)] 68 | 69 | for p in patches: 70 | ax.add_patch(p) 71 | 72 | pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.", 73 | ha='center', va='center', fontsize=14) 74 | 75 | pl.text(3.6, 4.9, "Feature\nVectors", 76 | ha='left', va='center', fontsize=14) 77 | 78 | pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm", 79 | ha='center', va='center', fontsize=14) 80 | 81 | pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.", 82 | ha='center', va='center', fontsize=14) 83 | 84 | pl.text(3.3, 1.7, "Feature\nVector", 85 | ha='left', va='center', fontsize=14) 86 | 87 | pl.text(5.5, 1.1, "Predictive\nModel", 88 | ha='center', va='center', fontsize=12) 89 | 90 | if supervised: 91 | pl.text(1.45, 3.05, "Labels", 92 | ha='center', va='center', fontsize=14) 93 | 94 | pl.text(8.05, 1.1, "Expected\nLabel", 95 | ha='center', va='center', fontsize=14) 96 | pl.text(8.8, 5.8, "Supervised Learning Model", 97 | ha='right', va='top', fontsize=18) 98 | 99 | else: 100 | pl.text(8.05, 1.1, 101 | "Likelihood\nor Cluster ID\nor Better\nRepresentation", 102 | ha='center', va='center', fontsize=12) 103 | pl.text(8.8, 5.8, "Unsupervised Learning Model", 104 | ha='right', va='top', fontsize=18) 105 | 106 | 107 | 108 | def plot_supervised_chart(annotate=False): 109 | create_base(supervised=True) 110 | if annotate: 111 | fontdict = dict(color='r', weight='bold', size=14) 112 | pl.text(1.9, 4.55, 'X = vec.fit_transform(input)', 113 | fontdict=fontdict, 114 | rotation=20, ha='left', va='bottom') 115 | pl.text(3.7, 3.2, 'clf.fit(X, y)', 116 | fontdict=fontdict, 117 | rotation=20, ha='left', va='bottom') 118 | pl.text(1.7, 1.5, 'X_new = vec.transform(input)', 119 | fontdict=fontdict, 120 | rotation=20, ha='left', va='bottom') 121 | pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)', 122 | fontdict=fontdict, 123 | rotation=20, ha='left', va='bottom') 124 | 125 | def plot_unsupervised_chart(): 126 | create_base(supervised=False) 127 | 128 | 129 | if __name__ == '__main__': 130 | plot_supervised_chart(False) 131 | plot_supervised_chart(True) 132 | plot_unsupervised_chart() 133 | pl.show() 134 | 135 | 136 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import * 2 | from .figures import * 3 | 4 | from .sgd_separator import plot_sgd_separator 5 | from .linear_regression import plot_linear_regression 6 | from .helpers import plot_iris_knn 7 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def linear_data_sample(N=40, rseed=0, m=3, b=-2): 5 | rng = np.random.RandomState(rseed) 6 | 7 | x = 10 * rng.rand(N) 8 | dy = m / 2 * (1 + rng.rand(N)) 9 | y = m * x + b + dy * rng.randn(N) 10 | 11 | return (x, y, dy) 12 | 13 | 14 | def linear_data_sample_big_errs(N=40, rseed=0, m=3, b=-2): 15 | rng = np.random.RandomState(rseed) 16 | 17 | x = 10 * rng.rand(N) 18 | dy = m / 2 * (1 + rng.rand(N)) 19 | dy[20:25] *= 10 20 | y = m * x + b + dy * rng.randn(N) 21 | 22 | return (x, y, dy) 23 | 24 | 25 | def sample_light_curve(phased=True): 26 | from astroML.datasets import fetch_LINEAR_sample 27 | data = fetch_LINEAR_sample() 28 | t, y, dy = data[18525697].T 29 | 30 | if phased: 31 | P_best = 0.580313015651 32 | t /= P_best 33 | 34 | return (t, y, dy) 35 | 36 | 37 | def sample_light_curve_2(phased=True): 38 | from astroML.datasets import fetch_LINEAR_sample 39 | data = fetch_LINEAR_sample() 40 | t, y, dy = data[10022663].T 41 | 42 | if phased: 43 | P_best = 0.61596079804 44 | t /= P_best 45 | 46 | return (t, y, dy) 47 | 48 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small helpers for code that is not shown in the notebooks 3 | """ 4 | 5 | from sklearn import neighbors, datasets, linear_model 6 | import pylab as pl 7 | import numpy as np 8 | from matplotlib.colors import ListedColormap 9 | 10 | # Create color maps for 3-class classification problem, as with iris 11 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 12 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 13 | 14 | def plot_iris_knn(): 15 | iris = datasets.load_iris() 16 | X = iris.data[:, :2] # we only take the first two features. We could 17 | # avoid this ugly slicing by using a two-dim dataset 18 | y = iris.target 19 | 20 | knn = neighbors.KNeighborsClassifier(n_neighbors=5) 21 | knn.fit(X, y) 22 | 23 | x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1 24 | y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1 25 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), 26 | np.linspace(y_min, y_max, 100)) 27 | Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]) 28 | 29 | # Put the result into a color plot 30 | Z = Z.reshape(xx.shape) 31 | pl.figure() 32 | pl.pcolormesh(xx, yy, Z, cmap=cmap_light) 33 | 34 | # Plot also the training points 35 | pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) 36 | pl.xlabel('sepal length (cm)') 37 | pl.ylabel('sepal width (cm)') 38 | pl.axis('tight') 39 | 40 | 41 | def plot_polynomial_regression(): 42 | rng = np.random.RandomState(0) 43 | x = 2*rng.rand(100) - 1 44 | 45 | f = lambda t: 1.2 * t**2 + .1 * t**3 - .4 * t **5 - .5 * t ** 9 46 | y = f(x) + .4 * rng.normal(size=100) 47 | 48 | x_test = np.linspace(-1, 1, 100) 49 | 50 | pl.figure() 51 | pl.scatter(x, y, s=4) 52 | 53 | X = np.array([x**i for i in range(5)]).T 54 | X_test = np.array([x_test**i for i in range(5)]).T 55 | regr = linear_model.LinearRegression() 56 | regr.fit(X, y) 57 | pl.plot(x_test, regr.predict(X_test), label='4th order') 58 | 59 | X = np.array([x**i for i in range(10)]).T 60 | X_test = np.array([x_test**i for i in range(10)]).T 61 | regr = linear_model.LinearRegression() 62 | regr.fit(X, y) 63 | pl.plot(x_test, regr.predict(X_test), label='9th order') 64 | 65 | pl.legend(loc='best') 66 | pl.axis('tight') 67 | pl.title('Fitting a 4th and a 9th order polynomial') 68 | 69 | pl.figure() 70 | pl.scatter(x, y, s=4) 71 | pl.plot(x_test, f(x_test), label="truth") 72 | pl.axis('tight') 73 | pl.title('Ground truth (9th order polynomial)') 74 | 75 | 76 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import LinearRegression 4 | 5 | 6 | def plot_linear_regression(): 7 | a = 0.5 8 | b = 1.0 9 | 10 | # x from 0 to 10 11 | x = 30 * np.random.random(20) 12 | 13 | # y = a*x + b with noise 14 | y = a * x + b + np.random.normal(size=x.shape) 15 | 16 | # create a linear regression classifier 17 | clf = LinearRegression() 18 | clf.fit(x[:, None], y) 19 | 20 | # predict y from the data 21 | x_new = np.linspace(0, 30, 100) 22 | y_new = clf.predict(x_new[:, None]) 23 | 24 | # plot the results 25 | ax = plt.axes() 26 | ax.scatter(x, y) 27 | ax.plot(x_new, y_new) 28 | 29 | ax.set_xlabel('x') 30 | ax.set_ylabel('y') 31 | 32 | ax.axis('tight') 33 | 34 | 35 | if __name__ == '__main__': 36 | plot_linear_regression() 37 | plt.show() 38 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/scikit-learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:29899a15bea89b9d8275879798b23011cecabc0eff03dd41bb606324221e0bc3" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# scikit-learn" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "%matplotlib inline\n", 23 | "\n", 24 | "# set seaborn plot defaults.\n", 25 | "# This can be safely commented out\n", 26 | "import seaborn; seaborn.set()" 27 | ], 28 | "language": "python", 29 | "metadata": {}, 30 | "outputs": [], 31 | "prompt_number": 3 32 | }, 33 | { 34 | "cell_type": "code", 35 | "collapsed": false, 36 | "input": [ 37 | "# Import the example plot from the figures directory\n", 38 | "from fig_code import plot_sgd_separator\n", 39 | "plot_sgd_separator()" 40 | ], 41 | "language": "python", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "ename": "ImportError", 46 | "evalue": "No module named fig_code", 47 | "output_type": "pyerr", 48 | "traceback": [ 49 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 50 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Import the example plot from the figures directory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mfig_code\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 51 | "\u001b[0;31mImportError\u001b[0m: No module named fig_code" 52 | ] 53 | } 54 | ], 55 | "prompt_number": 4 56 | }, 57 | { 58 | "cell_type": "code", 59 | "collapsed": false, 60 | "input": [], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "collapsed": false, 68 | "input": [], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "collapsed": false, 76 | "input": [], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "collapsed": false, 84 | "input": [], 85 | "language": "python", 86 | "metadata": {}, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "collapsed": false, 92 | "input": [], 93 | "language": "python", 94 | "metadata": {}, 95 | "outputs": [] 96 | } 97 | ], 98 | "metadata": {} 99 | } 100 | ] 101 | } -------------------------------------------------------------------------------- /scikit-learn/fig_code/sgd_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import SGDClassifier 4 | from sklearn.datasets.samples_generator import make_blobs 5 | 6 | def plot_sgd_separator(): 7 | # we create 50 separable points 8 | X, Y = make_blobs(n_samples=50, centers=2, 9 | random_state=0, cluster_std=0.60) 10 | 11 | # fit the model 12 | clf = SGDClassifier(loss="hinge", alpha=0.01, 13 | n_iter=200, fit_intercept=True) 14 | clf.fit(X, Y) 15 | 16 | # plot the line, the points, and the nearest vectors to the plane 17 | xx = np.linspace(-1, 5, 10) 18 | yy = np.linspace(-1, 5, 10) 19 | 20 | X1, X2 = np.meshgrid(xx, yy) 21 | Z = np.empty(X1.shape) 22 | for (i, j), val in np.ndenumerate(X1): 23 | x1 = val 24 | x2 = X2[i, j] 25 | p = clf.decision_function([x1, x2]) 26 | Z[i, j] = p[0] 27 | levels = [-1.0, 0.0, 1.0] 28 | linestyles = ['dashed', 'solid', 'dashed'] 29 | colors = 'k' 30 | 31 | ax = plt.axes() 32 | ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) 33 | ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) 34 | 35 | ax.axis('tight') 36 | 37 | 38 | if __name__ == '__main__': 39 | plot_sgd_separator() 40 | plt.show() 41 | -------------------------------------------------------------------------------- /scikit-learn/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scikit-learn/tests/__init__.py -------------------------------------------------------------------------------- /scipy/2002FemPreg.dat.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/2002FemPreg.dat.gz -------------------------------------------------------------------------------- /scipy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/__init__.py -------------------------------------------------------------------------------- /scipy/first.py: -------------------------------------------------------------------------------- 1 | """This file contains code used in "Think Stats", 2 | by Allen B. Downey, available from greenteapress.com 3 | 4 | Copyright 2014 Allen B. Downey 5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html 6 | """ 7 | 8 | from __future__ import print_function 9 | 10 | import math 11 | import numpy as np 12 | 13 | import nsfg 14 | import thinkstats2 15 | import thinkplot 16 | 17 | 18 | def MakeFrames(): 19 | """Reads pregnancy data and partitions first babies and others. 20 | 21 | returns: DataFrames (all live births, first babies, others) 22 | """ 23 | preg = nsfg.ReadFemPreg() 24 | 25 | live = preg[preg.outcome == 1] 26 | firsts = live[live.birthord == 1] 27 | others = live[live.birthord != 1] 28 | 29 | assert len(live) == 9148 30 | assert len(firsts) == 4413 31 | assert len(others) == 4735 32 | 33 | return live, firsts, others 34 | 35 | 36 | def Summarize(live, firsts, others): 37 | """Print various summary statistics.""" 38 | 39 | mean = live.prglngth.mean() 40 | var = live.prglngth.var() 41 | std = live.prglngth.std() 42 | 43 | print('Live mean', mean) 44 | print('Live variance', var) 45 | print('Live std', std) 46 | 47 | mean1 = firsts.prglngth.mean() 48 | mean2 = others.prglngth.mean() 49 | 50 | var1 = firsts.prglngth.var() 51 | var2 = others.prglngth.var() 52 | 53 | print('Mean') 54 | print('First babies', mean1) 55 | print('Others', mean2) 56 | 57 | print('Variance') 58 | print('First babies', var1) 59 | print('Others', var2) 60 | 61 | print('Difference in weeks', mean1 - mean2) 62 | print('Difference in hours', (mean1 - mean2) * 7 * 24) 63 | 64 | print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100) 65 | 66 | d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth) 67 | print('Cohen d', d) 68 | 69 | 70 | def PrintExtremes(live): 71 | """Plots the histogram of pregnancy lengths and prints the extremes. 72 | 73 | live: DataFrame of live births 74 | """ 75 | hist = thinkstats2.Hist(live.prglngth) 76 | thinkplot.Hist(hist, label='live births') 77 | 78 | thinkplot.Save(root='first_nsfg_hist_live', 79 | title='Histogram', 80 | xlabel='weeks', 81 | ylabel='frequency') 82 | 83 | print('Shortest lengths:') 84 | for weeks, freq in hist.Smallest(10): 85 | print(weeks, freq) 86 | 87 | print('Longest lengths:') 88 | for weeks, freq in hist.Largest(10): 89 | print(weeks, freq) 90 | 91 | 92 | def MakeHists(live): 93 | """Plot Hists for live births 94 | 95 | live: DataFrame 96 | others: DataFrame 97 | """ 98 | hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') 99 | thinkplot.Hist(hist) 100 | thinkplot.Save(root='first_wgt_lb_hist', 101 | xlabel='pounds', 102 | ylabel='frequency', 103 | axis=[-1, 14, 0, 3200]) 104 | 105 | hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz') 106 | thinkplot.Hist(hist) 107 | thinkplot.Save(root='first_wgt_oz_hist', 108 | xlabel='ounces', 109 | ylabel='frequency', 110 | axis=[-1, 16, 0, 1200]) 111 | 112 | hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') 113 | thinkplot.Hist(hist) 114 | thinkplot.Save(root='first_agepreg_hist', 115 | xlabel='years', 116 | ylabel='frequency') 117 | 118 | hist = thinkstats2.Hist(live.prglngth, label='prglngth') 119 | thinkplot.Hist(hist) 120 | thinkplot.Save(root='first_prglngth_hist', 121 | xlabel='weeks', 122 | ylabel='frequency', 123 | axis=[-1, 53, 0, 5000]) 124 | 125 | 126 | def MakeComparison(firsts, others): 127 | """Plots histograms of pregnancy length for first babies and others. 128 | 129 | firsts: DataFrame 130 | others: DataFrame 131 | """ 132 | first_hist = thinkstats2.Hist(firsts.prglngth, label='first') 133 | other_hist = thinkstats2.Hist(others.prglngth, label='other') 134 | 135 | width = 0.45 136 | thinkplot.PrePlot(2) 137 | thinkplot.Hist(first_hist, align='right', width=width) 138 | thinkplot.Hist(other_hist, align='left', width=width) 139 | 140 | thinkplot.Save(root='first_nsfg_hist', 141 | title='Histogram', 142 | xlabel='weeks', 143 | ylabel='frequency', 144 | axis=[27, 46, 0, 2700]) 145 | 146 | 147 | def main(script): 148 | live, firsts, others = MakeFrames() 149 | 150 | MakeHists(live) 151 | PrintExtremes(live) 152 | MakeComparison(firsts, others) 153 | Summarize(live, firsts, others) 154 | 155 | 156 | if __name__ == '__main__': 157 | import sys 158 | main(*sys.argv) 159 | 160 | 161 | -------------------------------------------------------------------------------- /scipy/nsfg.py: -------------------------------------------------------------------------------- 1 | """This file contains code for use with "Think Stats", 2 | by Allen B. Downey, available from greenteapress.com 3 | 4 | Copyright 2010 Allen B. Downey 5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html 6 | """ 7 | 8 | from __future__ import print_function 9 | 10 | from collections import defaultdict 11 | import numpy as np 12 | import sys 13 | 14 | import thinkstats2 15 | 16 | 17 | def ReadFemPreg(dct_file='2002FemPreg.dct', 18 | dat_file='2002FemPreg.dat.gz'): 19 | """Reads the NSFG pregnancy data. 20 | 21 | dct_file: string file name 22 | dat_file: string file name 23 | 24 | returns: DataFrame 25 | """ 26 | dct = thinkstats2.ReadStataDct(dct_file) 27 | df = dct.ReadFixedWidth(dat_file, compression='gzip') 28 | CleanFemPreg(df) 29 | return df 30 | 31 | 32 | def CleanFemPreg(df): 33 | """Recodes variables from the pregnancy frame. 34 | 35 | df: DataFrame 36 | """ 37 | # mother's age is encoded in centiyears; convert to years 38 | df.agepreg /= 100.0 39 | 40 | # birthwgt_lb contains at least one bogus value (51 lbs) 41 | # replace with NaN 42 | df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan 43 | 44 | # replace 'not ascertained', 'refused', 'don't know' with NaN 45 | na_vals = [97, 98, 99] 46 | df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) 47 | df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) 48 | df.hpagelb.replace(na_vals, np.nan, inplace=True) 49 | 50 | df.babysex.replace([7, 9], np.nan, inplace=True) 51 | df.nbrnaliv.replace([9], np.nan, inplace=True) 52 | 53 | # birthweight is stored in two columns, lbs and oz. 54 | # convert to a single column in lb 55 | # NOTE: creating a new column requires dictionary syntax, 56 | # not attribute assignment (like df.totalwgt_lb) 57 | df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 58 | 59 | # due to a bug in ReadStataDct, the last variable gets clipped; 60 | # so for now set it to NaN 61 | df.cmintvw = np.nan 62 | 63 | 64 | def MakePregMap(df): 65 | """Make a map from caseid to list of preg indices. 66 | 67 | df: DataFrame 68 | 69 | returns: dict that maps from caseid to list of indices into preg df 70 | """ 71 | d = defaultdict(list) 72 | for index, caseid in df.caseid.iteritems(): 73 | d[caseid].append(index) 74 | return d 75 | 76 | 77 | def main(script): 78 | """Tests the functions in this module. 79 | 80 | script: string script name 81 | """ 82 | df = ReadFemPreg() 83 | print(df.shape) 84 | 85 | assert len(df) == 13593 86 | 87 | assert df.caseid[13592] == 12571 88 | assert df.pregordr.value_counts()[1] == 5033 89 | assert df.nbrnaliv.value_counts()[1] == 8981 90 | assert df.babysex.value_counts()[1] == 4641 91 | assert df.birthwgt_lb.value_counts()[7] == 3049 92 | assert df.birthwgt_oz.value_counts()[0] == 1037 93 | assert df.prglngth.value_counts()[39] == 4744 94 | assert df.outcome.value_counts()[1] == 9148 95 | assert df.birthord.value_counts()[1] == 4413 96 | assert df.agepreg.value_counts()[22.75] == 100 97 | assert df.totalwgt_lb.value_counts()[7.5] == 302 98 | 99 | weights = df.finalwgt.value_counts() 100 | key = max(weights.keys()) 101 | assert df.finalwgt.value_counts()[key] == 6 102 | 103 | print('%s: All tests passed.' % script) 104 | 105 | if __name__ == '__main__': 106 | main(*sys.argv) 107 | -------------------------------------------------------------------------------- /scipy/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/tests/__init__.py -------------------------------------------------------------------------------- /spark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/spark/__init__.py -------------------------------------------------------------------------------- /spark/hdfs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# HDFS" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Run an HDFS command:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "!hdfs" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Run a file system command on the file systems (FsShell):" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "!hdfs dfs" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "List the user's home directory:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "!hdfs dfs -ls" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "List the HDFS root directory:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "!hdfs dfs -ls /" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Copy a local file to the user's directory on HDFS:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "!hdfs dfs -put file.txt file.txt" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Display the contents of the specified HDFS file:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "!hdfs dfs -cat file.txt" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "Print the last 10 lines of the file to the terminal:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "!hdfs dfs -cat file.txt | tail -n 10" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "View a directory and all of its files:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "!hdfs dfs -cat dir/* | less" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Copy an HDFS file to local:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "!hdfs dfs -get file.txt file.txt" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "Create a directory on HDFS:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "!hdfs dfs -mkdir dir" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "Recursively delete the specified directory and all of its contents:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "!hdfs dfs -rm -r dir" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Specify HDFS file in Spark (paths are relative to the user's home HDFS directory):" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "data = sc.textFile (\"hdfs://hdfs-host:port/path/file.txt\")" 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 2", 237 | "language": "python", 238 | "name": "python2" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 2 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython2", 250 | "version": "2.7.10" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 0 255 | } 256 | --------------------------------------------------------------------------------