├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── analyses
    ├── __init__.py
    ├── churn.ipynb
    └── churn_measurements.py
├── aws
    ├── __init__.py
    └── aws.ipynb
├── commands
    ├── __init__.py
    ├── linux.ipynb
    ├── misc.ipynb
    └── styles
    │   └── custom.css
├── data
    ├── churn.csv
    ├── confusion_matrix.png
    ├── ozone.csv
    ├── ozone_copy.csv
    └── titanic
    │   ├── genderclassmodel.csv
    │   ├── genderclassmodel.py
    │   ├── gendermodel.csv
    │   ├── gendermodel.py
    │   ├── myfirstforest.py
    │   ├── results-rf.csv
    │   ├── test.csv
    │   └── train.csv
├── deep-learning
    ├── deep-dream
    │   ├── dream.ipynb
    │   ├── flowers.jpg
    │   └── sky1024px.jpg
    ├── keras-tutorial
    │   ├── 0. Preamble.ipynb
    │   ├── 1.1 Introduction - Deep Learning and ANN.ipynb
    │   ├── 1.2 Introduction - Theano.ipynb
    │   ├── 1.3 Introduction - Keras.ipynb
    │   ├── 1.4 (Extra) A Simple Implementation of ANN for MNIST.ipynb
    │   ├── 2.1 Supervised Learning - ConvNets.ipynb
    │   ├── 2.2.1 Supervised Learning - ConvNet HandsOn Part I.ipynb
    │   ├── 2.2.2 Supervised Learning - ConvNet HandsOn Part II.ipynb
    │   ├── 2.3 Supervised Learning - Famous Models with Keras.ipynb
    │   ├── 3.1 Unsupervised Learning - AutoEncoders and Embeddings.ipynb
    │   ├── 3.2 RNN and LSTM.ipynb
    │   ├── 3.3 (Extra) LSTM for Sentence Generation.ipynb
    │   ├── 4. Conclusions.ipynb
    │   ├── LICENSE
    │   ├── data
    │   │   ├── female_blog_list.txt
    │   │   ├── intro_to_ann.csv
    │   │   ├── male_blog_list.txt
    │   │   ├── mnist.pkl.gz
    │   │   ├── rt-polarity.neg
    │   │   └── rt-polarity.pos
    │   ├── data_helpers.py
    │   ├── deep-learning-osx.yml
    │   ├── deep-learning.yml
    │   ├── deep_learning_models
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── imagenet_utils.py
    │   │   ├── resnet50.py
    │   │   ├── vgg16.py
    │   │   └── vgg19.py
    │   ├── imgs
    │   │   ├── ConvNet LeNet.png
    │   │   ├── LSTM3-chain.png
    │   │   ├── MLP.png
    │   │   ├── MaxPool.png
    │   │   ├── Perceptron and MLP.png
    │   │   ├── Perceptron.png
    │   │   ├── RNN-rolled.png
    │   │   ├── RNN-unrolled.png
    │   │   ├── autoencoder.png
    │   │   ├── backprop.png
    │   │   ├── cnn1.png
    │   │   ├── cnn2.png
    │   │   ├── cnn3.png
    │   │   ├── cnn4.png
    │   │   ├── cnn5.png
    │   │   ├── cnn6.png
    │   │   ├── conv.png
    │   │   ├── convnets_cover.png
    │   │   ├── euroscipy_2016_logo.png
    │   │   ├── gru.png
    │   │   ├── imagenet
    │   │   │   ├── apricot_565.jpeg
    │   │   │   ├── apricot_696.jpeg
    │   │   │   ├── apricot_787.jpeg
    │   │   │   ├── strawberry_1157.jpeg
    │   │   │   ├── strawberry_1174.jpeg
    │   │   │   └── strawberry_1189.jpeg
    │   │   ├── keDyv.png
    │   │   ├── keras-logo-small.jpg
    │   │   ├── keras_rank_1.jpg
    │   │   ├── keras_rank_2.jpg
    │   │   ├── mlp_details.png
    │   │   ├── overfitting.png
    │   │   ├── rnn.png
    │   │   ├── rnn2.png
    │   │   └── sprint.jpg
    │   ├── outline.md
    │   ├── solutions
    │   │   ├── sol_111.py
    │   │   └── sol_112.py
    │   └── w2v.py
    ├── tensor-flow-examples
    │   ├── Setup_TensorFlow.md
    │   ├── input_data.py
    │   ├── multigpu_basics.py
    │   └── notebooks
    │   │   ├── 1_intro
    │   │       └── basic_operations.ipynb
    │   │   ├── 2_basic_classifiers
    │   │       ├── linear_regression.ipynb
    │   │       ├── logistic_regression.ipynb
    │   │       └── nearest_neighbor.ipynb
    │   │   ├── 3_neural_networks
    │   │       ├── alexnet.ipynb
    │   │       ├── convolutional_network.ipynb
    │   │       ├── multilayer_perceptron.ipynb
    │   │       └── recurrent_network.ipynb
    │   │   ├── 4_multi_gpu
    │   │       └── multigpu_basics.ipynb
    │   │   └── 5_ui
    │   │       ├── graph_visualization.ipynb
    │   │       └── loss_visualization.ipynb
    ├── tensor-flow-exercises
    │   ├── 1_notmnist.ipynb
    │   ├── 2_fullyconnected.ipynb
    │   ├── 3_regularization.ipynb
    │   ├── 4_convolutions.ipynb
    │   ├── 5_word2vec.ipynb
    │   ├── 6_lstm.ipynb
    │   ├── Dockerfile
    │   └── README.md
    └── theano-tutorial
    │   ├── intro_theano
    │       ├── Makefile
    │       ├── intro_theano.ipynb
    │       ├── intro_theano.pdf
    │       ├── logistic_regression.ipynb
    │       └── utils.py
    │   ├── rnn_tutorial
    │       ├── Makefile
    │       ├── instruction.pdf
    │       ├── lstm_text.ipynb
    │       ├── lstm_text.py
    │       ├── rnn_lstm.pdf
    │       ├── rnn_precompile.py
    │       ├── simple_rnn.ipynb
    │       └── synthetic.py
    │   ├── scan_tutorial
    │       ├── scan_ex1_solution.py
    │       ├── scan_ex2_solution.py
    │       └── scan_tutorial.ipynb
    │   └── theano_mlp
    │       └── theano_mlp.ipynb
├── images
    ├── README.sketch
    ├── README_1200x800.gif
    ├── aws.png
    ├── commands.png
    ├── cover.png
    ├── coversmall.png
    ├── coversmall_alt.png
    ├── deep.png
    ├── k-means.gif
    ├── kaggle.png
    ├── keras.jpg
    ├── matplotlib.png
    ├── mrjob.png
    ├── numpy.png
    ├── pandas.png
    ├── python.png
    ├── regex-1.png
    ├── regex-2.png
    ├── scikitlearn.png
    ├── scipy.png
    ├── spark.png
    ├── svm.gif
    ├── tensorflow.png
    └── theano.png
├── kaggle
    ├── __init__.py
    └── titanic.ipynb
├── mapreduce
    ├── __init__.py
    ├── mapreduce-python.ipynb
    ├── mr_s3_log_parser.py
    └── test_mr_s3_log_parser.py
├── matplotlib
    ├── 04.00-Introduction-To-Matplotlib.ipynb
    ├── 04.01-Simple-Line-Plots.ipynb
    ├── 04.02-Simple-Scatter-Plots.ipynb
    ├── 04.03-Errorbars.ipynb
    ├── 04.04-Density-and-Contour-Plots.ipynb
    ├── 04.05-Histograms-and-Binnings.ipynb
    ├── 04.06-Customizing-Legends.ipynb
    ├── 04.07-Customizing-Colorbars.ipynb
    ├── 04.08-Multiple-Subplots.ipynb
    ├── 04.09-Text-and-Annotation.ipynb
    ├── 04.10-Customizing-Ticks.ipynb
    ├── 04.11-Settings-and-Stylesheets.ipynb
    ├── 04.12-Three-Dimensional-Plotting.ipynb
    ├── 04.13-Geographic-Data-With-Basemap.ipynb
    ├── 04.14-Visualization-With-Seaborn.ipynb
    ├── 04.15-Further-Resources.ipynb
    ├── __init__.py
    ├── matplotlib-applied.ipynb
    ├── matplotlib.ipynb
    └── tests
    │   └── __init__.py
├── misc
    ├── Algorithmia.ipynb
    └── regex.ipynb
├── numpy
    ├── 02.00-Introduction-to-NumPy.ipynb
    ├── 02.01-Understanding-Data-Types.ipynb
    ├── 02.02-The-Basics-Of-NumPy-Arrays.ipynb
    ├── 02.03-Computation-on-arrays-ufuncs.ipynb
    ├── 02.04-Computation-on-arrays-aggregates.ipynb
    ├── 02.05-Computation-on-arrays-broadcasting.ipynb
    ├── 02.06-Boolean-Arrays-and-Masks.ipynb
    ├── 02.07-Fancy-Indexing.ipynb
    ├── 02.08-Sorting.ipynb
    ├── 02.09-Structured-Data-NumPy.ipynb
    ├── __init__.py
    ├── figures
    │   ├── 02.05-broadcasting.png
    │   ├── PDSH-cover-small.png
    │   ├── array_vs_list.png
    │   └── cint_vs_pyint.png
    ├── numpy.ipynb
    └── tests
    │   └── __init__.py
├── pandas
    ├── 03.00-Introduction-to-Pandas.ipynb
    ├── 03.01-Introducing-Pandas-Objects.ipynb
    ├── 03.02-Data-Indexing-and-Selection.ipynb
    ├── 03.03-Operations-in-Pandas.ipynb
    ├── 03.04-Missing-Values.ipynb
    ├── 03.05-Hierarchical-Indexing.ipynb
    ├── 03.06-Concat-And-Append.ipynb
    ├── 03.07-Merge-and-Join.ipynb
    ├── 03.08-Aggregation-and-Grouping.ipynb
    ├── 03.09-Pivot-Tables.ipynb
    ├── 03.10-Working-With-Strings.ipynb
    ├── 03.11-Working-with-Time-Series.ipynb
    ├── 03.12-Performance-Eval-and-Query.ipynb
    ├── 03.13-Further-Resources.ipynb
    ├── __init__.py
    ├── pandas.ipynb
    └── tests
    │   └── __init__.py
├── python-data
    ├── __init__.py
    ├── datetime.ipynb
    ├── files.ipynb
    ├── functions.ipynb
    ├── hello_world.txt
    ├── logs.ipynb
    ├── pdb.ipynb
    ├── structs.ipynb
    ├── structs_utils.ipynb
    ├── tests
    │   ├── __init__.py
    │   ├── test_transform_util.py
    │   └── test_type_util.py
    ├── transform_util.py
    ├── type_util.py
    └── unit_tests.ipynb
├── scikit-learn
    ├── __init__.py
    ├── fig_code
    │   ├── ML_flow_chart.py
    │   ├── __init__.py
    │   ├── data.py
    │   ├── figures.py
    │   ├── helpers.py
    │   ├── linear_regression.py
    │   ├── scikit-learn.ipynb
    │   ├── sgd_separator.py
    │   └── svm_gui.py
    ├── scikit-learn-gmm.ipynb
    ├── scikit-learn-intro.ipynb
    ├── scikit-learn-k-means.ipynb
    ├── scikit-learn-linear-reg.ipynb
    ├── scikit-learn-pca.ipynb
    ├── scikit-learn-random-forest.ipynb
    ├── scikit-learn-svm.ipynb
    ├── scikit-learn-validation.ipynb
    └── tests
    │   └── __init__.py
├── scipy
    ├── 2002FemPreg.dat.gz
    ├── 2002FemPreg.dct
    ├── __init__.py
    ├── effect_size.ipynb
    ├── first.py
    ├── hypothesis.ipynb
    ├── nsfg.py
    ├── sampling.ipynb
    ├── tests
    │   └── __init__.py
    ├── thinkplot.py
    └── thinkstats2.py
└── spark
    ├── __init__.py
    ├── hdfs.ipynb
    └── spark.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | # IPython notebook
57 | .ipynb_checkpoints
58 | 
59 | # Repo scratch directory
60 | scratch/
61 | 
62 | # Misc
63 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This repository contains a variety of content; some developed by Donne Martin,
 2 | and some from third-parties.  The third-party content is distributed under the
 3 | license provided by those parties.
 4 | 
 5 | The content developed by Donne Martin is distributed under the following license:
 6 | 
 7 | I am providing code and resources in this repository to you under an open source
 8 | license.  Because this is my personal repository, the license you receive to my
 9 | code and resources is from me and not my employer (Facebook).
10 | 
11 | Copyright 2015 Donne Martin
12 | 
13 | Licensed under the Apache License, Version 2.0 (the "License");
14 | you may not use this file except in compliance with the License.
15 | You may obtain a copy of the License at
16 | 
17 |    http://www.apache.org/licenses/LICENSE-2.0
18 | 
19 | Unless required by applicable law or agreed to in writing, software
20 | distributed under the License is distributed on an "AS IS" BASIS,
21 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 | See the License for the specific language governing permissions and
23 | limitations under the License.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/__init__.py


--------------------------------------------------------------------------------
/analyses/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/analyses/__init__.py


--------------------------------------------------------------------------------
/analyses/churn_measurements.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | 
 4 | __author__ = "Eric Chiang"
 5 | __email__  = "eric[at]yhathq.com"
 6 | 
 7 | """
 8 | 
 9 | Measurements inspired by Philip Tetlock's "Expert Political Judgment"
10 | 
11 | Equations take from Yaniv, Yates, & Smith (1991):
12 |   "Measures of Descrimination Skill in Probabilistic Judgement"
13 | 
14 | """
15 | 
16 | 
17 | def calibration(prob,outcome,n_bins=10):
18 |     """Calibration measurement for a set of predictions.
19 | 
20 |     When predicting events at a given probability, how far is frequency
21 |     of positive outcomes from that probability?
22 |     NOTE: Lower scores are better
23 | 
24 |     prob: array_like, float
25 |         Probability estimates for a set of events
26 | 
27 |     outcome: array_like, bool
28 |         If event predicted occurred
29 | 
30 |     n_bins: int
31 |         Number of judgement categories to prefrom calculation over.
32 |         Prediction are binned based on probability, since "descrete" 
33 |         probabilities aren't required. 
34 | 
35 |     """
36 |     prob = np.array(prob)
37 |     outcome = np.array(outcome)
38 | 
39 |     c = 0.0
40 |     # Construct bins
41 |     judgement_bins = np.arange(n_bins + 1) / n_bins
42 |     # Which bin is each prediction in?
43 |     bin_num = np.digitize(prob,judgement_bins)
44 |     for j_bin in np.unique(bin_num):
45 |         # Is event in bin
46 |         in_bin = bin_num == j_bin
47 |         # Predicted probability taken as average of preds in bin
48 |         predicted_prob = np.mean(prob[in_bin])
49 |         # How often did events in this bin actually happen?
50 |         true_bin_prob = np.mean(outcome[in_bin])
51 |         # Squared distance between predicted and true times num of obs
52 |         c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2)
53 |     return c / len(prob)
54 | 
55 | def discrimination(prob,outcome,n_bins=10):
56 |     """Discrimination measurement for a set of predictions.
57 | 
58 |     For each judgement category, how far from the base probability
59 |     is the true frequency of that bin?
60 |     NOTE: High scores are better
61 | 
62 |     prob: array_like, float
63 |         Probability estimates for a set of events
64 | 
65 |     outcome: array_like, bool
66 |         If event predicted occurred
67 | 
68 |     n_bins: int
69 |         Number of judgement categories to prefrom calculation over.
70 |         Prediction are binned based on probability, since "descrete" 
71 |         probabilities aren't required. 
72 | 
73 |     """
74 |     prob = np.array(prob)
75 |     outcome = np.array(outcome)
76 | 
77 |     d = 0.0
78 |     # Base frequency of outcomes
79 |     base_prob = np.mean(outcome)
80 |     # Construct bins
81 |     judgement_bins = np.arange(n_bins + 1) / n_bins
82 |     # Which bin is each prediction in?
83 |     bin_num = np.digitize(prob,judgement_bins)
84 |     for j_bin in np.unique(bin_num):
85 |         in_bin = bin_num == j_bin
86 |         true_bin_prob = np.mean(outcome[in_bin])
87 |         # Squared distance between true and base times num of obs
88 |         d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2)
89 |     return d / len(prob)
90 | 


--------------------------------------------------------------------------------
/aws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/aws/__init__.py


--------------------------------------------------------------------------------
/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/commands/__init__.py


--------------------------------------------------------------------------------
/commands/styles/custom.css:
--------------------------------------------------------------------------------
 1 | <style>
 2 |     @font-face {
 3 |         font-family: "Computer Modern";
 4 |         src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');
 5 |     }
 6 |     div.cell{
 7 |         width:800px;
 8 |         margin-left:16% !important;
 9 |         margin-right:auto;
10 |     }
11 |     h1 {
12 |         font-family: Helvetica, serif;
13 |     }
14 |     h4{
15 |         margin-top:12px;
16 |         margin-bottom: 3px;
17 |        }
18 |     div.text_cell_render{
19 |         font-family: Computer Modern, "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
20 |         line-height: 145%;
21 |         font-size: 130%;
22 |         width:800px;
23 |         margin-left:auto;
24 |         margin-right:auto;
25 |     }
26 |     .CodeMirror{
27 |             font-family: "Source Code Pro", source-code-pro,Consolas, monospace;
28 |     }
29 |     .prompt{
30 |         display: None;
31 |     }
32 |     .text_cell_render h5 {
33 |         font-weight: 300;
34 |         font-size: 22pt;
35 |         color: #4057A1;
36 |         font-style: italic;
37 |         margin-bottom: .5em;
38 |         margin-top: 0.5em;
39 |         display: block;
40 |     }
41 |     
42 |     .warning{
43 |         color: rgb( 240, 20, 20 )
44 |         }  
45 | </style>
46 | <script>
47 |     MathJax.Hub.Config({
48 |                         TeX: {
49 |                            extensions: ["AMSmath.js"]
50 |                            },
51 |                 tex2jax: {
52 |                     inlineMath: [ ['$','$'], ["\\(","\\)"] ],
53 |                     displayMath: [ ['$$','$$'], ["\\[","\\]"] ]
54 |                 },
55 |                 displayAlign: 'center', // Change this to 'center' to center equations.
56 |                 "HTML-CSS": {
57 |                     styles: {'.MathJax_Display': {"margin": 4}}
58 |                 }
59 |         });
60 | </script>


--------------------------------------------------------------------------------
/data/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/data/confusion_matrix.png


--------------------------------------------------------------------------------
/data/ozone.csv:
--------------------------------------------------------------------------------
  1 | "Ozone","Solar.R","Wind","Temp","Month","Day"
  2 | 41,190,7.4,67,5,1
  3 | 36,118,8,72,5,2
  4 | 12,149,12.6,74,5,3
  5 | 18,313,11.5,62,5,4
  6 | NA,NA,14.3,56,5,5
  7 | 28,NA,14.9,66,5,6
  8 | 23,299,8.6,65,5,7
  9 | 19,99,13.8,59,5,8
 10 | 8,19,20.1,61,5,9
 11 | NA,194,8.6,69,5,10
 12 | 7,NA,6.9,74,5,11
 13 | 16,256,9.7,69,5,12
 14 | 11,290,9.2,66,5,13
 15 | 14,274,10.9,68,5,14
 16 | 18,65,13.2,58,5,15
 17 | 14,334,11.5,64,5,16
 18 | 34,307,12,66,5,17
 19 | 6,78,18.4,57,5,18
 20 | 30,322,11.5,68,5,19
 21 | 11,44,9.7,62,5,20
 22 | 1,8,9.7,59,5,21
 23 | 11,320,16.6,73,5,22
 24 | 4,25,9.7,61,5,23
 25 | 32,92,12,61,5,24
 26 | NA,66,16.6,57,5,25
 27 | NA,266,14.9,58,5,26
 28 | NA,NA,8,57,5,27
 29 | 23,13,12,67,5,28
 30 | 45,252,14.9,81,5,29
 31 | 115,223,5.7,79,5,30
 32 | 37,279,7.4,76,5,31
 33 | NA,286,8.6,78,6,1
 34 | NA,287,9.7,74,6,2
 35 | NA,242,16.1,67,6,3
 36 | NA,186,9.2,84,6,4
 37 | NA,220,8.6,85,6,5
 38 | NA,264,14.3,79,6,6
 39 | 29,127,9.7,82,6,7
 40 | NA,273,6.9,87,6,8
 41 | 71,291,13.8,90,6,9
 42 | 39,323,11.5,87,6,10
 43 | NA,259,10.9,93,6,11
 44 | NA,250,9.2,92,6,12
 45 | 23,148,8,82,6,13
 46 | NA,332,13.8,80,6,14
 47 | NA,322,11.5,79,6,15
 48 | 21,191,14.9,77,6,16
 49 | 37,284,20.7,72,6,17
 50 | 20,37,9.2,65,6,18
 51 | 12,120,11.5,73,6,19
 52 | 13,137,10.3,76,6,20
 53 | NA,150,6.3,77,6,21
 54 | NA,59,1.7,76,6,22
 55 | NA,91,4.6,76,6,23
 56 | NA,250,6.3,76,6,24
 57 | NA,135,8,75,6,25
 58 | NA,127,8,78,6,26
 59 | NA,47,10.3,73,6,27
 60 | NA,98,11.5,80,6,28
 61 | NA,31,14.9,77,6,29
 62 | NA,138,8,83,6,30
 63 | 135,269,4.1,84,7,1
 64 | 49,248,9.2,85,7,2
 65 | 32,236,9.2,81,7,3
 66 | NA,101,10.9,84,7,4
 67 | 64,175,4.6,83,7,5
 68 | 40,314,10.9,83,7,6
 69 | 77,276,5.1,88,7,7
 70 | 97,267,6.3,92,7,8
 71 | 97,272,5.7,92,7,9
 72 | 85,175,7.4,89,7,10
 73 | NA,139,8.6,82,7,11
 74 | 10,264,14.3,73,7,12
 75 | 27,175,14.9,81,7,13
 76 | NA,291,14.9,91,7,14
 77 | 7,48,14.3,80,7,15
 78 | 48,260,6.9,81,7,16
 79 | 35,274,10.3,82,7,17
 80 | 61,285,6.3,84,7,18
 81 | 79,187,5.1,87,7,19
 82 | 63,220,11.5,85,7,20
 83 | 16,7,6.9,74,7,21
 84 | NA,258,9.7,81,7,22
 85 | NA,295,11.5,82,7,23
 86 | 80,294,8.6,86,7,24
 87 | 108,223,8,85,7,25
 88 | 20,81,8.6,82,7,26
 89 | 52,82,12,86,7,27
 90 | 82,213,7.4,88,7,28
 91 | 50,275,7.4,86,7,29
 92 | 64,253,7.4,83,7,30
 93 | 59,254,9.2,81,7,31
 94 | 39,83,6.9,81,8,1
 95 | 9,24,13.8,81,8,2
 96 | 16,77,7.4,82,8,3
 97 | 78,NA,6.9,86,8,4
 98 | 35,NA,7.4,85,8,5
 99 | 66,NA,4.6,87,8,6
100 | 122,255,4,89,8,7
101 | 89,229,10.3,90,8,8
102 | 110,207,8,90,8,9
103 | NA,222,8.6,92,8,10
104 | NA,137,11.5,86,8,11
105 | 44,192,11.5,86,8,12
106 | 28,273,11.5,82,8,13
107 | 65,157,9.7,80,8,14
108 | NA,64,11.5,79,8,15
109 | 22,71,10.3,77,8,16
110 | 59,51,6.3,79,8,17
111 | 23,115,7.4,76,8,18
112 | 31,244,10.9,78,8,19
113 | 44,190,10.3,78,8,20
114 | 21,259,15.5,77,8,21
115 | 9,36,14.3,72,8,22
116 | NA,255,12.6,75,8,23
117 | 45,212,9.7,79,8,24
118 | 168,238,3.4,81,8,25
119 | 73,215,8,86,8,26
120 | NA,153,5.7,88,8,27
121 | 76,203,9.7,97,8,28
122 | 118,225,2.3,94,8,29
123 | 84,237,6.3,96,8,30
124 | 85,188,6.3,94,8,31
125 | 96,167,6.9,91,9,1
126 | 78,197,5.1,92,9,2
127 | 73,183,2.8,93,9,3
128 | 91,189,4.6,93,9,4
129 | 47,95,7.4,87,9,5
130 | 32,92,15.5,84,9,6
131 | 20,252,10.9,80,9,7
132 | 23,220,10.3,78,9,8
133 | 21,230,10.9,75,9,9
134 | 24,259,9.7,73,9,10
135 | 44,236,14.9,81,9,11
136 | 21,259,15.5,76,9,12
137 | 28,238,6.3,77,9,13
138 | 9,24,10.9,71,9,14
139 | 13,112,11.5,71,9,15
140 | 46,237,6.9,78,9,16
141 | 18,224,13.8,67,9,17
142 | 13,27,10.3,76,9,18
143 | 24,238,10.3,68,9,19
144 | 16,201,8,82,9,20
145 | 13,238,12.6,64,9,21
146 | 23,14,9.2,71,9,22
147 | 36,139,10.3,81,9,23
148 | 7,49,10.3,69,9,24
149 | 14,20,16.6,63,9,25
150 | 30,193,6.9,70,9,26
151 | NA,145,13.2,77,9,27
152 | 14,191,14.3,75,9,28
153 | 18,131,8,76,9,29
154 | 20,223,11.5,68,9,30
155 | 


--------------------------------------------------------------------------------
/data/ozone_copy.csv:
--------------------------------------------------------------------------------
  1 | 41.0,190.0,7.4,67,5,1
  2 | 36.0,118.0,8.0,72,5,2
  3 | 12.0,149.0,12.6,74,5,3
  4 | 18.0,313.0,11.5,62,5,4
  5 | ,,14.3,56,5,5
  6 | 28.0,,14.9,66,5,6
  7 | 23.0,299.0,8.6,65,5,7
  8 | 19.0,99.0,13.8,59,5,8
  9 | 8.0,19.0,20.1,61,5,9
 10 | ,194.0,8.6,69,5,10
 11 | 7.0,,6.9,74,5,11
 12 | 16.0,256.0,9.7,69,5,12
 13 | 11.0,290.0,9.2,66,5,13
 14 | 14.0,274.0,10.9,68,5,14
 15 | 18.0,65.0,13.2,58,5,15
 16 | 14.0,334.0,11.5,64,5,16
 17 | 34.0,307.0,12.0,66,5,17
 18 | 6.0,78.0,18.4,57,5,18
 19 | 30.0,322.0,11.5,68,5,19
 20 | 11.0,44.0,9.7,62,5,20
 21 | 1.0,8.0,9.7,59,5,21
 22 | 11.0,320.0,16.6,73,5,22
 23 | 4.0,25.0,9.7,61,5,23
 24 | 32.0,92.0,12.0,61,5,24
 25 | ,66.0,16.6,57,5,25
 26 | ,266.0,14.9,58,5,26
 27 | ,,8.0,57,5,27
 28 | 23.0,13.0,12.0,67,5,28
 29 | 45.0,252.0,14.9,81,5,29
 30 | 115.0,223.0,5.7,79,5,30
 31 | 37.0,279.0,7.4,76,5,31
 32 | ,286.0,8.6,78,6,1
 33 | ,287.0,9.7,74,6,2
 34 | ,242.0,16.1,67,6,3
 35 | ,186.0,9.2,84,6,4
 36 | ,220.0,8.6,85,6,5
 37 | ,264.0,14.3,79,6,6
 38 | 29.0,127.0,9.7,82,6,7
 39 | ,273.0,6.9,87,6,8
 40 | 71.0,291.0,13.8,90,6,9
 41 | 39.0,323.0,11.5,87,6,10
 42 | ,259.0,10.9,93,6,11
 43 | ,250.0,9.2,92,6,12
 44 | 23.0,148.0,8.0,82,6,13
 45 | ,332.0,13.8,80,6,14
 46 | ,322.0,11.5,79,6,15
 47 | 21.0,191.0,14.9,77,6,16
 48 | 37.0,284.0,20.7,72,6,17
 49 | 20.0,37.0,9.2,65,6,18
 50 | 12.0,120.0,11.5,73,6,19
 51 | 13.0,137.0,10.3,76,6,20
 52 | ,150.0,6.3,77,6,21
 53 | ,59.0,1.7,76,6,22
 54 | ,91.0,4.6,76,6,23
 55 | ,250.0,6.3,76,6,24
 56 | ,135.0,8.0,75,6,25
 57 | ,127.0,8.0,78,6,26
 58 | ,47.0,10.3,73,6,27
 59 | ,98.0,11.5,80,6,28
 60 | ,31.0,14.9,77,6,29
 61 | ,138.0,8.0,83,6,30
 62 | 135.0,269.0,4.1,84,7,1
 63 | 49.0,248.0,9.2,85,7,2
 64 | 32.0,236.0,9.2,81,7,3
 65 | ,101.0,10.9,84,7,4
 66 | 64.0,175.0,4.6,83,7,5
 67 | 40.0,314.0,10.9,83,7,6
 68 | 77.0,276.0,5.1,88,7,7
 69 | 97.0,267.0,6.3,92,7,8
 70 | 97.0,272.0,5.7,92,7,9
 71 | 85.0,175.0,7.4,89,7,10
 72 | ,139.0,8.6,82,7,11
 73 | 10.0,264.0,14.3,73,7,12
 74 | 27.0,175.0,14.9,81,7,13
 75 | ,291.0,14.9,91,7,14
 76 | 7.0,48.0,14.3,80,7,15
 77 | 48.0,260.0,6.9,81,7,16
 78 | 35.0,274.0,10.3,82,7,17
 79 | 61.0,285.0,6.3,84,7,18
 80 | 79.0,187.0,5.1,87,7,19
 81 | 63.0,220.0,11.5,85,7,20
 82 | 16.0,7.0,6.9,74,7,21
 83 | ,258.0,9.7,81,7,22
 84 | ,295.0,11.5,82,7,23
 85 | 80.0,294.0,8.6,86,7,24
 86 | 108.0,223.0,8.0,85,7,25
 87 | 20.0,81.0,8.6,82,7,26
 88 | 52.0,82.0,12.0,86,7,27
 89 | 82.0,213.0,7.4,88,7,28
 90 | 50.0,275.0,7.4,86,7,29
 91 | 64.0,253.0,7.4,83,7,30
 92 | 59.0,254.0,9.2,81,7,31
 93 | 39.0,83.0,6.9,81,8,1
 94 | 9.0,24.0,13.8,81,8,2
 95 | 16.0,77.0,7.4,82,8,3
 96 | 78.0,,6.9,86,8,4
 97 | 35.0,,7.4,85,8,5
 98 | 66.0,,4.6,87,8,6
 99 | 122.0,255.0,4.0,89,8,7
100 | 89.0,229.0,10.3,90,8,8
101 | 110.0,207.0,8.0,90,8,9
102 | ,222.0,8.6,92,8,10
103 | ,137.0,11.5,86,8,11
104 | 44.0,192.0,11.5,86,8,12
105 | 28.0,273.0,11.5,82,8,13
106 | 65.0,157.0,9.7,80,8,14
107 | ,64.0,11.5,79,8,15
108 | 22.0,71.0,10.3,77,8,16
109 | 59.0,51.0,6.3,79,8,17
110 | 23.0,115.0,7.4,76,8,18
111 | 31.0,244.0,10.9,78,8,19
112 | 44.0,190.0,10.3,78,8,20
113 | 21.0,259.0,15.5,77,8,21
114 | 9.0,36.0,14.3,72,8,22
115 | ,255.0,12.6,75,8,23
116 | 45.0,212.0,9.7,79,8,24
117 | 168.0,238.0,3.4,81,8,25
118 | 73.0,215.0,8.0,86,8,26
119 | ,153.0,5.7,88,8,27
120 | 76.0,203.0,9.7,97,8,28
121 | 118.0,225.0,2.3,94,8,29
122 | 84.0,237.0,6.3,96,8,30
123 | 85.0,188.0,6.3,94,8,31
124 | 96.0,167.0,6.9,91,9,1
125 | 78.0,197.0,5.1,92,9,2
126 | 73.0,183.0,2.8,93,9,3
127 | 91.0,189.0,4.6,93,9,4
128 | 47.0,95.0,7.4,87,9,5
129 | 32.0,92.0,15.5,84,9,6
130 | 20.0,252.0,10.9,80,9,7
131 | 23.0,220.0,10.3,78,9,8
132 | 21.0,230.0,10.9,75,9,9
133 | 24.0,259.0,9.7,73,9,10
134 | 44.0,236.0,14.9,81,9,11
135 | 21.0,259.0,15.5,76,9,12
136 | 28.0,238.0,6.3,77,9,13
137 | 9.0,24.0,10.9,71,9,14
138 | 13.0,112.0,11.5,71,9,15
139 | 46.0,237.0,6.9,78,9,16
140 | 18.0,224.0,13.8,67,9,17
141 | 13.0,27.0,10.3,76,9,18
142 | 24.0,238.0,10.3,68,9,19
143 | 16.0,201.0,8.0,82,9,20
144 | 13.0,238.0,12.6,64,9,21
145 | 23.0,14.0,9.2,71,9,22
146 | 36.0,139.0,10.3,81,9,23
147 | 7.0,49.0,10.3,69,9,24
148 | 14.0,20.0,16.6,63,9,25
149 | 30.0,193.0,6.9,70,9,26
150 | ,145.0,13.2,77,9,27
151 | 14.0,191.0,14.3,75,9,28
152 | 18.0,131.0,8.0,76,9,29
153 | 20.0,223.0,11.5,68,9,30
154 | 


--------------------------------------------------------------------------------
/data/titanic/genderclassmodel.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,0
 35 | 925,0
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,0
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,0
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,0
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,0
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,0
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,0
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,0
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,0
368 | 1258,0
369 | 1259,0
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/data/titanic/genderclassmodel.py:
--------------------------------------------------------------------------------
  1 | """ Now that the user can read in a file this creates a model which uses the price, class and gender
  2 | Author : AstroDave
  3 | Date : 18th September 2012
  4 | Revised : 28 March 2014
  5 | 
  6 | """
  7 | 
  8 | 
  9 | import csv as csv
 10 | import numpy as np
 11 | 
 12 | csv_file_object = csv.reader(open('train.csv', 'rb'))       # Load in the csv file
 13 | header = csv_file_object.next()                             # Skip the fist line as it is a header
 14 | data=[]                                                     # Create a variable to hold the data
 15 | 
 16 | for row in csv_file_object:                 # Skip through each row in the csv file
 17 |     data.append(row)                        # adding each row to the data variable
 18 | data = np.array(data)                       # Then convert from a list to an array
 19 | 
 20 | # In order to analyse the price column I need to bin up that data
 21 | # here are my binning parameters, the problem we face is some of the fares are very large
 22 | # So we can either have a lot of bins with nothing in them or we can just lose some
 23 | # information by just considering that anythng over 39 is simply in the last bin.
 24 | # So we add a ceiling
 25 | fare_ceiling = 40
 26 | # then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling
 27 | data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0
 28 | 
 29 | fare_bracket_size = 10
 30 | number_of_price_brackets = fare_ceiling / fare_bracket_size
 31 | number_of_classes = 3                             # I know there were 1st, 2nd and 3rd classes on board.
 32 | number_of_classes = len(np.unique(data[0::,2]))   # But it's better practice to calculate this from the Pclass directly:
 33 |                                                   # just take the length of an array of UNIQUE values in column index 2
 34 | 
 35 | 
 36 | # This reference matrix will show the proportion of survivors as a sorted table of
 37 | # gender, class and ticket fare.
 38 | # First initialize it with all zeros
 39 | survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float)
 40 | 
 41 | # I can now find the stats of all the women and men on board
 42 | for i in xrange(number_of_classes):
 43 |     for j in xrange(number_of_price_brackets):
 44 | 
 45 |         women_only_stats = data[ (data[0::,4] == "female") \
 46 |                                  & (data[0::,2].astype(np.float) == i+1) \
 47 |                                  & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \
 48 |                                  & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
 49 | 
 50 |         men_only_stats = data[ (data[0::,4] != "female") \
 51 |                                  & (data[0::,2].astype(np.float) == i+1) \
 52 |                                  & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \
 53 |                                  & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
 54 | 
 55 |                                  #if i == 0 and j == 3:
 56 | 
 57 |         survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))  # Female stats
 58 |         survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))    # Male stats
 59 | 
 60 | # Since in python if it tries to find the mean of an array with nothing in it
 61 | # (such that the denominator is 0), then it returns nan, we can convert these to 0
 62 | # by just saying where does the array not equal the array, and set these to 0.
 63 | survival_table[ survival_table != survival_table ] = 0.
 64 | 
 65 | # Now I have my proportion of survivors, simply round them such that if <0.5
 66 | # I predict they dont surivive, and if >= 0.5 they do
 67 | survival_table[ survival_table < 0.5 ] = 0
 68 | survival_table[ survival_table >= 0.5 ] = 1
 69 | 
 70 | # Now I have my indicator I can read in the test file and write out
 71 | # if a women then survived(1) if a man then did not survived (0)
 72 | # First read in test
 73 | test_file = open('test.csv', 'rb')
 74 | test_file_object = csv.reader(test_file)
 75 | header = test_file_object.next()
 76 | 
 77 | # Also open the a new file so I can write to it. 
 78 | predictions_file = open("genderclassmodel.csv", "wb")
 79 | predictions_file_object = csv.writer(predictions_file)
 80 | predictions_file_object.writerow(["PassengerId", "Survived"])
 81 | 
 82 | # First thing to do is bin up the price file
 83 | for row in test_file_object:
 84 |     for j in xrange(number_of_price_brackets):
 85 |         # If there is no fare then place the price of the ticket according to class
 86 |         try:
 87 |             row[8] = float(row[8])    # No fare recorded will come up as a string so
 88 |                                       # try to make it a float
 89 |         except:                       # If fails then just bin the fare according to the class
 90 |             bin_fare = 3 - float(row[1])
 91 |             break                     # Break from the loop and move to the next row
 92 |         if row[8] > fare_ceiling:     # Otherwise now test to see if it is higher
 93 |                                       # than the fare ceiling we set earlier
 94 |             bin_fare = number_of_price_brackets - 1
 95 |             break                     # And then break to the next row
 96 | 
 97 |         if row[8] >= j*fare_bracket_size\
 98 |             and row[8] < (j+1)*fare_bracket_size:     # If passed these tests then loop through
 99 |                                                       # each bin until you find the right one
100 |                                                       # append it to the bin_fare
101 |                                                       # and move to the next loop
102 |             bin_fare = j
103 |             break
104 |         # Now I have the binned fare, passenger class, and whether female or male, we can
105 |         # just cross ref their details with our survival table
106 |     if row[3] == 'female':
107 |         predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])])
108 |     else:
109 |         predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])])
110 | 
111 | # Close out the files
112 | test_file.close()
113 | predictions_file.close()


--------------------------------------------------------------------------------
/data/titanic/gendermodel.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,1
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/data/titanic/gendermodel.py:
--------------------------------------------------------------------------------
 1 | """ This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this
 2 | Author : AstroDave
 3 | Date : 18 September 2012
 4 | Revised: 28 March 2014
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import csv as csv
10 | import numpy as np
11 | 
12 | csv_file_object = csv.reader(open('train.csv', 'rb')) 	# Load in the csv file
13 | header = csv_file_object.next() 						# Skip the fist line as it is a header
14 | data=[] 												# Create a variable to hold the data
15 | 
16 | for row in csv_file_object: 							# Skip through each row in the csv file,
17 |     data.append(row[0:]) 								# adding each row to the data variable
18 | data = np.array(data) 									# Then convert from a list to an array.
19 | 
20 | # Now I have an array of 12 columns and 891 rows
21 | # I can access any element I want, so the entire first column would
22 | # be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0
23 | # I have to add the .astype() command, because
24 | # when appending the rows, python thought it was a string - so needed to convert
25 | 
26 | # Set some variables
27 | number_passengers = np.size(data[0::,1].astype(np.float))
28 | number_survived = np.sum(data[0::,1].astype(np.float))
29 | proportion_survivors = number_survived / number_passengers 
30 | 
31 | # I can now find the stats of all the women on board,
32 | # by making an array that lists True/False whether each row is female
33 | women_only_stats = data[0::,4] == "female" 	# This finds where all the women are
34 | men_only_stats = data[0::,4] != "female" 	# This finds where all the men are (note != means 'not equal')
35 | 
36 | # I can now filter the whole data, to find statistics for just women, by just placing
37 | # women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index. 
38 | # You can test it by placing it there, and requesting column index [4], and the output should all read 'female'
39 | # e.g. try typing this:   data[women_only_stats,4]
40 | women_onboard = data[women_only_stats,1].astype(np.float)
41 | men_onboard = data[men_only_stats,1].astype(np.float)
42 | 
43 | # and derive some statistics about them
44 | proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
45 | proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)
46 | 
47 | print 'Proportion of women who survived is %s' % proportion_women_survived
48 | print 'Proportion of men who survived is %s' % proportion_men_survived
49 | 
50 | # Now that I have my indicator that women were much more likely to survive,
51 | # I am done with the training set.
52 | # Now I will read in the test file and write out my simplistic prediction:
53 | # if female, then model that she survived (1) 
54 | # if male, then model that he did not survive (0)
55 | 
56 | # First, read in test.csv
57 | test_file = open('test.csv', 'rb')
58 | test_file_object = csv.reader(test_file)
59 | header = test_file_object.next()
60 | 
61 | # Also open the a new file so I can write to it. Call it something descriptive
62 | # Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex')
63 | # Write out the PassengerId, and my prediction.
64 | 
65 | predictions_file = open("gendermodel.csv", "wb")
66 | predictions_file_object = csv.writer(predictions_file)
67 | predictions_file_object.writerow(["PassengerId", "Survived"])	# write the column headers
68 | for row in test_file_object:									# For each row in test file,
69 |     if row[3] == 'female':										# is it a female, if yes then
70 |         predictions_file_object.writerow([row[0], "1"])			# write the PassengerId, and predict 1
71 |     else:														# or else if male,
72 |         predictions_file_object.writerow([row[0], "0"])			# write the PassengerId, and predict 0.
73 | test_file.close()												# Close out the files.
74 | predictions_file.close()
75 | 
76 | 


--------------------------------------------------------------------------------
/data/titanic/myfirstforest.py:
--------------------------------------------------------------------------------
 1 | """ Writing my first randomforest code.
 2 | Author : AstroDave
 3 | Date : 23rd September 2012
 4 | Revised: 15 April 2014
 5 | please see packages.python.org/milk/randomforests.html for more
 6 | 
 7 | """ 
 8 | import pandas as pd
 9 | import numpy as np
10 | import csv as csv
11 | from sklearn.ensemble import RandomForestClassifier
12 | 
13 | # Data cleanup
14 | # TRAIN DATA
15 | train_df = pd.read_csv('train.csv', header=0)        # Load the train file into a dataframe
16 | 
17 | # I need to convert all strings to integer classifiers.
18 | # I need to fill in the missing values of the data and make it complete.
19 | 
20 | # female = 0, Male = 1
21 | train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
22 | 
23 | # Embarked from 'C', 'Q', 'S'
24 | # Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.
25 | 
26 | # All missing Embarked -> just make them embark from most common place
27 | if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
28 |     train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values
29 | 
30 | Ports = list(enumerate(np.unique(train_df['Embarked'])))    # determine all values of Embarked,
31 | Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
32 | train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int
33 | 
34 | # All the ages with no data -> make the median of all Ages
35 | median_age = train_df['Age'].dropna().median()
36 | if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
37 |     train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age
38 | 
39 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
40 | train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 
41 | 
42 | 
43 | # TEST DATA
44 | test_df = pd.read_csv('test.csv', header=0)        # Load the test file into a dataframe
45 | 
46 | # I need to do the same with the test data now, so that the columns are the same as the training data
47 | # I need to convert all strings to integer classifiers:
48 | # female = 0, Male = 1
49 | test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
50 | 
51 | # Embarked from 'C', 'Q', 'S'
52 | # All missing Embarked -> just make them embark from most common place
53 | if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
54 |     test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
55 | # Again convert all Embarked strings to int
56 | test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)
57 | 
58 | 
59 | # All the ages with no data -> make the median of all Ages
60 | median_age = test_df['Age'].dropna().median()
61 | if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
62 |     test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age
63 | 
64 | # All the missing Fares -> assume median of their respective class
65 | if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
66 |     median_fare = np.zeros(3)
67 |     for f in range(0,3):                                              # loop 0 to 2
68 |         median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
69 |     for f in range(0,3):                                              # loop 0 to 2
70 |         test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
71 | 
72 | # Collect the test data's PassengerIds before dropping it
73 | ids = test_df['PassengerId'].values
74 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
75 | test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 
76 | 
77 | 
78 | # The data is now ready to go. So lets fit to the train, then predict to the test!
79 | # Convert back to a numpy array
80 | train_data = train_df.values
81 | test_data = test_df.values
82 | 
83 | 
84 | print 'Training...'
85 | forest = RandomForestClassifier(n_estimators=100)
86 | forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
87 | 
88 | print 'Predicting...'
89 | output = forest.predict(test_data).astype(int)
90 | 
91 | 
92 | predictions_file = open("myfirstforest.csv", "wb")
93 | open_file_object = csv.writer(predictions_file)
94 | open_file_object.writerow(["PassengerId","Survived"])
95 | open_file_object.writerows(zip(ids, output))
96 | predictions_file.close()
97 | print 'Done.'
98 | 


--------------------------------------------------------------------------------
/data/titanic/results-rf.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0.0
  3 | 893,0.0
  4 | 894,0.0
  5 | 895,1.0
  6 | 896,1.0
  7 | 897,0.0
  8 | 898,0.0
  9 | 899,0.0
 10 | 900,1.0
 11 | 901,0.0
 12 | 902,0.0
 13 | 903,0.0
 14 | 904,1.0
 15 | 905,0.0
 16 | 906,1.0
 17 | 907,1.0
 18 | 908,0.0
 19 | 909,1.0
 20 | 910,1.0
 21 | 911,1.0
 22 | 912,0.0
 23 | 913,1.0
 24 | 914,1.0
 25 | 915,1.0
 26 | 916,1.0
 27 | 917,0.0
 28 | 918,1.0
 29 | 919,1.0
 30 | 920,1.0
 31 | 921,0.0
 32 | 922,0.0
 33 | 923,0.0
 34 | 924,1.0
 35 | 925,0.0
 36 | 926,1.0
 37 | 927,1.0
 38 | 928,0.0
 39 | 929,0.0
 40 | 930,0.0
 41 | 931,1.0
 42 | 932,0.0
 43 | 933,1.0
 44 | 934,0.0
 45 | 935,1.0
 46 | 936,1.0
 47 | 937,0.0
 48 | 938,1.0
 49 | 939,0.0
 50 | 940,1.0
 51 | 941,1.0
 52 | 942,0.0
 53 | 943,0.0
 54 | 944,1.0
 55 | 945,1.0
 56 | 946,0.0
 57 | 947,0.0
 58 | 948,0.0
 59 | 949,0.0
 60 | 950,0.0
 61 | 951,1.0
 62 | 952,0.0
 63 | 953,0.0
 64 | 954,0.0
 65 | 955,1.0
 66 | 956,1.0
 67 | 957,1.0
 68 | 958,1.0
 69 | 959,0.0
 70 | 960,0.0
 71 | 961,1.0
 72 | 962,1.0
 73 | 963,0.0
 74 | 964,0.0
 75 | 965,0.0
 76 | 966,1.0
 77 | 967,0.0
 78 | 968,0.0
 79 | 969,1.0
 80 | 970,0.0
 81 | 971,1.0
 82 | 972,1.0
 83 | 973,0.0
 84 | 974,0.0
 85 | 975,0.0
 86 | 976,0.0
 87 | 977,0.0
 88 | 978,1.0
 89 | 979,0.0
 90 | 980,0.0
 91 | 981,1.0
 92 | 982,1.0
 93 | 983,0.0
 94 | 984,1.0
 95 | 985,0.0
 96 | 986,0.0
 97 | 987,0.0
 98 | 988,1.0
 99 | 989,0.0
100 | 990,0.0
101 | 991,0.0
102 | 992,1.0
103 | 993,0.0
104 | 994,0.0
105 | 995,0.0
106 | 996,1.0
107 | 997,0.0
108 | 998,0.0
109 | 999,0.0
110 | 1000,0.0
111 | 1001,0.0
112 | 1002,0.0
113 | 1003,0.0
114 | 1004,1.0
115 | 1005,0.0
116 | 1006,1.0
117 | 1007,0.0
118 | 1008,0.0
119 | 1009,1.0
120 | 1010,0.0
121 | 1011,1.0
122 | 1012,1.0
123 | 1013,0.0
124 | 1014,1.0
125 | 1015,0.0
126 | 1016,0.0
127 | 1017,1.0
128 | 1018,0.0
129 | 1019,0.0
130 | 1020,0.0
131 | 1021,0.0
132 | 1022,1.0
133 | 1023,0.0
134 | 1024,0.0
135 | 1025,0.0
136 | 1026,0.0
137 | 1027,0.0
138 | 1028,0.0
139 | 1029,0.0
140 | 1030,0.0
141 | 1031,0.0
142 | 1032,0.0
143 | 1033,1.0
144 | 1034,0.0
145 | 1035,0.0
146 | 1036,1.0
147 | 1037,0.0
148 | 1038,0.0
149 | 1039,0.0
150 | 1040,1.0
151 | 1041,0.0
152 | 1042,1.0
153 | 1043,0.0
154 | 1044,0.0
155 | 1045,1.0
156 | 1046,0.0
157 | 1047,0.0
158 | 1048,1.0
159 | 1049,0.0
160 | 1050,1.0
161 | 1051,1.0
162 | 1052,0.0
163 | 1053,1.0
164 | 1054,1.0
165 | 1055,0.0
166 | 1056,0.0
167 | 1057,1.0
168 | 1058,0.0
169 | 1059,0.0
170 | 1060,1.0
171 | 1061,0.0
172 | 1062,0.0
173 | 1063,0.0
174 | 1064,0.0
175 | 1065,0.0
176 | 1066,0.0
177 | 1067,1.0
178 | 1068,1.0
179 | 1069,0.0
180 | 1070,1.0
181 | 1071,1.0
182 | 1072,0.0
183 | 1073,0.0
184 | 1074,1.0
185 | 1075,0.0
186 | 1076,1.0
187 | 1077,0.0
188 | 1078,1.0
189 | 1079,0.0
190 | 1080,0.0
191 | 1081,0.0
192 | 1082,0.0
193 | 1083,0.0
194 | 1084,1.0
195 | 1085,0.0
196 | 1086,1.0
197 | 1087,0.0
198 | 1088,1.0
199 | 1089,0.0
200 | 1090,0.0
201 | 1091,0.0
202 | 1092,0.0
203 | 1093,1.0
204 | 1094,0.0
205 | 1095,1.0
206 | 1096,0.0
207 | 1097,0.0
208 | 1098,0.0
209 | 1099,0.0
210 | 1100,1.0
211 | 1101,0.0
212 | 1102,0.0
213 | 1103,0.0
214 | 1104,0.0
215 | 1105,1.0
216 | 1106,0.0
217 | 1107,0.0
218 | 1108,0.0
219 | 1109,0.0
220 | 1110,1.0
221 | 1111,0.0
222 | 1112,1.0
223 | 1113,0.0
224 | 1114,1.0
225 | 1115,1.0
226 | 1116,0.0
227 | 1117,1.0
228 | 1118,0.0
229 | 1119,0.0
230 | 1120,0.0
231 | 1121,0.0
232 | 1122,0.0
233 | 1123,1.0
234 | 1124,0.0
235 | 1125,0.0
236 | 1126,1.0
237 | 1127,0.0
238 | 1128,0.0
239 | 1129,1.0
240 | 1130,1.0
241 | 1131,1.0
242 | 1132,0.0
243 | 1133,1.0
244 | 1134,0.0
245 | 1135,0.0
246 | 1136,0.0
247 | 1137,0.0
248 | 1138,1.0
249 | 1139,0.0
250 | 1140,1.0
251 | 1141,0.0
252 | 1142,1.0
253 | 1143,0.0
254 | 1144,0.0
255 | 1145,0.0
256 | 1146,0.0
257 | 1147,0.0
258 | 1148,0.0
259 | 1149,0.0
260 | 1150,1.0
261 | 1151,0.0
262 | 1152,0.0
263 | 1153,0.0
264 | 1154,1.0
265 | 1155,1.0
266 | 1156,0.0
267 | 1157,0.0
268 | 1158,0.0
269 | 1159,0.0
270 | 1160,0.0
271 | 1161,0.0
272 | 1162,0.0
273 | 1163,0.0
274 | 1164,1.0
275 | 1165,0.0
276 | 1166,0.0
277 | 1167,1.0
278 | 1168,0.0
279 | 1169,0.0
280 | 1170,0.0
281 | 1171,0.0
282 | 1172,0.0
283 | 1173,1.0
284 | 1174,0.0
285 | 1175,0.0
286 | 1176,1.0
287 | 1177,0.0
288 | 1178,0.0
289 | 1179,0.0
290 | 1180,0.0
291 | 1181,0.0
292 | 1182,0.0
293 | 1183,0.0
294 | 1184,0.0
295 | 1185,0.0
296 | 1186,0.0
297 | 1187,0.0
298 | 1188,1.0
299 | 1189,0.0
300 | 1190,0.0
301 | 1191,0.0
302 | 1192,0.0
303 | 1193,0.0
304 | 1194,0.0
305 | 1195,0.0
306 | 1196,0.0
307 | 1197,1.0
308 | 1198,1.0
309 | 1199,1.0
310 | 1200,0.0
311 | 1201,0.0
312 | 1202,0.0
313 | 1203,1.0
314 | 1204,0.0
315 | 1205,0.0
316 | 1206,1.0
317 | 1207,1.0
318 | 1208,0.0
319 | 1209,0.0
320 | 1210,0.0
321 | 1211,0.0
322 | 1212,0.0
323 | 1213,0.0
324 | 1214,0.0
325 | 1215,1.0
326 | 1216,1.0
327 | 1217,0.0
328 | 1218,1.0
329 | 1219,0.0
330 | 1220,0.0
331 | 1221,0.0
332 | 1222,1.0
333 | 1223,1.0
334 | 1224,0.0
335 | 1225,1.0
336 | 1226,0.0
337 | 1227,0.0
338 | 1228,1.0
339 | 1229,0.0
340 | 1230,0.0
341 | 1231,0.0
342 | 1232,0.0
343 | 1233,0.0
344 | 1234,0.0
345 | 1235,1.0
346 | 1236,0.0
347 | 1237,0.0
348 | 1238,0.0
349 | 1239,1.0
350 | 1240,0.0
351 | 1241,1.0
352 | 1242,1.0
353 | 1243,0.0
354 | 1244,0.0
355 | 1245,0.0
356 | 1246,1.0
357 | 1247,0.0
358 | 1248,1.0
359 | 1249,0.0
360 | 1250,0.0
361 | 1251,1.0
362 | 1252,0.0
363 | 1253,1.0
364 | 1254,1.0
365 | 1255,1.0
366 | 1256,1.0
367 | 1257,0.0
368 | 1258,0.0
369 | 1259,0.0
370 | 1260,1.0
371 | 1261,1.0
372 | 1262,0.0
373 | 1263,1.0
374 | 1264,0.0
375 | 1265,0.0
376 | 1266,1.0
377 | 1267,1.0
378 | 1268,0.0
379 | 1269,0.0
380 | 1270,0.0
381 | 1271,0.0
382 | 1272,0.0
383 | 1273,0.0
384 | 1274,1.0
385 | 1275,1.0
386 | 1276,0.0
387 | 1277,1.0
388 | 1278,0.0
389 | 1279,0.0
390 | 1280,0.0
391 | 1281,0.0
392 | 1282,0.0
393 | 1283,1.0
394 | 1284,0.0
395 | 1285,0.0
396 | 1286,0.0
397 | 1287,1.0
398 | 1288,0.0
399 | 1289,1.0
400 | 1290,0.0
401 | 1291,0.0
402 | 1292,1.0
403 | 1293,0.0
404 | 1294,1.0
405 | 1295,0.0
406 | 1296,0.0
407 | 1297,0.0
408 | 1298,0.0
409 | 1299,0.0
410 | 1300,0.0
411 | 1301,1.0
412 | 1302,0.0
413 | 1303,1.0
414 | 1304,0.0
415 | 1305,0.0
416 | 1306,1.0
417 | 1307,0.0
418 | 1308,0.0
419 | 1309,0.0
420 | 


--------------------------------------------------------------------------------
/deep-learning/deep-dream/flowers.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/deep-dream/flowers.jpg


--------------------------------------------------------------------------------
/deep-learning/deep-dream/sky1024px.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/deep-dream/sky1024px.jpg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/4. Conclusions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Credits: Forked from [deep-learning-keras-tensorflow](https://github.com/leriomaggio/deep-learning-keras-tensorflow) by Valerio Maggio"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "slideshow": {
 14 |      "slide_type": "slide"
 15 |     }
 16 |    },
 17 |    "source": [
 18 |     "# Conclusions"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {
 24 |     "slideshow": {
 25 |      "slide_type": "subslide"
 26 |     }
 27 |    },
 28 |    "source": [
 29 |     "* Keras is a powerful and battery-included framework for Deep Learning in Python\n",
 30 |     "\n",
 31 |     "* Keras is **simple** to use..\n",
 32 |     "\n",
 33 |     "* ...but it is **not** for simple things!"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "slideshow": {
 40 |      "slide_type": "subslide"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "<img src=\"imgs/keras_rank_1.jpg\" width=\"65%\" />"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {
 50 |     "slideshow": {
 51 |      "slide_type": "subslide"
 52 |     }
 53 |    },
 54 |    "source": [
 55 |     "<img src=\"imgs/keras_rank_2.jpg\" width=\"65%\" />"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {
 61 |     "slideshow": {
 62 |      "slide_type": "slide"
 63 |     }
 64 |    },
 65 |    "source": [
 66 |     "## Some References for .."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {
 72 |     "slideshow": {
 73 |      "slide_type": "fragment"
 74 |     }
 75 |    },
 76 |    "source": [
 77 |     "#### Cutting Edge\n",
 78 |     "\n",
 79 |     "* Fractal Net Implementation with Keras: https://github.com/snf/keras-fractalnet -\n",
 80 |     "* Please check out: [https://github.com/fchollet/keras-resources]()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {
 86 |     "slideshow": {
 87 |      "slide_type": "fragment"
 88 |     }
 89 |    },
 90 |    "source": [
 91 |     "#### Hyper-Cool\n",
 92 |     "\n",
 93 |     "* Hyperas: https://github.com/maxpumperla/hyperas\n",
 94 |     "    - A web dashboard for Keras Models"
 95 |    ]
 96 |   }
 97 |  ],
 98 |  "metadata": {
 99 |   "celltoolbar": "Slideshow",
100 |   "kernelspec": {
101 |    "display_name": "Python 3",
102 |    "language": "python",
103 |    "name": "python3"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.4.3"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 0
120 | }
121 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 MPBA
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/data/female_blog_list.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/female_blog_list.txt


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/data/male_blog_list.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/male_blog_list.txt


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/data/mnist.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/mnist.pkl.gz


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/data/rt-polarity.neg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/rt-polarity.neg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/data/rt-polarity.pos:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/data/rt-polarity.pos


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import re
  3 | import itertools
  4 | from collections import Counter
  5 | """
  6 | Original taken from https://github.com/dennybritz/cnn-text-classification-tf
  7 | """
  8 | 
  9 | def clean_str(string):
 10 |     """
 11 |     Tokenization/string cleaning for all datasets except for SST.
 12 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 13 |     """
 14 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 15 |     string = re.sub(r"\'s", " \'s", string)
 16 |     string = re.sub(r"\'ve", " \'ve", string)
 17 |     string = re.sub(r"n\'t", " n\'t", string)
 18 |     string = re.sub(r"\'re", " \'re", string)
 19 |     string = re.sub(r"\'d", " \'d", string)
 20 |     string = re.sub(r"\'ll", " \'ll", string)
 21 |     string = re.sub(r",", " , ", string)
 22 |     string = re.sub(r"!", " ! ", string)
 23 |     string = re.sub(r"\(", " \( ", string)
 24 |     string = re.sub(r"\)", " \) ", string)
 25 |     string = re.sub(r"\?", " \? ", string)
 26 |     string = re.sub(r"\s{2,}", " ", string)
 27 |     return string.strip().lower()
 28 | 
 29 | 
 30 | def load_data_and_labels():
 31 |     """
 32 |     Loads MR polarity data from files, splits the data into words and generates labels.
 33 |     Returns split sentences and labels.
 34 |     """
 35 |     # Load data from files
 36 |     positive_examples = list(open("./data/rt-polarity.pos", encoding='ISO-8859-1').readlines())
 37 |     positive_examples = [s.strip() for s in positive_examples]
 38 |     negative_examples = list(open("./data/rt-polarity.neg", encoding='ISO-8859-1').readlines())
 39 |     negative_examples = [s.strip() for s in negative_examples]
 40 |     # Split by words
 41 |     x_text = positive_examples + negative_examples
 42 |     x_text = [clean_str(sent) for sent in x_text]
 43 |     x_text = [s.split(" ") for s in x_text]
 44 |     # Generate labels
 45 |     positive_labels = [[0, 1] for _ in positive_examples]
 46 |     negative_labels = [[1, 0] for _ in negative_examples]
 47 |     y = np.concatenate([positive_labels, negative_labels], 0)
 48 |     return [x_text, y]
 49 | 
 50 | 
 51 | def pad_sentences(sentences, padding_word="<PAD/>"):
 52 |     """
 53 |     Pads all sentences to the same length. The length is defined by the longest sentence.
 54 |     Returns padded sentences.
 55 |     """
 56 |     sequence_length = max(len(x) for x in sentences)
 57 |     padded_sentences = []
 58 |     for i in range(len(sentences)):
 59 |         sentence = sentences[i]
 60 |         num_padding = sequence_length - len(sentence)
 61 |         new_sentence = sentence + [padding_word] * num_padding
 62 |         padded_sentences.append(new_sentence)
 63 |     return padded_sentences
 64 | 
 65 | 
 66 | def build_vocab(sentences):
 67 |     """
 68 |     Builds a vocabulary mapping from word to index based on the sentences.
 69 |     Returns vocabulary mapping and inverse vocabulary mapping.
 70 |     """
 71 |     # Build vocabulary
 72 |     word_counts = Counter(itertools.chain(*sentences))
 73 |     # Mapping from index to word
 74 |     vocabulary_inv = [x[0] for x in word_counts.most_common()]
 75 |     # Mapping from word to index
 76 |     vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
 77 |     return [vocabulary, vocabulary_inv]
 78 | 
 79 | 
 80 | def build_input_data(sentences, labels, vocabulary):
 81 |     """
 82 |     Maps sentencs and labels to vectors based on a vocabulary.
 83 |     """
 84 |     x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
 85 |     y = np.array(labels)
 86 |     return [x, y]
 87 | 
 88 | 
 89 | def load_data():
 90 |     """
 91 |     Loads and preprocessed data for the MR dataset.
 92 |     Returns input vectors, labels, vocabulary, and inverse vocabulary.
 93 |     """
 94 |     # Load and preprocess data
 95 |     sentences, labels = load_data_and_labels()
 96 |     sentences_padded = pad_sentences(sentences)
 97 |     vocabulary, vocabulary_inv = build_vocab(sentences_padded)
 98 |     x, y = build_input_data(sentences_padded, labels, vocabulary)
 99 |     return [x, y, vocabulary, vocabulary_inv]
100 | 
101 | 
102 | def batch_iter(data, batch_size, num_epochs):
103 |     """
104 |     Generates a batch iterator for a dataset.
105 |     """
106 |     data = np.array(data)
107 |     data_size = len(data)
108 |     num_batches_per_epoch = int(len(data)/batch_size) + 1
109 |     for epoch in range(num_epochs):
110 |         # Shuffle the data at each epoch
111 |         shuffle_indices = np.random.permutation(np.arange(data_size))
112 |         shuffled_data = data[shuffle_indices]
113 |         for batch_num in range(num_batches_per_epoch):
114 |             start_index = batch_num * batch_size
115 |             end_index = min((batch_num + 1) * batch_size, data_size)
116 |             yield shuffled_data[start_index:end_index]


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/deep-learning-osx.yml:
--------------------------------------------------------------------------------
  1 | name: deep-learning
  2 | channels:
  3 | - conda-forge
  4 | - defaults
  5 | dependencies:
  6 | - accelerate=2.3.0=np111py35_3
  7 | - accelerate_cudalib=2.0=0
  8 | - appnope=0.1.0=py35_0
  9 | - bokeh=0.12.1=py35_0
 10 | - cffi=1.6.0=py35_0
 11 | - backports.shutil_get_terminal_size=1.0.0=py35_0
 12 | - blas=1.1=openblas
 13 | - ca-certificates=2016.8.2=3
 14 | - certifi=2016.8.2=py35_0
 15 | - cycler=0.10.0=py35_0
 16 | - cython=0.24.1=py35_0
 17 | - decorator=4.0.10=py35_0
 18 | - entrypoints=0.2.2=py35_0
 19 | - freetype=2.6.3=1
 20 | - h5py=2.6.0=np111py35_6
 21 | - hdf5=1.8.17=2
 22 | - ipykernel=4.3.1=py35_1
 23 | - ipython=5.1.0=py35_0
 24 | - ipywidgets=5.2.2=py35_0
 25 | - jinja2=2.8=py35_1
 26 | - jsonschema=2.5.1=py35_0
 27 | - jupyter_client=4.3.0=py35_0
 28 | - jupyter_console=5.0.0=py35_0
 29 | - jupyter_core=4.1.1=py35_1
 30 | - libgfortran=3.0.0=0
 31 | - libpng=1.6.24=0
 32 | - libsodium=1.0.10=0
 33 | - markupsafe=0.23=py35_0
 34 | - matplotlib=1.5.2=np111py35_5
 35 | - mistune=0.7.3=py35_0
 36 | - nbconvert=4.2.0=py35_0
 37 | - nbformat=4.0.1=py35_0
 38 | - ncurses=5.9=8
 39 | - nose=1.3.7=py35_1
 40 | - notebook=4.2.2=py35_0
 41 | - numpy=1.11.1=py35_blas_openblas_201
 42 | - openblas=0.2.18=4
 43 | - openssl=1.0.2h=2
 44 | - pandas=0.18.1=np111py35_1
 45 | - pexpect=4.2.0=py35_1
 46 | - pickleshare=0.7.3=py35_0
 47 | - pip=8.1.2=py35_0
 48 | - prompt_toolkit=1.0.6=py35_0
 49 | - ptyprocess=0.5.1=py35_0
 50 | - pygments=2.1.3=py35_1
 51 | - pyparsing=2.1.7=py35_0
 52 | - python=3.5.2=2
 53 | - python-dateutil=2.5.3=py35_0
 54 | - pytz=2016.6.1=py35_0
 55 | - pyyaml=3.11=py35_0
 56 | - pyzmq=15.4.0=py35_0
 57 | - qtconsole=4.2.1=py35_0
 58 | - readline=6.2=0
 59 | - requests=2.11.0=py35_0
 60 | - scikit-learn=0.17.1=np111py35_blas_openblas_201
 61 | - scipy=0.18.0=np111py35_blas_openblas_201
 62 | - setuptools=25.1.6=py35_0
 63 | - simplegeneric=0.8.1=py35_0
 64 | - sip=4.18=py35_0
 65 | - six=1.10.0=py35_0
 66 | - sqlite=3.13.0=1
 67 | - terminado=0.6=py35_0
 68 | - tk=8.5.19=0
 69 | - tornado=4.4.1=py35_1
 70 | - traitlets=4.2.2=py35_0
 71 | - wcwidth=0.1.7=py35_0
 72 | - wheel=0.29.0=py35_0
 73 | - widgetsnbextension=1.2.6=py35_3
 74 | - xz=5.2.2=0
 75 | - yaml=0.1.6=0
 76 | - zeromq=4.1.5=0
 77 | - zlib=1.2.8=3
 78 | - cudatoolkit=7.5=0
 79 | - ipython_genutils=0.1.0=py35_0
 80 | - jupyter=1.0.0=py35_3
 81 | - llvmlite=0.11.0=py35_0
 82 | - mkl=11.3.3=0
 83 | - mkl-service=1.1.2=py35_2
 84 | - numba=0.26.0=np111py35_0
 85 | - pycparser=2.14=py35_1
 86 | - pyqt=4.11.4=py35_4
 87 | - python.app=1.2=py35_4
 88 | - qt=4.8.7=4
 89 | - snakeviz=0.4.1=py35_0
 90 | - pip:
 91 |   - backports.shutil-get-terminal-size==1.0.0
 92 |   - certifi==2016.8.2
 93 |   - cycler==0.10.0
 94 |   - cython==0.24.1
 95 |   - decorator==4.0.10
 96 |   - h5py==2.6.0
 97 |   - ipykernel==4.3.1
 98 |   - ipython==5.1.0
 99 |   - ipython-genutils==0.1.0
100 |   - ipywidgets==5.2.2
101 |   - jinja2==2.8
102 |   - jsonschema==2.5.1
103 |   - jupyter-client==4.3.0
104 |   - jupyter-console==5.0.0
105 |   - jupyter-core==4.1.1
106 |   - keras==1.0.7
107 |   - markupsafe==0.23
108 |   - matplotlib==1.5.2
109 |   - mistune==0.7.3
110 |   - nbconvert==4.2.0
111 |   - nbformat==4.0.1
112 |   - nose==1.3.7
113 |   - notebook==4.2.2
114 |   - numpy==1.11.1
115 |   - pandas==0.18.1
116 |   - pexpect==4.2.0
117 |   - pickleshare==0.7.3
118 |   - pip==8.1.2
119 |   - prompt-toolkit==1.0.6
120 |   - ptyprocess==0.5.1
121 |   - pygments==2.1.3
122 |   - pyparsing==2.1.7
123 |   - python-dateutil==2.5.3
124 |   - pytz==2016.6.1
125 |   - pyyaml==3.11
126 |   - pyzmq==15.4.0
127 |   - qtconsole==4.2.1
128 |   - requests==2.11.0
129 |   - scikit-learn==0.17.1
130 |   - scipy==0.18.0
131 |   - setuptools==25.1.6
132 |   - simplegeneric==0.8.1
133 |   - six==1.10.0
134 |   - terminado==0.6
135 |   - theano==0.8.2
136 |   - tornado==4.4.1
137 |   - traitlets==4.2.2
138 |   - wcwidth==0.1.7
139 |   - wheel==0.29.0
140 |   - widgetsnbextension==1.2.6
141 | prefix: /Users/valerio/anaconda/envs/deep-learning
142 | 
143 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/deep-learning.yml:
--------------------------------------------------------------------------------
  1 | name: deep-learning
  2 | channels:
  3 | - conda-forge
  4 | - defaults
  5 | dependencies:
  6 | - accelerate=2.3.0=np111py35_3
  7 | - accelerate_cudalib=2.0=0
  8 | - bokeh=0.12.1=py35_0
  9 | - cffi=1.6.0=py35_0
 10 | - backports.shutil_get_terminal_size=1.0.0=py35_0
 11 | - blas=1.1=openblas
 12 | - ca-certificates=2016.8.2=3
 13 | - cairo=1.12.18=8
 14 | - certifi=2016.8.2=py35_0
 15 | - cycler=0.10.0=py35_0
 16 | - cython=0.24.1=py35_0
 17 | - decorator=4.0.10=py35_0
 18 | - entrypoints=0.2.2=py35_0
 19 | - fontconfig=2.11.1=3
 20 | - freetype=2.6.3=1
 21 | - gettext=0.19.7=1
 22 | - glib=2.48.0=4
 23 | - h5py=2.6.0=np111py35_6
 24 | - harfbuzz=1.0.6=0
 25 | - hdf5=1.8.17=2
 26 | - icu=56.1=4
 27 | - ipykernel=4.3.1=py35_1
 28 | - ipython=5.1.0=py35_0
 29 | - ipywidgets=5.2.2=py35_0
 30 | - jinja2=2.8=py35_1
 31 | - jpeg=9b=0
 32 | - jsonschema=2.5.1=py35_0
 33 | - jupyter_client=4.3.0=py35_0
 34 | - jupyter_console=5.0.0=py35_0
 35 | - jupyter_core=4.1.1=py35_1
 36 | - libffi=3.2.1=2
 37 | - libiconv=1.14=3
 38 | - libpng=1.6.24=0
 39 | - libsodium=1.0.10=0
 40 | - libtiff=4.0.6=6
 41 | - libxml2=2.9.4=0
 42 | - markupsafe=0.23=py35_0
 43 | - matplotlib=1.5.2=np111py35_6
 44 | - mistune=0.7.3=py35_0
 45 | - nbconvert=4.2.0=py35_0
 46 | - nbformat=4.0.1=py35_0
 47 | - ncurses=5.9=8
 48 | - nose=1.3.7=py35_1
 49 | - notebook=4.2.2=py35_0
 50 | - numpy=1.11.1=py35_blas_openblas_201
 51 | - openblas=0.2.18=4
 52 | - openssl=1.0.2h=2
 53 | - pandas=0.18.1=np111py35_1
 54 | - pango=1.40.1=0
 55 | - path.py=8.2.1=py35_0
 56 | - pcre=8.38=1
 57 | - pexpect=4.2.0=py35_1
 58 | - pickleshare=0.7.3=py35_0
 59 | - pip=8.1.2=py35_0
 60 | - pixman=0.32.6=0
 61 | - prompt_toolkit=1.0.6=py35_0
 62 | - protobuf=3.0.0b3=py35_1
 63 | - ptyprocess=0.5.1=py35_0
 64 | - pygments=2.1.3=py35_1
 65 | - pyparsing=2.1.7=py35_0
 66 | - python=3.5.2=2
 67 | - python-dateutil=2.5.3=py35_0
 68 | - pytz=2016.6.1=py35_0
 69 | - pyyaml=3.11=py35_0
 70 | - pyzmq=15.4.0=py35_0
 71 | - qt=4.8.7=0
 72 | - qtconsole=4.2.1=py35_0
 73 | - readline=6.2=0
 74 | - requests=2.11.0=py35_0
 75 | - scikit-learn=0.17.1=np111py35_blas_openblas_201
 76 | - scipy=0.18.0=np111py35_blas_openblas_201
 77 | - setuptools=25.1.6=py35_0
 78 | - simplegeneric=0.8.1=py35_0
 79 | - sip=4.18=py35_0
 80 | - six=1.10.0=py35_0
 81 | - sqlite=3.13.0=1
 82 | - terminado=0.6=py35_0
 83 | - tk=8.5.19=0
 84 | - tornado=4.4.1=py35_1
 85 | - traitlets=4.2.2=py35_0
 86 | - wcwidth=0.1.7=py35_0
 87 | - wheel=0.29.0=py35_0
 88 | - widgetsnbextension=1.2.6=py35_3
 89 | - xz=5.2.2=0
 90 | - yaml=0.1.6=0
 91 | - zeromq=4.1.5=0
 92 | - zlib=1.2.8=3
 93 | - cudatoolkit=7.5=0
 94 | - ipython_genutils=0.1.0=py35_0
 95 | - jupyter=1.0.0=py35_3
 96 | - libgfortran=3.0.0=1
 97 | - llvmlite=0.11.0=py35_0
 98 | - mkl=11.3.3=0
 99 | - mkl-service=1.1.2=py35_2
100 | - numba=0.26.0=np111py35_0
101 | - pycparser=2.14=py35_1
102 | - pyqt=4.11.4=py35_4
103 | - snakeviz=0.4.1=py35_0
104 | - pip:
105 |   - backports.shutil-get-terminal-size==1.0.0
106 |   - certifi==2016.8.2
107 |   - cycler==0.10.0
108 |   - cython==0.24.1
109 |   - decorator==4.0.10
110 |   - h5py==2.6.0
111 |   - ipykernel==4.3.1
112 |   - ipython==5.1.0
113 |   - ipython-genutils==0.1.0
114 |   - ipywidgets==5.2.2
115 |   - jinja2==2.8
116 |   - jsonschema==2.5.1
117 |   - jupyter-client==4.3.0
118 |   - jupyter-console==5.0.0
119 |   - jupyter-core==4.1.1
120 |   - keras==1.0.7
121 |   - mako==1.0.4
122 |   - markupsafe==0.23
123 |   - matplotlib==1.5.2
124 |   - mistune==0.7.3
125 |   - nbconvert==4.2.0
126 |   - nbformat==4.0.1
127 |   - nose==1.3.7
128 |   - notebook==4.2.2
129 |   - numpy==1.11.1
130 |   - pandas==0.18.1
131 |   - path.py==8.2.1
132 |   - pexpect==4.2.0
133 |   - pickleshare==0.7.3
134 |   - pip==8.1.2
135 |   - prompt-toolkit==1.0.6
136 |   - protobuf==3.0.0b2
137 |   - ptyprocess==0.5.1
138 |   - pygments==2.1.3
139 |   - pyparsing==2.1.7
140 |   - python-dateutil==2.5.3
141 |   - pytz==2016.6.1
142 |   - pyyaml==3.11
143 |   - pyzmq==15.4.0
144 |   - qtconsole==4.2.1
145 |   - requests==2.11.0
146 |   - scikit-learn==0.17.1
147 |   - scipy==0.18.0
148 |   - setuptools==25.1.4
149 |   - simplegeneric==0.8.1
150 |   - six==1.10.0
151 |   - terminado==0.6
152 |   - theano==0.8.2
153 |   - tornado==4.4.1
154 |   - traitlets==4.2.2
155 |   - wcwidth==0.1.7
156 |   - wheel==0.29.0
157 |   - widgetsnbextension==1.2.6
158 | prefix: /home/valerio/anaconda3/envs/deep-learning
159 | 
160 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/deep_learning_models/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 François Chollet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/deep_learning_models/README.md:
--------------------------------------------------------------------------------
 1 | # Trained image classification models for Keras
 2 | 
 3 | This repository contains code for the following Keras models:
 4 | 
 5 | - VGG16
 6 | - VGG19
 7 | - ResNet50
 8 | 
 9 | We plan on adding Inception v3 soon.
10 | 
11 | All architectures are compatible with both TensorFlow and Theano, and upon instantiation the models will be built according to the image dimension ordering set in your Keras configuration file at `~/.keras/keras.json`. For instance, if you have set `image_dim_ordering=tf`, then any model loaded from this repository will get built according to the TensorFlow dimension ordering convention, "Width-Height-Depth".
12 | 
13 | Weights can be automatically loaded upon instantiation (`weights='imagenet'` argument in model constructor). Weights are automatically downloaded if necessary, and cached locally in `~/.keras/models/`.
14 | 
15 | **Note that using these models requires the latest version of Keras (from the Github repo, not PyPI).**
16 | 
17 | ## Examples
18 | 
19 | ### Classify images
20 | 
21 | ```python
22 | from resnet50 import ResNet50
23 | from keras.preprocessing import image
24 | from imagenet_utils import preprocess_input, decode_predictions
25 | 
26 | model = ResNet50(weights='imagenet')
27 | 
28 | img_path = 'elephant.jpg'
29 | img = image.load_img(img_path, target_size=(224, 224))
30 | x = image.img_to_array(img)
31 | x = np.expand_dims(x, axis=0)
32 | x = preprocess_input(x)
33 | 
34 | preds = model.predict(x)
35 | print('Predicted:', decode_predictions(preds))
36 | # print: [[u'n02504458', u'African_elephant']]
37 | ```
38 | 
39 | ### Extract features from images
40 | 
41 | ```python
42 | from vgg16 import VGG16
43 | from keras.preprocessing import image
44 | from imagenet_utils import preprocess_input
45 | 
46 | model = VGG16(weights='imagenet', include_top=False)
47 | 
48 | img_path = 'elephant.jpg'
49 | img = image.load_img(img_path, target_size=(224, 224))
50 | x = image.img_to_array(img)
51 | x = np.expand_dims(x, axis=0)
52 | x = preprocess_input(x)
53 | 
54 | features = model.predict(x)
55 | ```
56 | 
57 | ### Extract features from an arbitrary intermediate layer
58 | 
59 | ```python
60 | from vgg19 import VGG19
61 | from keras.preprocessing import image
62 | from imagenet_utils import preprocess_input
63 | from keras.models import Model
64 | 
65 | base_model = VGG19(weights='imagenet')
66 | model = Model(input=base_model.input, output=base_model.get_layer('block4_pool').output)
67 | 
68 | img_path = 'elephant.jpg'
69 | img = image.load_img(img_path, target_size=(224, 224))
70 | x = image.img_to_array(img)
71 | x = np.expand_dims(x, axis=0)
72 | x = preprocess_input(x)
73 | 
74 | block4_pool_features = model.predict(x)
75 | ```
76 | 
77 | ## References
78 | 
79 | - [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556) - please cite this paper if you use the VGG models in your work.
80 | - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) - please cite this paper if you use the ResNet model in your work.
81 | 
82 | Additionally, don't forget to [cite Keras](https://keras.io/getting-started/faq/#how-should-i-cite-keras) if you use these models.
83 | 
84 | 
85 | ## License
86 | 
87 | - All code in this repository is under the MIT license as specified by the LICENSE file.
88 | - The ResNet50 weights are ported from the ones [released by Kaiming He](https://github.com/KaimingHe/deep-residual-networks) under the [MIT license](https://github.com/KaimingHe/deep-residual-networks/blob/master/LICENSE).
89 | - The VGG16 and VGG19 weights are ported from the ones [released by VGG at Oxford](http://www.robots.ox.ac.uk/~vgg/research/very_deep/) under the [Creative Commons Attribution License](https://creativecommons.org/licenses/by/4.0/).


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/deep_learning_models/imagenet_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | 
 4 | from keras.utils.data_utils import get_file
 5 | from keras import backend as K
 6 | 
 7 | CLASS_INDEX = None
 8 | CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
 9 | 
10 | 
11 | def preprocess_input(x, dim_ordering='default'):
12 |     if dim_ordering == 'default':
13 |         dim_ordering = K.image_dim_ordering()
14 |     assert dim_ordering in {'tf', 'th'}
15 | 
16 |     if dim_ordering == 'th':
17 |         x[:, 0, :, :] -= 103.939
18 |         x[:, 1, :, :] -= 116.779
19 |         x[:, 2, :, :] -= 123.68
20 |         # 'RGB'->'BGR'
21 |         x = x[:, ::-1, :, :]
22 |     else:
23 |         x[:, :, :, 0] -= 103.939
24 |         x[:, :, :, 1] -= 116.779
25 |         x[:, :, :, 2] -= 123.68
26 |         # 'RGB'->'BGR'
27 |         x = x[:, :, :, ::-1]
28 |     return x
29 | 
30 | 
31 | def decode_predictions(preds):
32 |     global CLASS_INDEX
33 |     assert len(preds.shape) == 2 and preds.shape[1] == 1000
34 |     if CLASS_INDEX is None:
35 |         fpath = get_file('imagenet_class_index.json',
36 |                          CLASS_INDEX_PATH,
37 |                          cache_subdir='models')
38 |         CLASS_INDEX = json.load(open(fpath))
39 |     indices = np.argmax(preds, axis=-1)
40 |     results = []
41 |     for i in indices:
42 |         results.append(CLASS_INDEX[str(i)])
43 |     return results
44 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/ConvNet LeNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/ConvNet LeNet.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/LSTM3-chain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/LSTM3-chain.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/MLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/MLP.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/MaxPool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/MaxPool.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/Perceptron and MLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/Perceptron and MLP.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/Perceptron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/Perceptron.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/RNN-rolled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/RNN-rolled.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/RNN-unrolled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/RNN-unrolled.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/autoencoder.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/backprop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/backprop.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/cnn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn1.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/cnn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn2.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/cnn3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn3.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/cnn4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn4.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/cnn5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn5.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/cnn6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/cnn6.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/conv.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/convnets_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/convnets_cover.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/euroscipy_2016_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/euroscipy_2016_logo.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/gru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/gru.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/imagenet/apricot_565.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/apricot_565.jpeg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/imagenet/apricot_696.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/apricot_696.jpeg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/imagenet/apricot_787.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/apricot_787.jpeg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1157.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1157.jpeg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1174.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1174.jpeg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1189.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/imagenet/strawberry_1189.jpeg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/keDyv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keDyv.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/keras-logo-small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keras-logo-small.jpg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/keras_rank_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keras_rank_1.jpg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/keras_rank_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/keras_rank_2.jpg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/mlp_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/mlp_details.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/overfitting.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/rnn.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/rnn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/rnn2.png


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/imgs/sprint.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/keras-tutorial/imgs/sprint.jpg


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/outline.md:
--------------------------------------------------------------------------------
 1 | # Outline (Draft)
 2 | 
 3 | - Part I: Introduction
 4 | 
 5 |     - Intro to ANN 
 6 |         - (naive pure-Python implementation from `pybrain`)
 7 |         - fast forward
 8 |         - sgd + backprop
 9 |     - Intro to Theano
10 |         - Model + SGD with Theano (simple logreg) 
11 |         
12 |     - Introduction to Keras
13 |         - Overview and main features
14 |             - Theano backend
15 |             - Tensorflow backend
16 |         - Same LogReg with Keras
17 | 
18 | - Part II: Supervised Learning + Keras Internals
19 |     - Intro: Focus on Image Classification
20 |     - Multi-Layer Perceptron and Fully Connected
21 |         - Examples with `keras.models.Sequential` and `Dense`
22 |         - HandsOn: MLP with keras 
23 |         
24 |     - Intro to CNN
25 |         - meaning of convolutional filters
26 |             - examples from ImageNet
27 |         
28 |         - Meaning of dimensions of Conv filters (through an exmple of ConvNet) 
29 |         - HandsOn: ConvNet with keras 
30 | 
31 |     - Advanced CNN
32 |         -  Dropout and MaxPooling
33 |     - Famous ANN in Keras (likely moved somewhere else)
34 |         - ref: https://github.com/fchollet/deep-learning-models
35 |         - VGG16
36 |         - VGG19
37 |         - LaNet
38 |         - Inception/GoogleNet
39 |         - ResNet
40 |         *Implementation and examples 
41 |         - HandsOn: Fine tuning a network on new dataset 
42 |         
43 | - Part III: Unsupervised Learning + Keras Internals
44 |     - AutoEncoders
45 |     - word2vec & doc2vec (gensim) + `keras.dataset` (i.e. `keras.dataset.imdb`)   
46 |     - HandsOn: _______
47 | 
48 | *should we include embedding here? 
49 | 
50 | - Part IV: Advanced Materials
51 |     - RNN (LSTM)
52 |         -  RNN, LSTM, GRU  
53 |         -  Meaning of dimensions of rnn (backprop though time, etc)
54 |         -  HandsOn: IMDB (?)
55 | 
56 |     - CNN-RNN
57 |     - Time Distributed Convolution 
58 |     - Some of the recent advances in DL implemented in Keras
59 |         - e.g. https://github.com/snf/keras-fractalnet - Fractal Net Implementation with Keras
60 | 
61 | 
62 | Notes:
63 | 
64 | 1) Please, add more details in Part IV (i.e. /Advanced Materials/)
65 | 2) As for Keras internals, I Would consider this: https://github.com/wuaalb/keras_extensions/blob/master/keras_extensions/rbm.py
66 | This is just to show how easy it is to extend Keras ( in this case, properly creating a new `Layer`).
67 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/solutions/sol_111.py:
--------------------------------------------------------------------------------
1 | ann = ANN(2, 10, 1)
2 | %timeit -n 1 -r 1 ann.train(zip(X,y), iterations=2)
3 | plot_decision_boundary(ann)
4 | plt.title("Our next model with 10 hidden units")
5 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/solutions/sol_112.py:
--------------------------------------------------------------------------------
1 | ann = ANN(2, 10, 1)
2 | %timeit -n 1 -r 1 ann.train(zip(X,y), iterations=100)
3 | plot_decision_boundary(ann)
4 | plt.title("Our model with 10 hidden units and 100 iterations")
5 | 


--------------------------------------------------------------------------------
/deep-learning/keras-tutorial/w2v.py:
--------------------------------------------------------------------------------
 1 | from gensim.models import word2vec
 2 | from os.path import join, exists, split
 3 | import os
 4 | import numpy as np
 5 | 
 6 | def train_word2vec(sentence_matrix, vocabulary_inv,
 7 |                    num_features=300, min_word_count=1, context=10):
 8 |     """
 9 |     Trains, saves, loads Word2Vec model
10 |     Returns initial weights for embedding layer.
11 |    
12 |     inputs:
13 |     sentence_matrix # int matrix: num_sentences x max_sentence_len
14 |     vocabulary_inv  # dict {str:int}
15 |     num_features    # Word vector dimensionality                      
16 |     min_word_count  # Minimum word count                        
17 |     context         # Context window size 
18 |     """
19 |     model_dir = 'word2vec_models'
20 |     model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
21 |     model_name = join(model_dir, model_name)
22 |     if exists(model_name):
23 |         embedding_model = word2vec.Word2Vec.load(model_name)
24 |         print('Loading existing Word2Vec model \'%s\'' % split(model_name)[-1])
25 |     else:
26 |         # Set values for various parameters
27 |         num_workers = 2       # Number of threads to run in parallel
28 |         downsampling = 1e-3   # Downsample setting for frequent words
29 |         
30 |         # Initialize and train the model
31 |         print("Training Word2Vec model...")
32 |         sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
33 |         embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
34 |                             size=num_features, min_count = min_word_count, \
35 |                             window = context, sample = downsampling)
36 |         
37 |         # If we don't plan to train the model any further, calling 
38 |         # init_sims will make the model much more memory-efficient.
39 |         embedding_model.init_sims(replace=True)
40 |         
41 |         # Saving the model for later use. You can load it later using Word2Vec.load()
42 |         if not exists(model_dir):
43 |             os.mkdir(model_dir)
44 |         print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
45 |         embedding_model.save(model_name)
46 |     
47 |     #  add unknown words
48 |     embedding_weights = [np.array([embedding_model[w] if w in embedding_model\
49 |                                                         else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\
50 |                                                         for w in vocabulary_inv])]
51 |     return embedding_weights
52 | 
53 | if __name__=='__main__':
54 |     import data_helpers
55 |     print("Loading data...")
56 |     x, _, _, vocabulary_inv = data_helpers.load_data()
57 |     w = train_word2vec(x, vocabulary_inv)
58 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/input_data.py:
--------------------------------------------------------------------------------
  1 | """Functions for downloading and reading MNIST data."""
  2 | from __future__ import print_function
  3 | import gzip
  4 | import os
  5 | import urllib
  6 | import numpy
  7 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
  8 | def maybe_download(filename, work_directory):
  9 |   """Download the data from Yann's website, unless it's already here."""
 10 |   if not os.path.exists(work_directory):
 11 |     os.mkdir(work_directory)
 12 |   filepath = os.path.join(work_directory, filename)
 13 |   if not os.path.exists(filepath):
 14 |     filepath, _ = urllib.urlretrieve(SOURCE_URL + filename, filepath)
 15 |     statinfo = os.stat(filepath)
 16 |     print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
 17 |   return filepath
 18 | def _read32(bytestream):
 19 |   dt = numpy.dtype(numpy.uint32).newbyteorder('>')
 20 |   return numpy.frombuffer(bytestream.read(4), dtype=dt)
 21 | def extract_images(filename):
 22 |   """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
 23 |   print('Extracting', filename)
 24 |   with gzip.open(filename) as bytestream:
 25 |     magic = _read32(bytestream)
 26 |     if magic != 2051:
 27 |       raise ValueError(
 28 |           'Invalid magic number %d in MNIST image file: %s' %
 29 |           (magic, filename))
 30 |     num_images = _read32(bytestream)
 31 |     rows = _read32(bytestream)
 32 |     cols = _read32(bytestream)
 33 |     buf = bytestream.read(rows * cols * num_images)
 34 |     data = numpy.frombuffer(buf, dtype=numpy.uint8)
 35 |     data = data.reshape(num_images, rows, cols, 1)
 36 |     return data
 37 | def dense_to_one_hot(labels_dense, num_classes=10):
 38 |   """Convert class labels from scalars to one-hot vectors."""
 39 |   num_labels = labels_dense.shape[0]
 40 |   index_offset = numpy.arange(num_labels) * num_classes
 41 |   labels_one_hot = numpy.zeros((num_labels, num_classes))
 42 |   labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
 43 |   return labels_one_hot
 44 | def extract_labels(filename, one_hot=False):
 45 |   """Extract the labels into a 1D uint8 numpy array [index]."""
 46 |   print('Extracting', filename)
 47 |   with gzip.open(filename) as bytestream:
 48 |     magic = _read32(bytestream)
 49 |     if magic != 2049:
 50 |       raise ValueError(
 51 |           'Invalid magic number %d in MNIST label file: %s' %
 52 |           (magic, filename))
 53 |     num_items = _read32(bytestream)
 54 |     buf = bytestream.read(num_items)
 55 |     labels = numpy.frombuffer(buf, dtype=numpy.uint8)
 56 |     if one_hot:
 57 |       return dense_to_one_hot(labels)
 58 |     return labels
 59 | class DataSet(object):
 60 |   def __init__(self, images, labels, fake_data=False):
 61 |     if fake_data:
 62 |       self._num_examples = 10000
 63 |     else:
 64 |       assert images.shape[0] == labels.shape[0], (
 65 |           "images.shape: %s labels.shape: %s" % (images.shape,
 66 |                                                  labels.shape))
 67 |       self._num_examples = images.shape[0]
 68 |       # Convert shape from [num examples, rows, columns, depth]
 69 |       # to [num examples, rows*columns] (assuming depth == 1)
 70 |       assert images.shape[3] == 1
 71 |       images = images.reshape(images.shape[0],
 72 |                               images.shape[1] * images.shape[2])
 73 |       # Convert from [0, 255] -> [0.0, 1.0].
 74 |       images = images.astype(numpy.float32)
 75 |       images = numpy.multiply(images, 1.0 / 255.0)
 76 |     self._images = images
 77 |     self._labels = labels
 78 |     self._epochs_completed = 0
 79 |     self._index_in_epoch = 0
 80 |   @property
 81 |   def images(self):
 82 |     return self._images
 83 |   @property
 84 |   def labels(self):
 85 |     return self._labels
 86 |   @property
 87 |   def num_examples(self):
 88 |     return self._num_examples
 89 |   @property
 90 |   def epochs_completed(self):
 91 |     return self._epochs_completed
 92 |   def next_batch(self, batch_size, fake_data=False):
 93 |     """Return the next `batch_size` examples from this data set."""
 94 |     if fake_data:
 95 |       fake_image = [1.0 for _ in xrange(784)]
 96 |       fake_label = 0
 97 |       return [fake_image for _ in xrange(batch_size)], [
 98 |           fake_label for _ in xrange(batch_size)]
 99 |     start = self._index_in_epoch
100 |     self._index_in_epoch += batch_size
101 |     if self._index_in_epoch > self._num_examples:
102 |       # Finished epoch
103 |       self._epochs_completed += 1
104 |       # Shuffle the data
105 |       perm = numpy.arange(self._num_examples)
106 |       numpy.random.shuffle(perm)
107 |       self._images = self._images[perm]
108 |       self._labels = self._labels[perm]
109 |       # Start next epoch
110 |       start = 0
111 |       self._index_in_epoch = batch_size
112 |       assert batch_size <= self._num_examples
113 |     end = self._index_in_epoch
114 |     return self._images[start:end], self._labels[start:end]
115 | def read_data_sets(train_dir, fake_data=False, one_hot=False):
116 |   class DataSets(object):
117 |     pass
118 |   data_sets = DataSets()
119 |   if fake_data:
120 |     data_sets.train = DataSet([], [], fake_data=True)
121 |     data_sets.validation = DataSet([], [], fake_data=True)
122 |     data_sets.test = DataSet([], [], fake_data=True)
123 |     return data_sets
124 |   TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
125 |   TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
126 |   TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
127 |   TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
128 |   VALIDATION_SIZE = 5000
129 |   local_file = maybe_download(TRAIN_IMAGES, train_dir)
130 |   train_images = extract_images(local_file)
131 |   local_file = maybe_download(TRAIN_LABELS, train_dir)
132 |   train_labels = extract_labels(local_file, one_hot=one_hot)
133 |   local_file = maybe_download(TEST_IMAGES, train_dir)
134 |   test_images = extract_images(local_file)
135 |   local_file = maybe_download(TEST_LABELS, train_dir)
136 |   test_labels = extract_labels(local_file, one_hot=one_hot)
137 |   validation_images = train_images[:VALIDATION_SIZE]
138 |   validation_labels = train_labels[:VALIDATION_SIZE]
139 |   train_images = train_images[VALIDATION_SIZE:]
140 |   train_labels = train_labels[VALIDATION_SIZE:]
141 |   data_sets.train = DataSet(train_images, train_labels)
142 |   data_sets.validation = DataSet(validation_images, validation_labels)
143 |   data_sets.test = DataSet(test_images, test_labels)
144 |   return data_sets


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/multigpu_basics.py:
--------------------------------------------------------------------------------
 1 | #Multi GPU Basic example
 2 | '''
 3 | This tutorial requires your machine to have 2 GPUs
 4 | "/cpu:0": The CPU of your machine.
 5 | "/gpu:0": The first GPU of your machine
 6 | "/gpu:1": The second GPU of your machine
 7 | '''
 8 | 
 9 | import numpy as np
10 | import tensorflow as tf
11 | import datetime
12 | 
13 | #Processing Units logs
14 | log_device_placement = True
15 | 
16 | #num of multiplications to perform
17 | n = 10
18 | 
19 | '''
20 | Example: compute A^n + B^n on 2 GPUs
21 | Results on 8 cores with 2 GTX-980:
22 |  * Single GPU computation time: 0:00:11.277449
23 |  * Multi GPU computation time: 0:00:07.131701
24 | '''
25 | #Create random large matrix
26 | A = np.random.rand(1e4, 1e4).astype('float32')
27 | B = np.random.rand(1e4, 1e4).astype('float32')
28 | 
29 | # Creates a graph to store results
30 | c1 = []
31 | c2 = []
32 | 
33 | def matpow(M, n):
34 |     if n < 1: #Abstract cases where n < 1
35 |         return M
36 |     else:
37 |         return tf.matmul(M, matpow(M, n-1))
38 | 
39 | '''
40 | Single GPU computing
41 | '''
42 | with tf.device('/gpu:0'):
43 |     a = tf.constant(A)
44 |     b = tf.constant(B)
45 |     #compute A^n and B^n and store results in c1
46 |     c1.append(matpow(a, n))
47 |     c1.append(matpow(b, n))
48 | 
49 | with tf.device('/cpu:0'):
50 |   sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n
51 | 
52 | t1_1 = datetime.datetime.now()
53 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:
54 |     # Runs the op.
55 |     sess.run(sum)
56 | t2_1 = datetime.datetime.now()
57 | 
58 | 
59 | '''
60 | Multi GPU computing
61 | '''
62 | #GPU:0 computes A^n
63 | with tf.device('/gpu:0'):
64 |     #compute A^n and store result in c2
65 |     a = tf.constant(A)
66 |     c2.append(matpow(a, n))
67 | 
68 | #GPU:1 computes B^n
69 | with tf.device('/gpu:1'):
70 |     #compute B^n and store result in c2
71 |     b = tf.constant(B)
72 |     c2.append(matpow(b, n))
73 | 
74 | with tf.device('/cpu:0'):
75 |   sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n
76 | 
77 | t1_2 = datetime.datetime.now()
78 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:
79 |     # Runs the op.
80 |     sess.run(sum)
81 | t2_2 = datetime.datetime.now()
82 | 
83 | 
84 | print "Single GPU computation time: " + str(t2_1-t1_1)
85 | print "Multi GPU computation time: " + str(t2_2-t1_2)


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/1_intro/basic_operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Basic Operations in TensorFlow\n",
  8 |     "\n",
  9 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 10 |     "\n",
 11 |     "## Setup\n",
 12 |     "\n",
 13 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import tensorflow as tf"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Basic constant operations\n",
 36 |     "# The value returned by the constructor represents the output\n",
 37 |     "# of the Constant op.\n",
 38 |     "a = tf.constant(2)\n",
 39 |     "b = tf.constant(3)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "a=2, b=3\n",
 54 |       "Addition with constants: 5\n",
 55 |       "Multiplication with constants: 6\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "# Launch the default graph.\n",
 61 |     "with tf.Session() as sess:\n",
 62 |     "    print \"a=2, b=3\"\n",
 63 |     "    print \"Addition with constants: %i\" % sess.run(a+b)\n",
 64 |     "    print \"Multiplication with constants: %i\" % sess.run(a*b)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# Basic Operations with variable as graph input\n",
 76 |     "# The value returned by the constructor represents the output\n",
 77 |     "# of the Variable op. (define as input when running session)\n",
 78 |     "# tf Graph input\n",
 79 |     "a = tf.placeholder(tf.int16)\n",
 80 |     "b = tf.placeholder(tf.int16)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Define some operations\n",
 92 |     "add = tf.add(a, b)\n",
 93 |     "mul = tf.mul(a, b)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 7,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "Addition with variables: 5\n",
108 |       "Multiplication with variables: 6\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "# Launch the default graph.\n",
114 |     "with tf.Session() as sess:\n",
115 |     "    # Run every operation with variable input\n",
116 |     "    print \"Addition with variables: %i\" % sess.run(add, feed_dict={a: 2, b: 3})\n",
117 |     "    print \"Multiplication with variables: %i\" % sess.run(mul, feed_dict={a: 2, b: 3})"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 8,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "# ----------------\n",
129 |     "# More in details:\n",
130 |     "# Matrix Multiplication from TensorFlow official tutorial\n",
131 |     "\n",
132 |     "# Create a Constant op that produces a 1x2 matrix.  The op is\n",
133 |     "# added as a node to the default graph.\n",
134 |     "#\n",
135 |     "# The value returned by the constructor represents the output\n",
136 |     "# of the Constant op.\n",
137 |     "matrix1 = tf.constant([[3., 3.]])"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 9,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "# Create another Constant that produces a 2x1 matrix.\n",
149 |     "matrix2 = tf.constant([[2.],[2.]])"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 10,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.\n",
161 |     "# The returned value, 'product', represents the result of the matrix\n",
162 |     "# multiplication.\n",
163 |     "product = tf.matmul(matrix1, matrix2)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 11,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [
173 |     {
174 |      "name": "stdout",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "[[ 12.]]\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "# To run the matmul op we call the session 'run()' method, passing 'product'\n",
183 |     "# which represents the output of the matmul op.  This indicates to the call\n",
184 |     "# that we want to get the output of the matmul op back.\n",
185 |     "#\n",
186 |     "# All inputs needed by the op are run automatically by the session.  They\n",
187 |     "# typically are run in parallel.\n",
188 |     "#\n",
189 |     "# The call 'run(product)' thus causes the execution of threes ops in the\n",
190 |     "# graph: the two constants and matmul.\n",
191 |     "#\n",
192 |     "# The output of the op is returned in 'result' as a numpy `ndarray` object.\n",
193 |     "with tf.Session() as sess:\n",
194 |     "    result = sess.run(product)\n",
195 |     "    print result"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": []
206 |   }
207 |  ],
208 |  "metadata": {
209 |   "kernelspec": {
210 |    "display_name": "Python 2",
211 |    "language": "python",
212 |    "name": "python2"
213 |   },
214 |   "language_info": {
215 |    "codemirror_mode": {
216 |     "name": "ipython",
217 |     "version": 2
218 |    },
219 |    "file_extension": ".py",
220 |    "mimetype": "text/x-python",
221 |    "name": "python",
222 |    "nbconvert_exporter": "python",
223 |    "pygments_lexer": "ipython2",
224 |    "version": "2.7.5+"
225 |   }
226 |  },
227 |  "nbformat": 4,
228 |  "nbformat_minor": 0
229 | }
230 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/2_basic_classifiers/logistic_regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Logistic Regression in TensorFlow\n",
 10 |     "\n",
 11 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 12 |     "\n",
 13 |     "## Setup\n",
 14 |     "\n",
 15 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 5,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "Extracting /tmp/data/train-images-idx3-ubyte.gz\n",
 30 |       "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n",
 31 |       "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n",
 32 |       "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "# Import MINST data\n",
 38 |     "import input_data\n",
 39 |     "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 6,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import tensorflow as tf"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 7,
 56 |    "metadata": {
 57 |     "collapsed": true
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Parameters\n",
 62 |     "learning_rate = 0.01\n",
 63 |     "training_epochs = 25\n",
 64 |     "batch_size = 100\n",
 65 |     "display_step = 1"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 8,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# tf Graph Input\n",
 77 |     "x = tf.placeholder(\"float\", [None, 784]) # mnist data image of shape 28*28=784\n",
 78 |     "y = tf.placeholder(\"float\", [None, 10]) # 0-9 digits recognition => 10 classes"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 9,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# Create model\n",
 90 |     "\n",
 91 |     "# Set model weights\n",
 92 |     "W = tf.Variable(tf.zeros([784, 10]))\n",
 93 |     "b = tf.Variable(tf.zeros([10]))"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 10,
 99 |    "metadata": {
100 |     "collapsed": true
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "# Construct model\n",
105 |     "activation = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 11,
111 |    "metadata": {
112 |     "collapsed": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "# Minimize error using cross entropy\n",
117 |     "# Cross entropy\n",
118 |     "cost = -tf.reduce_sum(y*tf.log(activation)) \n",
119 |     "# Gradient Descent\n",
120 |     "optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) "
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 12,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# Initializing the variables\n",
132 |     "init = tf.initialize_all_variables()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 13,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Epoch: 0001 cost= 29.860479714\n",
147 |       "Epoch: 0002 cost= 22.080549484\n",
148 |       "Epoch: 0003 cost= 21.237104595\n",
149 |       "Epoch: 0004 cost= 20.460196280\n",
150 |       "Epoch: 0005 cost= 20.185128237\n",
151 |       "Epoch: 0006 cost= 19.940297202\n",
152 |       "Epoch: 0007 cost= 19.645111119\n",
153 |       "Epoch: 0008 cost= 19.507218031\n",
154 |       "Epoch: 0009 cost= 19.389794492\n",
155 |       "Epoch: 0010 cost= 19.177005816\n",
156 |       "Epoch: 0011 cost= 19.082493615\n",
157 |       "Epoch: 0012 cost= 19.072873598\n",
158 |       "Epoch: 0013 cost= 18.938005402\n",
159 |       "Epoch: 0014 cost= 18.891806430\n",
160 |       "Epoch: 0015 cost= 18.839480221\n",
161 |       "Epoch: 0016 cost= 18.769349510\n",
162 |       "Epoch: 0017 cost= 18.590865587\n",
163 |       "Epoch: 0018 cost= 18.623413677\n",
164 |       "Epoch: 0019 cost= 18.546149085\n",
165 |       "Epoch: 0020 cost= 18.432274895\n",
166 |       "Epoch: 0021 cost= 18.358189004\n",
167 |       "Epoch: 0022 cost= 18.380014628\n",
168 |       "Epoch: 0023 cost= 18.499993471\n",
169 |       "Epoch: 0024 cost= 18.386477311\n",
170 |       "Epoch: 0025 cost= 18.258080609\n",
171 |       "Optimization Finished!\n",
172 |       "Accuracy: 0.9048\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "# Launch the graph\n",
178 |     "with tf.Session() as sess:\n",
179 |     "    sess.run(init)\n",
180 |     "\n",
181 |     "    # Training cycle\n",
182 |     "    for epoch in range(training_epochs):\n",
183 |     "        avg_cost = 0.\n",
184 |     "        total_batch = int(mnist.train.num_examples/batch_size)\n",
185 |     "        # Loop over all batches\n",
186 |     "        for i in range(total_batch):\n",
187 |     "            batch_xs, batch_ys = mnist.train.next_batch(batch_size)\n",
188 |     "            # Fit training using batch data\n",
189 |     "            sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})\n",
190 |     "            # Compute average loss\n",
191 |     "            avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch\n",
192 |     "        # Display logs per epoch step\n",
193 |     "        if epoch % display_step == 0:\n",
194 |     "            print \"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(avg_cost)\n",
195 |     "\n",
196 |     "    print \"Optimization Finished!\"\n",
197 |     "\n",
198 |     "    # Test model\n",
199 |     "    correct_prediction = tf.equal(tf.argmax(activation, 1), tf.argmax(y, 1))\n",
200 |     "    # Calculate accuracy\n",
201 |     "    accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n",
202 |     "    print \"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})"
203 |    ]
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python 3",
209 |    "language": "python",
210 |    "name": "python3"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.4.3"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 0
227 | }
228 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/4_multi_gpu/multigpu_basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Basic Multi GPU Computation in TensorFlow\n",
  8 |     "\n",
  9 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 10 |     "\n",
 11 |     "## Setup\n",
 12 |     "\n",
 13 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "source": [
 22 |     "This tutorial requires your machine to have 2 GPUs\n",
 23 |     "* \"/cpu:0\": The CPU of your machine.\n",
 24 |     "* \"/gpu:0\": The first GPU of your machine\n",
 25 |     "* \"/gpu:1\": The second GPU of your machine\n",
 26 |     "* For this example, we are using 2 GTX-980"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import numpy as np\n",
 38 |     "import tensorflow as tf\n",
 39 |     "import datetime"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "#Processing Units logs\n",
 51 |     "log_device_placement = True\n",
 52 |     "\n",
 53 |     "#num of multiplications to perform\n",
 54 |     "n = 10"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# Example: compute A^n + B^n on 2 GPUs\n",
 66 |     "\n",
 67 |     "# Create random large matrix\n",
 68 |     "A = np.random.rand(1e4, 1e4).astype('float32')\n",
 69 |     "B = np.random.rand(1e4, 1e4).astype('float32')\n",
 70 |     "\n",
 71 |     "# Creates a graph to store results\n",
 72 |     "c1 = []\n",
 73 |     "c2 = []\n",
 74 |     "\n",
 75 |     "# Define matrix power\n",
 76 |     "def matpow(M, n):\n",
 77 |     "    if n < 1: #Abstract cases where n < 1\n",
 78 |     "        return M\n",
 79 |     "    else:\n",
 80 |     "        return tf.matmul(M, matpow(M, n-1))"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Single GPU computing\n",
 92 |     "\n",
 93 |     "with tf.device('/gpu:0'):\n",
 94 |     "    a = tf.constant(A)\n",
 95 |     "    b = tf.constant(B)\n",
 96 |     "    #compute A^n and B^n and store results in c1\n",
 97 |     "    c1.append(matpow(a, n))\n",
 98 |     "    c1.append(matpow(b, n))\n",
 99 |     "\n",
100 |     "with tf.device('/cpu:0'):\n",
101 |     "  sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n\n",
102 |     "\n",
103 |     "t1_1 = datetime.datetime.now()\n",
104 |     "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n",
105 |     "    # Runs the op.\n",
106 |     "    sess.run(sum)\n",
107 |     "t2_1 = datetime.datetime.now()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 7,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "# Multi GPU computing\n",
119 |     "# GPU:0 computes A^n\n",
120 |     "with tf.device('/gpu:0'):\n",
121 |     "    #compute A^n and store result in c2\n",
122 |     "    a = tf.constant(A)\n",
123 |     "    c2.append(matpow(a, n))\n",
124 |     "\n",
125 |     "#GPU:1 computes B^n\n",
126 |     "with tf.device('/gpu:1'):\n",
127 |     "    #compute B^n and store result in c2\n",
128 |     "    b = tf.constant(B)\n",
129 |     "    c2.append(matpow(b, n))\n",
130 |     "\n",
131 |     "with tf.device('/cpu:0'):\n",
132 |     "  sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n\n",
133 |     "\n",
134 |     "t1_2 = datetime.datetime.now()\n",
135 |     "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n",
136 |     "    # Runs the op.\n",
137 |     "    sess.run(sum)\n",
138 |     "t2_2 = datetime.datetime.now()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "Single GPU computation time: 0:00:11.833497\n",
153 |       "Multi GPU computation time: 0:00:07.085913\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "print \"Single GPU computation time: \" + str(t2_1-t1_1)\n",
159 |     "print \"Multi GPU computation time: \" + str(t2_2-t1_2)"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.4.3"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 0
184 | }
185 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-exercises/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM b.gcr.io/tensorflow/tensorflow:latest
2 | MAINTAINER Vincent Vanhoucke <vanhoucke@google.com>
3 | RUN pip install scikit-learn
4 | ADD *.ipynb /notebooks/
5 | WORKDIR /notebooks
6 | CMD ["/run_jupyter.sh"]
7 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-exercises/README.md:
--------------------------------------------------------------------------------
 1 | Exercises
 2 | ===========================================================
 3 | 
 4 | Building the Docker container
 5 | -----------------------------
 6 | 
 7 |     docker build -t $USER/exercises .
 8 | 
 9 | Running the container
10 | ---------------------
11 | 
12 |     docker run -p 8888:8888 -it --rm $USER/exercises
13 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/Makefile:
--------------------------------------------------------------------------------
1 | intro_theano.pdf: slides_source/intro_theano.tex
2 | 	cd slides_source; pdflatex --shell-escape intro_theano.tex
3 | 	mv slides_source/intro_theano.pdf .
4 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/utils.py:
--------------------------------------------------------------------------------
  1 | """ This file contains different utility functions that are not connected
  2 | in anyway to the networks presented in the tutorials, but rather help in
  3 | processing the outputs into a more understandable way.
  4 | 
  5 | For example ``tile_raster_images`` helps in generating a easy to grasp
  6 | image from a set of samples or weights.
  7 | """
  8 | 
  9 | 
 10 | import numpy
 11 | from six.moves import xrange
 12 | 
 13 | 
 14 | def scale_to_unit_interval(ndar, eps=1e-8):
 15 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 16 |     ndar = ndar.copy()
 17 |     ndar -= ndar.min()
 18 |     ndar *= 1.0 / (ndar.max() + eps)
 19 |     return ndar
 20 | 
 21 | 
 22 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 23 |                        scale_rows_to_unit_interval=True,
 24 |                        output_pixel_vals=True):
 25 |     """
 26 |     Transform an array with one flattened image per row, into an array in
 27 |     which images are reshaped and layed out like tiles on a floor.
 28 | 
 29 |     This function is useful for visualizing datasets whose rows are images,
 30 |     and also columns of matrices for transforming those rows
 31 |     (such as the first layer of a neural net).
 32 | 
 33 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 34 |     be 2-D ndarrays or None;
 35 |     :param X: a 2-D array in which every row is a flattened image.
 36 | 
 37 |     :type img_shape: tuple; (height, width)
 38 |     :param img_shape: the original shape of each image
 39 | 
 40 |     :type tile_shape: tuple; (rows, cols)
 41 |     :param tile_shape: the number of images to tile (rows, cols)
 42 | 
 43 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 44 |     values) or floats
 45 | 
 46 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 47 |     being plotted to [0,1] or not
 48 | 
 49 | 
 50 |     :returns: array suitable for viewing as an image.
 51 |     (See:`Image.fromarray`.)
 52 |     :rtype: a 2-d array with same dtype as X.
 53 | 
 54 |     """
 55 | 
 56 |     assert len(img_shape) == 2
 57 |     assert len(tile_shape) == 2
 58 |     assert len(tile_spacing) == 2
 59 | 
 60 |     # The expression below can be re-written in a more C style as
 61 |     # follows :
 62 |     #
 63 |     # out_shape    = [0,0]
 64 |     # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
 65 |     #                tile_spacing[0]
 66 |     # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
 67 |     #                tile_spacing[1]
 68 |     out_shape = [
 69 |         (ishp + tsp) * tshp - tsp
 70 |         for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
 71 |     ]
 72 | 
 73 |     if isinstance(X, tuple):
 74 |         assert len(X) == 4
 75 |         # Create an output numpy ndarray to store the image
 76 |         if output_pixel_vals:
 77 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 78 |                                     dtype='uint8')
 79 |         else:
 80 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 81 |                                     dtype=X.dtype)
 82 | 
 83 |         #colors default to 0, alpha defaults to 1 (opaque)
 84 |         if output_pixel_vals:
 85 |             channel_defaults = [0, 0, 0, 255]
 86 |         else:
 87 |             channel_defaults = [0., 0., 0., 1.]
 88 | 
 89 |         for i in xrange(4):
 90 |             if X[i] is None:
 91 |                 # if channel is None, fill it with zeros of the correct
 92 |                 # dtype
 93 |                 dt = out_array.dtype
 94 |                 if output_pixel_vals:
 95 |                     dt = 'uint8'
 96 |                 out_array[:, :, i] = numpy.zeros(
 97 |                     out_shape,
 98 |                     dtype=dt
 99 |                 ) + channel_defaults[i]
100 |             else:
101 |                 # use a recurrent call to compute the channel and store it
102 |                 # in the output
103 |                 out_array[:, :, i] = tile_raster_images(
104 |                     X[i], img_shape, tile_shape, tile_spacing,
105 |                     scale_rows_to_unit_interval, output_pixel_vals)
106 |         return out_array
107 | 
108 |     else:
109 |         # if we are dealing with only one channel
110 |         H, W = img_shape
111 |         Hs, Ws = tile_spacing
112 | 
113 |         # generate a matrix to store the output
114 |         dt = X.dtype
115 |         if output_pixel_vals:
116 |             dt = 'uint8'
117 |         out_array = numpy.zeros(out_shape, dtype=dt)
118 | 
119 |         for tile_row in xrange(tile_shape[0]):
120 |             for tile_col in xrange(tile_shape[1]):
121 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
122 |                     this_x = X[tile_row * tile_shape[1] + tile_col]
123 |                     if scale_rows_to_unit_interval:
124 |                         # if we should scale values to be between 0 and 1
125 |                         # do this by calling the `scale_to_unit_interval`
126 |                         # function
127 |                         this_img = scale_to_unit_interval(
128 |                             this_x.reshape(img_shape))
129 |                     else:
130 |                         this_img = this_x.reshape(img_shape)
131 |                     # add the slice to the corresponding position in the
132 |                     # output array
133 |                     c = 1
134 |                     if output_pixel_vals:
135 |                         c = 255
136 |                     out_array[
137 |                         tile_row * (H + Hs): tile_row * (H + Hs) + H,
138 |                         tile_col * (W + Ws): tile_col * (W + Ws) + W
139 |                     ] = this_img * c
140 |         return out_array
141 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/Makefile:
--------------------------------------------------------------------------------
 1 | all: instruction.pdf rnn_lstm.pdf
 2 | 
 3 | instruction.pdf: slides_source/instruction.tex
 4 | 	cd slides_source; pdflatex --shell-escape instruction.tex
 5 | 	cd slides_source; pdflatex --shell-escape instruction.tex
 6 | 	cd slides_source; pdflatex --shell-escape instruction.tex
 7 | 	mv slides_source/instruction.pdf .
 8 | 
 9 | rnn_lstm.pdf: slides_source/rnn_lstm.tex
10 | 	cd slides_source; pdflatex --shell-escape rnn_lstm.tex
11 | 	cd slides_source; pdflatex --shell-escape rnn_lstm.tex
12 | 	cd slides_source; pdflatex --shell-escape rnn_lstm.tex
13 | 	mv slides_source/rnn_lstm.pdf .
14 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/synthetic.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import numpy as np
 3 | 
 4 | 
 5 | def mackey_glass(sample_len=1000, tau=17, seed=None, n_samples = 1):
 6 |     '''
 7 |     mackey_glass(sample_len=1000, tau=17, seed = None, n_samples = 1) -> input
 8 |     Generate the Mackey Glass time-series. Parameters are:
 9 |         - sample_len: length of the time-series in timesteps. Default is 1000.
10 |         - tau: delay of the MG - system. Commonly used values are tau=17 (mild 
11 |           chaos) and tau=30 (moderate chaos). Default is 17.
12 |         - seed: to seed the random generator, can be used to generate the same
13 |           timeseries at each invocation.
14 |         - n_samples : number of samples to generate
15 |     '''
16 |     delta_t = 10
17 |     history_len = tau * delta_t 
18 |     # Initial conditions for the history of the system
19 |     timeseries = 1.2
20 |     
21 |     if seed is not None:
22 |         np.random.seed(seed)
23 | 
24 |     samples = []
25 | 
26 |     for _ in range(n_samples):
27 |         history = collections.deque(1.2 * np.ones(history_len) + 0.2 * \
28 |                                     (np.random.rand(history_len) - 0.5))
29 |         # Preallocate the array for the time-series
30 |         inp = np.zeros((sample_len,1))
31 |         
32 |         for timestep in range(sample_len):
33 |             for _ in range(delta_t):
34 |                 xtau = history.popleft()
35 |                 history.append(timeseries)
36 |                 timeseries = history[-1] + (0.2 * xtau / (1.0 + xtau ** 10) - \
37 |                              0.1 * history[-1]) / delta_t
38 |             inp[timestep] = timeseries
39 |         
40 |         # Squash timeseries through tanh
41 |         inp = np.tanh(inp - 1)
42 |         samples.append(inp)
43 |     return samples
44 | 
45 | 
46 | def mso(sample_len=1000, n_samples = 1):
47 |     '''
48 |     mso(sample_len=1000, n_samples = 1) -> input
49 |     Generate the Multiple Sinewave Oscillator time-series, a sum of two sines
50 |     with incommensurable periods. Parameters are:
51 |         - sample_len: length of the time-series in timesteps
52 |         - n_samples: number of samples to generate
53 |     '''
54 |     signals = []
55 |     for _ in range(n_samples):
56 |         phase = np.random.rand()
57 |         x = np.atleast_2d(np.arange(sample_len)).T
58 |         signals.append(np.sin(0.2 * x + phase) + np.sin(0.311 * x + phase))
59 |     return signals
60 | 
61 | 
62 | def lorentz(sample_len=1000, sigma=10, rho=28, beta=8 / 3, step=0.01):
63 |     """This function generates a Lorentz time series of length sample_len,
64 |     with standard parameters sigma, rho and beta. 
65 |     """
66 | 
67 |     x = np.zeros([sample_len])
68 |     y = np.zeros([sample_len])
69 |     z = np.zeros([sample_len])
70 | 
71 |     # Initial conditions taken from 'Chaos and Time Series Analysis', J. Sprott
72 |     x[0] = 0;
73 |     y[0] = -0.01;
74 |     z[0] = 9;
75 | 
76 |     for t in range(sample_len - 1):
77 |         x[t + 1] = x[t] + sigma * (y[t] - x[t]) * step
78 |         y[t + 1] = y[t] + (x[t] * (rho - z[t]) - y[t]) * step
79 |         z[t + 1] = z[t] + (x[t] * y[t] - beta * z[t]) * step
80 | 
81 |     x.shape += (1,)
82 |     y.shape += (1,)
83 |     z.shape += (1,)
84 | 
85 |     return np.concatenate((x, y, z), axis=1)
86 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/scan_tutorial/scan_ex1_solution.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | 
 5 | coefficients = T.vector("coefficients")
 6 | x = T.scalar("x")
 7 | max_coefficients_supported = 10000
 8 | 
 9 | 
10 | def step(coeff, power, prior_value, free_var):
11 |     return prior_value + (coeff * (free_var ** power))
12 | 
13 | # Generate the components of the polynomial
14 | full_range = T.arange(max_coefficients_supported)
15 | outputs_info = np.zeros((), dtype=theano.config.floatX)
16 | 
17 | components, updates = theano.scan(fn=step,
18 |                                   sequences=[coefficients, full_range],
19 |                                   outputs_info=outputs_info,
20 |                                   non_sequences=x)
21 | 
22 | polynomial = components[-1]
23 | calculate_polynomial = theano.function(inputs=[coefficients, x],
24 |                                        outputs=polynomial,
25 |                                        updates=updates)
26 | 
27 | test_coeff = np.asarray([1, 0, 2], dtype=theano.config.floatX)
28 | print(calculate_polynomial(test_coeff, 3))
29 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/scan_tutorial/scan_ex2_solution.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | 
 5 | probabilities = T.vector()
 6 | nb_samples = T.iscalar()
 7 | 
 8 | rng = T.shared_randomstreams.RandomStreams(1234)
 9 | 
10 | 
11 | def sample_from_pvect(pvect):
12 |     """ Provided utility function: given a symbolic vector of
13 |     probabilities (which MUST sum to 1), sample one element
14 |     and return its index.
15 |     """
16 |     onehot_sample = rng.multinomial(n=1, pvals=pvect)
17 |     sample = onehot_sample.argmax()
18 |     return sample
19 | 
20 | 
21 | def set_p_to_zero(pvect, i):
22 |     """ Provided utility function: given a symbolic vector of
23 |     probabilities and an index 'i', set the probability of the
24 |     i-th element to 0 and renormalize the probabilities so they
25 |     sum to 1.
26 |     """
27 |     new_pvect = T.set_subtensor(pvect[i], 0.)
28 |     new_pvect = new_pvect / new_pvect.sum()
29 |     return new_pvect
30 | 
31 | 
32 | def step(p):
33 |     sample = sample_from_pvect(p)
34 |     new_p = set_p_to_zero(p, sample)
35 |     return new_p, sample
36 | 
37 | output, updates = theano.scan(fn=step,
38 |                               outputs_info=[probabilities, None],
39 |                               n_steps=nb_samples)
40 | 
41 | modified_probabilities, samples = output
42 | 
43 | f = theano.function(inputs=[probabilities, nb_samples],
44 |                     outputs=[samples],
45 |                     updates=updates)
46 | 
47 | # Testing the function
48 | test_probs = np.asarray([0.6, 0.3, 0.1], dtype=theano.config.floatX)
49 | for i in range(10):
50 |     print(f(test_probs, 2))
51 | 


--------------------------------------------------------------------------------
/images/README.sketch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/README.sketch


--------------------------------------------------------------------------------
/images/README_1200x800.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/README_1200x800.gif


--------------------------------------------------------------------------------
/images/aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/aws.png


--------------------------------------------------------------------------------
/images/commands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/commands.png


--------------------------------------------------------------------------------
/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/cover.png


--------------------------------------------------------------------------------
/images/coversmall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/coversmall.png


--------------------------------------------------------------------------------
/images/coversmall_alt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/coversmall_alt.png


--------------------------------------------------------------------------------
/images/deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/deep.png


--------------------------------------------------------------------------------
/images/k-means.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/k-means.gif


--------------------------------------------------------------------------------
/images/kaggle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/kaggle.png


--------------------------------------------------------------------------------
/images/keras.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/keras.jpg


--------------------------------------------------------------------------------
/images/matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/matplotlib.png


--------------------------------------------------------------------------------
/images/mrjob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/mrjob.png


--------------------------------------------------------------------------------
/images/numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/numpy.png


--------------------------------------------------------------------------------
/images/pandas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/pandas.png


--------------------------------------------------------------------------------
/images/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/python.png


--------------------------------------------------------------------------------
/images/regex-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/regex-1.png


--------------------------------------------------------------------------------
/images/regex-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/regex-2.png


--------------------------------------------------------------------------------
/images/scikitlearn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/scikitlearn.png


--------------------------------------------------------------------------------
/images/scipy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/scipy.png


--------------------------------------------------------------------------------
/images/spark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/spark.png


--------------------------------------------------------------------------------
/images/svm.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/svm.gif


--------------------------------------------------------------------------------
/images/tensorflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/tensorflow.png


--------------------------------------------------------------------------------
/images/theano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/theano.png


--------------------------------------------------------------------------------
/kaggle/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/kaggle/__init__.py


--------------------------------------------------------------------------------
/mapreduce/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/mapreduce/__init__.py


--------------------------------------------------------------------------------
/mapreduce/mr_s3_log_parser.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import time
  3 | from mrjob.job import MRJob
  4 | from mrjob.protocol import RawValueProtocol, ReprProtocol
  5 | import re
  6 | 
  7 | 
  8 | class MrS3LogParser(MRJob):
  9 |     """Parses the logs from S3 based on the S3 logging format:
 10 |     http://docs.aws.amazon.com/AmazonS3/latest/dev/LogFormat.html
 11 |     
 12 |     Aggregates a user's daily requests by user agent and operation
 13 |     
 14 |     Outputs date_time, requester, user_agent, operation, count
 15 |     """
 16 | 
 17 |     LOGPATS  = r'(\S+) (\S+) \[(.*?)\] (\S+) (\S+) ' \
 18 |                r'(\S+) (\S+) (\S+) ("([^"]+)"|-) ' \
 19 |                r'(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ' \
 20 |                r'("([^"]+)"|-) ("([^"]+)"|-)'
 21 |     NUM_ENTRIES_PER_LINE = 17
 22 |     logpat = re.compile(LOGPATS)
 23 | 
 24 |     (S3_LOG_BUCKET_OWNER, 
 25 |      S3_LOG_BUCKET, 
 26 |      S3_LOG_DATE_TIME,
 27 |      S3_LOG_IP, 
 28 |      S3_LOG_REQUESTER_ID, 
 29 |      S3_LOG_REQUEST_ID,
 30 |      S3_LOG_OPERATION, 
 31 |      S3_LOG_KEY, 
 32 |      S3_LOG_HTTP_METHOD,
 33 |      S3_LOG_HTTP_STATUS, 
 34 |      S3_LOG_S3_ERROR, 
 35 |      S3_LOG_BYTES_SENT,
 36 |      S3_LOG_OBJECT_SIZE, 
 37 |      S3_LOG_TOTAL_TIME, 
 38 |      S3_LOG_TURN_AROUND_TIME,
 39 |      S3_LOG_REFERER, 
 40 |      S3_LOG_USER_AGENT) = range(NUM_ENTRIES_PER_LINE)
 41 | 
 42 |     DELIMITER = '\t'
 43 | 
 44 |     # We use RawValueProtocol for input to be format agnostic
 45 |     # and avoid any type of parsing errors
 46 |     INPUT_PROTOCOL = RawValueProtocol
 47 | 
 48 |     # We use RawValueProtocol for output so we can output raw lines
 49 |     # instead of (k, v) pairs
 50 |     OUTPUT_PROTOCOL = RawValueProtocol
 51 | 
 52 |     # Encode the intermediate records using repr() instead of JSON, so the
 53 |     # record doesn't get Unicode-encoded
 54 |     INTERNAL_PROTOCOL = ReprProtocol
 55 | 
 56 |     def clean_date_time_zone(self, raw_date_time_zone):
 57 |         """Converts entry 22/Jul/2013:21:04:17 +0000 to the format
 58 |         'YYYY-MM-DD HH:MM:SS' which is more suitable for loading into
 59 |         a database such as Redshift or RDS
 60 | 
 61 |         Note: requires the chars "[ ]" to be stripped prior to input
 62 |         Returns the converted datetime annd timezone
 63 |         or None for both values if failed
 64 | 
 65 |         TODO: Needs to combine timezone with date as one field
 66 |         """
 67 |         date_time = None
 68 |         time_zone_parsed = None
 69 | 
 70 |         # TODO: Probably cleaner to parse this with a regex
 71 |         date_parsed = raw_date_time_zone[:raw_date_time_zone.find(":")]
 72 |         time_parsed = raw_date_time_zone[raw_date_time_zone.find(":") + 1:
 73 |                                          raw_date_time_zone.find("+") - 1]
 74 |         time_zone_parsed = raw_date_time_zone[raw_date_time_zone.find("+"):]
 75 | 
 76 |         try:
 77 |             date_struct = time.strptime(date_parsed, "%d/%b/%Y")
 78 |             converted_date = time.strftime("%Y-%m-%d", date_struct)
 79 |             date_time = converted_date + " " + time_parsed
 80 | 
 81 |         # Throws a ValueError exception if the operation fails that is
 82 |         # caught by the calling function and is handled appropriately
 83 |         except ValueError as error:
 84 |             raise ValueError(error)
 85 |         else:
 86 |             return converted_date, date_time, time_zone_parsed
 87 | 
 88 |     def mapper(self, _, line):
 89 |         line = line.strip()
 90 |         match = self.logpat.search(line)
 91 | 
 92 |         date_time = None
 93 |         requester = None
 94 |         user_agent = None
 95 |         operation = None
 96 | 
 97 |         try:
 98 |             for n in range(self.NUM_ENTRIES_PER_LINE):
 99 |                 group = match.group(1 + n)
100 | 
101 |                 if n == self.S3_LOG_DATE_TIME:
102 |                     date, date_time, time_zone_parsed = \
103 |                         self.clean_date_time_zone(group)
104 |                     # Leave the following line of code if 
105 |                     # you want to aggregate by date
106 |                     date_time = date + " 00:00:00"
107 |                 elif n == self.S3_LOG_REQUESTER_ID:
108 |                     requester = group
109 |                 elif n == self.S3_LOG_USER_AGENT:
110 |                     user_agent = group
111 |                 elif n == self.S3_LOG_OPERATION:
112 |                     operation = group
113 |                 else:
114 |                     pass
115 | 
116 |         except Exception:
117 |             yield (("Error while parsing line: %s", line), 1)
118 |         else:
119 |             yield ((date_time, requester, user_agent, operation), 1)
120 | 
121 |     def reducer(self, key, values):
122 |         output = list(key)
123 |         output = self.DELIMITER.join(output) + \
124 |                  self.DELIMITER + \
125 |                  str(sum(values))
126 | 
127 |         yield None, output
128 | 
129 |     def steps(self):
130 |         return [
131 |             self.mr(mapper=self.mapper,
132 |                     reducer=self.reducer)
133 |         ]
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     MrS3LogParser.run()


--------------------------------------------------------------------------------
/mapreduce/test_mr_s3_log_parser.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from StringIO import StringIO
 3 | import unittest2 as unittest
 4 | from mr_s3_log_parser import MrS3LogParser
 5 | 
 6 | 
 7 | class MrTestsUtil:
 8 | 
 9 |     def run_mr_sandbox(self, mr_job, stdin):
10 |         # inline runs the job in the same process so small jobs tend to
11 |         # run faster and stack traces are simpler
12 |         # --no-conf prevents options from local mrjob.conf from polluting
13 |         # the testing environment
14 |         # "-" reads from standard in
15 |         mr_job.sandbox(stdin=stdin)
16 | 
17 |         # make_runner ensures job cleanup is performed regardless of
18 |         # success or failure
19 |         with mr_job.make_runner() as runner:
20 |             runner.run()
21 |             for line in runner.stream_output():
22 |                 key, value = mr_job.parse_output_line(line)
23 |                 yield value
24 | 
25 |                 
26 | class TestMrS3LogParser(unittest.TestCase):
27 | 
28 |     mr_job = None
29 |     mr_tests_util = None
30 | 
31 |     RAW_LOG_LINE_INVALID = \
32 |         '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
33 |         '00000388225bcc00000 ' \
34 |         's3-storage [22/Jul/2013:21:03:27 +0000] ' \
35 |         '00.111.222.33 ' \
36 | 
37 |     RAW_LOG_LINE_VALID = \
38 |         '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
39 |         '00000388225bcc00000 ' \
40 |         's3-storage [22/Jul/2013:21:03:27 +0000] ' \
41 |         '00.111.222.33 ' \
42 |         'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \
43 |         'REST.HEAD.OBJECT user/file.pdf ' \
44 |         '"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \
45 |         '00000SDZk ' \
46 |         'HTTP/1.1" 200 - - 4000272 18 - "-" ' \
47 |         '"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \
48 |         '00000XMHZJp6DjM9x5JVEAMo8MG00000'
49 | 
50 |     DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000"
51 |     DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000"
52 |     DATE_VALID = "2013-07-22"
53 |     DATE_TIME_VALID = "2013-07-22 21:04:17"
54 |     TIME_ZONE_VALID = "+0000"
55 | 
56 |     def __init__(self, *args, **kwargs):
57 |         super(TestMrS3LogParser, self).__init__(*args, **kwargs)
58 |         self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-'])
59 |         self.mr_tests_util = MrTestsUtil()
60 | 
61 |     def test_invalid_log_lines(self):
62 |         stdin = StringIO(self.RAW_LOG_LINE_INVALID)
63 | 
64 |         for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
65 |             self.assertEqual(result.find("Error"), 0)
66 | 
67 |     def test_valid_log_lines(self):
68 |         stdin = StringIO(self.RAW_LOG_LINE_VALID)
69 | 
70 |         for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
71 |             self.assertEqual(result.find("Error"), -1)
72 | 
73 |     def test_clean_date_time_zone(self):
74 |         date, date_time, time_zone_parsed = \
75 |             self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID)
76 |         self.assertEqual(date, self.DATE_VALID)
77 |         self.assertEqual(date_time, self.DATE_TIME_VALID)
78 |         self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID)
79 | 
80 |         # Use a lambda to delay the calling of clean_date_time_zone so that
81 |         # assertRaises has enough time to handle it properly
82 |         self.assertRaises(ValueError,
83 |             lambda: self.mr_job.clean_date_time_zone(
84 |                 self.DATE_TIME_ZONE_INVALID))
85 | 
86 | if __name__ == '__main__':
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/matplotlib/04.15-Further-Resources.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "<!--BOOK_INFORMATION-->\n",
 8 |     "<img align=\"left\" style=\"padding-right:10px;\" src=\"figures/PDSH-cover-small.png\">\n",
 9 |     "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n",
10 |     "\n",
11 |     "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n",
12 |     "\n",
13 |     "*No changes were made to the contents of this notebook from the original.*"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "markdown",
18 |    "metadata": {},
19 |    "source": [
20 |     "<!--NAVIGATION-->\n",
21 |     "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "# Further Resources"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "markdown",
33 |    "metadata": {},
34 |    "source": [
35 |     "## Matplotlib Resources\n",
36 |     "\n",
37 |     "A single chapter in a book can never hope to cover all the available features and plot types available in Matplotlib.\n",
38 |     "As with other packages we've seen, liberal use of IPython's tab-completion and help functions (see [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb)) can be very helpful when exploring Matplotlib's API.\n",
39 |     "In addition, Matplotlib’s [online documentation](http://matplotlib.org/) can be a helpful reference.\n",
40 |     "See in particular the [Matplotlib gallery](http://matplotlib.org/gallery.html) linked on that page: it shows thumbnails of hundreds of different plot types, each one linked to a page with the Python code snippet used to generate it.\n",
41 |     "In this way, you can visually inspect and learn about a wide range of different plotting styles and visualization techniques.\n",
42 |     "\n",
43 |     "For a book-length treatment of Matplotlib, I would recommend [*Interactive Applications Using Matplotlib*](https://www.packtpub.com/application-development/interactive-applications-using-matplotlib), written by Matplotlib core developer Ben Root."
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "markdown",
48 |    "metadata": {},
49 |    "source": [
50 |     "## Other Python Graphics Libraries\n",
51 |     "\n",
52 |     "Although Matplotlib is the most prominent Python visualization library, there are other more modern tools that are worth exploring as well.\n",
53 |     "I'll mention a few of them briefly here:\n",
54 |     "\n",
55 |     "- [Bokeh](http://bokeh.pydata.org) is a JavaScript visualization library with a Python frontend that creates highly interactive visualizations capable of handling very large and/or streaming datasets. The Python front-end outputs a JSON data structure that can be interpreted by the Bokeh JS engine.\n",
56 |     "- [Plotly](http://plot.ly) is the eponymous open source product of the Plotly company, and is similar in spirit to Bokeh. Because Plotly is the main product of a startup, it is receiving a high level of development effort. Use of the library is entirely free.\n",
57 |     "- [Vispy](http://vispy.org/) is an actively developed project focused on dynamic visualizations of very large datasets. Because it is built to target OpenGL and make use of efficient graphics processors in your computer, it is able to render some quite large and stunning visualizations.\n",
58 |     "- [Vega](https://vega.github.io/) and [Vega-Lite](https://vega.github.io/vega-lite) are declarative graphics representations, and are the product of years of research into the fundamental language of data visualization. The reference rendering implementation is JavaScript, but the API is language agnostic. There is a Python API under development in the [Altair](https://altair-viz.github.io/) package. Though as of summer 2016 it's not yet fully mature, I'm quite excited for the possibilities of this project to provide a common reference point for visualization in Python and other languages.\n",
59 |     "\n",
60 |     "The visualization space in the Python community is very dynamic, and I fully expect this list to be out of date as soon as it is published.\n",
61 |     "Keep an eye out for what's coming in the future!"
62 |    ]
63 |   },
64 |   {
65 |    "cell_type": "markdown",
66 |    "metadata": {},
67 |    "source": [
68 |     "<!--NAVIGATION-->\n",
69 |     "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >"
70 |    ]
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.4.3"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 0
94 | }
95 | 


--------------------------------------------------------------------------------
/matplotlib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/matplotlib/__init__.py


--------------------------------------------------------------------------------
/matplotlib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/matplotlib/tests/__init__.py


--------------------------------------------------------------------------------
/misc/regex.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Regex\n",
 8 |     "\n",
 9 |     "Credits: Material from [Regular Expressions Cheat Sheet](http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/) by Dave Child\n",
10 |     "\n",
11 |     "Use with http://www.regexr.com to generate regular expressions."
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "markdown",
16 |    "metadata": {},
17 |    "source": [
18 |     "<p align=\"center\">\n",
19 |     "  <img src=\"https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/master/images/regex-1.png\">\n",
20 |     "  <img src=\"https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/master/images/regex-2.png\">\n",
21 |     "  <br/>\n",
22 |     "</p>"
23 |    ]
24 |   }
25 |  ],
26 |  "metadata": {
27 |   "kernelspec": {
28 |    "display_name": "Python 2",
29 |    "language": "python",
30 |    "name": "python2"
31 |   },
32 |   "language_info": {
33 |    "codemirror_mode": {
34 |     "name": "ipython",
35 |     "version": 2
36 |    },
37 |    "file_extension": ".py",
38 |    "mimetype": "text/x-python",
39 |    "name": "python",
40 |    "nbconvert_exporter": "python",
41 |    "pygments_lexer": "ipython2",
42 |    "version": "2.7.10"
43 |   }
44 |  },
45 |  "nbformat": 4,
46 |  "nbformat_minor": 0
47 | }
48 | 


--------------------------------------------------------------------------------
/numpy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/__init__.py


--------------------------------------------------------------------------------
/numpy/figures/02.05-broadcasting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/02.05-broadcasting.png


--------------------------------------------------------------------------------
/numpy/figures/PDSH-cover-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/PDSH-cover-small.png


--------------------------------------------------------------------------------
/numpy/figures/array_vs_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/array_vs_list.png


--------------------------------------------------------------------------------
/numpy/figures/cint_vs_pyint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/cint_vs_pyint.png


--------------------------------------------------------------------------------
/numpy/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/tests/__init__.py


--------------------------------------------------------------------------------
/pandas/03.00-Introduction-to-Pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<!--BOOK_INFORMATION-->\n",
  8 |     "<img align=\"left\" style=\"padding-right:10px;\" src=\"figures/PDSH-cover-small.png\">\n",
  9 |     "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n",
 10 |     "\n",
 11 |     "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n",
 12 |     "\n",
 13 |     "*No changes were made to the contents of this notebook from the original.*"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "<!--NAVIGATION-->\n",
 21 |     "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Data Manipulation with Pandas"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "In the previous chapter, we dove into detail on NumPy and its ``ndarray`` object, which provides efficient storage and manipulation of dense typed arrays in Python.\n",
 36 |     "Here we'll build on this knowledge by looking in detail at the data structures provided by the Pandas library.\n",
 37 |     "Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a ``DataFrame``.\n",
 38 |     "``DataFrame``s are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data.\n",
 39 |     "As well as offering a convenient storage interface for labeled data, Pandas implements a number of powerful data operations familiar to users of both database frameworks and spreadsheet programs.\n",
 40 |     "\n",
 41 |     "As we saw, NumPy's ``ndarray`` data structure provides essential features for the type of clean, well-organized data typically seen in numerical computing tasks.\n",
 42 |     "While it serves this purpose very well, its limitations become clear when we need more flexibility (e.g., attaching labels to data, working with missing data, etc.) and when attempting operations that do not map well to element-wise broadcasting (e.g., groupings, pivots, etc.), each of which is an important piece of analyzing the less structured data available in many forms in the world around us.\n",
 43 |     "Pandas, and in particular its ``Series`` and ``DataFrame`` objects, builds on the NumPy array structure and provides efficient access to these sorts of \"data munging\" tasks that occupy much of a data scientist's time.\n",
 44 |     "\n",
 45 |     "In this chapter, we will focus on the mechanics of using ``Series``, ``DataFrame``, and related structures effectively.\n",
 46 |     "We will use examples drawn from real datasets where appropriate, but these examples are not necessarily the focus."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## Installing and Using Pandas\n",
 54 |     "\n",
 55 |     "Installation of Pandas on your system requires NumPy to be installed, and if building the library from source, requires the appropriate tools to compile the C and Cython sources on which Pandas is built.\n",
 56 |     "Details on this installation can be found in the [Pandas documentation](http://pandas.pydata.org/).\n",
 57 |     "If you followed the advice outlined in the [Preface](00.00-Preface.ipynb) and used the Anaconda stack, you already have Pandas installed.\n",
 58 |     "\n",
 59 |     "Once Pandas is installed, you can import it and check the version:"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 1,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "'0.18.1'"
 73 |       ]
 74 |      },
 75 |      "execution_count": 1,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "import pandas\n",
 82 |     "pandas.__version__"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "Just as we generally import NumPy under the alias ``np``, we will import Pandas under the alias ``pd``:"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 2,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import pandas as pd"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "This import convention will be used throughout the remainder of this book."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Reminder about Built-In Documentation\n",
115 |     "\n",
116 |     "As you read through this chapter, don't forget that IPython gives you the ability to quickly explore the contents of a package (by using the tab-completion feature) as well as the documentation of various functions (using the ``?`` character). (Refer back to [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) if you need a refresher on this.)\n",
117 |     "\n",
118 |     "For example, to display all the contents of the pandas namespace, you can type\n",
119 |     "\n",
120 |     "```ipython\n",
121 |     "In [3]: pd.<TAB>\n",
122 |     "```\n",
123 |     "\n",
124 |     "And to display Pandas's built-in documentation, you can use this:\n",
125 |     "\n",
126 |     "```ipython\n",
127 |     "In [4]: pd?\n",
128 |     "```\n",
129 |     "\n",
130 |     "More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "<!--NAVIGATION-->\n",
138 |     "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >"
139 |    ]
140 |   }
141 |  ],
142 |  "metadata": {
143 |   "anaconda-cloud": {},
144 |   "kernelspec": {
145 |    "display_name": "Python 3",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.4.3"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 0
164 | }
165 | 


--------------------------------------------------------------------------------
/pandas/03.13-Further-Resources.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "<!--BOOK_INFORMATION-->\n",
 8 |     "<img align=\"left\" style=\"padding-right:10px;\" src=\"figures/PDSH-cover-small.png\">\n",
 9 |     "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n",
10 |     "\n",
11 |     "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n",
12 |     "\n",
13 |     "*No changes were made to the contents of this notebook from the original.*"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "markdown",
18 |    "metadata": {},
19 |    "source": [
20 |     "<!--NAVIGATION-->\n",
21 |     "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "# Further Resources\n",
29 |     "\n",
30 |     "In this chapter, we've covered many of the basics of using Pandas effectively for data analysis.\n",
31 |     "Still, much has been omitted from our discussion.\n",
32 |     "To learn more about Pandas, I recommend the following resources:\n",
33 |     "\n",
34 |     "- [Pandas online documentation](http://pandas.pydata.org/): This is the go-to source for complete documentation of the package. While the examples in the documentation tend to be small generated datasets, the description of the options is complete and generally very useful for understanding the use of various functions.\n",
35 |     "\n",
36 |     "- [*Python for Data Analysis*](http://shop.oreilly.com/product/0636920023784.do) Written by Wes McKinney (the original creator of Pandas), this book contains much more detail on the Pandas package than we had room for in this chapter. In particular, he takes a deep dive into tools for time series, which were his bread and butter as a financial consultant. The book also has many entertaining examples of applying Pandas to gain insight from real-world datasets. Keep in mind, though, that the book is now several years old, and the Pandas package has quite a few new features that this book does not cover (but be on the lookout for a new edition in 2017).\n",
37 |     "\n",
38 |     "- [Stack Overflow](http://stackoverflow.com/questions/tagged/pandas): Pandas has so many users that any question you have has likely been asked and answered on Stack Overflow. Using Pandas is a case where some Google-Fu is your best friend. Simply go to your favorite search engine and type in the question, problem, or error you're coming across–more than likely you'll find your answer on a Stack Overflow page.\n",
39 |     "\n",
40 |     "- [Pandas on PyVideo](http://pyvideo.org/search?q=pandas): From PyCon to SciPy to PyData, many conferences have featured tutorials from Pandas developers and power users. The PyCon tutorials in particular tend to be given by very well-vetted presenters.\n",
41 |     "\n",
42 |     "Using these resources, combined with the walk-through given in this chapter, my hope is that you'll be poised to use Pandas to tackle any data analysis problem you come across!"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "markdown",
47 |    "metadata": {},
48 |    "source": [
49 |     "<!--NAVIGATION-->\n",
50 |     "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >"
51 |    ]
52 |   }
53 |  ],
54 |  "metadata": {
55 |   "anaconda-cloud": {},
56 |   "kernelspec": {
57 |    "display_name": "Python 3",
58 |    "language": "python",
59 |    "name": "python3"
60 |   },
61 |   "language_info": {
62 |    "codemirror_mode": {
63 |     "name": "ipython",
64 |     "version": 3
65 |    },
66 |    "file_extension": ".py",
67 |    "mimetype": "text/x-python",
68 |    "name": "python",
69 |    "nbconvert_exporter": "python",
70 |    "pygments_lexer": "ipython3",
71 |    "version": "3.4.3"
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 0
76 | }
77 | 


--------------------------------------------------------------------------------
/pandas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/pandas/__init__.py


--------------------------------------------------------------------------------
/pandas/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/pandas/tests/__init__.py


--------------------------------------------------------------------------------
/python-data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/python-data/__init__.py


--------------------------------------------------------------------------------
/python-data/files.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Files\n",
 15 |     "\n",
 16 |     "* Read a File\n",
 17 |     "* Write a File\n",
 18 |     "* Read and Write UTF-8"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Read a File\n",
 26 |     "\n",
 27 |     "Open a file in read-only mode.<br\\>\n",
 28 |     "Iterate over the file lines.  rstrip removes the EOL markers.<br\\>"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "class TypeUtil:\n",
 43 |       "\n",
 44 |       "    @classmethod\n",
 45 |       "    def is_iterable(cls, obj):\n",
 46 |       "        \"\"\"Determines if obj is iterable.\n",
 47 |       "\n",
 48 |       "        Useful when writing functions that can accept multiple types of\n",
 49 |       "        input (list, tuple, ndarray, iterator).  Pairs well with\n",
 50 |       "        convert_to_list.\n",
 51 |       "        \"\"\"\n",
 52 |       "        try:\n",
 53 |       "            iter(obj)\n",
 54 |       "            return True\n",
 55 |       "        except TypeError:\n",
 56 |       "            return False\n",
 57 |       "\n",
 58 |       "    @classmethod\n",
 59 |       "    def convert_to_list(cls, obj):\n",
 60 |       "        \"\"\"Converts obj to a list if it is not a list and it is iterable,\n",
 61 |       "        else returns the original obj.\n",
 62 |       "        \"\"\"\n",
 63 |       "        if not isinstance(obj, list) and cls.is_iterable(obj):\n",
 64 |       "            obj = list(obj)\n",
 65 |       "        return obj\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "old_file_path = 'type_util.py'\n",
 71 |     "with open(old_file_path, 'r') as old_file:\n",
 72 |     "    for line in old_file:\n",
 73 |     "        print(line.rstrip())"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Write to a file\n",
 81 |     "\n",
 82 |     "Create a new file overwriting any previous file with the same name, write text, then close the file:"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 2,
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "new_file_path = 'hello_world.txt'\n",
 94 |     "with open(new_file_path, 'w') as new_file:\n",
 95 |     "    new_file.write('hello world!')"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Read and Write UTF-8"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "import codecs\n",
114 |     "with codecs.open(\"hello_world_new.txt\", \"a\", \"utf-8\") as new_file:\n",
115 |     "    with codecs.open(\"hello_world.txt\", \"r\", \"utf-8\") as old_file:                   \n",
116 |     "        for line in old_file:\n",
117 |     "            new_file.write(line + '\\n')"
118 |    ]
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": "Python 2",
124 |    "language": "python",
125 |    "name": "python2"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 2
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython2",
137 |    "version": "2.7.10"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 0
142 | }
143 | 


--------------------------------------------------------------------------------
/python-data/hello_world.txt:
--------------------------------------------------------------------------------
1 | hello world!


--------------------------------------------------------------------------------
/python-data/logs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Logging in Python\n",
 15 |     "* Logging with RotatingFileHandler\n",
 16 |     "* Logging with TimedRotatingFileHandler "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Logging with RotatingFileHandler\n",
 24 |     "\n",
 25 |     "The logging discussion is taken from the [Python Logging Cookbook](https://docs.python.org/2/howto/logging-cookbook.html#using-file-rotation):\n",
 26 |     "\n",
 27 |     "Sometimes you want to let a log file grow to a certain size, then open a new file and log to that. You may want to keep a certain number of these files, and when that many files have been created, rotate the files so that the number of files and the size of the files both remain bounded. For this usage pattern, the logging package provides a RotatingFileHandler.\n",
 28 |     "\n",
 29 |     "The most current file is always logging_rotatingfile_example.out, and each time it reaches the size limit it is renamed with the suffix .1. Each of the existing backup files is renamed to increment the suffix (.1 becomes .2, etc.) and the .6 file is erased.\n",
 30 |     "\n",
 31 |     "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import logging\n",
 43 |     "import time\n",
 44 |     " \n",
 45 |     "from logging.handlers import RotatingFileHandler\n",
 46 |     " \n",
 47 |     "#----------------------------------------------------------------------\n",
 48 |     "def create_rotating_log(path):\n",
 49 |     "    \"\"\"\n",
 50 |     "    Creates a rotating log\n",
 51 |     "    \"\"\"\n",
 52 |     "    logger = logging.getLogger(\"Rotating Log\")\n",
 53 |     "    logger.setLevel(logging.INFO)\n",
 54 |     " \n",
 55 |     "    # add a rotating handler\n",
 56 |     "    handler = RotatingFileHandler(path, maxBytes=20,\n",
 57 |     "                                  backupCount=5)\n",
 58 |     "    logger.addHandler(handler)\n",
 59 |     " \n",
 60 |     "    for i in range(10):\n",
 61 |     "        logger.info(\"This is test log line %s\" % i)\n",
 62 |     "        time.sleep(1.5)\n",
 63 |     " \n",
 64 |     "#----------------------------------------------------------------------\n",
 65 |     "if __name__ == \"__main__\":\n",
 66 |     "    log_file = \"test.log\"\n",
 67 |     "    create_rotating_log(log_file)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Logging with TimedRotatingFileHandler\n",
 75 |     "\n",
 76 |     "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "import logging\n",
 88 |     "import time\n",
 89 |     " \n",
 90 |     "from logging.handlers import TimedRotatingFileHandler\n",
 91 |     " \n",
 92 |     "#----------------------------------------------------------------------\n",
 93 |     "def create_timed_rotating_log(path):\n",
 94 |     "    \"\"\"\"\"\"\n",
 95 |     "    logger = logging.getLogger(\"Rotating Log\")\n",
 96 |     "    logger.setLevel(logging.INFO)\n",
 97 |     " \n",
 98 |     "    # Rotate log based on when parameter:\n",
 99 |     "    # second (s)\n",
100 |     "    # minute (m)\n",
101 |     "    # hour (h)\n",
102 |     "    # day (d)\n",
103 |     "    # w0-w6 (weekday, 0=Monday)\n",
104 |     "    # midnight\n",
105 |     "    handler = TimedRotatingFileHandler(path,\n",
106 |     "                                       when=\"m\",\n",
107 |     "                                       interval=1,\n",
108 |     "                                       backupCount=5)\n",
109 |     "    logger.addHandler(handler)\n",
110 |     " \n",
111 |     "    for i in range(20):\n",
112 |     "        logger.info(\"This is a test!\")\n",
113 |     "        time.sleep(1.5)\n",
114 |     " \n",
115 |     "#----------------------------------------------------------------------\n",
116 |     "if __name__ == \"__main__\":\n",
117 |     "    log_file = \"timed_test.log\"\n",
118 |     "    create_timed_rotating_log(log_file)"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 2",
125 |    "language": "python",
126 |    "name": "python2"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 2
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython2",
138 |    "version": "2.7.10"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 0
143 | }
144 | 


--------------------------------------------------------------------------------
/python-data/pdb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# PDB\n",
 15 |     "\n",
 16 |     "The pdb module defines an interactive source code debugger for Python programs.  Below are frequently used commands:"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# Run pdb when this line is hit\n",
 28 |     "import pdb; pdb.set_trace()\n",
 29 |     "\n",
 30 |     "# Run pdb when the script is run\n",
 31 |     "python -m pdb script.py\n",
 32 |     "\n",
 33 |     "# Help\n",
 34 |     "h[elp]\n",
 35 |     "\n",
 36 |     "# Show current content\n",
 37 |     "l[ist]\n",
 38 |     "\n",
 39 |     "# Examine variables\n",
 40 |     "p[rint]\n",
 41 |     "\n",
 42 |     "# Pretty print\n",
 43 |     "pp\n",
 44 |     "\n",
 45 |     "# Go to next line\n",
 46 |     "n[ext]\n",
 47 |     "\n",
 48 |     "# Step into\n",
 49 |     "s[tep]\n",
 50 |     "\n",
 51 |     "# Continue execution until the line with the line number greater \n",
 52 |     "# than the current one is reached or when returning from current frame.\n",
 53 |     "until\n",
 54 |     "\n",
 55 |     "# Return\n",
 56 |     "r[eturn]\n",
 57 |     "\n",
 58 |     "# See all breakpoints\n",
 59 |     "b to see all breakpoints\n",
 60 |     "\n",
 61 |     "# Set breakpoint at line 16\n",
 62 |     "b 16 \n",
 63 |     "\n",
 64 |     "# Clear breakpoint 1\n",
 65 |     "cl[ear] 1\n",
 66 |     "\n",
 67 |     "# Continue\n",
 68 |     "c[ontinue]\n",
 69 |     "\n",
 70 |     "# Conditional breakpoints, line 11\n",
 71 |     "b 11, this_year == 2015\n",
 72 |     "\n",
 73 |     "# Stack location\n",
 74 |     "w[here]\n",
 75 |     "\n",
 76 |     "# Go up in stack\n",
 77 |     "u[p]\n",
 78 |     "\n",
 79 |     "# Go down in stack\n",
 80 |     "d[own]\n",
 81 |     "\n",
 82 |     "# Longlist shows full method of where you're in (Python 3)\n",
 83 |     "ll\n",
 84 |     "\n",
 85 |     "# Quit\n",
 86 |     "q[uit]"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "kernelspec": {
 92 |    "display_name": "Python 2",
 93 |    "language": "python",
 94 |    "name": "python2"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 2
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython2",
106 |    "version": "2.7.10"
107 |   }
108 |  },
109 |  "nbformat": 4,
110 |  "nbformat_minor": 0
111 | }
112 | 


--------------------------------------------------------------------------------
/python-data/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/python-data/tests/__init__.py


--------------------------------------------------------------------------------
/python-data/tests/test_transform_util.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import assert_equal
 2 | from ..transform_util import TransformUtil
 3 | 
 4 | 
 5 | class TestTransformUtil():
 6 | 
 7 |     states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', \
 8 |           'FlOrIda', 'south carolina##', 'West virginia?']
 9 |     
10 |     expected_output = ['Alabama',
11 |                        'Georgia',
12 |                        'Georgia',
13 |                        'Georgia',
14 |                        'Florida',
15 |                        'South Carolina',
16 |                        'West Virginia']
17 |     
18 |     def test_remove_punctuation(self):
19 |         assert_equal(TransformUtil.remove_punctuation('!#?'), '')
20 |         
21 |     def test_map_remove_punctuation(self):
22 |         # Map applies a function to a collection
23 |         output = map(TransformUtil.remove_punctuation, self.states)
24 |         assert_equal('!#?' not in output, True)
25 | 
26 |     def test_clean_strings(self):
27 |         clean_ops = [str.strip, TransformUtil.remove_punctuation, str.title] 
28 |         output = TransformUtil.clean_strings(self.states, clean_ops)
29 |         assert_equal(output, self.expected_output)


--------------------------------------------------------------------------------
/python-data/tests/test_type_util.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import assert_equal
 2 | from ..type_util import TypeUtil
 3 | 
 4 | 
 5 | class TestUtil():
 6 | 
 7 |     def test_is_iterable(self):
 8 |         assert_equal(TypeUtil.is_iterable('foo'), True)
 9 |         assert_equal(TypeUtil.is_iterable(7), False)
10 | 
11 |     def test_convert_to_list(self):
12 |         assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)
13 |         assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)


--------------------------------------------------------------------------------
/python-data/transform_util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class TransformUtil:
 5 | 
 6 |     @classmethod
 7 |     def remove_punctuation(cls, value):
 8 |         """Removes !, #, and ?.
 9 |         """        
10 |         return re.sub('[!#?]', '', value) 
11 | 
12 |     @classmethod
13 |     def clean_strings(cls, strings, ops): 
14 |         """General purpose method to clean strings.
15 | 
16 |         Pass in a sequence of strings and the operations to perform.
17 |         """        
18 |         result = [] 
19 |         for value in strings: 
20 |             for function in ops: 
21 |                 value = function(value) 
22 |             result.append(value) 
23 |         return result


--------------------------------------------------------------------------------
/python-data/type_util.py:
--------------------------------------------------------------------------------
 1 | class TypeUtil:
 2 | 
 3 |     @classmethod
 4 |     def is_iterable(cls, obj):
 5 |         """Determines if obj is iterable.
 6 | 
 7 |         Useful when writing functions that can accept multiple types of
 8 |         input (list, tuple, ndarray, iterator).  Pairs well with
 9 |         convert_to_list.
10 |         """
11 |         try:
12 |             iter(obj)
13 |             return True
14 |         except TypeError:
15 |             return False
16 | 
17 |     @classmethod
18 |     def convert_to_list(cls, obj):
19 |         """Converts obj to a list if it is not a list and it is iterable, 
20 |         else returns the original obj.
21 |         """
22 |         if not isinstance(obj, list) and cls.is_iterable(obj):
23 |             obj = list(obj)
24 |         return obj


--------------------------------------------------------------------------------
/python-data/unit_tests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Nose Unit Tests with IPython Notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Nose\n",
 22 |     "\n",
 23 |     "Testing is a vital part of software development.  Nose extends unittest to make testing easier."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Install Nose\n",
 31 |     "\n",
 32 |     "Run the following command line:"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "!pip install nose"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Create the Code\n",
 51 |     "\n",
 52 |     "Save your code to a file with the %%file magic:"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 1,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "Overwriting type_util.py\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "%%file type_util.py\n",
 72 |     "class TypeUtil:\n",
 73 |     "\n",
 74 |     "    @classmethod\n",
 75 |     "    def is_iterable(cls, obj):\n",
 76 |     "        \"\"\"Determines if obj is iterable.\n",
 77 |     "\n",
 78 |     "        Useful when writing functions that can accept multiple types of\n",
 79 |     "        input (list, tuple, ndarray, iterator).  Pairs well with\n",
 80 |     "        convert_to_list.\n",
 81 |     "        \"\"\"\n",
 82 |     "        try:\n",
 83 |     "            iter(obj)\n",
 84 |     "            return True\n",
 85 |     "        except TypeError:\n",
 86 |     "            return False\n",
 87 |     "\n",
 88 |     "    @classmethod\n",
 89 |     "    def convert_to_list(cls, obj):\n",
 90 |     "        \"\"\"Converts obj to a list if it is not a list and it is iterable, \n",
 91 |     "        else returns the original obj.\n",
 92 |     "        \"\"\"\n",
 93 |     "        if not isinstance(obj, list) and cls.is_iterable(obj):\n",
 94 |     "            obj = list(obj)\n",
 95 |     "        return obj\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Create the Nose Tests\n",
103 |     "\n",
104 |     "Save your test to a file with the %%file magic:"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 2,
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Overwriting tests/test_type_util.py\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "%%file tests/test_type_util.py\n",
124 |     "from nose.tools import assert_equal\n",
125 |     "from ..type_util import TypeUtil\n",
126 |     "\n",
127 |     "\n",
128 |     "class TestUtil():\n",
129 |     "\n",
130 |     "    def test_is_iterable(self):\n",
131 |     "        assert_equal(TypeUtil.is_iterable('foo'), True)\n",
132 |     "        assert_equal(TypeUtil.is_iterable(7), False)\n",
133 |     "\n",
134 |     "    def test_convert_to_list(self):\n",
135 |     "        assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)\n",
136 |     "        assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Run the Nose Tests\n",
144 |     "\n",
145 |     "Run the following command line:"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 3,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "core.tests.test_type_util.TestUtil.test_convert_to_list ... ok\r\n",
160 |       "core.tests.test_type_util.TestUtil.test_is_iterable ... ok\r\n",
161 |       "\r\n",
162 |       "----------------------------------------------------------------------\r\n",
163 |       "Ran 2 tests in 0.001s\r\n",
164 |       "\r\n",
165 |       "OK\r\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "!nosetests tests/test_type_util.py -v"
171 |    ]
172 |   }
173 |  ],
174 |  "metadata": {
175 |   "kernelspec": {
176 |    "display_name": "Python 2",
177 |    "language": "python",
178 |    "name": "python2"
179 |   },
180 |   "language_info": {
181 |    "codemirror_mode": {
182 |     "name": "ipython",
183 |     "version": 2
184 |    },
185 |    "file_extension": ".py",
186 |    "mimetype": "text/x-python",
187 |    "name": "python",
188 |    "nbconvert_exporter": "python",
189 |    "pygments_lexer": "ipython2",
190 |    "version": "2.7.10"
191 |   }
192 |  },
193 |  "nbformat": 4,
194 |  "nbformat_minor": 0
195 | }
196 | 


--------------------------------------------------------------------------------
/scikit-learn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scikit-learn/__init__.py


--------------------------------------------------------------------------------
/scikit-learn/fig_code/ML_flow_chart.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tutorial Diagrams
  3 | -----------------
  4 | 
  5 | This script plots the flow-charts used in the scikit-learn tutorials.
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pylab as pl
 10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow
 11 | 
 12 | def create_base(box_bg = '#CCCCCC',
 13 |                 arrow1 = '#88CCFF',
 14 |                 arrow2 = '#88FF88',
 15 |                 supervised=True):
 16 |     fig = pl.figure(figsize=(9, 6), facecolor='w')
 17 |     ax = pl.axes((0, 0, 1, 1),
 18 |                  xticks=[], yticks=[], frameon=False)
 19 |     ax.set_xlim(0, 9)
 20 |     ax.set_ylim(0, 6)
 21 | 
 22 |     patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg),
 23 |                Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg),
 24 |                Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg),
 25 |                
 26 |                Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg),
 27 |                Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg),
 28 |                Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg),
 29 |                
 30 |                Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg),
 31 |                
 32 |                Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg),
 33 |                
 34 |                Circle((5.5, 3.5), 1.0, fc=box_bg),
 35 |                
 36 |                Polygon([[5.5, 1.7],
 37 |                         [6.1, 1.1],
 38 |                         [5.5, 0.5],
 39 |                         [4.9, 1.1]], fc=box_bg),
 40 |                
 41 |                FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1,
 42 |                           width=0.25, head_width=0.5, head_length=0.2),
 43 |                
 44 |                FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1,
 45 |                           width=0.25, head_width=0.5, head_length=0.2),
 46 |                
 47 |                FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1,
 48 |                           width=0.25, head_width=0.5, head_length=0.2),
 49 |                
 50 |                FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2,
 51 |                           width=0.25, head_width=0.5, head_length=0.2),
 52 |                
 53 |                FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2,
 54 |                           width=0.25, head_width=0.5, head_length=0.2),
 55 |                
 56 |                FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2,
 57 |                           width=0.25, head_width=0.5, head_length=0.2)]
 58 | 
 59 |     if supervised:
 60 |         patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg),
 61 |                     Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg),
 62 |                     Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg),
 63 |                     FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1,
 64 |                                width=0.25, head_width=0.5, head_length=0.2),
 65 |                     Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)]
 66 |     else:
 67 |         patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)]
 68 |     
 69 |     for p in patches:
 70 |         ax.add_patch(p)
 71 |         
 72 |     pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.",
 73 |             ha='center', va='center', fontsize=14)
 74 |     
 75 |     pl.text(3.6, 4.9, "Feature\nVectors", 
 76 |             ha='left', va='center', fontsize=14)
 77 |     
 78 |     pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm",
 79 |             ha='center', va='center', fontsize=14)
 80 |     
 81 |     pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.",
 82 |             ha='center', va='center', fontsize=14)
 83 |     
 84 |     pl.text(3.3, 1.7, "Feature\nVector", 
 85 |             ha='left', va='center', fontsize=14)
 86 |     
 87 |     pl.text(5.5, 1.1, "Predictive\nModel", 
 88 |             ha='center', va='center', fontsize=12)
 89 | 
 90 |     if supervised:
 91 |         pl.text(1.45, 3.05, "Labels",
 92 |                 ha='center', va='center', fontsize=14)
 93 |     
 94 |         pl.text(8.05, 1.1, "Expected\nLabel",
 95 |                 ha='center', va='center', fontsize=14)
 96 |         pl.text(8.8, 5.8, "Supervised Learning Model",
 97 |                 ha='right', va='top', fontsize=18)
 98 | 
 99 |     else:
100 |         pl.text(8.05, 1.1,
101 |                 "Likelihood\nor Cluster ID\nor Better\nRepresentation",
102 |                 ha='center', va='center', fontsize=12)
103 |         pl.text(8.8, 5.8, "Unsupervised Learning Model",
104 |                 ha='right', va='top', fontsize=18)
105 |         
106 |         
107 | 
108 | def plot_supervised_chart(annotate=False):
109 |     create_base(supervised=True)
110 |     if annotate:
111 |         fontdict = dict(color='r', weight='bold', size=14)
112 |         pl.text(1.9, 4.55, 'X = vec.fit_transform(input)',
113 |                 fontdict=fontdict,
114 |                 rotation=20, ha='left', va='bottom')
115 |         pl.text(3.7, 3.2, 'clf.fit(X, y)',
116 |                 fontdict=fontdict,
117 |                 rotation=20, ha='left', va='bottom')
118 |         pl.text(1.7, 1.5, 'X_new = vec.transform(input)',
119 |                 fontdict=fontdict,
120 |                 rotation=20, ha='left', va='bottom')
121 |         pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)',
122 |                 fontdict=fontdict,
123 |                 rotation=20, ha='left', va='bottom')
124 | 
125 | def plot_unsupervised_chart():
126 |     create_base(supervised=False)
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     plot_supervised_chart(False)
131 |     plot_supervised_chart(True)
132 |     plot_unsupervised_chart()
133 |     pl.show()
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import *
2 | from .figures import *
3 | 
4 | from .sgd_separator import plot_sgd_separator
5 | from .linear_regression import plot_linear_regression
6 | from .helpers import plot_iris_knn
7 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def linear_data_sample(N=40, rseed=0, m=3, b=-2):
 5 |     rng = np.random.RandomState(rseed)
 6 | 
 7 |     x = 10 * rng.rand(N)
 8 |     dy = m / 2 * (1 + rng.rand(N))
 9 |     y = m * x + b + dy * rng.randn(N)
10 | 
11 |     return (x, y, dy)
12 | 
13 | 
14 | def linear_data_sample_big_errs(N=40, rseed=0, m=3, b=-2):
15 |     rng = np.random.RandomState(rseed)
16 | 
17 |     x = 10 * rng.rand(N)
18 |     dy = m / 2 * (1 + rng.rand(N))
19 |     dy[20:25] *= 10
20 |     y = m * x + b + dy * rng.randn(N)
21 | 
22 |     return (x, y, dy)
23 | 
24 | 
25 | def sample_light_curve(phased=True):
26 |     from astroML.datasets import fetch_LINEAR_sample
27 |     data = fetch_LINEAR_sample()
28 |     t, y, dy = data[18525697].T
29 | 
30 |     if phased:
31 |         P_best = 0.580313015651
32 |         t /= P_best
33 | 
34 |     return (t, y, dy)
35 |     
36 | 
37 | def sample_light_curve_2(phased=True):
38 |     from astroML.datasets import fetch_LINEAR_sample
39 |     data = fetch_LINEAR_sample()
40 |     t, y, dy = data[10022663].T
41 | 
42 |     if phased:
43 |         P_best = 0.61596079804
44 |         t /= P_best
45 | 
46 |     return (t, y, dy)
47 |     
48 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/helpers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Small helpers for code that is not shown in the notebooks
 3 | """
 4 | 
 5 | from sklearn import neighbors, datasets, linear_model
 6 | import pylab as pl
 7 | import numpy as np
 8 | from matplotlib.colors import ListedColormap
 9 | 
10 | # Create color maps for 3-class classification problem, as with iris
11 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
12 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
13 | 
14 | def plot_iris_knn():
15 |     iris = datasets.load_iris()
16 |     X = iris.data[:, :2]  # we only take the first two features. We could
17 |                         # avoid this ugly slicing by using a two-dim dataset
18 |     y = iris.target
19 | 
20 |     knn = neighbors.KNeighborsClassifier(n_neighbors=5)
21 |     knn.fit(X, y)
22 | 
23 |     x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
24 |     y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
25 |     xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
26 |                          np.linspace(y_min, y_max, 100))
27 |     Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
28 | 
29 |     # Put the result into a color plot
30 |     Z = Z.reshape(xx.shape)
31 |     pl.figure()
32 |     pl.pcolormesh(xx, yy, Z, cmap=cmap_light)
33 | 
34 |     # Plot also the training points
35 |     pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
36 |     pl.xlabel('sepal length (cm)')
37 |     pl.ylabel('sepal width (cm)')
38 |     pl.axis('tight')
39 | 
40 | 
41 | def plot_polynomial_regression():
42 |     rng = np.random.RandomState(0)
43 |     x = 2*rng.rand(100) - 1
44 | 
45 |     f = lambda t: 1.2 * t**2 + .1 * t**3 - .4 * t **5 - .5 * t ** 9
46 |     y = f(x) + .4 * rng.normal(size=100)
47 | 
48 |     x_test = np.linspace(-1, 1, 100)
49 | 
50 |     pl.figure()
51 |     pl.scatter(x, y, s=4)
52 | 
53 |     X = np.array([x**i for i in range(5)]).T
54 |     X_test = np.array([x_test**i for i in range(5)]).T
55 |     regr = linear_model.LinearRegression()
56 |     regr.fit(X, y)
57 |     pl.plot(x_test, regr.predict(X_test), label='4th order')
58 | 
59 |     X = np.array([x**i for i in range(10)]).T
60 |     X_test = np.array([x_test**i for i in range(10)]).T
61 |     regr = linear_model.LinearRegression()
62 |     regr.fit(X, y)
63 |     pl.plot(x_test, regr.predict(X_test), label='9th order')
64 | 
65 |     pl.legend(loc='best')
66 |     pl.axis('tight')
67 |     pl.title('Fitting a 4th and a 9th order polynomial')
68 | 
69 |     pl.figure()
70 |     pl.scatter(x, y, s=4)
71 |     pl.plot(x_test, f(x_test), label="truth")
72 |     pl.axis('tight')
73 |     pl.title('Ground truth (9th order polynomial)')
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/linear_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.linear_model import LinearRegression
 4 | 
 5 | 
 6 | def plot_linear_regression():
 7 |     a = 0.5
 8 |     b = 1.0
 9 | 
10 |     # x from 0 to 10
11 |     x = 30 * np.random.random(20)
12 | 
13 |     # y = a*x + b with noise
14 |     y = a * x + b + np.random.normal(size=x.shape)
15 | 
16 |     # create a linear regression classifier
17 |     clf = LinearRegression()
18 |     clf.fit(x[:, None], y)
19 | 
20 |     # predict y from the data
21 |     x_new = np.linspace(0, 30, 100)
22 |     y_new = clf.predict(x_new[:, None])
23 | 
24 |     # plot the results
25 |     ax = plt.axes()
26 |     ax.scatter(x, y)
27 |     ax.plot(x_new, y_new)
28 | 
29 |     ax.set_xlabel('x')
30 |     ax.set_ylabel('y')
31 | 
32 |     ax.axis('tight')
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     plot_linear_regression()
37 |     plt.show()
38 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/scikit-learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:29899a15bea89b9d8275879798b23011cecabc0eff03dd41bb606324221e0bc3"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "markdown",
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "# scikit-learn"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "%matplotlib inline\n",
 23 |       "\n",
 24 |       "# set seaborn plot defaults.\n",
 25 |       "# This can be safely commented out\n",
 26 |       "import seaborn; seaborn.set()"
 27 |      ],
 28 |      "language": "python",
 29 |      "metadata": {},
 30 |      "outputs": [],
 31 |      "prompt_number": 3
 32 |     },
 33 |     {
 34 |      "cell_type": "code",
 35 |      "collapsed": false,
 36 |      "input": [
 37 |       "# Import the example plot from the figures directory\n",
 38 |       "from fig_code import plot_sgd_separator\n",
 39 |       "plot_sgd_separator()"
 40 |      ],
 41 |      "language": "python",
 42 |      "metadata": {},
 43 |      "outputs": [
 44 |       {
 45 |        "ename": "ImportError",
 46 |        "evalue": "No module named fig_code",
 47 |        "output_type": "pyerr",
 48 |        "traceback": [
 49 |         "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 50 |         "\u001b[0;32m<ipython-input-4-ce8360b266e1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Import the example plot from the figures directory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mfig_code\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 51 |         "\u001b[0;31mImportError\u001b[0m: No module named fig_code"
 52 |        ]
 53 |       }
 54 |      ],
 55 |      "prompt_number": 4
 56 |     },
 57 |     {
 58 |      "cell_type": "code",
 59 |      "collapsed": false,
 60 |      "input": [],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "code",
 67 |      "collapsed": false,
 68 |      "input": [],
 69 |      "language": "python",
 70 |      "metadata": {},
 71 |      "outputs": []
 72 |     },
 73 |     {
 74 |      "cell_type": "code",
 75 |      "collapsed": false,
 76 |      "input": [],
 77 |      "language": "python",
 78 |      "metadata": {},
 79 |      "outputs": []
 80 |     },
 81 |     {
 82 |      "cell_type": "code",
 83 |      "collapsed": false,
 84 |      "input": [],
 85 |      "language": "python",
 86 |      "metadata": {},
 87 |      "outputs": []
 88 |     },
 89 |     {
 90 |      "cell_type": "code",
 91 |      "collapsed": false,
 92 |      "input": [],
 93 |      "language": "python",
 94 |      "metadata": {},
 95 |      "outputs": []
 96 |     }
 97 |    ],
 98 |    "metadata": {}
 99 |   }
100 |  ]
101 | }


--------------------------------------------------------------------------------
/scikit-learn/fig_code/sgd_separator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.linear_model import SGDClassifier
 4 | from sklearn.datasets.samples_generator import make_blobs
 5 | 
 6 | def plot_sgd_separator():
 7 |     # we create 50 separable points
 8 |     X, Y = make_blobs(n_samples=50, centers=2,
 9 |                       random_state=0, cluster_std=0.60)
10 | 
11 |     # fit the model
12 |     clf = SGDClassifier(loss="hinge", alpha=0.01,
13 |                         n_iter=200, fit_intercept=True)
14 |     clf.fit(X, Y)
15 | 
16 |     # plot the line, the points, and the nearest vectors to the plane
17 |     xx = np.linspace(-1, 5, 10)
18 |     yy = np.linspace(-1, 5, 10)
19 | 
20 |     X1, X2 = np.meshgrid(xx, yy)
21 |     Z = np.empty(X1.shape)
22 |     for (i, j), val in np.ndenumerate(X1):
23 |         x1 = val
24 |         x2 = X2[i, j]
25 |         p = clf.decision_function([x1, x2])
26 |         Z[i, j] = p[0]
27 |     levels = [-1.0, 0.0, 1.0]
28 |     linestyles = ['dashed', 'solid', 'dashed']
29 |     colors = 'k'
30 | 
31 |     ax = plt.axes()
32 |     ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
33 |     ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
34 | 
35 |     ax.axis('tight')
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     plot_sgd_separator()
40 |     plt.show()
41 | 


--------------------------------------------------------------------------------
/scikit-learn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scikit-learn/tests/__init__.py


--------------------------------------------------------------------------------
/scipy/2002FemPreg.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/2002FemPreg.dat.gz


--------------------------------------------------------------------------------
/scipy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/__init__.py


--------------------------------------------------------------------------------
/scipy/first.py:
--------------------------------------------------------------------------------
  1 | """This file contains code used in "Think Stats",
  2 | by Allen B. Downey, available from greenteapress.com
  3 | 
  4 | Copyright 2014 Allen B. Downey
  5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | import math
 11 | import numpy as np
 12 | 
 13 | import nsfg
 14 | import thinkstats2
 15 | import thinkplot
 16 | 
 17 | 
 18 | def MakeFrames():
 19 |     """Reads pregnancy data and partitions first babies and others.
 20 | 
 21 |     returns: DataFrames (all live births, first babies, others)
 22 |     """
 23 |     preg = nsfg.ReadFemPreg()
 24 | 
 25 |     live = preg[preg.outcome == 1]
 26 |     firsts = live[live.birthord == 1]
 27 |     others = live[live.birthord != 1]
 28 | 
 29 |     assert len(live) == 9148
 30 |     assert len(firsts) == 4413
 31 |     assert len(others) == 4735
 32 | 
 33 |     return live, firsts, others
 34 | 
 35 | 
 36 | def Summarize(live, firsts, others):
 37 |     """Print various summary statistics."""
 38 | 
 39 |     mean = live.prglngth.mean()
 40 |     var = live.prglngth.var()
 41 |     std = live.prglngth.std()
 42 | 
 43 |     print('Live mean', mean)
 44 |     print('Live variance', var)
 45 |     print('Live std', std)
 46 | 
 47 |     mean1 = firsts.prglngth.mean()
 48 |     mean2 = others.prglngth.mean()
 49 | 
 50 |     var1 = firsts.prglngth.var()
 51 |     var2 = others.prglngth.var()
 52 | 
 53 |     print('Mean')
 54 |     print('First babies', mean1)
 55 |     print('Others', mean2)
 56 | 
 57 |     print('Variance')
 58 |     print('First babies', var1)
 59 |     print('Others', var2)
 60 | 
 61 |     print('Difference in weeks', mean1 - mean2)
 62 |     print('Difference in hours', (mean1 - mean2) * 7 * 24)
 63 | 
 64 |     print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100)
 65 | 
 66 |     d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth)
 67 |     print('Cohen d', d)
 68 | 
 69 | 
 70 | def PrintExtremes(live):
 71 |     """Plots the histogram of pregnancy lengths and prints the extremes.
 72 | 
 73 |     live: DataFrame of live births
 74 |     """
 75 |     hist = thinkstats2.Hist(live.prglngth)
 76 |     thinkplot.Hist(hist, label='live births')
 77 | 
 78 |     thinkplot.Save(root='first_nsfg_hist_live', 
 79 |                    title='Histogram',
 80 |                    xlabel='weeks',
 81 |                    ylabel='frequency')
 82 | 
 83 |     print('Shortest lengths:')
 84 |     for weeks, freq in hist.Smallest(10):
 85 |         print(weeks, freq)
 86 | 
 87 |     print('Longest lengths:')
 88 |     for weeks, freq in hist.Largest(10):
 89 |         print(weeks, freq)
 90 |     
 91 | 
 92 | def MakeHists(live):
 93 |     """Plot Hists for live births
 94 | 
 95 |     live: DataFrame
 96 |     others: DataFrame
 97 |     """
 98 |     hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
 99 |     thinkplot.Hist(hist)
100 |     thinkplot.Save(root='first_wgt_lb_hist', 
101 |                    xlabel='pounds',
102 |                    ylabel='frequency',
103 |                    axis=[-1, 14, 0, 3200])
104 | 
105 |     hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
106 |     thinkplot.Hist(hist)
107 |     thinkplot.Save(root='first_wgt_oz_hist', 
108 |                    xlabel='ounces',
109 |                    ylabel='frequency',
110 |                    axis=[-1, 16, 0, 1200])
111 | 
112 |     hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
113 |     thinkplot.Hist(hist)
114 |     thinkplot.Save(root='first_agepreg_hist', 
115 |                    xlabel='years',
116 |                    ylabel='frequency')
117 | 
118 |     hist = thinkstats2.Hist(live.prglngth, label='prglngth')
119 |     thinkplot.Hist(hist)
120 |     thinkplot.Save(root='first_prglngth_hist', 
121 |                    xlabel='weeks',
122 |                    ylabel='frequency',
123 |                    axis=[-1, 53, 0, 5000])
124 | 
125 | 
126 | def MakeComparison(firsts, others):
127 |     """Plots histograms of pregnancy length for first babies and others.
128 | 
129 |     firsts: DataFrame
130 |     others: DataFrame
131 |     """
132 |     first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
133 |     other_hist = thinkstats2.Hist(others.prglngth, label='other')
134 | 
135 |     width = 0.45
136 |     thinkplot.PrePlot(2)
137 |     thinkplot.Hist(first_hist, align='right', width=width)
138 |     thinkplot.Hist(other_hist, align='left', width=width)
139 | 
140 |     thinkplot.Save(root='first_nsfg_hist', 
141 |                    title='Histogram',
142 |                    xlabel='weeks',
143 |                    ylabel='frequency',
144 |                    axis=[27, 46, 0, 2700])
145 | 
146 | 
147 | def main(script):
148 |     live, firsts, others = MakeFrames()
149 | 
150 |     MakeHists(live)
151 |     PrintExtremes(live)
152 |     MakeComparison(firsts, others)
153 |     Summarize(live, firsts, others)
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     import sys
158 |     main(*sys.argv)
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/scipy/nsfg.py:
--------------------------------------------------------------------------------
  1 | """This file contains code for use with "Think Stats",
  2 | by Allen B. Downey, available from greenteapress.com
  3 | 
  4 | Copyright 2010 Allen B. Downey
  5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from collections import defaultdict
 11 | import numpy as np
 12 | import sys
 13 | 
 14 | import thinkstats2
 15 | 
 16 | 
 17 | def ReadFemPreg(dct_file='2002FemPreg.dct',
 18 |                 dat_file='2002FemPreg.dat.gz'):
 19 |     """Reads the NSFG pregnancy data.
 20 | 
 21 |     dct_file: string file name
 22 |     dat_file: string file name
 23 | 
 24 |     returns: DataFrame
 25 |     """
 26 |     dct = thinkstats2.ReadStataDct(dct_file)
 27 |     df = dct.ReadFixedWidth(dat_file, compression='gzip')
 28 |     CleanFemPreg(df)
 29 |     return df
 30 | 
 31 | 
 32 | def CleanFemPreg(df):
 33 |     """Recodes variables from the pregnancy frame.
 34 | 
 35 |     df: DataFrame
 36 |     """
 37 |     # mother's age is encoded in centiyears; convert to years
 38 |     df.agepreg /= 100.0
 39 | 
 40 |     # birthwgt_lb contains at least one bogus value (51 lbs)
 41 |     # replace with NaN
 42 |     df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
 43 |     
 44 |     # replace 'not ascertained', 'refused', 'don't know' with NaN
 45 |     na_vals = [97, 98, 99]
 46 |     df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
 47 |     df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
 48 |     df.hpagelb.replace(na_vals, np.nan, inplace=True)
 49 | 
 50 |     df.babysex.replace([7, 9], np.nan, inplace=True)
 51 |     df.nbrnaliv.replace([9], np.nan, inplace=True)
 52 | 
 53 |     # birthweight is stored in two columns, lbs and oz.
 54 |     # convert to a single column in lb
 55 |     # NOTE: creating a new column requires dictionary syntax,
 56 |     # not attribute assignment (like df.totalwgt_lb)
 57 |     df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    
 58 | 
 59 |     # due to a bug in ReadStataDct, the last variable gets clipped;
 60 |     # so for now set it to NaN
 61 |     df.cmintvw = np.nan
 62 | 
 63 | 
 64 | def MakePregMap(df):
 65 |     """Make a map from caseid to list of preg indices.
 66 | 
 67 |     df: DataFrame
 68 | 
 69 |     returns: dict that maps from caseid to list of indices into preg df
 70 |     """
 71 |     d = defaultdict(list)
 72 |     for index, caseid in df.caseid.iteritems():
 73 |         d[caseid].append(index)
 74 |     return d
 75 | 
 76 | 
 77 | def main(script):
 78 |     """Tests the functions in this module.
 79 | 
 80 |     script: string script name
 81 |     """
 82 |     df = ReadFemPreg()
 83 |     print(df.shape)
 84 | 
 85 |     assert len(df) == 13593
 86 | 
 87 |     assert df.caseid[13592] == 12571
 88 |     assert df.pregordr.value_counts()[1] == 5033
 89 |     assert df.nbrnaliv.value_counts()[1] == 8981
 90 |     assert df.babysex.value_counts()[1] == 4641
 91 |     assert df.birthwgt_lb.value_counts()[7] == 3049
 92 |     assert df.birthwgt_oz.value_counts()[0] == 1037
 93 |     assert df.prglngth.value_counts()[39] == 4744
 94 |     assert df.outcome.value_counts()[1] == 9148
 95 |     assert df.birthord.value_counts()[1] == 4413
 96 |     assert df.agepreg.value_counts()[22.75] == 100
 97 |     assert df.totalwgt_lb.value_counts()[7.5] == 302
 98 | 
 99 |     weights = df.finalwgt.value_counts()
100 |     key = max(weights.keys())
101 |     assert df.finalwgt.value_counts()[key] == 6
102 | 
103 |     print('%s: All tests passed.' % script)
104 | 
105 | if __name__ == '__main__':
106 |     main(*sys.argv)
107 | 


--------------------------------------------------------------------------------
/scipy/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/tests/__init__.py


--------------------------------------------------------------------------------
/spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/spark/__init__.py


--------------------------------------------------------------------------------
/spark/hdfs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# HDFS"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Run an HDFS command:"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "!hdfs"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Run a file system command on the file systems (FsShell):"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "!hdfs dfs"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "List the user's home directory:"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "!hdfs dfs -ls"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "List the HDFS root directory:"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "!hdfs dfs -ls /"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Copy a local file to the user's directory on HDFS:"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "!hdfs dfs -put file.txt file.txt"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "Display the contents of the specified HDFS file:"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "!hdfs dfs -cat file.txt"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Print the last 10 lines of the file to the terminal:"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "!hdfs dfs -cat file.txt | tail -n 10"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "View a directory and all of its files:"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "!hdfs dfs -cat dir/* | less"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Copy an HDFS file to local:"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "!hdfs dfs -get file.txt file.txt"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "Create a directory on HDFS:"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": false
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "!hdfs dfs -mkdir dir"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "Recursively delete the specified directory and all of its contents:"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "collapsed": false
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "!hdfs dfs -rm -r dir"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Specify HDFS file in Spark (paths are relative to the user's home HDFS directory):"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "collapsed": false
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "data = sc.textFile (\"hdfs://hdfs-host:port/path/file.txt\")"
231 |    ]
232 |   }
233 |  ],
234 |  "metadata": {
235 |   "kernelspec": {
236 |    "display_name": "Python 2",
237 |    "language": "python",
238 |    "name": "python2"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 2
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython2",
250 |    "version": "2.7.10"
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 0
255 | }
256 | 


--------------------------------------------------------------------------------