├── __init__.py
├── aws
    └── __init__.py
├── kaggle
    └── __init__.py
├── numpy
    ├── __init__.py
    └── tests
    │   └── __init__.py
├── pandas
    ├── __init__.py
    └── tests
    │   └── __init__.py
├── scipy
    ├── __init__.py
    ├── tests
    │   └── __init__.py
    ├── 2002FemPreg.dat.gz
    ├── nsfg.py
    └── first.py
├── spark
    ├── __init__.py
    └── hdfs.ipynb
├── analyses
    ├── __init__.py
    └── churn_measurements.py
├── commands
    ├── __init__.py
    └── styles
    │   └── custom.css
├── mapreduce
    ├── __init__.py
    ├── test_mr_s3_log_parser.py
    └── mr_s3_log_parser.py
├── matplotlib
    ├── __init__.py
    └── tests
    │   └── __init__.py
├── python-data
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_type_util.py
    │   └── test_transform_util.py
    ├── hello_world.txt
    ├── transform_util.py
    ├── type_util.py
    ├── pdb.ipynb
    ├── files.ipynb
    ├── logs.ipynb
    ├── unit_tests.ipynb
    └── datetime.ipynb
├── scikit-learn
    ├── __init__.py
    ├── tests
    │   └── __init__.py
    └── fig_code
    │   ├── __init__.py
    │   ├── linear_regression.py
    │   ├── data.py
    │   ├── sgd_separator.py
    │   ├── helpers.py
    │   ├── scikit-learn.ipynb
    │   ├── ML_flow_chart.py
    │   └── figures.py
├── .gitattributes
├── images
    ├── aws.png
    ├── cover.png
    ├── deep.png
    ├── mrjob.png
    ├── numpy.png
    ├── scipy.png
    ├── spark.png
    ├── svm.gif
    ├── k-means.gif
    ├── kaggle.png
    ├── pandas.png
    ├── python.png
    ├── regex-1.png
    ├── regex-2.png
    ├── theano.png
    ├── README.sketch
    ├── commands.png
    ├── coversmall.png
    ├── matplotlib.png
    ├── tensorflow.png
    ├── scikitlearn.png
    ├── README_1200x800.gif
    └── coversmall_alt.png
├── data
    ├── confusion_matrix.png
    ├── ozone.csv
    ├── titanic
    │   ├── gendermodel.py
    │   ├── myfirstforest.py
    │   ├── gendermodel.csv
    │   ├── genderclassmodel.csv
    │   ├── results-rf.csv
    │   └── genderclassmodel.py
    └── ozone_copy.csv
├── deep-learning
    ├── deep-dream
    │   ├── flowers.jpg
    │   └── sky1024px.jpg
    ├── theano-tutorial
    │   ├── rnn_tutorial
    │   │   ├── rnn_lstm.pdf
    │   │   ├── instruction.pdf
    │   │   ├── Makefile
    │   │   ├── synthetic.py
    │   │   ├── rnn_precompile.py
    │   │   └── lstm_text.py
    │   ├── intro_theano
    │   │   ├── intro_theano.pdf
    │   │   ├── Makefile
    │   │   └── utils.py
    │   └── scan_tutorial
    │   │   ├── scan_ex1_solution.py
    │   │   └── scan_ex2_solution.py
    ├── tensor-flow-exercises
    │   ├── Dockerfile
    │   ├── README.md
    │   └── 3_regularization.ipynb
    └── tensor-flow-examples
    │   ├── multigpu_basics.py
    │   ├── notebooks
    │       ├── 4_multi_gpu
    │       │   └── multigpu_basics.ipynb
    │       ├── 1_intro
    │       │   └── basic_operations.ipynb
    │       ├── 2_basic_classifiers
    │       │   └── logistic_regression.ipynb
    │       └── 3_neural_networks
    │       │   └── multilayer_perceptron.ipynb
    │   └── input_data.py
├── requirements.txt
├── LICENSE
├── .gitignore
└── misc
    └── regex.ipynb


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aws/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kaggle/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/numpy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scipy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/analyses/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mapreduce/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matplotlib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/numpy/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scipy/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matplotlib/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pandas/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scikit-learn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-data/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scikit-learn/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-data/hello_world.txt:
--------------------------------------------------------------------------------
1 | hello world!


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | 


--------------------------------------------------------------------------------
/images/aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/aws.png


--------------------------------------------------------------------------------
/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/cover.png


--------------------------------------------------------------------------------
/images/deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/deep.png


--------------------------------------------------------------------------------
/images/mrjob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/mrjob.png


--------------------------------------------------------------------------------
/images/numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/numpy.png


--------------------------------------------------------------------------------
/images/scipy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/scipy.png


--------------------------------------------------------------------------------
/images/spark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/spark.png


--------------------------------------------------------------------------------
/images/svm.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/svm.gif


--------------------------------------------------------------------------------
/images/k-means.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/k-means.gif


--------------------------------------------------------------------------------
/images/kaggle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/kaggle.png


--------------------------------------------------------------------------------
/images/pandas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/pandas.png


--------------------------------------------------------------------------------
/images/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/python.png


--------------------------------------------------------------------------------
/images/regex-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/regex-1.png


--------------------------------------------------------------------------------
/images/regex-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/regex-2.png


--------------------------------------------------------------------------------
/images/theano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/theano.png


--------------------------------------------------------------------------------
/images/README.sketch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/README.sketch


--------------------------------------------------------------------------------
/images/commands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/commands.png


--------------------------------------------------------------------------------
/images/coversmall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/coversmall.png


--------------------------------------------------------------------------------
/images/matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/matplotlib.png


--------------------------------------------------------------------------------
/images/tensorflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/tensorflow.png


--------------------------------------------------------------------------------
/images/scikitlearn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/scikitlearn.png


--------------------------------------------------------------------------------
/data/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/data/confusion_matrix.png


--------------------------------------------------------------------------------
/images/README_1200x800.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/README_1200x800.gif


--------------------------------------------------------------------------------
/images/coversmall_alt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/coversmall_alt.png


--------------------------------------------------------------------------------
/scipy/2002FemPreg.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/scipy/2002FemPreg.dat.gz


--------------------------------------------------------------------------------
/deep-learning/deep-dream/flowers.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/deep-dream/flowers.jpg


--------------------------------------------------------------------------------
/deep-learning/deep-dream/sky1024px.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/deep-dream/sky1024px.jpg


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/Makefile:
--------------------------------------------------------------------------------
1 | intro_theano.pdf: slides_source/intro_theano.tex
2 | 	cd slides_source; pdflatex --shell-escape intro_theano.tex
3 | 	mv slides_source/intro_theano.pdf .
4 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import *
2 | from .figures import *
3 | 
4 | from .sgd_separator import plot_sgd_separator
5 | from .linear_regression import plot_linear_regression
6 | from .helpers import plot_iris_knn
7 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-exercises/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM b.gcr.io/tensorflow/tensorflow:latest
2 | MAINTAINER Vincent Vanhoucke <vanhoucke@google.com>
3 | RUN pip install scikit-learn
4 | ADD *.ipynb /notebooks/
5 | WORKDIR /notebooks
6 | CMD ["/run_jupyter.sh"]
7 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-exercises/README.md:
--------------------------------------------------------------------------------
 1 | Exercises
 2 | ===========================================================
 3 | 
 4 | Building the Docker container
 5 | -----------------------------
 6 | 
 7 |     docker build -t $USER/exercises .
 8 | 
 9 | Running the container
10 | ---------------------
11 | 
12 |     docker run -p 8888:8888 -it --rm $USER/exercises
13 | 


--------------------------------------------------------------------------------
/python-data/tests/test_type_util.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import assert_equal
 2 | from ..type_util import TypeUtil
 3 | 
 4 | 
 5 | class TestUtil():
 6 | 
 7 |     def test_is_iterable(self):
 8 |         assert_equal(TypeUtil.is_iterable('foo'), True)
 9 |         assert_equal(TypeUtil.is_iterable(7), False)
10 | 
11 |     def test_convert_to_list(self):
12 |         assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)
13 |         assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.ssl-match-hostname==3.4.0.2
 2 | certifi==2015.4.28
 3 | functools32==3.2.3.post1
 4 | gnureadline==6.3.3
 5 | ipython==3.2.0
 6 | Jinja2==2.7.3
 7 | jsonschema==2.5.1
 8 | MarkupSafe==0.23
 9 | matplotlib==1.4.3
10 | mistune==0.6
11 | mock==1.0.1
12 | nose==1.3.7
13 | numpy==1.9.2
14 | pandas==0.16.2
15 | ptyprocess==0.5
16 | Pygments==2.0.2
17 | pyparsing==2.0.3
18 | python-dateutil==2.4.2
19 | pytz==2015.4
20 | pyzmq==14.7.0
21 | scikit-learn==0.16.1
22 | scipy==0.15.1
23 | seaborn==0.6.0
24 | six==1.9.0
25 | sympy==0.7.6
26 | terminado==0.5
27 | tornado==4.2
28 | wheel==0.24.0
29 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/Makefile:
--------------------------------------------------------------------------------
 1 | all: instruction.pdf rnn_lstm.pdf
 2 | 
 3 | instruction.pdf: slides_source/instruction.tex
 4 | 	cd slides_source; pdflatex --shell-escape instruction.tex
 5 | 	cd slides_source; pdflatex --shell-escape instruction.tex
 6 | 	cd slides_source; pdflatex --shell-escape instruction.tex
 7 | 	mv slides_source/instruction.pdf .
 8 | 
 9 | rnn_lstm.pdf: slides_source/rnn_lstm.tex
10 | 	cd slides_source; pdflatex --shell-escape rnn_lstm.tex
11 | 	cd slides_source; pdflatex --shell-escape rnn_lstm.tex
12 | 	cd slides_source; pdflatex --shell-escape rnn_lstm.tex
13 | 	mv slides_source/rnn_lstm.pdf .
14 | 


--------------------------------------------------------------------------------
/python-data/transform_util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class TransformUtil:
 5 | 
 6 |     @classmethod
 7 |     def remove_punctuation(cls, value):
 8 |         """Removes !, #, and ?.
 9 |         """        
10 |         return re.sub('[!#?]', '', value) 
11 | 
12 |     @classmethod
13 |     def clean_strings(cls, strings, ops): 
14 |         """General purpose method to clean strings.
15 | 
16 |         Pass in a sequence of strings and the operations to perform.
17 |         """        
18 |         result = [] 
19 |         for value in strings: 
20 |             for function in ops: 
21 |                 value = function(value) 
22 |             result.append(value) 
23 |         return result


--------------------------------------------------------------------------------
/python-data/type_util.py:
--------------------------------------------------------------------------------
 1 | class TypeUtil:
 2 | 
 3 |     @classmethod
 4 |     def is_iterable(cls, obj):
 5 |         """Determines if obj is iterable.
 6 | 
 7 |         Useful when writing functions that can accept multiple types of
 8 |         input (list, tuple, ndarray, iterator).  Pairs well with
 9 |         convert_to_list.
10 |         """
11 |         try:
12 |             iter(obj)
13 |             return True
14 |         except TypeError:
15 |             return False
16 | 
17 |     @classmethod
18 |     def convert_to_list(cls, obj):
19 |         """Converts obj to a list if it is not a list and it is iterable, 
20 |         else returns the original obj.
21 |         """
22 |         if not isinstance(obj, list) and cls.is_iterable(obj):
23 |             obj = list(obj)
24 |         return obj


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This repository contains a variety of content; some developed by Donne Martin, and some from third-parties.  The third-party content is distributed under the license provided by those parties.
 2 | 
 3 | The content developed by Donne Martin is distributed under the following license:
 4 | 
 5 | Copyright 2015 Donne Martin
 6 | 
 7 | Licensed under the Apache License, Version 2.0 (the "License");
 8 | you may not use this file except in compliance with the License.
 9 | You may obtain a copy of the License at
10 | 
11 |    http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing, software
14 | distributed under the License is distributed on an "AS IS" BASIS,
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | See the License for the specific language governing permissions and
17 | limitations under the License.


--------------------------------------------------------------------------------
/scikit-learn/fig_code/linear_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.linear_model import LinearRegression
 4 | 
 5 | 
 6 | def plot_linear_regression():
 7 |     a = 0.5
 8 |     b = 1.0
 9 | 
10 |     # x from 0 to 10
11 |     x = 30 * np.random.random(20)
12 | 
13 |     # y = a*x + b with noise
14 |     y = a * x + b + np.random.normal(size=x.shape)
15 | 
16 |     # create a linear regression classifier
17 |     clf = LinearRegression()
18 |     clf.fit(x[:, None], y)
19 | 
20 |     # predict y from the data
21 |     x_new = np.linspace(0, 30, 100)
22 |     y_new = clf.predict(x_new[:, None])
23 | 
24 |     # plot the results
25 |     ax = plt.axes()
26 |     ax.scatter(x, y)
27 |     ax.plot(x_new, y_new)
28 | 
29 |     ax.set_xlabel('x')
30 |     ax.set_ylabel('y')
31 | 
32 |     ax.axis('tight')
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     plot_linear_regression()
37 |     plt.show()
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | # IPython notebook
57 | .ipynb_checkpoints
58 | 
59 | # Repo scratch directory
60 | scratch/


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/scan_tutorial/scan_ex1_solution.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | 
 5 | coefficients = T.vector("coefficients")
 6 | x = T.scalar("x")
 7 | max_coefficients_supported = 10000
 8 | 
 9 | 
10 | def step(coeff, power, prior_value, free_var):
11 |     return prior_value + (coeff * (free_var ** power))
12 | 
13 | # Generate the components of the polynomial
14 | full_range = T.arange(max_coefficients_supported)
15 | outputs_info = np.zeros((), dtype=theano.config.floatX)
16 | 
17 | components, updates = theano.scan(fn=step,
18 |                                   sequences=[coefficients, full_range],
19 |                                   outputs_info=outputs_info,
20 |                                   non_sequences=x)
21 | 
22 | polynomial = components[-1]
23 | calculate_polynomial = theano.function(inputs=[coefficients, x],
24 |                                        outputs=polynomial,
25 |                                        updates=updates)
26 | 
27 | test_coeff = np.asarray([1, 0, 2], dtype=theano.config.floatX)
28 | print(calculate_polynomial(test_coeff, 3))
29 | 


--------------------------------------------------------------------------------
/python-data/tests/test_transform_util.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import assert_equal
 2 | from ..transform_util import TransformUtil
 3 | 
 4 | 
 5 | class TestTransformUtil():
 6 | 
 7 |     states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', \
 8 |           'FlOrIda', 'south carolina##', 'West virginia?']
 9 |     
10 |     expected_output = ['Alabama',
11 |                        'Georgia',
12 |                        'Georgia',
13 |                        'Georgia',
14 |                        'Florida',
15 |                        'South Carolina',
16 |                        'West Virginia']
17 |     
18 |     def test_remove_punctuation(self):
19 |         assert_equal(TransformUtil.remove_punctuation('!#?'), '')
20 |         
21 |     def test_map_remove_punctuation(self):
22 |         # Map applies a function to a collection
23 |         output = map(TransformUtil.remove_punctuation, self.states)
24 |         assert_equal('!#?' not in output, True)
25 | 
26 |     def test_clean_strings(self):
27 |         clean_ops = [str.strip, TransformUtil.remove_punctuation, str.title] 
28 |         output = TransformUtil.clean_strings(self.states, clean_ops)
29 |         assert_equal(output, self.expected_output)


--------------------------------------------------------------------------------
/scikit-learn/fig_code/data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def linear_data_sample(N=40, rseed=0, m=3, b=-2):
 5 |     rng = np.random.RandomState(rseed)
 6 | 
 7 |     x = 10 * rng.rand(N)
 8 |     dy = m / 2 * (1 + rng.rand(N))
 9 |     y = m * x + b + dy * rng.randn(N)
10 | 
11 |     return (x, y, dy)
12 | 
13 | 
14 | def linear_data_sample_big_errs(N=40, rseed=0, m=3, b=-2):
15 |     rng = np.random.RandomState(rseed)
16 | 
17 |     x = 10 * rng.rand(N)
18 |     dy = m / 2 * (1 + rng.rand(N))
19 |     dy[20:25] *= 10
20 |     y = m * x + b + dy * rng.randn(N)
21 | 
22 |     return (x, y, dy)
23 | 
24 | 
25 | def sample_light_curve(phased=True):
26 |     from astroML.datasets import fetch_LINEAR_sample
27 |     data = fetch_LINEAR_sample()
28 |     t, y, dy = data[18525697].T
29 | 
30 |     if phased:
31 |         P_best = 0.580313015651
32 |         t /= P_best
33 | 
34 |     return (t, y, dy)
35 |     
36 | 
37 | def sample_light_curve_2(phased=True):
38 |     from astroML.datasets import fetch_LINEAR_sample
39 |     data = fetch_LINEAR_sample()
40 |     t, y, dy = data[10022663].T
41 | 
42 |     if phased:
43 |         P_best = 0.61596079804
44 |         t /= P_best
45 | 
46 |     return (t, y, dy)
47 |     
48 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/sgd_separator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.linear_model import SGDClassifier
 4 | from sklearn.datasets.samples_generator import make_blobs
 5 | 
 6 | def plot_sgd_separator():
 7 |     # we create 50 separable points
 8 |     X, Y = make_blobs(n_samples=50, centers=2,
 9 |                       random_state=0, cluster_std=0.60)
10 | 
11 |     # fit the model
12 |     clf = SGDClassifier(loss="hinge", alpha=0.01,
13 |                         n_iter=200, fit_intercept=True)
14 |     clf.fit(X, Y)
15 | 
16 |     # plot the line, the points, and the nearest vectors to the plane
17 |     xx = np.linspace(-1, 5, 10)
18 |     yy = np.linspace(-1, 5, 10)
19 | 
20 |     X1, X2 = np.meshgrid(xx, yy)
21 |     Z = np.empty(X1.shape)
22 |     for (i, j), val in np.ndenumerate(X1):
23 |         x1 = val
24 |         x2 = X2[i, j]
25 |         p = clf.decision_function([x1, x2])
26 |         Z[i, j] = p[0]
27 |     levels = [-1.0, 0.0, 1.0]
28 |     linestyles = ['dashed', 'solid', 'dashed']
29 |     colors = 'k'
30 | 
31 |     ax = plt.axes()
32 |     ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
33 |     ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
34 | 
35 |     ax.axis('tight')
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     plot_sgd_separator()
40 |     plt.show()
41 | 


--------------------------------------------------------------------------------
/misc/regex.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Regex\n",
 8 |     "\n",
 9 |     "Credits: Material from [Regular Expressions Cheat Sheet](http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/) by Dave Child\n",
10 |     "\n",
11 |     "Use with http://www.regexr.com to generate regular expressions."
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "markdown",
16 |    "metadata": {},
17 |    "source": [
18 |     "<p align=\"center\">\n",
19 |     "  <img src=\"https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/master/images/regex-1.png\">\n",
20 |     "  <img src=\"https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/master/images/regex-2.png\">\n",
21 |     "  <br/>\n",
22 |     "</p>"
23 |    ]
24 |   }
25 |  ],
26 |  "metadata": {
27 |   "kernelspec": {
28 |    "display_name": "Python 2",
29 |    "language": "python",
30 |    "name": "python2"
31 |   },
32 |   "language_info": {
33 |    "codemirror_mode": {
34 |     "name": "ipython",
35 |     "version": 2
36 |    },
37 |    "file_extension": ".py",
38 |    "mimetype": "text/x-python",
39 |    "name": "python",
40 |    "nbconvert_exporter": "python",
41 |    "pygments_lexer": "ipython2",
42 |    "version": "2.7.10"
43 |   }
44 |  },
45 |  "nbformat": 4,
46 |  "nbformat_minor": 0
47 | }
48 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/scan_tutorial/scan_ex2_solution.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | 
 5 | probabilities = T.vector()
 6 | nb_samples = T.iscalar()
 7 | 
 8 | rng = T.shared_randomstreams.RandomStreams(1234)
 9 | 
10 | 
11 | def sample_from_pvect(pvect):
12 |     """ Provided utility function: given a symbolic vector of
13 |     probabilities (which MUST sum to 1), sample one element
14 |     and return its index.
15 |     """
16 |     onehot_sample = rng.multinomial(n=1, pvals=pvect)
17 |     sample = onehot_sample.argmax()
18 |     return sample
19 | 
20 | 
21 | def set_p_to_zero(pvect, i):
22 |     """ Provided utility function: given a symbolic vector of
23 |     probabilities and an index 'i', set the probability of the
24 |     i-th element to 0 and renormalize the probabilities so they
25 |     sum to 1.
26 |     """
27 |     new_pvect = T.set_subtensor(pvect[i], 0.)
28 |     new_pvect = new_pvect / new_pvect.sum()
29 |     return new_pvect
30 | 
31 | 
32 | def step(p):
33 |     sample = sample_from_pvect(p)
34 |     new_p = set_p_to_zero(p, sample)
35 |     return new_p, sample
36 | 
37 | output, updates = theano.scan(fn=step,
38 |                               outputs_info=[probabilities, None],
39 |                               n_steps=nb_samples)
40 | 
41 | modified_probabilities, samples = output
42 | 
43 | f = theano.function(inputs=[probabilities, nb_samples],
44 |                     outputs=[samples],
45 |                     updates=updates)
46 | 
47 | # Testing the function
48 | test_probs = np.asarray([0.6, 0.3, 0.1], dtype=theano.config.floatX)
49 | for i in range(10):
50 |     print(f(test_probs, 2))
51 | 


--------------------------------------------------------------------------------
/commands/styles/custom.css:
--------------------------------------------------------------------------------
 1 | <style>
 2 |     @font-face {
 3 |         font-family: "Computer Modern";
 4 |         src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');
 5 |     }
 6 |     div.cell{
 7 |         width:800px;
 8 |         margin-left:16% !important;
 9 |         margin-right:auto;
10 |     }
11 |     h1 {
12 |         font-family: Helvetica, serif;
13 |     }
14 |     h4{
15 |         margin-top:12px;
16 |         margin-bottom: 3px;
17 |        }
18 |     div.text_cell_render{
19 |         font-family: Computer Modern, "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
20 |         line-height: 145%;
21 |         font-size: 130%;
22 |         width:800px;
23 |         margin-left:auto;
24 |         margin-right:auto;
25 |     }
26 |     .CodeMirror{
27 |             font-family: "Source Code Pro", source-code-pro,Consolas, monospace;
28 |     }
29 |     .prompt{
30 |         display: None;
31 |     }
32 |     .text_cell_render h5 {
33 |         font-weight: 300;
34 |         font-size: 22pt;
35 |         color: #4057A1;
36 |         font-style: italic;
37 |         margin-bottom: .5em;
38 |         margin-top: 0.5em;
39 |         display: block;
40 |     }
41 |     
42 |     .warning{
43 |         color: rgb( 240, 20, 20 )
44 |         }  
45 | </style>
46 | <script>
47 |     MathJax.Hub.Config({
48 |                         TeX: {
49 |                            extensions: ["AMSmath.js"]
50 |                            },
51 |                 tex2jax: {
52 |                     inlineMath: [ ['$','$'], ["\\(","\\)"] ],
53 |                     displayMath: [ ['$$','$$'], ["\\[","\\]"] ]
54 |                 },
55 |                 displayAlign: 'center', // Change this to 'center' to center equations.
56 |                 "HTML-CSS": {
57 |                     styles: {'.MathJax_Display': {"margin": 4}}
58 |                 }
59 |         });
60 | </script>


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/multigpu_basics.py:
--------------------------------------------------------------------------------
 1 | #Multi GPU Basic example
 2 | '''
 3 | This tutorial requires your machine to have 2 GPUs
 4 | "/cpu:0": The CPU of your machine.
 5 | "/gpu:0": The first GPU of your machine
 6 | "/gpu:1": The second GPU of your machine
 7 | '''
 8 | 
 9 | import numpy as np
10 | import tensorflow as tf
11 | import datetime
12 | 
13 | #Processing Units logs
14 | log_device_placement = True
15 | 
16 | #num of multiplications to perform
17 | n = 10
18 | 
19 | '''
20 | Example: compute A^n + B^n on 2 GPUs
21 | Results on 8 cores with 2 GTX-980:
22 |  * Single GPU computation time: 0:00:11.277449
23 |  * Multi GPU computation time: 0:00:07.131701
24 | '''
25 | #Create random large matrix
26 | A = np.random.rand(1e4, 1e4).astype('float32')
27 | B = np.random.rand(1e4, 1e4).astype('float32')
28 | 
29 | # Creates a graph to store results
30 | c1 = []
31 | c2 = []
32 | 
33 | def matpow(M, n):
34 |     if n < 1: #Abstract cases where n < 1
35 |         return M
36 |     else:
37 |         return tf.matmul(M, matpow(M, n-1))
38 | 
39 | '''
40 | Single GPU computing
41 | '''
42 | with tf.device('/gpu:0'):
43 |     a = tf.constant(A)
44 |     b = tf.constant(B)
45 |     #compute A^n and B^n and store results in c1
46 |     c1.append(matpow(a, n))
47 |     c1.append(matpow(b, n))
48 | 
49 | with tf.device('/cpu:0'):
50 |   sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n
51 | 
52 | t1_1 = datetime.datetime.now()
53 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:
54 |     # Runs the op.
55 |     sess.run(sum)
56 | t2_1 = datetime.datetime.now()
57 | 
58 | 
59 | '''
60 | Multi GPU computing
61 | '''
62 | #GPU:0 computes A^n
63 | with tf.device('/gpu:0'):
64 |     #compute A^n and store result in c2
65 |     a = tf.constant(A)
66 |     c2.append(matpow(a, n))
67 | 
68 | #GPU:1 computes B^n
69 | with tf.device('/gpu:1'):
70 |     #compute B^n and store result in c2
71 |     b = tf.constant(B)
72 |     c2.append(matpow(b, n))
73 | 
74 | with tf.device('/cpu:0'):
75 |   sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n
76 | 
77 | t1_2 = datetime.datetime.now()
78 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:
79 |     # Runs the op.
80 |     sess.run(sum)
81 | t2_2 = datetime.datetime.now()
82 | 
83 | 
84 | print "Single GPU computation time: " + str(t2_1-t1_1)
85 | print "Multi GPU computation time: " + str(t2_2-t1_2)


--------------------------------------------------------------------------------
/scikit-learn/fig_code/helpers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Small helpers for code that is not shown in the notebooks
 3 | """
 4 | 
 5 | from sklearn import neighbors, datasets, linear_model
 6 | import pylab as pl
 7 | import numpy as np
 8 | from matplotlib.colors import ListedColormap
 9 | 
10 | # Create color maps for 3-class classification problem, as with iris
11 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
12 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
13 | 
14 | def plot_iris_knn():
15 |     iris = datasets.load_iris()
16 |     X = iris.data[:, :2]  # we only take the first two features. We could
17 |                         # avoid this ugly slicing by using a two-dim dataset
18 |     y = iris.target
19 | 
20 |     knn = neighbors.KNeighborsClassifier(n_neighbors=5)
21 |     knn.fit(X, y)
22 | 
23 |     x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
24 |     y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
25 |     xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
26 |                          np.linspace(y_min, y_max, 100))
27 |     Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
28 | 
29 |     # Put the result into a color plot
30 |     Z = Z.reshape(xx.shape)
31 |     pl.figure()
32 |     pl.pcolormesh(xx, yy, Z, cmap=cmap_light)
33 | 
34 |     # Plot also the training points
35 |     pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
36 |     pl.xlabel('sepal length (cm)')
37 |     pl.ylabel('sepal width (cm)')
38 |     pl.axis('tight')
39 | 
40 | 
41 | def plot_polynomial_regression():
42 |     rng = np.random.RandomState(0)
43 |     x = 2*rng.rand(100) - 1
44 | 
45 |     f = lambda t: 1.2 * t**2 + .1 * t**3 - .4 * t **5 - .5 * t ** 9
46 |     y = f(x) + .4 * rng.normal(size=100)
47 | 
48 |     x_test = np.linspace(-1, 1, 100)
49 | 
50 |     pl.figure()
51 |     pl.scatter(x, y, s=4)
52 | 
53 |     X = np.array([x**i for i in range(5)]).T
54 |     X_test = np.array([x_test**i for i in range(5)]).T
55 |     regr = linear_model.LinearRegression()
56 |     regr.fit(X, y)
57 |     pl.plot(x_test, regr.predict(X_test), label='4th order')
58 | 
59 |     X = np.array([x**i for i in range(10)]).T
60 |     X_test = np.array([x_test**i for i in range(10)]).T
61 |     regr = linear_model.LinearRegression()
62 |     regr.fit(X, y)
63 |     pl.plot(x_test, regr.predict(X_test), label='9th order')
64 | 
65 |     pl.legend(loc='best')
66 |     pl.axis('tight')
67 |     pl.title('Fitting a 4th and a 9th order polynomial')
68 | 
69 |     pl.figure()
70 |     pl.scatter(x, y, s=4)
71 |     pl.plot(x_test, f(x_test), label="truth")
72 |     pl.axis('tight')
73 |     pl.title('Ground truth (9th order polynomial)')
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/python-data/pdb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# PDB\n",
 15 |     "\n",
 16 |     "The pdb module defines an interactive source code debugger for Python programs.  Below are frequently used commands:"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# Run pdb when this line is hit\n",
 28 |     "import pdb; pdb.set_trace()\n",
 29 |     "\n",
 30 |     "# Run pdb when the script is run\n",
 31 |     "python -m pdb script.py\n",
 32 |     "\n",
 33 |     "# Help\n",
 34 |     "h[elp]\n",
 35 |     "\n",
 36 |     "# Show current content\n",
 37 |     "l[ist]\n",
 38 |     "\n",
 39 |     "# Examine variables\n",
 40 |     "p[rint]\n",
 41 |     "\n",
 42 |     "# Pretty print\n",
 43 |     "pp\n",
 44 |     "\n",
 45 |     "# Go to next line\n",
 46 |     "n[ext]\n",
 47 |     "\n",
 48 |     "# Step into\n",
 49 |     "s[tep]\n",
 50 |     "\n",
 51 |     "# Continue execution until the line with the line number greater \n",
 52 |     "# than the current one is reached or when returning from current frame.\n",
 53 |     "until\n",
 54 |     "\n",
 55 |     "# Return\n",
 56 |     "r[eturn]\n",
 57 |     "\n",
 58 |     "# See all breakpoints\n",
 59 |     "b to see all breakpoints\n",
 60 |     "\n",
 61 |     "# Set breakpoint at line 16\n",
 62 |     "b 16 \n",
 63 |     "\n",
 64 |     "# Clear breakpoint 1\n",
 65 |     "cl[ear] 1\n",
 66 |     "\n",
 67 |     "# Continue\n",
 68 |     "c[ontinue]\n",
 69 |     "\n",
 70 |     "# Conditional breakpoints, line 11\n",
 71 |     "b 11, this_year == 2015\n",
 72 |     "\n",
 73 |     "# Stack location\n",
 74 |     "w[here]\n",
 75 |     "\n",
 76 |     "# Go up in stack\n",
 77 |     "u[p]\n",
 78 |     "\n",
 79 |     "# Go down in stack\n",
 80 |     "d[own]\n",
 81 |     "\n",
 82 |     "# Longlist shows full method of where you're in (Python 3)\n",
 83 |     "ll\n",
 84 |     "\n",
 85 |     "# Quit\n",
 86 |     "q[uit]"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "kernelspec": {
 92 |    "display_name": "Python 2",
 93 |    "language": "python",
 94 |    "name": "python2"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 2
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython2",
106 |    "version": "2.7.10"
107 |   }
108 |  },
109 |  "nbformat": 4,
110 |  "nbformat_minor": 0
111 | }
112 | 


--------------------------------------------------------------------------------
/analyses/churn_measurements.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | 
 4 | __author__ = "Eric Chiang"
 5 | __email__  = "eric[at]yhathq.com"
 6 | 
 7 | """
 8 | 
 9 | Measurements inspired by Philip Tetlock's "Expert Political Judgment"
10 | 
11 | Equations take from Yaniv, Yates, & Smith (1991):
12 |   "Measures of Descrimination Skill in Probabilistic Judgement"
13 | 
14 | """
15 | 
16 | 
17 | def calibration(prob,outcome,n_bins=10):
18 |     """Calibration measurement for a set of predictions.
19 | 
20 |     When predicting events at a given probability, how far is frequency
21 |     of positive outcomes from that probability?
22 |     NOTE: Lower scores are better
23 | 
24 |     prob: array_like, float
25 |         Probability estimates for a set of events
26 | 
27 |     outcome: array_like, bool
28 |         If event predicted occurred
29 | 
30 |     n_bins: int
31 |         Number of judgement categories to prefrom calculation over.
32 |         Prediction are binned based on probability, since "descrete" 
33 |         probabilities aren't required. 
34 | 
35 |     """
36 |     prob = np.array(prob)
37 |     outcome = np.array(outcome)
38 | 
39 |     c = 0.0
40 |     # Construct bins
41 |     judgement_bins = np.arange(n_bins + 1) / n_bins
42 |     # Which bin is each prediction in?
43 |     bin_num = np.digitize(prob,judgement_bins)
44 |     for j_bin in np.unique(bin_num):
45 |         # Is event in bin
46 |         in_bin = bin_num == j_bin
47 |         # Predicted probability taken as average of preds in bin
48 |         predicted_prob = np.mean(prob[in_bin])
49 |         # How often did events in this bin actually happen?
50 |         true_bin_prob = np.mean(outcome[in_bin])
51 |         # Squared distance between predicted and true times num of obs
52 |         c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2)
53 |     return c / len(prob)
54 | 
55 | def discrimination(prob,outcome,n_bins=10):
56 |     """Discrimination measurement for a set of predictions.
57 | 
58 |     For each judgement category, how far from the base probability
59 |     is the true frequency of that bin?
60 |     NOTE: High scores are better
61 | 
62 |     prob: array_like, float
63 |         Probability estimates for a set of events
64 | 
65 |     outcome: array_like, bool
66 |         If event predicted occurred
67 | 
68 |     n_bins: int
69 |         Number of judgement categories to prefrom calculation over.
70 |         Prediction are binned based on probability, since "descrete" 
71 |         probabilities aren't required. 
72 | 
73 |     """
74 |     prob = np.array(prob)
75 |     outcome = np.array(outcome)
76 | 
77 |     d = 0.0
78 |     # Base frequency of outcomes
79 |     base_prob = np.mean(outcome)
80 |     # Construct bins
81 |     judgement_bins = np.arange(n_bins + 1) / n_bins
82 |     # Which bin is each prediction in?
83 |     bin_num = np.digitize(prob,judgement_bins)
84 |     for j_bin in np.unique(bin_num):
85 |         in_bin = bin_num == j_bin
86 |         true_bin_prob = np.mean(outcome[in_bin])
87 |         # Squared distance between true and base times num of obs
88 |         d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2)
89 |     return d / len(prob)
90 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/synthetic.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import numpy as np
 3 | 
 4 | 
 5 | def mackey_glass(sample_len=1000, tau=17, seed=None, n_samples = 1):
 6 |     '''
 7 |     mackey_glass(sample_len=1000, tau=17, seed = None, n_samples = 1) -> input
 8 |     Generate the Mackey Glass time-series. Parameters are:
 9 |         - sample_len: length of the time-series in timesteps. Default is 1000.
10 |         - tau: delay of the MG - system. Commonly used values are tau=17 (mild 
11 |           chaos) and tau=30 (moderate chaos). Default is 17.
12 |         - seed: to seed the random generator, can be used to generate the same
13 |           timeseries at each invocation.
14 |         - n_samples : number of samples to generate
15 |     '''
16 |     delta_t = 10
17 |     history_len = tau * delta_t 
18 |     # Initial conditions for the history of the system
19 |     timeseries = 1.2
20 |     
21 |     if seed is not None:
22 |         np.random.seed(seed)
23 | 
24 |     samples = []
25 | 
26 |     for _ in range(n_samples):
27 |         history = collections.deque(1.2 * np.ones(history_len) + 0.2 * \
28 |                                     (np.random.rand(history_len) - 0.5))
29 |         # Preallocate the array for the time-series
30 |         inp = np.zeros((sample_len,1))
31 |         
32 |         for timestep in range(sample_len):
33 |             for _ in range(delta_t):
34 |                 xtau = history.popleft()
35 |                 history.append(timeseries)
36 |                 timeseries = history[-1] + (0.2 * xtau / (1.0 + xtau ** 10) - \
37 |                              0.1 * history[-1]) / delta_t
38 |             inp[timestep] = timeseries
39 |         
40 |         # Squash timeseries through tanh
41 |         inp = np.tanh(inp - 1)
42 |         samples.append(inp)
43 |     return samples
44 | 
45 | 
46 | def mso(sample_len=1000, n_samples = 1):
47 |     '''
48 |     mso(sample_len=1000, n_samples = 1) -> input
49 |     Generate the Multiple Sinewave Oscillator time-series, a sum of two sines
50 |     with incommensurable periods. Parameters are:
51 |         - sample_len: length of the time-series in timesteps
52 |         - n_samples: number of samples to generate
53 |     '''
54 |     signals = []
55 |     for _ in range(n_samples):
56 |         phase = np.random.rand()
57 |         x = np.atleast_2d(np.arange(sample_len)).T
58 |         signals.append(np.sin(0.2 * x + phase) + np.sin(0.311 * x + phase))
59 |     return signals
60 | 
61 | 
62 | def lorentz(sample_len=1000, sigma=10, rho=28, beta=8 / 3, step=0.01):
63 |     """This function generates a Lorentz time series of length sample_len,
64 |     with standard parameters sigma, rho and beta. 
65 |     """
66 | 
67 |     x = np.zeros([sample_len])
68 |     y = np.zeros([sample_len])
69 |     z = np.zeros([sample_len])
70 | 
71 |     # Initial conditions taken from 'Chaos and Time Series Analysis', J. Sprott
72 |     x[0] = 0;
73 |     y[0] = -0.01;
74 |     z[0] = 9;
75 | 
76 |     for t in range(sample_len - 1):
77 |         x[t + 1] = x[t] + sigma * (y[t] - x[t]) * step
78 |         y[t + 1] = y[t] + (x[t] * (rho - z[t]) - y[t]) * step
79 |         z[t + 1] = z[t] + (x[t] * y[t] - beta * z[t]) * step
80 | 
81 |     x.shape += (1,)
82 |     y.shape += (1,)
83 |     z.shape += (1,)
84 | 
85 |     return np.concatenate((x, y, z), axis=1)
86 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/scikit-learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:29899a15bea89b9d8275879798b23011cecabc0eff03dd41bb606324221e0bc3"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "markdown",
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "# scikit-learn"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "%matplotlib inline\n",
 23 |       "\n",
 24 |       "# set seaborn plot defaults.\n",
 25 |       "# This can be safely commented out\n",
 26 |       "import seaborn; seaborn.set()"
 27 |      ],
 28 |      "language": "python",
 29 |      "metadata": {},
 30 |      "outputs": [],
 31 |      "prompt_number": 3
 32 |     },
 33 |     {
 34 |      "cell_type": "code",
 35 |      "collapsed": false,
 36 |      "input": [
 37 |       "# Import the example plot from the figures directory\n",
 38 |       "from fig_code import plot_sgd_separator\n",
 39 |       "plot_sgd_separator()"
 40 |      ],
 41 |      "language": "python",
 42 |      "metadata": {},
 43 |      "outputs": [
 44 |       {
 45 |        "ename": "ImportError",
 46 |        "evalue": "No module named fig_code",
 47 |        "output_type": "pyerr",
 48 |        "traceback": [
 49 |         "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 50 |         "\u001b[0;32m<ipython-input-4-ce8360b266e1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Import the example plot from the figures directory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mfig_code\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 51 |         "\u001b[0;31mImportError\u001b[0m: No module named fig_code"
 52 |        ]
 53 |       }
 54 |      ],
 55 |      "prompt_number": 4
 56 |     },
 57 |     {
 58 |      "cell_type": "code",
 59 |      "collapsed": false,
 60 |      "input": [],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "code",
 67 |      "collapsed": false,
 68 |      "input": [],
 69 |      "language": "python",
 70 |      "metadata": {},
 71 |      "outputs": []
 72 |     },
 73 |     {
 74 |      "cell_type": "code",
 75 |      "collapsed": false,
 76 |      "input": [],
 77 |      "language": "python",
 78 |      "metadata": {},
 79 |      "outputs": []
 80 |     },
 81 |     {
 82 |      "cell_type": "code",
 83 |      "collapsed": false,
 84 |      "input": [],
 85 |      "language": "python",
 86 |      "metadata": {},
 87 |      "outputs": []
 88 |     },
 89 |     {
 90 |      "cell_type": "code",
 91 |      "collapsed": false,
 92 |      "input": [],
 93 |      "language": "python",
 94 |      "metadata": {},
 95 |      "outputs": []
 96 |     }
 97 |    ],
 98 |    "metadata": {}
 99 |   }
100 |  ]
101 | }


--------------------------------------------------------------------------------
/mapreduce/test_mr_s3_log_parser.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from StringIO import StringIO
 3 | import unittest2 as unittest
 4 | from mr_s3_log_parser import MrS3LogParser
 5 | 
 6 | 
 7 | class MrTestsUtil:
 8 | 
 9 |     def run_mr_sandbox(self, mr_job, stdin):
10 |         # inline runs the job in the same process so small jobs tend to
11 |         # run faster and stack traces are simpler
12 |         # --no-conf prevents options from local mrjob.conf from polluting
13 |         # the testing environment
14 |         # "-" reads from standard in
15 |         mr_job.sandbox(stdin=stdin)
16 | 
17 |         # make_runner ensures job cleanup is performed regardless of
18 |         # success or failure
19 |         with mr_job.make_runner() as runner:
20 |             runner.run()
21 |             for line in runner.stream_output():
22 |                 key, value = mr_job.parse_output_line(line)
23 |                 yield value
24 | 
25 |                 
26 | class TestMrS3LogParser(unittest.TestCase):
27 | 
28 |     mr_job = None
29 |     mr_tests_util = None
30 | 
31 |     RAW_LOG_LINE_INVALID = \
32 |         '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
33 |         '00000388225bcc00000 ' \
34 |         's3-storage [22/Jul/2013:21:03:27 +0000] ' \
35 |         '00.111.222.33 ' \
36 | 
37 |     RAW_LOG_LINE_VALID = \
38 |         '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
39 |         '00000388225bcc00000 ' \
40 |         's3-storage [22/Jul/2013:21:03:27 +0000] ' \
41 |         '00.111.222.33 ' \
42 |         'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \
43 |         'REST.HEAD.OBJECT user/file.pdf ' \
44 |         '"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \
45 |         '00000SDZk ' \
46 |         'HTTP/1.1" 200 - - 4000272 18 - "-" ' \
47 |         '"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \
48 |         '00000XMHZJp6DjM9x5JVEAMo8MG00000'
49 | 
50 |     DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000"
51 |     DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000"
52 |     DATE_VALID = "2013-07-22"
53 |     DATE_TIME_VALID = "2013-07-22 21:04:17"
54 |     TIME_ZONE_VALID = "+0000"
55 | 
56 |     def __init__(self, *args, **kwargs):
57 |         super(TestMrS3LogParser, self).__init__(*args, **kwargs)
58 |         self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-'])
59 |         self.mr_tests_util = MrTestsUtil()
60 | 
61 |     def test_invalid_log_lines(self):
62 |         stdin = StringIO(self.RAW_LOG_LINE_INVALID)
63 | 
64 |         for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
65 |             self.assertEqual(result.find("Error"), 0)
66 | 
67 |     def test_valid_log_lines(self):
68 |         stdin = StringIO(self.RAW_LOG_LINE_VALID)
69 | 
70 |         for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
71 |             self.assertEqual(result.find("Error"), -1)
72 | 
73 |     def test_clean_date_time_zone(self):
74 |         date, date_time, time_zone_parsed = \
75 |             self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID)
76 |         self.assertEqual(date, self.DATE_VALID)
77 |         self.assertEqual(date_time, self.DATE_TIME_VALID)
78 |         self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID)
79 | 
80 |         # Use a lambda to delay the calling of clean_date_time_zone so that
81 |         # assertRaises has enough time to handle it properly
82 |         self.assertRaises(ValueError,
83 |             lambda: self.mr_job.clean_date_time_zone(
84 |                 self.DATE_TIME_ZONE_INVALID))
85 | 
86 | if __name__ == '__main__':
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/scipy/nsfg.py:
--------------------------------------------------------------------------------
  1 | """This file contains code for use with "Think Stats",
  2 | by Allen B. Downey, available from greenteapress.com
  3 | 
  4 | Copyright 2010 Allen B. Downey
  5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from collections import defaultdict
 11 | import numpy as np
 12 | import sys
 13 | 
 14 | import thinkstats2
 15 | 
 16 | 
 17 | def ReadFemPreg(dct_file='2002FemPreg.dct',
 18 |                 dat_file='2002FemPreg.dat.gz'):
 19 |     """Reads the NSFG pregnancy data.
 20 | 
 21 |     dct_file: string file name
 22 |     dat_file: string file name
 23 | 
 24 |     returns: DataFrame
 25 |     """
 26 |     dct = thinkstats2.ReadStataDct(dct_file)
 27 |     df = dct.ReadFixedWidth(dat_file, compression='gzip')
 28 |     CleanFemPreg(df)
 29 |     return df
 30 | 
 31 | 
 32 | def CleanFemPreg(df):
 33 |     """Recodes variables from the pregnancy frame.
 34 | 
 35 |     df: DataFrame
 36 |     """
 37 |     # mother's age is encoded in centiyears; convert to years
 38 |     df.agepreg /= 100.0
 39 | 
 40 |     # birthwgt_lb contains at least one bogus value (51 lbs)
 41 |     # replace with NaN
 42 |     df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
 43 |     
 44 |     # replace 'not ascertained', 'refused', 'don't know' with NaN
 45 |     na_vals = [97, 98, 99]
 46 |     df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
 47 |     df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
 48 |     df.hpagelb.replace(na_vals, np.nan, inplace=True)
 49 | 
 50 |     df.babysex.replace([7, 9], np.nan, inplace=True)
 51 |     df.nbrnaliv.replace([9], np.nan, inplace=True)
 52 | 
 53 |     # birthweight is stored in two columns, lbs and oz.
 54 |     # convert to a single column in lb
 55 |     # NOTE: creating a new column requires dictionary syntax,
 56 |     # not attribute assignment (like df.totalwgt_lb)
 57 |     df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    
 58 | 
 59 |     # due to a bug in ReadStataDct, the last variable gets clipped;
 60 |     # so for now set it to NaN
 61 |     df.cmintvw = np.nan
 62 | 
 63 | 
 64 | def MakePregMap(df):
 65 |     """Make a map from caseid to list of preg indices.
 66 | 
 67 |     df: DataFrame
 68 | 
 69 |     returns: dict that maps from caseid to list of indices into preg df
 70 |     """
 71 |     d = defaultdict(list)
 72 |     for index, caseid in df.caseid.iteritems():
 73 |         d[caseid].append(index)
 74 |     return d
 75 | 
 76 | 
 77 | def main(script):
 78 |     """Tests the functions in this module.
 79 | 
 80 |     script: string script name
 81 |     """
 82 |     df = ReadFemPreg()
 83 |     print(df.shape)
 84 | 
 85 |     assert len(df) == 13593
 86 | 
 87 |     assert df.caseid[13592] == 12571
 88 |     assert df.pregordr.value_counts()[1] == 5033
 89 |     assert df.nbrnaliv.value_counts()[1] == 8981
 90 |     assert df.babysex.value_counts()[1] == 4641
 91 |     assert df.birthwgt_lb.value_counts()[7] == 3049
 92 |     assert df.birthwgt_oz.value_counts()[0] == 1037
 93 |     assert df.prglngth.value_counts()[39] == 4744
 94 |     assert df.outcome.value_counts()[1] == 9148
 95 |     assert df.birthord.value_counts()[1] == 4413
 96 |     assert df.agepreg.value_counts()[22.75] == 100
 97 |     assert df.totalwgt_lb.value_counts()[7.5] == 302
 98 | 
 99 |     weights = df.finalwgt.value_counts()
100 |     key = max(weights.keys())
101 |     assert df.finalwgt.value_counts()[key] == 6
102 | 
103 |     print('%s: All tests passed.' % script)
104 | 
105 | if __name__ == '__main__':
106 |     main(*sys.argv)
107 | 


--------------------------------------------------------------------------------
/data/ozone.csv:
--------------------------------------------------------------------------------
  1 | "Ozone","Solar.R","Wind","Temp","Month","Day"
  2 | 41,190,7.4,67,5,1
  3 | 36,118,8,72,5,2
  4 | 12,149,12.6,74,5,3
  5 | 18,313,11.5,62,5,4
  6 | NA,NA,14.3,56,5,5
  7 | 28,NA,14.9,66,5,6
  8 | 23,299,8.6,65,5,7
  9 | 19,99,13.8,59,5,8
 10 | 8,19,20.1,61,5,9
 11 | NA,194,8.6,69,5,10
 12 | 7,NA,6.9,74,5,11
 13 | 16,256,9.7,69,5,12
 14 | 11,290,9.2,66,5,13
 15 | 14,274,10.9,68,5,14
 16 | 18,65,13.2,58,5,15
 17 | 14,334,11.5,64,5,16
 18 | 34,307,12,66,5,17
 19 | 6,78,18.4,57,5,18
 20 | 30,322,11.5,68,5,19
 21 | 11,44,9.7,62,5,20
 22 | 1,8,9.7,59,5,21
 23 | 11,320,16.6,73,5,22
 24 | 4,25,9.7,61,5,23
 25 | 32,92,12,61,5,24
 26 | NA,66,16.6,57,5,25
 27 | NA,266,14.9,58,5,26
 28 | NA,NA,8,57,5,27
 29 | 23,13,12,67,5,28
 30 | 45,252,14.9,81,5,29
 31 | 115,223,5.7,79,5,30
 32 | 37,279,7.4,76,5,31
 33 | NA,286,8.6,78,6,1
 34 | NA,287,9.7,74,6,2
 35 | NA,242,16.1,67,6,3
 36 | NA,186,9.2,84,6,4
 37 | NA,220,8.6,85,6,5
 38 | NA,264,14.3,79,6,6
 39 | 29,127,9.7,82,6,7
 40 | NA,273,6.9,87,6,8
 41 | 71,291,13.8,90,6,9
 42 | 39,323,11.5,87,6,10
 43 | NA,259,10.9,93,6,11
 44 | NA,250,9.2,92,6,12
 45 | 23,148,8,82,6,13
 46 | NA,332,13.8,80,6,14
 47 | NA,322,11.5,79,6,15
 48 | 21,191,14.9,77,6,16
 49 | 37,284,20.7,72,6,17
 50 | 20,37,9.2,65,6,18
 51 | 12,120,11.5,73,6,19
 52 | 13,137,10.3,76,6,20
 53 | NA,150,6.3,77,6,21
 54 | NA,59,1.7,76,6,22
 55 | NA,91,4.6,76,6,23
 56 | NA,250,6.3,76,6,24
 57 | NA,135,8,75,6,25
 58 | NA,127,8,78,6,26
 59 | NA,47,10.3,73,6,27
 60 | NA,98,11.5,80,6,28
 61 | NA,31,14.9,77,6,29
 62 | NA,138,8,83,6,30
 63 | 135,269,4.1,84,7,1
 64 | 49,248,9.2,85,7,2
 65 | 32,236,9.2,81,7,3
 66 | NA,101,10.9,84,7,4
 67 | 64,175,4.6,83,7,5
 68 | 40,314,10.9,83,7,6
 69 | 77,276,5.1,88,7,7
 70 | 97,267,6.3,92,7,8
 71 | 97,272,5.7,92,7,9
 72 | 85,175,7.4,89,7,10
 73 | NA,139,8.6,82,7,11
 74 | 10,264,14.3,73,7,12
 75 | 27,175,14.9,81,7,13
 76 | NA,291,14.9,91,7,14
 77 | 7,48,14.3,80,7,15
 78 | 48,260,6.9,81,7,16
 79 | 35,274,10.3,82,7,17
 80 | 61,285,6.3,84,7,18
 81 | 79,187,5.1,87,7,19
 82 | 63,220,11.5,85,7,20
 83 | 16,7,6.9,74,7,21
 84 | NA,258,9.7,81,7,22
 85 | NA,295,11.5,82,7,23
 86 | 80,294,8.6,86,7,24
 87 | 108,223,8,85,7,25
 88 | 20,81,8.6,82,7,26
 89 | 52,82,12,86,7,27
 90 | 82,213,7.4,88,7,28
 91 | 50,275,7.4,86,7,29
 92 | 64,253,7.4,83,7,30
 93 | 59,254,9.2,81,7,31
 94 | 39,83,6.9,81,8,1
 95 | 9,24,13.8,81,8,2
 96 | 16,77,7.4,82,8,3
 97 | 78,NA,6.9,86,8,4
 98 | 35,NA,7.4,85,8,5
 99 | 66,NA,4.6,87,8,6
100 | 122,255,4,89,8,7
101 | 89,229,10.3,90,8,8
102 | 110,207,8,90,8,9
103 | NA,222,8.6,92,8,10
104 | NA,137,11.5,86,8,11
105 | 44,192,11.5,86,8,12
106 | 28,273,11.5,82,8,13
107 | 65,157,9.7,80,8,14
108 | NA,64,11.5,79,8,15
109 | 22,71,10.3,77,8,16
110 | 59,51,6.3,79,8,17
111 | 23,115,7.4,76,8,18
112 | 31,244,10.9,78,8,19
113 | 44,190,10.3,78,8,20
114 | 21,259,15.5,77,8,21
115 | 9,36,14.3,72,8,22
116 | NA,255,12.6,75,8,23
117 | 45,212,9.7,79,8,24
118 | 168,238,3.4,81,8,25
119 | 73,215,8,86,8,26
120 | NA,153,5.7,88,8,27
121 | 76,203,9.7,97,8,28
122 | 118,225,2.3,94,8,29
123 | 84,237,6.3,96,8,30
124 | 85,188,6.3,94,8,31
125 | 96,167,6.9,91,9,1
126 | 78,197,5.1,92,9,2
127 | 73,183,2.8,93,9,3
128 | 91,189,4.6,93,9,4
129 | 47,95,7.4,87,9,5
130 | 32,92,15.5,84,9,6
131 | 20,252,10.9,80,9,7
132 | 23,220,10.3,78,9,8
133 | 21,230,10.9,75,9,9
134 | 24,259,9.7,73,9,10
135 | 44,236,14.9,81,9,11
136 | 21,259,15.5,76,9,12
137 | 28,238,6.3,77,9,13
138 | 9,24,10.9,71,9,14
139 | 13,112,11.5,71,9,15
140 | 46,237,6.9,78,9,16
141 | 18,224,13.8,67,9,17
142 | 13,27,10.3,76,9,18
143 | 24,238,10.3,68,9,19
144 | 16,201,8,82,9,20
145 | 13,238,12.6,64,9,21
146 | 23,14,9.2,71,9,22
147 | 36,139,10.3,81,9,23
148 | 7,49,10.3,69,9,24
149 | 14,20,16.6,63,9,25
150 | 30,193,6.9,70,9,26
151 | NA,145,13.2,77,9,27
152 | 14,191,14.3,75,9,28
153 | 18,131,8,76,9,29
154 | 20,223,11.5,68,9,30
155 | 


--------------------------------------------------------------------------------
/data/titanic/gendermodel.py:
--------------------------------------------------------------------------------
 1 | """ This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this
 2 | Author : AstroDave
 3 | Date : 18 September 2012
 4 | Revised: 28 March 2014
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import csv as csv
10 | import numpy as np
11 | 
12 | csv_file_object = csv.reader(open('train.csv', 'rb')) 	# Load in the csv file
13 | header = csv_file_object.next() 						# Skip the fist line as it is a header
14 | data=[] 												# Create a variable to hold the data
15 | 
16 | for row in csv_file_object: 							# Skip through each row in the csv file,
17 |     data.append(row[0:]) 								# adding each row to the data variable
18 | data = np.array(data) 									# Then convert from a list to an array.
19 | 
20 | # Now I have an array of 12 columns and 891 rows
21 | # I can access any element I want, so the entire first column would
22 | # be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0
23 | # I have to add the .astype() command, because
24 | # when appending the rows, python thought it was a string - so needed to convert
25 | 
26 | # Set some variables
27 | number_passengers = np.size(data[0::,1].astype(np.float))
28 | number_survived = np.sum(data[0::,1].astype(np.float))
29 | proportion_survivors = number_survived / number_passengers 
30 | 
31 | # I can now find the stats of all the women on board,
32 | # by making an array that lists True/False whether each row is female
33 | women_only_stats = data[0::,4] == "female" 	# This finds where all the women are
34 | men_only_stats = data[0::,4] != "female" 	# This finds where all the men are (note != means 'not equal')
35 | 
36 | # I can now filter the whole data, to find statistics for just women, by just placing
37 | # women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index. 
38 | # You can test it by placing it there, and requesting column index [4], and the output should all read 'female'
39 | # e.g. try typing this:   data[women_only_stats,4]
40 | women_onboard = data[women_only_stats,1].astype(np.float)
41 | men_onboard = data[men_only_stats,1].astype(np.float)
42 | 
43 | # and derive some statistics about them
44 | proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
45 | proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)
46 | 
47 | print 'Proportion of women who survived is %s' % proportion_women_survived
48 | print 'Proportion of men who survived is %s' % proportion_men_survived
49 | 
50 | # Now that I have my indicator that women were much more likely to survive,
51 | # I am done with the training set.
52 | # Now I will read in the test file and write out my simplistic prediction:
53 | # if female, then model that she survived (1) 
54 | # if male, then model that he did not survive (0)
55 | 
56 | # First, read in test.csv
57 | test_file = open('test.csv', 'rb')
58 | test_file_object = csv.reader(test_file)
59 | header = test_file_object.next()
60 | 
61 | # Also open the a new file so I can write to it. Call it something descriptive
62 | # Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex')
63 | # Write out the PassengerId, and my prediction.
64 | 
65 | predictions_file = open("gendermodel.csv", "wb")
66 | predictions_file_object = csv.writer(predictions_file)
67 | predictions_file_object.writerow(["PassengerId", "Survived"])	# write the column headers
68 | for row in test_file_object:									# For each row in test file,
69 |     if row[3] == 'female':										# is it a female, if yes then
70 |         predictions_file_object.writerow([row[0], "1"])			# write the PassengerId, and predict 1
71 |     else:														# or else if male,
72 |         predictions_file_object.writerow([row[0], "0"])			# write the PassengerId, and predict 0.
73 | test_file.close()												# Close out the files.
74 | predictions_file.close()
75 | 
76 | 


--------------------------------------------------------------------------------
/data/ozone_copy.csv:
--------------------------------------------------------------------------------
  1 | 41.0,190.0,7.4,67,5,1
  2 | 36.0,118.0,8.0,72,5,2
  3 | 12.0,149.0,12.6,74,5,3
  4 | 18.0,313.0,11.5,62,5,4
  5 | ,,14.3,56,5,5
  6 | 28.0,,14.9,66,5,6
  7 | 23.0,299.0,8.6,65,5,7
  8 | 19.0,99.0,13.8,59,5,8
  9 | 8.0,19.0,20.1,61,5,9
 10 | ,194.0,8.6,69,5,10
 11 | 7.0,,6.9,74,5,11
 12 | 16.0,256.0,9.7,69,5,12
 13 | 11.0,290.0,9.2,66,5,13
 14 | 14.0,274.0,10.9,68,5,14
 15 | 18.0,65.0,13.2,58,5,15
 16 | 14.0,334.0,11.5,64,5,16
 17 | 34.0,307.0,12.0,66,5,17
 18 | 6.0,78.0,18.4,57,5,18
 19 | 30.0,322.0,11.5,68,5,19
 20 | 11.0,44.0,9.7,62,5,20
 21 | 1.0,8.0,9.7,59,5,21
 22 | 11.0,320.0,16.6,73,5,22
 23 | 4.0,25.0,9.7,61,5,23
 24 | 32.0,92.0,12.0,61,5,24
 25 | ,66.0,16.6,57,5,25
 26 | ,266.0,14.9,58,5,26
 27 | ,,8.0,57,5,27
 28 | 23.0,13.0,12.0,67,5,28
 29 | 45.0,252.0,14.9,81,5,29
 30 | 115.0,223.0,5.7,79,5,30
 31 | 37.0,279.0,7.4,76,5,31
 32 | ,286.0,8.6,78,6,1
 33 | ,287.0,9.7,74,6,2
 34 | ,242.0,16.1,67,6,3
 35 | ,186.0,9.2,84,6,4
 36 | ,220.0,8.6,85,6,5
 37 | ,264.0,14.3,79,6,6
 38 | 29.0,127.0,9.7,82,6,7
 39 | ,273.0,6.9,87,6,8
 40 | 71.0,291.0,13.8,90,6,9
 41 | 39.0,323.0,11.5,87,6,10
 42 | ,259.0,10.9,93,6,11
 43 | ,250.0,9.2,92,6,12
 44 | 23.0,148.0,8.0,82,6,13
 45 | ,332.0,13.8,80,6,14
 46 | ,322.0,11.5,79,6,15
 47 | 21.0,191.0,14.9,77,6,16
 48 | 37.0,284.0,20.7,72,6,17
 49 | 20.0,37.0,9.2,65,6,18
 50 | 12.0,120.0,11.5,73,6,19
 51 | 13.0,137.0,10.3,76,6,20
 52 | ,150.0,6.3,77,6,21
 53 | ,59.0,1.7,76,6,22
 54 | ,91.0,4.6,76,6,23
 55 | ,250.0,6.3,76,6,24
 56 | ,135.0,8.0,75,6,25
 57 | ,127.0,8.0,78,6,26
 58 | ,47.0,10.3,73,6,27
 59 | ,98.0,11.5,80,6,28
 60 | ,31.0,14.9,77,6,29
 61 | ,138.0,8.0,83,6,30
 62 | 135.0,269.0,4.1,84,7,1
 63 | 49.0,248.0,9.2,85,7,2
 64 | 32.0,236.0,9.2,81,7,3
 65 | ,101.0,10.9,84,7,4
 66 | 64.0,175.0,4.6,83,7,5
 67 | 40.0,314.0,10.9,83,7,6
 68 | 77.0,276.0,5.1,88,7,7
 69 | 97.0,267.0,6.3,92,7,8
 70 | 97.0,272.0,5.7,92,7,9
 71 | 85.0,175.0,7.4,89,7,10
 72 | ,139.0,8.6,82,7,11
 73 | 10.0,264.0,14.3,73,7,12
 74 | 27.0,175.0,14.9,81,7,13
 75 | ,291.0,14.9,91,7,14
 76 | 7.0,48.0,14.3,80,7,15
 77 | 48.0,260.0,6.9,81,7,16
 78 | 35.0,274.0,10.3,82,7,17
 79 | 61.0,285.0,6.3,84,7,18
 80 | 79.0,187.0,5.1,87,7,19
 81 | 63.0,220.0,11.5,85,7,20
 82 | 16.0,7.0,6.9,74,7,21
 83 | ,258.0,9.7,81,7,22
 84 | ,295.0,11.5,82,7,23
 85 | 80.0,294.0,8.6,86,7,24
 86 | 108.0,223.0,8.0,85,7,25
 87 | 20.0,81.0,8.6,82,7,26
 88 | 52.0,82.0,12.0,86,7,27
 89 | 82.0,213.0,7.4,88,7,28
 90 | 50.0,275.0,7.4,86,7,29
 91 | 64.0,253.0,7.4,83,7,30
 92 | 59.0,254.0,9.2,81,7,31
 93 | 39.0,83.0,6.9,81,8,1
 94 | 9.0,24.0,13.8,81,8,2
 95 | 16.0,77.0,7.4,82,8,3
 96 | 78.0,,6.9,86,8,4
 97 | 35.0,,7.4,85,8,5
 98 | 66.0,,4.6,87,8,6
 99 | 122.0,255.0,4.0,89,8,7
100 | 89.0,229.0,10.3,90,8,8
101 | 110.0,207.0,8.0,90,8,9
102 | ,222.0,8.6,92,8,10
103 | ,137.0,11.5,86,8,11
104 | 44.0,192.0,11.5,86,8,12
105 | 28.0,273.0,11.5,82,8,13
106 | 65.0,157.0,9.7,80,8,14
107 | ,64.0,11.5,79,8,15
108 | 22.0,71.0,10.3,77,8,16
109 | 59.0,51.0,6.3,79,8,17
110 | 23.0,115.0,7.4,76,8,18
111 | 31.0,244.0,10.9,78,8,19
112 | 44.0,190.0,10.3,78,8,20
113 | 21.0,259.0,15.5,77,8,21
114 | 9.0,36.0,14.3,72,8,22
115 | ,255.0,12.6,75,8,23
116 | 45.0,212.0,9.7,79,8,24
117 | 168.0,238.0,3.4,81,8,25
118 | 73.0,215.0,8.0,86,8,26
119 | ,153.0,5.7,88,8,27
120 | 76.0,203.0,9.7,97,8,28
121 | 118.0,225.0,2.3,94,8,29
122 | 84.0,237.0,6.3,96,8,30
123 | 85.0,188.0,6.3,94,8,31
124 | 96.0,167.0,6.9,91,9,1
125 | 78.0,197.0,5.1,92,9,2
126 | 73.0,183.0,2.8,93,9,3
127 | 91.0,189.0,4.6,93,9,4
128 | 47.0,95.0,7.4,87,9,5
129 | 32.0,92.0,15.5,84,9,6
130 | 20.0,252.0,10.9,80,9,7
131 | 23.0,220.0,10.3,78,9,8
132 | 21.0,230.0,10.9,75,9,9
133 | 24.0,259.0,9.7,73,9,10
134 | 44.0,236.0,14.9,81,9,11
135 | 21.0,259.0,15.5,76,9,12
136 | 28.0,238.0,6.3,77,9,13
137 | 9.0,24.0,10.9,71,9,14
138 | 13.0,112.0,11.5,71,9,15
139 | 46.0,237.0,6.9,78,9,16
140 | 18.0,224.0,13.8,67,9,17
141 | 13.0,27.0,10.3,76,9,18
142 | 24.0,238.0,10.3,68,9,19
143 | 16.0,201.0,8.0,82,9,20
144 | 13.0,238.0,12.6,64,9,21
145 | 23.0,14.0,9.2,71,9,22
146 | 36.0,139.0,10.3,81,9,23
147 | 7.0,49.0,10.3,69,9,24
148 | 14.0,20.0,16.6,63,9,25
149 | 30.0,193.0,6.9,70,9,26
150 | ,145.0,13.2,77,9,27
151 | 14.0,191.0,14.3,75,9,28
152 | 18.0,131.0,8.0,76,9,29
153 | 20.0,223.0,11.5,68,9,30
154 | 


--------------------------------------------------------------------------------
/python-data/files.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Files\n",
 15 |     "\n",
 16 |     "* Read a File\n",
 17 |     "* Write a File\n",
 18 |     "* Read and Write UTF-8"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Read a File\n",
 26 |     "\n",
 27 |     "Open a file in read-only mode.<br\\>\n",
 28 |     "Iterate over the file lines.  rstrip removes the EOL markers.<br\\>"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "class TypeUtil:\n",
 43 |       "\n",
 44 |       "    @classmethod\n",
 45 |       "    def is_iterable(cls, obj):\n",
 46 |       "        \"\"\"Determines if obj is iterable.\n",
 47 |       "\n",
 48 |       "        Useful when writing functions that can accept multiple types of\n",
 49 |       "        input (list, tuple, ndarray, iterator).  Pairs well with\n",
 50 |       "        convert_to_list.\n",
 51 |       "        \"\"\"\n",
 52 |       "        try:\n",
 53 |       "            iter(obj)\n",
 54 |       "            return True\n",
 55 |       "        except TypeError:\n",
 56 |       "            return False\n",
 57 |       "\n",
 58 |       "    @classmethod\n",
 59 |       "    def convert_to_list(cls, obj):\n",
 60 |       "        \"\"\"Converts obj to a list if it is not a list and it is iterable,\n",
 61 |       "        else returns the original obj.\n",
 62 |       "        \"\"\"\n",
 63 |       "        if not isinstance(obj, list) and cls.is_iterable(obj):\n",
 64 |       "            obj = list(obj)\n",
 65 |       "        return obj\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "old_file_path = 'type_util.py'\n",
 71 |     "with open(old_file_path, 'r') as old_file:\n",
 72 |     "    for line in old_file:\n",
 73 |     "        print(line.rstrip())"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Write to a file\n",
 81 |     "\n",
 82 |     "Create a new file overwriting any previous file with the same name, write text, then close the file:"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 2,
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "new_file_path = 'hello_world.txt'\n",
 94 |     "with open(new_file_path, 'w') as new_file:\n",
 95 |     "    new_file.write('hello world!')"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Read and Write UTF-8"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "import codecs\n",
114 |     "with codecs.open(\"hello_world_new.txt\", \"a\", \"utf-8\") as new_file:\n",
115 |     "    with codecs.open(\"hello_world.txt\", \"r\", \"utf-8\") as old_file:                   \n",
116 |     "        for line in old_file:\n",
117 |     "            new_file.write(line + '\\n')"
118 |    ]
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": "Python 2",
124 |    "language": "python",
125 |    "name": "python2"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 2
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython2",
137 |    "version": "2.7.10"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 0
142 | }
143 | 


--------------------------------------------------------------------------------
/data/titanic/myfirstforest.py:
--------------------------------------------------------------------------------
 1 | """ Writing my first randomforest code.
 2 | Author : AstroDave
 3 | Date : 23rd September 2012
 4 | Revised: 15 April 2014
 5 | please see packages.python.org/milk/randomforests.html for more
 6 | 
 7 | """ 
 8 | import pandas as pd
 9 | import numpy as np
10 | import csv as csv
11 | from sklearn.ensemble import RandomForestClassifier
12 | 
13 | # Data cleanup
14 | # TRAIN DATA
15 | train_df = pd.read_csv('train.csv', header=0)        # Load the train file into a dataframe
16 | 
17 | # I need to convert all strings to integer classifiers.
18 | # I need to fill in the missing values of the data and make it complete.
19 | 
20 | # female = 0, Male = 1
21 | train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
22 | 
23 | # Embarked from 'C', 'Q', 'S'
24 | # Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.
25 | 
26 | # All missing Embarked -> just make them embark from most common place
27 | if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
28 |     train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values
29 | 
30 | Ports = list(enumerate(np.unique(train_df['Embarked'])))    # determine all values of Embarked,
31 | Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
32 | train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int
33 | 
34 | # All the ages with no data -> make the median of all Ages
35 | median_age = train_df['Age'].dropna().median()
36 | if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
37 |     train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age
38 | 
39 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
40 | train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 
41 | 
42 | 
43 | # TEST DATA
44 | test_df = pd.read_csv('test.csv', header=0)        # Load the test file into a dataframe
45 | 
46 | # I need to do the same with the test data now, so that the columns are the same as the training data
47 | # I need to convert all strings to integer classifiers:
48 | # female = 0, Male = 1
49 | test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
50 | 
51 | # Embarked from 'C', 'Q', 'S'
52 | # All missing Embarked -> just make them embark from most common place
53 | if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
54 |     test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
55 | # Again convert all Embarked strings to int
56 | test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)
57 | 
58 | 
59 | # All the ages with no data -> make the median of all Ages
60 | median_age = test_df['Age'].dropna().median()
61 | if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
62 |     test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age
63 | 
64 | # All the missing Fares -> assume median of their respective class
65 | if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
66 |     median_fare = np.zeros(3)
67 |     for f in range(0,3):                                              # loop 0 to 2
68 |         median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
69 |     for f in range(0,3):                                              # loop 0 to 2
70 |         test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
71 | 
72 | # Collect the test data's PassengerIds before dropping it
73 | ids = test_df['PassengerId'].values
74 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
75 | test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 
76 | 
77 | 
78 | # The data is now ready to go. So lets fit to the train, then predict to the test!
79 | # Convert back to a numpy array
80 | train_data = train_df.values
81 | test_data = test_df.values
82 | 
83 | 
84 | print 'Training...'
85 | forest = RandomForestClassifier(n_estimators=100)
86 | forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
87 | 
88 | print 'Predicting...'
89 | output = forest.predict(test_data).astype(int)
90 | 
91 | 
92 | predictions_file = open("myfirstforest.csv", "wb")
93 | open_file_object = csv.writer(predictions_file)
94 | open_file_object.writerow(["PassengerId","Survived"])
95 | open_file_object.writerows(zip(ids, output))
96 | predictions_file.close()
97 | print 'Done.'
98 | 


--------------------------------------------------------------------------------
/scipy/first.py:
--------------------------------------------------------------------------------
  1 | """This file contains code used in "Think Stats",
  2 | by Allen B. Downey, available from greenteapress.com
  3 | 
  4 | Copyright 2014 Allen B. Downey
  5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | import math
 11 | import numpy as np
 12 | 
 13 | import nsfg
 14 | import thinkstats2
 15 | import thinkplot
 16 | 
 17 | 
 18 | def MakeFrames():
 19 |     """Reads pregnancy data and partitions first babies and others.
 20 | 
 21 |     returns: DataFrames (all live births, first babies, others)
 22 |     """
 23 |     preg = nsfg.ReadFemPreg()
 24 | 
 25 |     live = preg[preg.outcome == 1]
 26 |     firsts = live[live.birthord == 1]
 27 |     others = live[live.birthord != 1]
 28 | 
 29 |     assert len(live) == 9148
 30 |     assert len(firsts) == 4413
 31 |     assert len(others) == 4735
 32 | 
 33 |     return live, firsts, others
 34 | 
 35 | 
 36 | def Summarize(live, firsts, others):
 37 |     """Print various summary statistics."""
 38 | 
 39 |     mean = live.prglngth.mean()
 40 |     var = live.prglngth.var()
 41 |     std = live.prglngth.std()
 42 | 
 43 |     print('Live mean', mean)
 44 |     print('Live variance', var)
 45 |     print('Live std', std)
 46 | 
 47 |     mean1 = firsts.prglngth.mean()
 48 |     mean2 = others.prglngth.mean()
 49 | 
 50 |     var1 = firsts.prglngth.var()
 51 |     var2 = others.prglngth.var()
 52 | 
 53 |     print('Mean')
 54 |     print('First babies', mean1)
 55 |     print('Others', mean2)
 56 | 
 57 |     print('Variance')
 58 |     print('First babies', var1)
 59 |     print('Others', var2)
 60 | 
 61 |     print('Difference in weeks', mean1 - mean2)
 62 |     print('Difference in hours', (mean1 - mean2) * 7 * 24)
 63 | 
 64 |     print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100)
 65 | 
 66 |     d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth)
 67 |     print('Cohen d', d)
 68 | 
 69 | 
 70 | def PrintExtremes(live):
 71 |     """Plots the histogram of pregnancy lengths and prints the extremes.
 72 | 
 73 |     live: DataFrame of live births
 74 |     """
 75 |     hist = thinkstats2.Hist(live.prglngth)
 76 |     thinkplot.Hist(hist, label='live births')
 77 | 
 78 |     thinkplot.Save(root='first_nsfg_hist_live', 
 79 |                    title='Histogram',
 80 |                    xlabel='weeks',
 81 |                    ylabel='frequency')
 82 | 
 83 |     print('Shortest lengths:')
 84 |     for weeks, freq in hist.Smallest(10):
 85 |         print(weeks, freq)
 86 | 
 87 |     print('Longest lengths:')
 88 |     for weeks, freq in hist.Largest(10):
 89 |         print(weeks, freq)
 90 |     
 91 | 
 92 | def MakeHists(live):
 93 |     """Plot Hists for live births
 94 | 
 95 |     live: DataFrame
 96 |     others: DataFrame
 97 |     """
 98 |     hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
 99 |     thinkplot.Hist(hist)
100 |     thinkplot.Save(root='first_wgt_lb_hist', 
101 |                    xlabel='pounds',
102 |                    ylabel='frequency',
103 |                    axis=[-1, 14, 0, 3200])
104 | 
105 |     hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
106 |     thinkplot.Hist(hist)
107 |     thinkplot.Save(root='first_wgt_oz_hist', 
108 |                    xlabel='ounces',
109 |                    ylabel='frequency',
110 |                    axis=[-1, 16, 0, 1200])
111 | 
112 |     hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
113 |     thinkplot.Hist(hist)
114 |     thinkplot.Save(root='first_agepreg_hist', 
115 |                    xlabel='years',
116 |                    ylabel='frequency')
117 | 
118 |     hist = thinkstats2.Hist(live.prglngth, label='prglngth')
119 |     thinkplot.Hist(hist)
120 |     thinkplot.Save(root='first_prglngth_hist', 
121 |                    xlabel='weeks',
122 |                    ylabel='frequency',
123 |                    axis=[-1, 53, 0, 5000])
124 | 
125 | 
126 | def MakeComparison(firsts, others):
127 |     """Plots histograms of pregnancy length for first babies and others.
128 | 
129 |     firsts: DataFrame
130 |     others: DataFrame
131 |     """
132 |     first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
133 |     other_hist = thinkstats2.Hist(others.prglngth, label='other')
134 | 
135 |     width = 0.45
136 |     thinkplot.PrePlot(2)
137 |     thinkplot.Hist(first_hist, align='right', width=width)
138 |     thinkplot.Hist(other_hist, align='left', width=width)
139 | 
140 |     thinkplot.Save(root='first_nsfg_hist', 
141 |                    title='Histogram',
142 |                    xlabel='weeks',
143 |                    ylabel='frequency',
144 |                    axis=[27, 46, 0, 2700])
145 | 
146 | 
147 | def main(script):
148 |     live, firsts, others = MakeFrames()
149 | 
150 |     MakeHists(live)
151 |     PrintExtremes(live)
152 |     MakeComparison(firsts, others)
153 |     Summarize(live, firsts, others)
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     import sys
158 |     main(*sys.argv)
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/mapreduce/mr_s3_log_parser.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import time
  3 | from mrjob.job import MRJob
  4 | from mrjob.protocol import RawValueProtocol, ReprProtocol
  5 | import re
  6 | 
  7 | 
  8 | class MrS3LogParser(MRJob):
  9 |     """Parses the logs from S3 based on the S3 logging format:
 10 |     http://docs.aws.amazon.com/AmazonS3/latest/dev/LogFormat.html
 11 |     
 12 |     Aggregates a user's daily requests by user agent and operation
 13 |     
 14 |     Outputs date_time, requester, user_agent, operation, count
 15 |     """
 16 | 
 17 |     LOGPATS  = r'(\S+) (\S+) \[(.*?)\] (\S+) (\S+) ' \
 18 |                r'(\S+) (\S+) (\S+) ("([^"]+)"|-) ' \
 19 |                r'(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ' \
 20 |                r'("([^"]+)"|-) ("([^"]+)"|-)'
 21 |     NUM_ENTRIES_PER_LINE = 17
 22 |     logpat = re.compile(LOGPATS)
 23 | 
 24 |     (S3_LOG_BUCKET_OWNER, 
 25 |      S3_LOG_BUCKET, 
 26 |      S3_LOG_DATE_TIME,
 27 |      S3_LOG_IP, 
 28 |      S3_LOG_REQUESTER_ID, 
 29 |      S3_LOG_REQUEST_ID,
 30 |      S3_LOG_OPERATION, 
 31 |      S3_LOG_KEY, 
 32 |      S3_LOG_HTTP_METHOD,
 33 |      S3_LOG_HTTP_STATUS, 
 34 |      S3_LOG_S3_ERROR, 
 35 |      S3_LOG_BYTES_SENT,
 36 |      S3_LOG_OBJECT_SIZE, 
 37 |      S3_LOG_TOTAL_TIME, 
 38 |      S3_LOG_TURN_AROUND_TIME,
 39 |      S3_LOG_REFERER, 
 40 |      S3_LOG_USER_AGENT) = range(NUM_ENTRIES_PER_LINE)
 41 | 
 42 |     DELIMITER = '\t'
 43 | 
 44 |     # We use RawValueProtocol for input to be format agnostic
 45 |     # and avoid any type of parsing errors
 46 |     INPUT_PROTOCOL = RawValueProtocol
 47 | 
 48 |     # We use RawValueProtocol for output so we can output raw lines
 49 |     # instead of (k, v) pairs
 50 |     OUTPUT_PROTOCOL = RawValueProtocol
 51 | 
 52 |     # Encode the intermediate records using repr() instead of JSON, so the
 53 |     # record doesn't get Unicode-encoded
 54 |     INTERNAL_PROTOCOL = ReprProtocol
 55 | 
 56 |     def clean_date_time_zone(self, raw_date_time_zone):
 57 |         """Converts entry 22/Jul/2013:21:04:17 +0000 to the format
 58 |         'YYYY-MM-DD HH:MM:SS' which is more suitable for loading into
 59 |         a database such as Redshift or RDS
 60 | 
 61 |         Note: requires the chars "[ ]" to be stripped prior to input
 62 |         Returns the converted datetime annd timezone
 63 |         or None for both values if failed
 64 | 
 65 |         TODO: Needs to combine timezone with date as one field
 66 |         """
 67 |         date_time = None
 68 |         time_zone_parsed = None
 69 | 
 70 |         # TODO: Probably cleaner to parse this with a regex
 71 |         date_parsed = raw_date_time_zone[:raw_date_time_zone.find(":")]
 72 |         time_parsed = raw_date_time_zone[raw_date_time_zone.find(":") + 1:
 73 |                                          raw_date_time_zone.find("+") - 1]
 74 |         time_zone_parsed = raw_date_time_zone[raw_date_time_zone.find("+"):]
 75 | 
 76 |         try:
 77 |             date_struct = time.strptime(date_parsed, "%d/%b/%Y")
 78 |             converted_date = time.strftime("%Y-%m-%d", date_struct)
 79 |             date_time = converted_date + " " + time_parsed
 80 | 
 81 |         # Throws a ValueError exception if the operation fails that is
 82 |         # caught by the calling function and is handled appropriately
 83 |         except ValueError as error:
 84 |             raise ValueError(error)
 85 |         else:
 86 |             return converted_date, date_time, time_zone_parsed
 87 | 
 88 |     def mapper(self, _, line):
 89 |         line = line.strip()
 90 |         match = self.logpat.search(line)
 91 | 
 92 |         date_time = None
 93 |         requester = None
 94 |         user_agent = None
 95 |         operation = None
 96 | 
 97 |         try:
 98 |             for n in range(self.NUM_ENTRIES_PER_LINE):
 99 |                 group = match.group(1 + n)
100 | 
101 |                 if n == self.S3_LOG_DATE_TIME:
102 |                     date, date_time, time_zone_parsed = \
103 |                         self.clean_date_time_zone(group)
104 |                     # Leave the following line of code if 
105 |                     # you want to aggregate by date
106 |                     date_time = date + " 00:00:00"
107 |                 elif n == self.S3_LOG_REQUESTER_ID:
108 |                     requester = group
109 |                 elif n == self.S3_LOG_USER_AGENT:
110 |                     user_agent = group
111 |                 elif n == self.S3_LOG_OPERATION:
112 |                     operation = group
113 |                 else:
114 |                     pass
115 | 
116 |         except Exception:
117 |             yield (("Error while parsing line: %s", line), 1)
118 |         else:
119 |             yield ((date_time, requester, user_agent, operation), 1)
120 | 
121 |     def reducer(self, key, values):
122 |         output = list(key)
123 |         output = self.DELIMITER.join(output) + \
124 |                  self.DELIMITER + \
125 |                  str(sum(values))
126 | 
127 |         yield None, output
128 | 
129 |     def steps(self):
130 |         return [
131 |             self.mr(mapper=self.mapper,
132 |                     reducer=self.reducer)
133 |         ]
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     MrS3LogParser.run()


--------------------------------------------------------------------------------
/python-data/logs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Logging in Python\n",
 15 |     "* Logging with RotatingFileHandler\n",
 16 |     "* Logging with TimedRotatingFileHandler "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Logging with RotatingFileHandler\n",
 24 |     "\n",
 25 |     "The logging discussion is taken from the [Python Logging Cookbook](https://docs.python.org/2/howto/logging-cookbook.html#using-file-rotation):\n",
 26 |     "\n",
 27 |     "Sometimes you want to let a log file grow to a certain size, then open a new file and log to that. You may want to keep a certain number of these files, and when that many files have been created, rotate the files so that the number of files and the size of the files both remain bounded. For this usage pattern, the logging package provides a RotatingFileHandler.\n",
 28 |     "\n",
 29 |     "The most current file is always logging_rotatingfile_example.out, and each time it reaches the size limit it is renamed with the suffix .1. Each of the existing backup files is renamed to increment the suffix (.1 becomes .2, etc.) and the .6 file is erased.\n",
 30 |     "\n",
 31 |     "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import logging\n",
 43 |     "import time\n",
 44 |     " \n",
 45 |     "from logging.handlers import RotatingFileHandler\n",
 46 |     " \n",
 47 |     "#----------------------------------------------------------------------\n",
 48 |     "def create_rotating_log(path):\n",
 49 |     "    \"\"\"\n",
 50 |     "    Creates a rotating log\n",
 51 |     "    \"\"\"\n",
 52 |     "    logger = logging.getLogger(\"Rotating Log\")\n",
 53 |     "    logger.setLevel(logging.INFO)\n",
 54 |     " \n",
 55 |     "    # add a rotating handler\n",
 56 |     "    handler = RotatingFileHandler(path, maxBytes=20,\n",
 57 |     "                                  backupCount=5)\n",
 58 |     "    logger.addHandler(handler)\n",
 59 |     " \n",
 60 |     "    for i in range(10):\n",
 61 |     "        logger.info(\"This is test log line %s\" % i)\n",
 62 |     "        time.sleep(1.5)\n",
 63 |     " \n",
 64 |     "#----------------------------------------------------------------------\n",
 65 |     "if __name__ == \"__main__\":\n",
 66 |     "    log_file = \"test.log\"\n",
 67 |     "    create_rotating_log(log_file)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Logging with TimedRotatingFileHandler\n",
 75 |     "\n",
 76 |     "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "import logging\n",
 88 |     "import time\n",
 89 |     " \n",
 90 |     "from logging.handlers import TimedRotatingFileHandler\n",
 91 |     " \n",
 92 |     "#----------------------------------------------------------------------\n",
 93 |     "def create_timed_rotating_log(path):\n",
 94 |     "    \"\"\"\"\"\"\n",
 95 |     "    logger = logging.getLogger(\"Rotating Log\")\n",
 96 |     "    logger.setLevel(logging.INFO)\n",
 97 |     " \n",
 98 |     "    # Rotate log based on when parameter:\n",
 99 |     "    # second (s)\n",
100 |     "    # minute (m)\n",
101 |     "    # hour (h)\n",
102 |     "    # day (d)\n",
103 |     "    # w0-w6 (weekday, 0=Monday)\n",
104 |     "    # midnight\n",
105 |     "    handler = TimedRotatingFileHandler(path,\n",
106 |     "                                       when=\"m\",\n",
107 |     "                                       interval=1,\n",
108 |     "                                       backupCount=5)\n",
109 |     "    logger.addHandler(handler)\n",
110 |     " \n",
111 |     "    for i in range(20):\n",
112 |     "        logger.info(\"This is a test!\")\n",
113 |     "        time.sleep(1.5)\n",
114 |     " \n",
115 |     "#----------------------------------------------------------------------\n",
116 |     "if __name__ == \"__main__\":\n",
117 |     "    log_file = \"timed_test.log\"\n",
118 |     "    create_timed_rotating_log(log_file)"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 2",
125 |    "language": "python",
126 |    "name": "python2"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 2
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython2",
138 |    "version": "2.7.10"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 0
143 | }
144 | 


--------------------------------------------------------------------------------
/python-data/unit_tests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Nose Unit Tests with IPython Notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Nose\n",
 22 |     "\n",
 23 |     "Testing is a vital part of software development.  Nose extends unittest to make testing easier."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Install Nose\n",
 31 |     "\n",
 32 |     "Run the following command line:"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "!pip install nose"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Create the Code\n",
 51 |     "\n",
 52 |     "Save your code to a file with the %%file magic:"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 1,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "Overwriting type_util.py\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "%%file type_util.py\n",
 72 |     "class TypeUtil:\n",
 73 |     "\n",
 74 |     "    @classmethod\n",
 75 |     "    def is_iterable(cls, obj):\n",
 76 |     "        \"\"\"Determines if obj is iterable.\n",
 77 |     "\n",
 78 |     "        Useful when writing functions that can accept multiple types of\n",
 79 |     "        input (list, tuple, ndarray, iterator).  Pairs well with\n",
 80 |     "        convert_to_list.\n",
 81 |     "        \"\"\"\n",
 82 |     "        try:\n",
 83 |     "            iter(obj)\n",
 84 |     "            return True\n",
 85 |     "        except TypeError:\n",
 86 |     "            return False\n",
 87 |     "\n",
 88 |     "    @classmethod\n",
 89 |     "    def convert_to_list(cls, obj):\n",
 90 |     "        \"\"\"Converts obj to a list if it is not a list and it is iterable, \n",
 91 |     "        else returns the original obj.\n",
 92 |     "        \"\"\"\n",
 93 |     "        if not isinstance(obj, list) and cls.is_iterable(obj):\n",
 94 |     "            obj = list(obj)\n",
 95 |     "        return obj\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Create the Nose Tests\n",
103 |     "\n",
104 |     "Save your test to a file with the %%file magic:"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 2,
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Overwriting tests/test_type_util.py\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "%%file tests/test_type_util.py\n",
124 |     "from nose.tools import assert_equal\n",
125 |     "from ..type_util import TypeUtil\n",
126 |     "\n",
127 |     "\n",
128 |     "class TestUtil():\n",
129 |     "\n",
130 |     "    def test_is_iterable(self):\n",
131 |     "        assert_equal(TypeUtil.is_iterable('foo'), True)\n",
132 |     "        assert_equal(TypeUtil.is_iterable(7), False)\n",
133 |     "\n",
134 |     "    def test_convert_to_list(self):\n",
135 |     "        assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)\n",
136 |     "        assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Run the Nose Tests\n",
144 |     "\n",
145 |     "Run the following command line:"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 3,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "core.tests.test_type_util.TestUtil.test_convert_to_list ... ok\r\n",
160 |       "core.tests.test_type_util.TestUtil.test_is_iterable ... ok\r\n",
161 |       "\r\n",
162 |       "----------------------------------------------------------------------\r\n",
163 |       "Ran 2 tests in 0.001s\r\n",
164 |       "\r\n",
165 |       "OK\r\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "!nosetests tests/test_type_util.py -v"
171 |    ]
172 |   }
173 |  ],
174 |  "metadata": {
175 |   "kernelspec": {
176 |    "display_name": "Python 2",
177 |    "language": "python",
178 |    "name": "python2"
179 |   },
180 |   "language_info": {
181 |    "codemirror_mode": {
182 |     "name": "ipython",
183 |     "version": 2
184 |    },
185 |    "file_extension": ".py",
186 |    "mimetype": "text/x-python",
187 |    "name": "python",
188 |    "nbconvert_exporter": "python",
189 |    "pygments_lexer": "ipython2",
190 |    "version": "2.7.10"
191 |   }
192 |  },
193 |  "nbformat": 4,
194 |  "nbformat_minor": 0
195 | }
196 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/4_multi_gpu/multigpu_basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Basic Multi GPU Computation in TensorFlow\n",
  8 |     "\n",
  9 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 10 |     "\n",
 11 |     "## Setup\n",
 12 |     "\n",
 13 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "source": [
 22 |     "This tutorial requires your machine to have 2 GPUs\n",
 23 |     "* \"/cpu:0\": The CPU of your machine.\n",
 24 |     "* \"/gpu:0\": The first GPU of your machine\n",
 25 |     "* \"/gpu:1\": The second GPU of your machine\n",
 26 |     "* For this example, we are using 2 GTX-980"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import numpy as np\n",
 38 |     "import tensorflow as tf\n",
 39 |     "import datetime"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "#Processing Units logs\n",
 51 |     "log_device_placement = True\n",
 52 |     "\n",
 53 |     "#num of multiplications to perform\n",
 54 |     "n = 10"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# Example: compute A^n + B^n on 2 GPUs\n",
 66 |     "\n",
 67 |     "# Create random large matrix\n",
 68 |     "A = np.random.rand(1e4, 1e4).astype('float32')\n",
 69 |     "B = np.random.rand(1e4, 1e4).astype('float32')\n",
 70 |     "\n",
 71 |     "# Creates a graph to store results\n",
 72 |     "c1 = []\n",
 73 |     "c2 = []\n",
 74 |     "\n",
 75 |     "# Define matrix power\n",
 76 |     "def matpow(M, n):\n",
 77 |     "    if n < 1: #Abstract cases where n < 1\n",
 78 |     "        return M\n",
 79 |     "    else:\n",
 80 |     "        return tf.matmul(M, matpow(M, n-1))"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Single GPU computing\n",
 92 |     "\n",
 93 |     "with tf.device('/gpu:0'):\n",
 94 |     "    a = tf.constant(A)\n",
 95 |     "    b = tf.constant(B)\n",
 96 |     "    #compute A^n and B^n and store results in c1\n",
 97 |     "    c1.append(matpow(a, n))\n",
 98 |     "    c1.append(matpow(b, n))\n",
 99 |     "\n",
100 |     "with tf.device('/cpu:0'):\n",
101 |     "  sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n\n",
102 |     "\n",
103 |     "t1_1 = datetime.datetime.now()\n",
104 |     "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n",
105 |     "    # Runs the op.\n",
106 |     "    sess.run(sum)\n",
107 |     "t2_1 = datetime.datetime.now()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 7,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "# Multi GPU computing\n",
119 |     "# GPU:0 computes A^n\n",
120 |     "with tf.device('/gpu:0'):\n",
121 |     "    #compute A^n and store result in c2\n",
122 |     "    a = tf.constant(A)\n",
123 |     "    c2.append(matpow(a, n))\n",
124 |     "\n",
125 |     "#GPU:1 computes B^n\n",
126 |     "with tf.device('/gpu:1'):\n",
127 |     "    #compute B^n and store result in c2\n",
128 |     "    b = tf.constant(B)\n",
129 |     "    c2.append(matpow(b, n))\n",
130 |     "\n",
131 |     "with tf.device('/cpu:0'):\n",
132 |     "  sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n\n",
133 |     "\n",
134 |     "t1_2 = datetime.datetime.now()\n",
135 |     "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n",
136 |     "    # Runs the op.\n",
137 |     "    sess.run(sum)\n",
138 |     "t2_2 = datetime.datetime.now()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "Single GPU computation time: 0:00:11.833497\n",
153 |       "Multi GPU computation time: 0:00:07.085913\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "print \"Single GPU computation time: \" + str(t2_1-t1_1)\n",
159 |     "print \"Multi GPU computation time: \" + str(t2_2-t1_2)"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.4.3"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 0
184 | }
185 | 


--------------------------------------------------------------------------------
/data/titanic/gendermodel.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,1
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/data/titanic/genderclassmodel.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,0
 35 | 925,0
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,0
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,0
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,0
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,0
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,0
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,0
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,0
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,0
368 | 1258,0
369 | 1259,0
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/ML_flow_chart.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tutorial Diagrams
  3 | -----------------
  4 | 
  5 | This script plots the flow-charts used in the scikit-learn tutorials.
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pylab as pl
 10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow
 11 | 
 12 | def create_base(box_bg = '#CCCCCC',
 13 |                 arrow1 = '#88CCFF',
 14 |                 arrow2 = '#88FF88',
 15 |                 supervised=True):
 16 |     fig = pl.figure(figsize=(9, 6), facecolor='w')
 17 |     ax = pl.axes((0, 0, 1, 1),
 18 |                  xticks=[], yticks=[], frameon=False)
 19 |     ax.set_xlim(0, 9)
 20 |     ax.set_ylim(0, 6)
 21 | 
 22 |     patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg),
 23 |                Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg),
 24 |                Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg),
 25 |                
 26 |                Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg),
 27 |                Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg),
 28 |                Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg),
 29 |                
 30 |                Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg),
 31 |                
 32 |                Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg),
 33 |                
 34 |                Circle((5.5, 3.5), 1.0, fc=box_bg),
 35 |                
 36 |                Polygon([[5.5, 1.7],
 37 |                         [6.1, 1.1],
 38 |                         [5.5, 0.5],
 39 |                         [4.9, 1.1]], fc=box_bg),
 40 |                
 41 |                FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1,
 42 |                           width=0.25, head_width=0.5, head_length=0.2),
 43 |                
 44 |                FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1,
 45 |                           width=0.25, head_width=0.5, head_length=0.2),
 46 |                
 47 |                FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1,
 48 |                           width=0.25, head_width=0.5, head_length=0.2),
 49 |                
 50 |                FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2,
 51 |                           width=0.25, head_width=0.5, head_length=0.2),
 52 |                
 53 |                FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2,
 54 |                           width=0.25, head_width=0.5, head_length=0.2),
 55 |                
 56 |                FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2,
 57 |                           width=0.25, head_width=0.5, head_length=0.2)]
 58 | 
 59 |     if supervised:
 60 |         patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg),
 61 |                     Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg),
 62 |                     Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg),
 63 |                     FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1,
 64 |                                width=0.25, head_width=0.5, head_length=0.2),
 65 |                     Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)]
 66 |     else:
 67 |         patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)]
 68 |     
 69 |     for p in patches:
 70 |         ax.add_patch(p)
 71 |         
 72 |     pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.",
 73 |             ha='center', va='center', fontsize=14)
 74 |     
 75 |     pl.text(3.6, 4.9, "Feature\nVectors", 
 76 |             ha='left', va='center', fontsize=14)
 77 |     
 78 |     pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm",
 79 |             ha='center', va='center', fontsize=14)
 80 |     
 81 |     pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.",
 82 |             ha='center', va='center', fontsize=14)
 83 |     
 84 |     pl.text(3.3, 1.7, "Feature\nVector", 
 85 |             ha='left', va='center', fontsize=14)
 86 |     
 87 |     pl.text(5.5, 1.1, "Predictive\nModel", 
 88 |             ha='center', va='center', fontsize=12)
 89 | 
 90 |     if supervised:
 91 |         pl.text(1.45, 3.05, "Labels",
 92 |                 ha='center', va='center', fontsize=14)
 93 |     
 94 |         pl.text(8.05, 1.1, "Expected\nLabel",
 95 |                 ha='center', va='center', fontsize=14)
 96 |         pl.text(8.8, 5.8, "Supervised Learning Model",
 97 |                 ha='right', va='top', fontsize=18)
 98 | 
 99 |     else:
100 |         pl.text(8.05, 1.1,
101 |                 "Likelihood\nor Cluster ID\nor Better\nRepresentation",
102 |                 ha='center', va='center', fontsize=12)
103 |         pl.text(8.8, 5.8, "Unsupervised Learning Model",
104 |                 ha='right', va='top', fontsize=18)
105 |         
106 |         
107 | 
108 | def plot_supervised_chart(annotate=False):
109 |     create_base(supervised=True)
110 |     if annotate:
111 |         fontdict = dict(color='r', weight='bold', size=14)
112 |         pl.text(1.9, 4.55, 'X = vec.fit_transform(input)',
113 |                 fontdict=fontdict,
114 |                 rotation=20, ha='left', va='bottom')
115 |         pl.text(3.7, 3.2, 'clf.fit(X, y)',
116 |                 fontdict=fontdict,
117 |                 rotation=20, ha='left', va='bottom')
118 |         pl.text(1.7, 1.5, 'X_new = vec.transform(input)',
119 |                 fontdict=fontdict,
120 |                 rotation=20, ha='left', va='bottom')
121 |         pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)',
122 |                 fontdict=fontdict,
123 |                 rotation=20, ha='left', va='bottom')
124 | 
125 | def plot_unsupervised_chart():
126 |     create_base(supervised=False)
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     plot_supervised_chart(False)
131 |     plot_supervised_chart(True)
132 |     plot_unsupervised_chart()
133 |     pl.show()
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/spark/hdfs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# HDFS"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Run an HDFS command:"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "!hdfs"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Run a file system command on the file systems (FsShell):"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "!hdfs dfs"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "List the user's home directory:"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "!hdfs dfs -ls"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "List the HDFS root directory:"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "!hdfs dfs -ls /"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Copy a local file to the user's directory on HDFS:"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "!hdfs dfs -put file.txt file.txt"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "Display the contents of the specified HDFS file:"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "!hdfs dfs -cat file.txt"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Print the last 10 lines of the file to the terminal:"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "!hdfs dfs -cat file.txt | tail -n 10"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "View a directory and all of its files:"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "!hdfs dfs -cat dir/* | less"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Copy an HDFS file to local:"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "!hdfs dfs -get file.txt file.txt"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "Create a directory on HDFS:"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": false
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "!hdfs dfs -mkdir dir"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "Recursively delete the specified directory and all of its contents:"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "collapsed": false
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "!hdfs dfs -rm -r dir"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Specify HDFS file in Spark (paths are relative to the user's home HDFS directory):"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "collapsed": false
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "data = sc.textFile (\"hdfs://hdfs-host:port/path/file.txt\")"
231 |    ]
232 |   }
233 |  ],
234 |  "metadata": {
235 |   "kernelspec": {
236 |    "display_name": "Python 2",
237 |    "language": "python",
238 |    "name": "python2"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 2
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython2",
250 |    "version": "2.7.10"
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 0
255 | }
256 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/utils.py:
--------------------------------------------------------------------------------
  1 | """ This file contains different utility functions that are not connected
  2 | in anyway to the networks presented in the tutorials, but rather help in
  3 | processing the outputs into a more understandable way.
  4 | 
  5 | For example ``tile_raster_images`` helps in generating a easy to grasp
  6 | image from a set of samples or weights.
  7 | """
  8 | 
  9 | 
 10 | import numpy
 11 | from six.moves import xrange
 12 | 
 13 | 
 14 | def scale_to_unit_interval(ndar, eps=1e-8):
 15 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 16 |     ndar = ndar.copy()
 17 |     ndar -= ndar.min()
 18 |     ndar *= 1.0 / (ndar.max() + eps)
 19 |     return ndar
 20 | 
 21 | 
 22 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 23 |                        scale_rows_to_unit_interval=True,
 24 |                        output_pixel_vals=True):
 25 |     """
 26 |     Transform an array with one flattened image per row, into an array in
 27 |     which images are reshaped and layed out like tiles on a floor.
 28 | 
 29 |     This function is useful for visualizing datasets whose rows are images,
 30 |     and also columns of matrices for transforming those rows
 31 |     (such as the first layer of a neural net).
 32 | 
 33 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 34 |     be 2-D ndarrays or None;
 35 |     :param X: a 2-D array in which every row is a flattened image.
 36 | 
 37 |     :type img_shape: tuple; (height, width)
 38 |     :param img_shape: the original shape of each image
 39 | 
 40 |     :type tile_shape: tuple; (rows, cols)
 41 |     :param tile_shape: the number of images to tile (rows, cols)
 42 | 
 43 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 44 |     values) or floats
 45 | 
 46 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 47 |     being plotted to [0,1] or not
 48 | 
 49 | 
 50 |     :returns: array suitable for viewing as an image.
 51 |     (See:`Image.fromarray`.)
 52 |     :rtype: a 2-d array with same dtype as X.
 53 | 
 54 |     """
 55 | 
 56 |     assert len(img_shape) == 2
 57 |     assert len(tile_shape) == 2
 58 |     assert len(tile_spacing) == 2
 59 | 
 60 |     # The expression below can be re-written in a more C style as
 61 |     # follows :
 62 |     #
 63 |     # out_shape    = [0,0]
 64 |     # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
 65 |     #                tile_spacing[0]
 66 |     # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
 67 |     #                tile_spacing[1]
 68 |     out_shape = [
 69 |         (ishp + tsp) * tshp - tsp
 70 |         for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
 71 |     ]
 72 | 
 73 |     if isinstance(X, tuple):
 74 |         assert len(X) == 4
 75 |         # Create an output numpy ndarray to store the image
 76 |         if output_pixel_vals:
 77 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 78 |                                     dtype='uint8')
 79 |         else:
 80 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 81 |                                     dtype=X.dtype)
 82 | 
 83 |         #colors default to 0, alpha defaults to 1 (opaque)
 84 |         if output_pixel_vals:
 85 |             channel_defaults = [0, 0, 0, 255]
 86 |         else:
 87 |             channel_defaults = [0., 0., 0., 1.]
 88 | 
 89 |         for i in xrange(4):
 90 |             if X[i] is None:
 91 |                 # if channel is None, fill it with zeros of the correct
 92 |                 # dtype
 93 |                 dt = out_array.dtype
 94 |                 if output_pixel_vals:
 95 |                     dt = 'uint8'
 96 |                 out_array[:, :, i] = numpy.zeros(
 97 |                     out_shape,
 98 |                     dtype=dt
 99 |                 ) + channel_defaults[i]
100 |             else:
101 |                 # use a recurrent call to compute the channel and store it
102 |                 # in the output
103 |                 out_array[:, :, i] = tile_raster_images(
104 |                     X[i], img_shape, tile_shape, tile_spacing,
105 |                     scale_rows_to_unit_interval, output_pixel_vals)
106 |         return out_array
107 | 
108 |     else:
109 |         # if we are dealing with only one channel
110 |         H, W = img_shape
111 |         Hs, Ws = tile_spacing
112 | 
113 |         # generate a matrix to store the output
114 |         dt = X.dtype
115 |         if output_pixel_vals:
116 |             dt = 'uint8'
117 |         out_array = numpy.zeros(out_shape, dtype=dt)
118 | 
119 |         for tile_row in xrange(tile_shape[0]):
120 |             for tile_col in xrange(tile_shape[1]):
121 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
122 |                     this_x = X[tile_row * tile_shape[1] + tile_col]
123 |                     if scale_rows_to_unit_interval:
124 |                         # if we should scale values to be between 0 and 1
125 |                         # do this by calling the `scale_to_unit_interval`
126 |                         # function
127 |                         this_img = scale_to_unit_interval(
128 |                             this_x.reshape(img_shape))
129 |                     else:
130 |                         this_img = this_x.reshape(img_shape)
131 |                     # add the slice to the corresponding position in the
132 |                     # output array
133 |                     c = 1
134 |                     if output_pixel_vals:
135 |                         c = 255
136 |                     out_array[
137 |                         tile_row * (H + Hs): tile_row * (H + Hs) + H,
138 |                         tile_col * (W + Ws): tile_col * (W + Ws) + W
139 |                     ] = this_img * c
140 |         return out_array
141 | 


--------------------------------------------------------------------------------
/data/titanic/results-rf.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0.0
  3 | 893,0.0
  4 | 894,0.0
  5 | 895,1.0
  6 | 896,1.0
  7 | 897,0.0
  8 | 898,0.0
  9 | 899,0.0
 10 | 900,1.0
 11 | 901,0.0
 12 | 902,0.0
 13 | 903,0.0
 14 | 904,1.0
 15 | 905,0.0
 16 | 906,1.0
 17 | 907,1.0
 18 | 908,0.0
 19 | 909,1.0
 20 | 910,1.0
 21 | 911,1.0
 22 | 912,0.0
 23 | 913,1.0
 24 | 914,1.0
 25 | 915,1.0
 26 | 916,1.0
 27 | 917,0.0
 28 | 918,1.0
 29 | 919,1.0
 30 | 920,1.0
 31 | 921,0.0
 32 | 922,0.0
 33 | 923,0.0
 34 | 924,1.0
 35 | 925,0.0
 36 | 926,1.0
 37 | 927,1.0
 38 | 928,0.0
 39 | 929,0.0
 40 | 930,0.0
 41 | 931,1.0
 42 | 932,0.0
 43 | 933,1.0
 44 | 934,0.0
 45 | 935,1.0
 46 | 936,1.0
 47 | 937,0.0
 48 | 938,1.0
 49 | 939,0.0
 50 | 940,1.0
 51 | 941,1.0
 52 | 942,0.0
 53 | 943,0.0
 54 | 944,1.0
 55 | 945,1.0
 56 | 946,0.0
 57 | 947,0.0
 58 | 948,0.0
 59 | 949,0.0
 60 | 950,0.0
 61 | 951,1.0
 62 | 952,0.0
 63 | 953,0.0
 64 | 954,0.0
 65 | 955,1.0
 66 | 956,1.0
 67 | 957,1.0
 68 | 958,1.0
 69 | 959,0.0
 70 | 960,0.0
 71 | 961,1.0
 72 | 962,1.0
 73 | 963,0.0
 74 | 964,0.0
 75 | 965,0.0
 76 | 966,1.0
 77 | 967,0.0
 78 | 968,0.0
 79 | 969,1.0
 80 | 970,0.0
 81 | 971,1.0
 82 | 972,1.0
 83 | 973,0.0
 84 | 974,0.0
 85 | 975,0.0
 86 | 976,0.0
 87 | 977,0.0
 88 | 978,1.0
 89 | 979,0.0
 90 | 980,0.0
 91 | 981,1.0
 92 | 982,1.0
 93 | 983,0.0
 94 | 984,1.0
 95 | 985,0.0
 96 | 986,0.0
 97 | 987,0.0
 98 | 988,1.0
 99 | 989,0.0
100 | 990,0.0
101 | 991,0.0
102 | 992,1.0
103 | 993,0.0
104 | 994,0.0
105 | 995,0.0
106 | 996,1.0
107 | 997,0.0
108 | 998,0.0
109 | 999,0.0
110 | 1000,0.0
111 | 1001,0.0
112 | 1002,0.0
113 | 1003,0.0
114 | 1004,1.0
115 | 1005,0.0
116 | 1006,1.0
117 | 1007,0.0
118 | 1008,0.0
119 | 1009,1.0
120 | 1010,0.0
121 | 1011,1.0
122 | 1012,1.0
123 | 1013,0.0
124 | 1014,1.0
125 | 1015,0.0
126 | 1016,0.0
127 | 1017,1.0
128 | 1018,0.0
129 | 1019,0.0
130 | 1020,0.0
131 | 1021,0.0
132 | 1022,1.0
133 | 1023,0.0
134 | 1024,0.0
135 | 1025,0.0
136 | 1026,0.0
137 | 1027,0.0
138 | 1028,0.0
139 | 1029,0.0
140 | 1030,0.0
141 | 1031,0.0
142 | 1032,0.0
143 | 1033,1.0
144 | 1034,0.0
145 | 1035,0.0
146 | 1036,1.0
147 | 1037,0.0
148 | 1038,0.0
149 | 1039,0.0
150 | 1040,1.0
151 | 1041,0.0
152 | 1042,1.0
153 | 1043,0.0
154 | 1044,0.0
155 | 1045,1.0
156 | 1046,0.0
157 | 1047,0.0
158 | 1048,1.0
159 | 1049,0.0
160 | 1050,1.0
161 | 1051,1.0
162 | 1052,0.0
163 | 1053,1.0
164 | 1054,1.0
165 | 1055,0.0
166 | 1056,0.0
167 | 1057,1.0
168 | 1058,0.0
169 | 1059,0.0
170 | 1060,1.0
171 | 1061,0.0
172 | 1062,0.0
173 | 1063,0.0
174 | 1064,0.0
175 | 1065,0.0
176 | 1066,0.0
177 | 1067,1.0
178 | 1068,1.0
179 | 1069,0.0
180 | 1070,1.0
181 | 1071,1.0
182 | 1072,0.0
183 | 1073,0.0
184 | 1074,1.0
185 | 1075,0.0
186 | 1076,1.0
187 | 1077,0.0
188 | 1078,1.0
189 | 1079,0.0
190 | 1080,0.0
191 | 1081,0.0
192 | 1082,0.0
193 | 1083,0.0
194 | 1084,1.0
195 | 1085,0.0
196 | 1086,1.0
197 | 1087,0.0
198 | 1088,1.0
199 | 1089,0.0
200 | 1090,0.0
201 | 1091,0.0
202 | 1092,0.0
203 | 1093,1.0
204 | 1094,0.0
205 | 1095,1.0
206 | 1096,0.0
207 | 1097,0.0
208 | 1098,0.0
209 | 1099,0.0
210 | 1100,1.0
211 | 1101,0.0
212 | 1102,0.0
213 | 1103,0.0
214 | 1104,0.0
215 | 1105,1.0
216 | 1106,0.0
217 | 1107,0.0
218 | 1108,0.0
219 | 1109,0.0
220 | 1110,1.0
221 | 1111,0.0
222 | 1112,1.0
223 | 1113,0.0
224 | 1114,1.0
225 | 1115,1.0
226 | 1116,0.0
227 | 1117,1.0
228 | 1118,0.0
229 | 1119,0.0
230 | 1120,0.0
231 | 1121,0.0
232 | 1122,0.0
233 | 1123,1.0
234 | 1124,0.0
235 | 1125,0.0
236 | 1126,1.0
237 | 1127,0.0
238 | 1128,0.0
239 | 1129,1.0
240 | 1130,1.0
241 | 1131,1.0
242 | 1132,0.0
243 | 1133,1.0
244 | 1134,0.0
245 | 1135,0.0
246 | 1136,0.0
247 | 1137,0.0
248 | 1138,1.0
249 | 1139,0.0
250 | 1140,1.0
251 | 1141,0.0
252 | 1142,1.0
253 | 1143,0.0
254 | 1144,0.0
255 | 1145,0.0
256 | 1146,0.0
257 | 1147,0.0
258 | 1148,0.0
259 | 1149,0.0
260 | 1150,1.0
261 | 1151,0.0
262 | 1152,0.0
263 | 1153,0.0
264 | 1154,1.0
265 | 1155,1.0
266 | 1156,0.0
267 | 1157,0.0
268 | 1158,0.0
269 | 1159,0.0
270 | 1160,0.0
271 | 1161,0.0
272 | 1162,0.0
273 | 1163,0.0
274 | 1164,1.0
275 | 1165,0.0
276 | 1166,0.0
277 | 1167,1.0
278 | 1168,0.0
279 | 1169,0.0
280 | 1170,0.0
281 | 1171,0.0
282 | 1172,0.0
283 | 1173,1.0
284 | 1174,0.0
285 | 1175,0.0
286 | 1176,1.0
287 | 1177,0.0
288 | 1178,0.0
289 | 1179,0.0
290 | 1180,0.0
291 | 1181,0.0
292 | 1182,0.0
293 | 1183,0.0
294 | 1184,0.0
295 | 1185,0.0
296 | 1186,0.0
297 | 1187,0.0
298 | 1188,1.0
299 | 1189,0.0
300 | 1190,0.0
301 | 1191,0.0
302 | 1192,0.0
303 | 1193,0.0
304 | 1194,0.0
305 | 1195,0.0
306 | 1196,0.0
307 | 1197,1.0
308 | 1198,1.0
309 | 1199,1.0
310 | 1200,0.0
311 | 1201,0.0
312 | 1202,0.0
313 | 1203,1.0
314 | 1204,0.0
315 | 1205,0.0
316 | 1206,1.0
317 | 1207,1.0
318 | 1208,0.0
319 | 1209,0.0
320 | 1210,0.0
321 | 1211,0.0
322 | 1212,0.0
323 | 1213,0.0
324 | 1214,0.0
325 | 1215,1.0
326 | 1216,1.0
327 | 1217,0.0
328 | 1218,1.0
329 | 1219,0.0
330 | 1220,0.0
331 | 1221,0.0
332 | 1222,1.0
333 | 1223,1.0
334 | 1224,0.0
335 | 1225,1.0
336 | 1226,0.0
337 | 1227,0.0
338 | 1228,1.0
339 | 1229,0.0
340 | 1230,0.0
341 | 1231,0.0
342 | 1232,0.0
343 | 1233,0.0
344 | 1234,0.0
345 | 1235,1.0
346 | 1236,0.0
347 | 1237,0.0
348 | 1238,0.0
349 | 1239,1.0
350 | 1240,0.0
351 | 1241,1.0
352 | 1242,1.0
353 | 1243,0.0
354 | 1244,0.0
355 | 1245,0.0
356 | 1246,1.0
357 | 1247,0.0
358 | 1248,1.0
359 | 1249,0.0
360 | 1250,0.0
361 | 1251,1.0
362 | 1252,0.0
363 | 1253,1.0
364 | 1254,1.0
365 | 1255,1.0
366 | 1256,1.0
367 | 1257,0.0
368 | 1258,0.0
369 | 1259,0.0
370 | 1260,1.0
371 | 1261,1.0
372 | 1262,0.0
373 | 1263,1.0
374 | 1264,0.0
375 | 1265,0.0
376 | 1266,1.0
377 | 1267,1.0
378 | 1268,0.0
379 | 1269,0.0
380 | 1270,0.0
381 | 1271,0.0
382 | 1272,0.0
383 | 1273,0.0
384 | 1274,1.0
385 | 1275,1.0
386 | 1276,0.0
387 | 1277,1.0
388 | 1278,0.0
389 | 1279,0.0
390 | 1280,0.0
391 | 1281,0.0
392 | 1282,0.0
393 | 1283,1.0
394 | 1284,0.0
395 | 1285,0.0
396 | 1286,0.0
397 | 1287,1.0
398 | 1288,0.0
399 | 1289,1.0
400 | 1290,0.0
401 | 1291,0.0
402 | 1292,1.0
403 | 1293,0.0
404 | 1294,1.0
405 | 1295,0.0
406 | 1296,0.0
407 | 1297,0.0
408 | 1298,0.0
409 | 1299,0.0
410 | 1300,0.0
411 | 1301,1.0
412 | 1302,0.0
413 | 1303,1.0
414 | 1304,0.0
415 | 1305,0.0
416 | 1306,1.0
417 | 1307,0.0
418 | 1308,0.0
419 | 1309,0.0
420 | 


--------------------------------------------------------------------------------
/data/titanic/genderclassmodel.py:
--------------------------------------------------------------------------------
  1 | """ Now that the user can read in a file this creates a model which uses the price, class and gender
  2 | Author : AstroDave
  3 | Date : 18th September 2012
  4 | Revised : 28 March 2014
  5 | 
  6 | """
  7 | 
  8 | 
  9 | import csv as csv
 10 | import numpy as np
 11 | 
 12 | csv_file_object = csv.reader(open('train.csv', 'rb'))       # Load in the csv file
 13 | header = csv_file_object.next()                             # Skip the fist line as it is a header
 14 | data=[]                                                     # Create a variable to hold the data
 15 | 
 16 | for row in csv_file_object:                 # Skip through each row in the csv file
 17 |     data.append(row)                        # adding each row to the data variable
 18 | data = np.array(data)                       # Then convert from a list to an array
 19 | 
 20 | # In order to analyse the price column I need to bin up that data
 21 | # here are my binning parameters, the problem we face is some of the fares are very large
 22 | # So we can either have a lot of bins with nothing in them or we can just lose some
 23 | # information by just considering that anythng over 39 is simply in the last bin.
 24 | # So we add a ceiling
 25 | fare_ceiling = 40
 26 | # then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling
 27 | data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0
 28 | 
 29 | fare_bracket_size = 10
 30 | number_of_price_brackets = fare_ceiling / fare_bracket_size
 31 | number_of_classes = 3                             # I know there were 1st, 2nd and 3rd classes on board.
 32 | number_of_classes = len(np.unique(data[0::,2]))   # But it's better practice to calculate this from the Pclass directly:
 33 |                                                   # just take the length of an array of UNIQUE values in column index 2
 34 | 
 35 | 
 36 | # This reference matrix will show the proportion of survivors as a sorted table of
 37 | # gender, class and ticket fare.
 38 | # First initialize it with all zeros
 39 | survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float)
 40 | 
 41 | # I can now find the stats of all the women and men on board
 42 | for i in xrange(number_of_classes):
 43 |     for j in xrange(number_of_price_brackets):
 44 | 
 45 |         women_only_stats = data[ (data[0::,4] == "female") \
 46 |                                  & (data[0::,2].astype(np.float) == i+1) \
 47 |                                  & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \
 48 |                                  & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
 49 | 
 50 |         men_only_stats = data[ (data[0::,4] != "female") \
 51 |                                  & (data[0::,2].astype(np.float) == i+1) \
 52 |                                  & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \
 53 |                                  & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
 54 | 
 55 |                                  #if i == 0 and j == 3:
 56 | 
 57 |         survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))  # Female stats
 58 |         survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))    # Male stats
 59 | 
 60 | # Since in python if it tries to find the mean of an array with nothing in it
 61 | # (such that the denominator is 0), then it returns nan, we can convert these to 0
 62 | # by just saying where does the array not equal the array, and set these to 0.
 63 | survival_table[ survival_table != survival_table ] = 0.
 64 | 
 65 | # Now I have my proportion of survivors, simply round them such that if <0.5
 66 | # I predict they dont surivive, and if >= 0.5 they do
 67 | survival_table[ survival_table < 0.5 ] = 0
 68 | survival_table[ survival_table >= 0.5 ] = 1
 69 | 
 70 | # Now I have my indicator I can read in the test file and write out
 71 | # if a women then survived(1) if a man then did not survived (0)
 72 | # First read in test
 73 | test_file = open('test.csv', 'rb')
 74 | test_file_object = csv.reader(test_file)
 75 | header = test_file_object.next()
 76 | 
 77 | # Also open the a new file so I can write to it. 
 78 | predictions_file = open("genderclassmodel.csv", "wb")
 79 | predictions_file_object = csv.writer(predictions_file)
 80 | predictions_file_object.writerow(["PassengerId", "Survived"])
 81 | 
 82 | # First thing to do is bin up the price file
 83 | for row in test_file_object:
 84 |     for j in xrange(number_of_price_brackets):
 85 |         # If there is no fare then place the price of the ticket according to class
 86 |         try:
 87 |             row[8] = float(row[8])    # No fare recorded will come up as a string so
 88 |                                       # try to make it a float
 89 |         except:                       # If fails then just bin the fare according to the class
 90 |             bin_fare = 3 - float(row[1])
 91 |             break                     # Break from the loop and move to the next row
 92 |         if row[8] > fare_ceiling:     # Otherwise now test to see if it is higher
 93 |                                       # than the fare ceiling we set earlier
 94 |             bin_fare = number_of_price_brackets - 1
 95 |             break                     # And then break to the next row
 96 | 
 97 |         if row[8] >= j*fare_bracket_size\
 98 |             and row[8] < (j+1)*fare_bracket_size:     # If passed these tests then loop through
 99 |                                                       # each bin until you find the right one
100 |                                                       # append it to the bin_fare
101 |                                                       # and move to the next loop
102 |             bin_fare = j
103 |             break
104 |         # Now I have the binned fare, passenger class, and whether female or male, we can
105 |         # just cross ref their details with our survival table
106 |     if row[3] == 'female':
107 |         predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])])
108 |     else:
109 |         predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])])
110 | 
111 | # Close out the files
112 | test_file.close()
113 | predictions_file.close()


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/input_data.py:
--------------------------------------------------------------------------------
  1 | """Functions for downloading and reading MNIST data."""
  2 | from __future__ import print_function
  3 | import gzip
  4 | import os
  5 | import urllib
  6 | import numpy
  7 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
  8 | def maybe_download(filename, work_directory):
  9 |   """Download the data from Yann's website, unless it's already here."""
 10 |   if not os.path.exists(work_directory):
 11 |     os.mkdir(work_directory)
 12 |   filepath = os.path.join(work_directory, filename)
 13 |   if not os.path.exists(filepath):
 14 |     filepath, _ = urllib.urlretrieve(SOURCE_URL + filename, filepath)
 15 |     statinfo = os.stat(filepath)
 16 |     print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
 17 |   return filepath
 18 | def _read32(bytestream):
 19 |   dt = numpy.dtype(numpy.uint32).newbyteorder('>')
 20 |   return numpy.frombuffer(bytestream.read(4), dtype=dt)
 21 | def extract_images(filename):
 22 |   """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
 23 |   print('Extracting', filename)
 24 |   with gzip.open(filename) as bytestream:
 25 |     magic = _read32(bytestream)
 26 |     if magic != 2051:
 27 |       raise ValueError(
 28 |           'Invalid magic number %d in MNIST image file: %s' %
 29 |           (magic, filename))
 30 |     num_images = _read32(bytestream)
 31 |     rows = _read32(bytestream)
 32 |     cols = _read32(bytestream)
 33 |     buf = bytestream.read(rows * cols * num_images)
 34 |     data = numpy.frombuffer(buf, dtype=numpy.uint8)
 35 |     data = data.reshape(num_images, rows, cols, 1)
 36 |     return data
 37 | def dense_to_one_hot(labels_dense, num_classes=10):
 38 |   """Convert class labels from scalars to one-hot vectors."""
 39 |   num_labels = labels_dense.shape[0]
 40 |   index_offset = numpy.arange(num_labels) * num_classes
 41 |   labels_one_hot = numpy.zeros((num_labels, num_classes))
 42 |   labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
 43 |   return labels_one_hot
 44 | def extract_labels(filename, one_hot=False):
 45 |   """Extract the labels into a 1D uint8 numpy array [index]."""
 46 |   print('Extracting', filename)
 47 |   with gzip.open(filename) as bytestream:
 48 |     magic = _read32(bytestream)
 49 |     if magic != 2049:
 50 |       raise ValueError(
 51 |           'Invalid magic number %d in MNIST label file: %s' %
 52 |           (magic, filename))
 53 |     num_items = _read32(bytestream)
 54 |     buf = bytestream.read(num_items)
 55 |     labels = numpy.frombuffer(buf, dtype=numpy.uint8)
 56 |     if one_hot:
 57 |       return dense_to_one_hot(labels)
 58 |     return labels
 59 | class DataSet(object):
 60 |   def __init__(self, images, labels, fake_data=False):
 61 |     if fake_data:
 62 |       self._num_examples = 10000
 63 |     else:
 64 |       assert images.shape[0] == labels.shape[0], (
 65 |           "images.shape: %s labels.shape: %s" % (images.shape,
 66 |                                                  labels.shape))
 67 |       self._num_examples = images.shape[0]
 68 |       # Convert shape from [num examples, rows, columns, depth]
 69 |       # to [num examples, rows*columns] (assuming depth == 1)
 70 |       assert images.shape[3] == 1
 71 |       images = images.reshape(images.shape[0],
 72 |                               images.shape[1] * images.shape[2])
 73 |       # Convert from [0, 255] -> [0.0, 1.0].
 74 |       images = images.astype(numpy.float32)
 75 |       images = numpy.multiply(images, 1.0 / 255.0)
 76 |     self._images = images
 77 |     self._labels = labels
 78 |     self._epochs_completed = 0
 79 |     self._index_in_epoch = 0
 80 |   @property
 81 |   def images(self):
 82 |     return self._images
 83 |   @property
 84 |   def labels(self):
 85 |     return self._labels
 86 |   @property
 87 |   def num_examples(self):
 88 |     return self._num_examples
 89 |   @property
 90 |   def epochs_completed(self):
 91 |     return self._epochs_completed
 92 |   def next_batch(self, batch_size, fake_data=False):
 93 |     """Return the next `batch_size` examples from this data set."""
 94 |     if fake_data:
 95 |       fake_image = [1.0 for _ in xrange(784)]
 96 |       fake_label = 0
 97 |       return [fake_image for _ in xrange(batch_size)], [
 98 |           fake_label for _ in xrange(batch_size)]
 99 |     start = self._index_in_epoch
100 |     self._index_in_epoch += batch_size
101 |     if self._index_in_epoch > self._num_examples:
102 |       # Finished epoch
103 |       self._epochs_completed += 1
104 |       # Shuffle the data
105 |       perm = numpy.arange(self._num_examples)
106 |       numpy.random.shuffle(perm)
107 |       self._images = self._images[perm]
108 |       self._labels = self._labels[perm]
109 |       # Start next epoch
110 |       start = 0
111 |       self._index_in_epoch = batch_size
112 |       assert batch_size <= self._num_examples
113 |     end = self._index_in_epoch
114 |     return self._images[start:end], self._labels[start:end]
115 | def read_data_sets(train_dir, fake_data=False, one_hot=False):
116 |   class DataSets(object):
117 |     pass
118 |   data_sets = DataSets()
119 |   if fake_data:
120 |     data_sets.train = DataSet([], [], fake_data=True)
121 |     data_sets.validation = DataSet([], [], fake_data=True)
122 |     data_sets.test = DataSet([], [], fake_data=True)
123 |     return data_sets
124 |   TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
125 |   TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
126 |   TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
127 |   TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
128 |   VALIDATION_SIZE = 5000
129 |   local_file = maybe_download(TRAIN_IMAGES, train_dir)
130 |   train_images = extract_images(local_file)
131 |   local_file = maybe_download(TRAIN_LABELS, train_dir)
132 |   train_labels = extract_labels(local_file, one_hot=one_hot)
133 |   local_file = maybe_download(TEST_IMAGES, train_dir)
134 |   test_images = extract_images(local_file)
135 |   local_file = maybe_download(TEST_LABELS, train_dir)
136 |   test_labels = extract_labels(local_file, one_hot=one_hot)
137 |   validation_images = train_images[:VALIDATION_SIZE]
138 |   validation_labels = train_labels[:VALIDATION_SIZE]
139 |   train_images = train_images[VALIDATION_SIZE:]
140 |   train_labels = train_labels[VALIDATION_SIZE:]
141 |   data_sets.train = DataSet(train_images, train_labels)
142 |   data_sets.validation = DataSet(validation_images, validation_labels)
143 |   data_sets.test = DataSet(test_images, test_labels)
144 |   return data_sets


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/1_intro/basic_operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Basic Operations in TensorFlow\n",
  8 |     "\n",
  9 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 10 |     "\n",
 11 |     "## Setup\n",
 12 |     "\n",
 13 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import tensorflow as tf"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Basic constant operations\n",
 36 |     "# The value returned by the constructor represents the output\n",
 37 |     "# of the Constant op.\n",
 38 |     "a = tf.constant(2)\n",
 39 |     "b = tf.constant(3)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "a=2, b=3\n",
 54 |       "Addition with constants: 5\n",
 55 |       "Multiplication with constants: 6\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "# Launch the default graph.\n",
 61 |     "with tf.Session() as sess:\n",
 62 |     "    print \"a=2, b=3\"\n",
 63 |     "    print \"Addition with constants: %i\" % sess.run(a+b)\n",
 64 |     "    print \"Multiplication with constants: %i\" % sess.run(a*b)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# Basic Operations with variable as graph input\n",
 76 |     "# The value returned by the constructor represents the output\n",
 77 |     "# of the Variable op. (define as input when running session)\n",
 78 |     "# tf Graph input\n",
 79 |     "a = tf.placeholder(tf.int16)\n",
 80 |     "b = tf.placeholder(tf.int16)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Define some operations\n",
 92 |     "add = tf.add(a, b)\n",
 93 |     "mul = tf.mul(a, b)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 7,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "Addition with variables: 5\n",
108 |       "Multiplication with variables: 6\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "# Launch the default graph.\n",
114 |     "with tf.Session() as sess:\n",
115 |     "    # Run every operation with variable input\n",
116 |     "    print \"Addition with variables: %i\" % sess.run(add, feed_dict={a: 2, b: 3})\n",
117 |     "    print \"Multiplication with variables: %i\" % sess.run(mul, feed_dict={a: 2, b: 3})"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 8,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "# ----------------\n",
129 |     "# More in details:\n",
130 |     "# Matrix Multiplication from TensorFlow official tutorial\n",
131 |     "\n",
132 |     "# Create a Constant op that produces a 1x2 matrix.  The op is\n",
133 |     "# added as a node to the default graph.\n",
134 |     "#\n",
135 |     "# The value returned by the constructor represents the output\n",
136 |     "# of the Constant op.\n",
137 |     "matrix1 = tf.constant([[3., 3.]])"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 9,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "# Create another Constant that produces a 2x1 matrix.\n",
149 |     "matrix2 = tf.constant([[2.],[2.]])"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 10,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.\n",
161 |     "# The returned value, 'product', represents the result of the matrix\n",
162 |     "# multiplication.\n",
163 |     "product = tf.matmul(matrix1, matrix2)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 11,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [
173 |     {
174 |      "name": "stdout",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "[[ 12.]]\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "# To run the matmul op we call the session 'run()' method, passing 'product'\n",
183 |     "# which represents the output of the matmul op.  This indicates to the call\n",
184 |     "# that we want to get the output of the matmul op back.\n",
185 |     "#\n",
186 |     "# All inputs needed by the op are run automatically by the session.  They\n",
187 |     "# typically are run in parallel.\n",
188 |     "#\n",
189 |     "# The call 'run(product)' thus causes the execution of threes ops in the\n",
190 |     "# graph: the two constants and matmul.\n",
191 |     "#\n",
192 |     "# The output of the op is returned in 'result' as a numpy `ndarray` object.\n",
193 |     "with tf.Session() as sess:\n",
194 |     "    result = sess.run(product)\n",
195 |     "    print result"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": []
206 |   }
207 |  ],
208 |  "metadata": {
209 |   "kernelspec": {
210 |    "display_name": "Python 2",
211 |    "language": "python",
212 |    "name": "python2"
213 |   },
214 |   "language_info": {
215 |    "codemirror_mode": {
216 |     "name": "ipython",
217 |     "version": 2
218 |    },
219 |    "file_extension": ".py",
220 |    "mimetype": "text/x-python",
221 |    "name": "python",
222 |    "nbconvert_exporter": "python",
223 |    "pygments_lexer": "ipython2",
224 |    "version": "2.7.5+"
225 |   }
226 |  },
227 |  "nbformat": 4,
228 |  "nbformat_minor": 0
229 | }
230 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/2_basic_classifiers/logistic_regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Logistic Regression in TensorFlow\n",
 10 |     "\n",
 11 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 12 |     "\n",
 13 |     "## Setup\n",
 14 |     "\n",
 15 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 5,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "Extracting /tmp/data/train-images-idx3-ubyte.gz\n",
 30 |       "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n",
 31 |       "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n",
 32 |       "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "# Import MINST data\n",
 38 |     "import input_data\n",
 39 |     "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 6,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import tensorflow as tf"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 7,
 56 |    "metadata": {
 57 |     "collapsed": true
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Parameters\n",
 62 |     "learning_rate = 0.01\n",
 63 |     "training_epochs = 25\n",
 64 |     "batch_size = 100\n",
 65 |     "display_step = 1"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 8,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# tf Graph Input\n",
 77 |     "x = tf.placeholder(\"float\", [None, 784]) # mnist data image of shape 28*28=784\n",
 78 |     "y = tf.placeholder(\"float\", [None, 10]) # 0-9 digits recognition => 10 classes"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 9,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# Create model\n",
 90 |     "\n",
 91 |     "# Set model weights\n",
 92 |     "W = tf.Variable(tf.zeros([784, 10]))\n",
 93 |     "b = tf.Variable(tf.zeros([10]))"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 10,
 99 |    "metadata": {
100 |     "collapsed": true
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "# Construct model\n",
105 |     "activation = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 11,
111 |    "metadata": {
112 |     "collapsed": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "# Minimize error using cross entropy\n",
117 |     "# Cross entropy\n",
118 |     "cost = -tf.reduce_sum(y*tf.log(activation)) \n",
119 |     "# Gradient Descent\n",
120 |     "optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) "
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 12,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# Initializing the variables\n",
132 |     "init = tf.initialize_all_variables()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 13,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Epoch: 0001 cost= 29.860479714\n",
147 |       "Epoch: 0002 cost= 22.080549484\n",
148 |       "Epoch: 0003 cost= 21.237104595\n",
149 |       "Epoch: 0004 cost= 20.460196280\n",
150 |       "Epoch: 0005 cost= 20.185128237\n",
151 |       "Epoch: 0006 cost= 19.940297202\n",
152 |       "Epoch: 0007 cost= 19.645111119\n",
153 |       "Epoch: 0008 cost= 19.507218031\n",
154 |       "Epoch: 0009 cost= 19.389794492\n",
155 |       "Epoch: 0010 cost= 19.177005816\n",
156 |       "Epoch: 0011 cost= 19.082493615\n",
157 |       "Epoch: 0012 cost= 19.072873598\n",
158 |       "Epoch: 0013 cost= 18.938005402\n",
159 |       "Epoch: 0014 cost= 18.891806430\n",
160 |       "Epoch: 0015 cost= 18.839480221\n",
161 |       "Epoch: 0016 cost= 18.769349510\n",
162 |       "Epoch: 0017 cost= 18.590865587\n",
163 |       "Epoch: 0018 cost= 18.623413677\n",
164 |       "Epoch: 0019 cost= 18.546149085\n",
165 |       "Epoch: 0020 cost= 18.432274895\n",
166 |       "Epoch: 0021 cost= 18.358189004\n",
167 |       "Epoch: 0022 cost= 18.380014628\n",
168 |       "Epoch: 0023 cost= 18.499993471\n",
169 |       "Epoch: 0024 cost= 18.386477311\n",
170 |       "Epoch: 0025 cost= 18.258080609\n",
171 |       "Optimization Finished!\n",
172 |       "Accuracy: 0.9048\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "# Launch the graph\n",
178 |     "with tf.Session() as sess:\n",
179 |     "    sess.run(init)\n",
180 |     "\n",
181 |     "    # Training cycle\n",
182 |     "    for epoch in range(training_epochs):\n",
183 |     "        avg_cost = 0.\n",
184 |     "        total_batch = int(mnist.train.num_examples/batch_size)\n",
185 |     "        # Loop over all batches\n",
186 |     "        for i in range(total_batch):\n",
187 |     "            batch_xs, batch_ys = mnist.train.next_batch(batch_size)\n",
188 |     "            # Fit training using batch data\n",
189 |     "            sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})\n",
190 |     "            # Compute average loss\n",
191 |     "            avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch\n",
192 |     "        # Display logs per epoch step\n",
193 |     "        if epoch % display_step == 0:\n",
194 |     "            print \"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(avg_cost)\n",
195 |     "\n",
196 |     "    print \"Optimization Finished!\"\n",
197 |     "\n",
198 |     "    # Test model\n",
199 |     "    correct_prediction = tf.equal(tf.argmax(activation, 1), tf.argmax(y, 1))\n",
200 |     "    # Calculate accuracy\n",
201 |     "    accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n",
202 |     "    print \"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})"
203 |    ]
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python 3",
209 |    "language": "python",
210 |    "name": "python3"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.4.3"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 0
227 | }
228 | 


--------------------------------------------------------------------------------
/python-data/datetime.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Dates and Times"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "* Basics\n",
 22 |     "* strftime\n",
 23 |     "* strptime\n",
 24 |     "* timedelta"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Basics"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 1,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from datetime import datetime, date, time"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "year = 2015\n",
 54 |     "month = 1\n",
 55 |     "day = 20\n",
 56 |     "hour = 7\n",
 57 |     "minute = 28\n",
 58 |     "second = 15"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "dt = datetime(year, month, day, hour, minute, second)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "(7, 28, 15)"
 83 |       ]
 84 |      },
 85 |      "execution_count": 4,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "dt.hour, dt.minute, dt.second"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Extract the equivalent date object:"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "datetime.date(2015, 1, 20)"
112 |       ]
113 |      },
114 |      "execution_count": 5,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "dt.date()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Extract the equivalent time object:"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 6,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "datetime.time(7, 28, 15)"
141 |       ]
142 |      },
143 |      "execution_count": 6,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "dt.time()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "When aggregating or grouping time series data, it is sometimes useful to replace fields of a series of datetimes such as zeroing out the minute and second fields:"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 7,
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "datetime.datetime(2015, 1, 20, 7, 0)"
170 |       ]
171 |      },
172 |      "execution_count": 7,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "dt.replace(minute=0, second=0)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "## strftime"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "Format a datetime string:"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 8,
198 |    "metadata": {
199 |     "collapsed": false
200 |    },
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "'01/20/2015 07:28'"
206 |       ]
207 |      },
208 |      "execution_count": 8,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "dt.strftime('%m/%d/%Y %H:%M')"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "## strptime"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "Convert a string into a datetime object:"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 9,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "datetime.datetime(2015, 1, 20, 0, 0)"
242 |       ]
243 |      },
244 |      "execution_count": 9,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "datetime.strptime('20150120', '%Y%m%d')"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "## timedelta"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "Get the current datetime:"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 10,
270 |    "metadata": {
271 |     "collapsed": false
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "dt_now = datetime.now()"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "Subtract two datetime fields to create a timedelta:"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 11,
288 |    "metadata": {
289 |     "collapsed": false
290 |    },
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/plain": [
295 |        "datetime.timedelta(6, 40171, 885211)"
296 |       ]
297 |      },
298 |      "execution_count": 11,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "source": [
304 |     "delta = dt_now - dt\n",
305 |     "delta"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "Add a datetime and a timedelta to get a new datetime:"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 12,
318 |    "metadata": {
319 |     "collapsed": false
320 |    },
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "datetime.datetime(2015, 1, 26, 18, 37, 46, 885211)"
326 |       ]
327 |      },
328 |      "execution_count": 12,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "dt + delta"
335 |    ]
336 |   }
337 |  ],
338 |  "metadata": {
339 |   "kernelspec": {
340 |    "display_name": "Python 2",
341 |    "language": "python",
342 |    "name": "python2"
343 |   },
344 |   "language_info": {
345 |    "codemirror_mode": {
346 |     "name": "ipython",
347 |     "version": 2
348 |    },
349 |    "file_extension": ".py",
350 |    "mimetype": "text/x-python",
351 |    "name": "python",
352 |    "nbconvert_exporter": "python",
353 |    "pygments_lexer": "ipython2",
354 |    "version": "2.7.10"
355 |   }
356 |  },
357 |  "nbformat": 4,
358 |  "nbformat_minor": 0
359 | }
360 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-examples/notebooks/3_neural_networks/multilayer_perceptron.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multilayer Perceptron in TensorFlow\n",
  8 |     "\n",
  9 |     "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n",
 10 |     "\n",
 11 |     "## Setup\n",
 12 |     "\n",
 13 |     "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "Extracting /tmp/data/train-images-idx3-ubyte.gz\n",
 28 |       "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n",
 29 |       "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n",
 30 |       "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "# Import MINST data\n",
 36 |     "import input_data\n",
 37 |     "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import tensorflow as tf"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 4,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Parameters\n",
 60 |     "learning_rate = 0.001\n",
 61 |     "training_epochs = 15\n",
 62 |     "batch_size = 100\n",
 63 |     "display_step = 1"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 5,
 69 |    "metadata": {
 70 |     "collapsed": true
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Network Parameters\n",
 75 |     "n_hidden_1 = 256 # 1st layer num features\n",
 76 |     "n_hidden_2 = 256 # 2nd layer num features\n",
 77 |     "n_input = 784 # MNIST data input (img shape: 28*28)\n",
 78 |     "n_classes = 10 # MNIST total classes (0-9 digits)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 6,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# tf Graph input\n",
 90 |     "x = tf.placeholder(\"float\", [None, n_input])\n",
 91 |     "y = tf.placeholder(\"float\", [None, n_classes])"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "# Create model\n",
103 |     "def multilayer_perceptron(_X, _weights, _biases):\n",
104 |     "    #Hidden layer with RELU activation\n",
105 |     "    layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) \n",
106 |     "    #Hidden layer with RELU activation\n",
107 |     "    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) \n",
108 |     "    return tf.matmul(layer_2, weights['out']) + biases['out']"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "# Store layers weight & bias\n",
120 |     "weights = {\n",
121 |     "    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),\n",
122 |     "    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),\n",
123 |     "    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))\n",
124 |     "}\n",
125 |     "biases = {\n",
126 |     "    'b1': tf.Variable(tf.random_normal([n_hidden_1])),\n",
127 |     "    'b2': tf.Variable(tf.random_normal([n_hidden_2])),\n",
128 |     "    'out': tf.Variable(tf.random_normal([n_classes]))\n",
129 |     "}"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 9,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "# Construct model\n",
141 |     "pred = multilayer_perceptron(x, weights, biases)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 10,
147 |    "metadata": {
148 |     "collapsed": true
149 |    },
150 |    "outputs": [],
151 |    "source": [
152 |     "# Define loss and optimizer\n",
153 |     "# Softmax loss\n",
154 |     "cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) \n",
155 |     "# Adam Optimizer\n",
156 |     "optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) "
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 11,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "# Initializing the variables\n",
168 |     "init = tf.initialize_all_variables()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 12,
174 |    "metadata": {
175 |     "collapsed": false
176 |    },
177 |    "outputs": [
178 |     {
179 |      "name": "stdout",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "Epoch: 0001 cost= 160.113980416\n",
183 |       "Epoch: 0002 cost= 38.665780694\n",
184 |       "Epoch: 0003 cost= 24.118004577\n",
185 |       "Epoch: 0004 cost= 16.440921303\n",
186 |       "Epoch: 0005 cost= 11.689460141\n",
187 |       "Epoch: 0006 cost= 8.469423468\n",
188 |       "Epoch: 0007 cost= 6.223237230\n",
189 |       "Epoch: 0008 cost= 4.560174118\n",
190 |       "Epoch: 0009 cost= 3.250516910\n",
191 |       "Epoch: 0010 cost= 2.359658795\n",
192 |       "Epoch: 0011 cost= 1.694081847\n",
193 |       "Epoch: 0012 cost= 1.167997509\n",
194 |       "Epoch: 0013 cost= 0.872986831\n",
195 |       "Epoch: 0014 cost= 0.630616366\n",
196 |       "Epoch: 0015 cost= 0.487381571\n",
197 |       "Optimization Finished!\n",
198 |       "Accuracy: 0.9462\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "# Launch the graph\n",
204 |     "with tf.Session() as sess:\n",
205 |     "    sess.run(init)\n",
206 |     "\n",
207 |     "    # Training cycle\n",
208 |     "    for epoch in range(training_epochs):\n",
209 |     "        avg_cost = 0.\n",
210 |     "        total_batch = int(mnist.train.num_examples/batch_size)\n",
211 |     "        # Loop over all batches\n",
212 |     "        for i in range(total_batch):\n",
213 |     "            batch_xs, batch_ys = mnist.train.next_batch(batch_size)\n",
214 |     "            # Fit training using batch data\n",
215 |     "            sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})\n",
216 |     "            # Compute average loss\n",
217 |     "            avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch\n",
218 |     "        # Display logs per epoch step\n",
219 |     "        if epoch % display_step == 0:\n",
220 |     "            print \"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(avg_cost)\n",
221 |     "\n",
222 |     "    print \"Optimization Finished!\"\n",
223 |     "\n",
224 |     "    # Test model\n",
225 |     "    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))\n",
226 |     "    # Calculate accuracy\n",
227 |     "    accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n",
228 |     "    print \"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})"
229 |    ]
230 |   }
231 |  ],
232 |  "metadata": {
233 |   "kernelspec": {
234 |    "display_name": "Python 3",
235 |    "language": "python",
236 |    "name": "python3"
237 |   },
238 |   "language_info": {
239 |    "codemirror_mode": {
240 |     "name": "ipython",
241 |     "version": 3
242 |    },
243 |    "file_extension": ".py",
244 |    "mimetype": "text/x-python",
245 |    "name": "python",
246 |    "nbconvert_exporter": "python",
247 |    "pygments_lexer": "ipython3",
248 |    "version": "3.4.3"
249 |   }
250 |  },
251 |  "nbformat": 4,
252 |  "nbformat_minor": 0
253 | }
254 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/rnn_precompile.py:
--------------------------------------------------------------------------------
  1 | """This file is only here to speed up the execution of notebooks.
  2 | 
  3 | It contains a subset of the code defined in simple_rnn.ipynb and
  4 | lstm_text.ipynb, in particular the code compiling Theano function.
  5 | Executing this script first will populate the cache of compiled C code,
  6 | which will make subsequent compilations faster.
  7 | 
  8 | The use case is to run this script in the background when a demo VM
  9 | such as the one for NVIDIA's qwikLABS, so that the compilation phase
 10 | started from the notebooks is faster.
 11 | 
 12 | """
 13 | import numpy
 14 | 
 15 | import theano
 16 | import theano.tensor as T
 17 | 
 18 | from theano import config
 19 | from theano.tensor.nnet import categorical_crossentropy
 20 | 
 21 | 
 22 | floatX = theano.config.floatX
 23 | 
 24 | 
 25 | # simple_rnn.ipynb
 26 | 
 27 | class SimpleRNN(object):
 28 |     def __init__(self, input_dim, recurrent_dim):
 29 |         w_xh = numpy.random.normal(0, .01, (input_dim, recurrent_dim))
 30 |         w_hh = numpy.random.normal(0, .02, (recurrent_dim, recurrent_dim))
 31 |         self.w_xh = theano.shared(numpy.asarray(w_xh, dtype=floatX), name='w_xh')
 32 |         self.w_hh = theano.shared(numpy.asarray(w_hh, dtype=floatX), name='w_hh')
 33 |         self.b_h = theano.shared(numpy.zeros((recurrent_dim,), dtype=floatX), name='b_h')
 34 |         self.parameters = [self.w_xh, self.w_hh, self.b_h]
 35 | 
 36 |     def _step(self, input_t, previous):
 37 |         return T.tanh(T.dot(previous, self.w_hh) + input_t)
 38 | 
 39 |     def __call__(self, x):
 40 |         x_w_xh = T.dot(x, self.w_xh) + self.b_h
 41 |         result, updates = theano.scan(self._step,
 42 |                                       sequences=[x_w_xh],
 43 |                                       outputs_info=[T.zeros_like(self.b_h)])
 44 |         return result
 45 | 
 46 | 
 47 | w_ho_np = numpy.random.normal(0, .01, (15, 1))
 48 | w_ho = theano.shared(numpy.asarray(w_ho_np, dtype=floatX), name='w_ho')
 49 | b_o = theano.shared(numpy.zeros((1,), dtype=floatX), name='b_o')
 50 | 
 51 | x = T.matrix('x')
 52 | my_rnn = SimpleRNN(1, 15)
 53 | hidden = my_rnn(x)
 54 | prediction = T.dot(hidden, w_ho) + b_o
 55 | parameters = my_rnn.parameters + [w_ho, b_o]
 56 | l2 = sum((p**2).sum() for p in parameters)
 57 | mse = T.mean((prediction[:-1] - x[1:])**2)
 58 | cost = mse + .0001 * l2
 59 | gradient = T.grad(cost, wrt=parameters)
 60 | 
 61 | lr = .3
 62 | updates = [(par, par - lr * gra) for par, gra in zip(parameters, gradient)]
 63 | update_model = theano.function([x], cost, updates=updates)
 64 | get_cost = theano.function([x], mse)
 65 | predict = theano.function([x], prediction)
 66 | get_hidden = theano.function([x], hidden)
 67 | get_gradient = theano.function([x], gradient)
 68 | 
 69 | predict = theano.function([x], prediction)
 70 | 
 71 | # Generating sequences
 72 | 
 73 | x_t = T.vector()
 74 | h_p = T.vector()
 75 | preactivation = T.dot(x_t, my_rnn.w_xh) + my_rnn.b_h
 76 | h_t = my_rnn._step(preactivation, h_p)
 77 | o_t = T.dot(h_t, w_ho) + b_o
 78 | 
 79 | single_step = theano.function([x_t, h_p], [o_t, h_t])
 80 | 
 81 | # lstm_text.ipynb
 82 | 
 83 | def gauss_weight(rng, ndim_in, ndim_out=None, sd=.005):
 84 |     if ndim_out is None:
 85 |         ndim_out = ndim_in
 86 |     W = rng.randn(ndim_in, ndim_out) * sd
 87 |     return numpy.asarray(W, dtype=config.floatX)
 88 | 
 89 | 
 90 | def index_dot(indices, w):
 91 |     return w[indices.flatten()]
 92 | 
 93 | 
 94 | class LstmLayer:
 95 | 
 96 |     def __init__(self, rng, input, mask, n_in, n_h):
 97 | 
 98 |         # Init params
 99 |         self.W_i = theano.shared(gauss_weight(rng, n_in, n_h), 'W_i', borrow=True)
100 |         self.W_f = theano.shared(gauss_weight(rng, n_in, n_h), 'W_f', borrow=True)
101 |         self.W_c = theano.shared(gauss_weight(rng, n_in, n_h), 'W_c', borrow=True)
102 |         self.W_o = theano.shared(gauss_weight(rng, n_in, n_h), 'W_o', borrow=True)
103 | 
104 |         self.U_i = theano.shared(gauss_weight(rng, n_h), 'U_i', borrow=True)
105 |         self.U_f = theano.shared(gauss_weight(rng, n_h), 'U_f', borrow=True)
106 |         self.U_c = theano.shared(gauss_weight(rng, n_h), 'U_c', borrow=True)
107 |         self.U_o = theano.shared(gauss_weight(rng, n_h), 'U_o', borrow=True)
108 | 
109 |         self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
110 |                                  'b_i', borrow=True)
111 |         self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
112 |                                  'b_f', borrow=True)
113 |         self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
114 |                                  'b_c', borrow=True)
115 |         self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
116 |                                  'b_o', borrow=True)
117 | 
118 |         self.params = [self.W_i, self.W_f, self.W_c, self.W_o,
119 |                        self.U_i, self.U_f, self.U_c, self.U_o,
120 |                        self.b_i, self.b_f, self.b_c, self.b_o]
121 | 
122 |         outputs_info = [T.zeros((input.shape[1], n_h)),
123 |                         T.zeros((input.shape[1], n_h))]
124 | 
125 |         rval, updates = theano.scan(self._step,
126 |                                     sequences=[mask, input],
127 |                                     outputs_info=outputs_info)
128 | 
129 |         # self.output is in the format (length, batchsize, n_h)
130 |         self.output = rval[0]
131 | 
132 |     def _step(self, m_, x_, h_, c_):
133 | 
134 |         i_preact = (index_dot(x_, self.W_i) +
135 |                     T.dot(h_, self.U_i) + self.b_i)
136 |         i = T.nnet.sigmoid(i_preact)
137 | 
138 |         f_preact = (index_dot(x_, self.W_f) +
139 |                     T.dot(h_, self.U_f) + self.b_f)
140 |         f = T.nnet.sigmoid(f_preact)
141 | 
142 |         o_preact = (index_dot(x_, self.W_o) +
143 |                     T.dot(h_, self.U_o) + self.b_o)
144 |         o = T.nnet.sigmoid(o_preact)
145 | 
146 |         c_preact = (index_dot(x_, self.W_c) +
147 |                     T.dot(h_, self.U_c) + self.b_c)
148 |         c = T.tanh(c_preact)
149 | 
150 |         c = f * c_ + i * c
151 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
152 | 
153 |         h = o * T.tanh(c)
154 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
155 | 
156 |         return h, c
157 | 
158 | 
159 | def sequence_categorical_crossentropy(prediction, targets, mask):
160 |     prediction_flat = prediction.reshape(((prediction.shape[0] *
161 |                                            prediction.shape[1]),
162 |                                           prediction.shape[2]), ndim=2)
163 |     targets_flat = targets.flatten()
164 |     mask_flat = mask.flatten()
165 |     ce = categorical_crossentropy(prediction_flat, targets_flat)
166 |     return T.sum(ce * mask_flat)
167 | 
168 | 
169 | class LogisticRegression(object):
170 | 
171 |     def __init__(self, rng, input, n_in, n_out):
172 | 
173 |         W = gauss_weight(rng, n_in, n_out)
174 |         self.W = theano.shared(value=numpy.asarray(W, dtype=theano.config.floatX),
175 |                                name='W', borrow=True)
176 |         # initialize the biases b as a vector of n_out 0s
177 |         self.b = theano.shared(value=numpy.zeros((n_out,),
178 |                                                  dtype=theano.config.floatX),
179 |                                name='b', borrow=True)
180 | 
181 |         # compute vector of class-membership probabilities in symbolic form
182 |         energy = T.dot(input, self.W) + self.b
183 |         energy_exp = T.exp(energy - T.max(energy, axis=2, keepdims=True))
184 |         pmf = energy_exp / energy_exp.sum(axis=2, keepdims=True)
185 |         self.p_y_given_x = pmf
186 |         self.params = [self.W, self.b]
187 | 
188 | batch_size = 100
189 | n_h = 50
190 | 
191 | # The Theano graph
192 | # Set the random number generator' seeds for consistency
193 | rng = numpy.random.RandomState(12345)
194 | 
195 | x = T.lmatrix('x')
196 | mask = T.matrix('mask')
197 | 
198 | # Construct an LSTM layer and a logistic regression layer
199 | recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)
200 | logreg_layer = LogisticRegression(rng=rng, input=recurrent_layer.output[:-1],
201 |                                   n_in=n_h, n_out=111)
202 | 
203 | # define a cost variable to optimize
204 | cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
205 |                                          x[1:],
206 |                                          mask[1:]) / batch_size
207 | 
208 | # create a list of all model parameters to be fit by gradient descent
209 | params = logreg_layer.params + recurrent_layer.params
210 | 
211 | # create a list of gradients for all model parameters
212 | grads = T.grad(cost, params)
213 | 
214 | learning_rate = 0.1
215 | updates = [
216 |     (param_i, param_i - learning_rate * grad_i)
217 |     for param_i, grad_i in zip(params, grads)
218 | ]
219 | 
220 | update_model = theano.function([x, mask], cost, updates=updates)
221 | 
222 | evaluate_model = theano.function([x, mask], cost)
223 | 
224 | # Generating Sequences
225 | x_t = T.iscalar()
226 | h_p = T.vector()
227 | c_p = T.vector()
228 | h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
229 | energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b
230 | 
231 | energy_exp = T.exp(energy - T.max(energy, axis=1, keepdims=True))
232 | 
233 | output = energy_exp / energy_exp.sum(axis=1, keepdims=True)
234 | single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])
235 | 


--------------------------------------------------------------------------------
/scikit-learn/fig_code/figures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import warnings
  4 | 
  5 | 
  6 | def plot_venn_diagram():
  7 |     fig, ax = plt.subplots(subplot_kw=dict(frameon=False, xticks=[], yticks=[]))
  8 |     ax.add_patch(plt.Circle((0.3, 0.3), 0.3, fc='red', alpha=0.5))
  9 |     ax.add_patch(plt.Circle((0.6, 0.3), 0.3, fc='blue', alpha=0.5))
 10 |     ax.add_patch(plt.Rectangle((-0.1, -0.1), 1.1, 0.8, fc='none', ec='black'))
 11 |     ax.text(0.2, 0.3, '$x$', size=30, ha='center', va='center')
 12 |     ax.text(0.7, 0.3, '$y$', size=30, ha='center', va='center')
 13 |     ax.text(0.0, 0.6, '$I$', size=30)
 14 |     ax.axis('equal')
 15 | 
 16 | 
 17 | def plot_example_decision_tree():
 18 |     fig = plt.figure(figsize=(10, 4))
 19 |     ax = fig.add_axes([0, 0, 0.8, 1], frameon=False, xticks=[], yticks=[])
 20 |     ax.set_title('Example Decision Tree: Animal Classification', size=24)
 21 | 
 22 |     def text(ax, x, y, t, size=20, **kwargs):
 23 |         ax.text(x, y, t,
 24 |                 ha='center', va='center', size=size,
 25 |                 bbox=dict(boxstyle='round', ec='k', fc='w'), **kwargs)
 26 | 
 27 |     text(ax, 0.5, 0.9, "How big is\nthe animal?", 20)
 28 |     text(ax, 0.3, 0.6, "Does the animal\nhave horns?", 18)
 29 |     text(ax, 0.7, 0.6, "Does the animal\nhave two legs?", 18)
 30 |     text(ax, 0.12, 0.3, "Are the horns\nlonger than 10cm?", 14)
 31 |     text(ax, 0.38, 0.3, "Is the animal\nwearing a collar?", 14)
 32 |     text(ax, 0.62, 0.3, "Does the animal\nhave wings?", 14)
 33 |     text(ax, 0.88, 0.3, "Does the animal\nhave a tail?", 14)
 34 | 
 35 |     text(ax, 0.4, 0.75, "> 1m", 12, alpha=0.4)
 36 |     text(ax, 0.6, 0.75, "< 1m", 12, alpha=0.4)
 37 | 
 38 |     text(ax, 0.21, 0.45, "yes", 12, alpha=0.4)
 39 |     text(ax, 0.34, 0.45, "no", 12, alpha=0.4)
 40 | 
 41 |     text(ax, 0.66, 0.45, "yes", 12, alpha=0.4)
 42 |     text(ax, 0.79, 0.45, "no", 12, alpha=0.4)
 43 | 
 44 |     ax.plot([0.3, 0.5, 0.7], [0.6, 0.9, 0.6], '-k')
 45 |     ax.plot([0.12, 0.3, 0.38], [0.3, 0.6, 0.3], '-k')
 46 |     ax.plot([0.62, 0.7, 0.88], [0.3, 0.6, 0.3], '-k')
 47 |     ax.plot([0.0, 0.12, 0.20], [0.0, 0.3, 0.0], '--k')
 48 |     ax.plot([0.28, 0.38, 0.48], [0.0, 0.3, 0.0], '--k')
 49 |     ax.plot([0.52, 0.62, 0.72], [0.0, 0.3, 0.0], '--k')
 50 |     ax.plot([0.8, 0.88, 1.0], [0.0, 0.3, 0.0], '--k')
 51 |     ax.axis([0, 1, 0, 1])
 52 | 
 53 | 
 54 | def visualize_tree(estimator, X, y, boundaries=True,
 55 |                    xlim=None, ylim=None):
 56 |     estimator.fit(X, y)
 57 | 
 58 |     if xlim is None:
 59 |         xlim = (X[:, 0].min() - 0.1, X[:, 0].max() + 0.1)
 60 |     if ylim is None:
 61 |         ylim = (X[:, 1].min() - 0.1, X[:, 1].max() + 0.1)
 62 | 
 63 |     x_min, x_max = xlim
 64 |     y_min, y_max = ylim
 65 |     xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
 66 |                          np.linspace(y_min, y_max, 100))
 67 |     Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
 68 | 
 69 |     # Put the result into a color plot
 70 |     Z = Z.reshape(xx.shape)
 71 |     plt.figure()
 72 |     plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap='rainbow')
 73 |     plt.clim(y.min(), y.max())
 74 | 
 75 |     # Plot also the training points
 76 |     plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow')
 77 |     plt.axis('off')
 78 | 
 79 |     plt.xlim(x_min, x_max)
 80 |     plt.ylim(y_min, y_max)        
 81 |     plt.clim(y.min(), y.max())
 82 |     
 83 |     # Plot the decision boundaries
 84 |     def plot_boundaries(i, xlim, ylim):
 85 |         if i < 0:
 86 |             return
 87 | 
 88 |         tree = estimator.tree_
 89 |         
 90 |         if tree.feature[i] == 0:
 91 |             plt.plot([tree.threshold[i], tree.threshold[i]], ylim, '-k')
 92 |             plot_boundaries(tree.children_left[i],
 93 |                             [xlim[0], tree.threshold[i]], ylim)
 94 |             plot_boundaries(tree.children_right[i],
 95 |                             [tree.threshold[i], xlim[1]], ylim)
 96 |         
 97 |         elif tree.feature[i] == 1:
 98 |             plt.plot(xlim, [tree.threshold[i], tree.threshold[i]], '-k')
 99 |             plot_boundaries(tree.children_left[i], xlim,
100 |                             [ylim[0], tree.threshold[i]])
101 |             plot_boundaries(tree.children_right[i], xlim,
102 |                             [tree.threshold[i], ylim[1]])
103 |             
104 |     if boundaries:
105 |         plot_boundaries(0, plt.xlim(), plt.ylim())
106 | 
107 | 
108 | def plot_tree_interactive(X, y):
109 |     from sklearn.tree import DecisionTreeClassifier
110 | 
111 |     def interactive_tree(depth=1):
112 |         clf = DecisionTreeClassifier(max_depth=depth, random_state=0)
113 |         visualize_tree(clf, X, y)
114 | 
115 |     from IPython.html.widgets import interact
116 |     return interact(interactive_tree, depth=[1, 5])
117 | 
118 | 
119 | def plot_kmeans_interactive(min_clusters=1, max_clusters=6):
120 |     from IPython.html.widgets import interact
121 |     from sklearn.metrics.pairwise import euclidean_distances
122 |     from sklearn.datasets.samples_generator import make_blobs
123 | 
124 |     with warnings.catch_warnings():
125 |         warnings.filterwarnings('ignore')
126 | 
127 |         X, y = make_blobs(n_samples=300, centers=4,
128 |                           random_state=0, cluster_std=0.60)
129 | 
130 |         def _kmeans_step(frame=0, n_clusters=4):
131 |             rng = np.random.RandomState(2)
132 |             labels = np.zeros(X.shape[0])
133 |             centers = rng.randn(n_clusters, 2)
134 | 
135 |             nsteps = frame // 3
136 | 
137 |             for i in range(nsteps + 1):
138 |                 old_centers = centers
139 |                 if i < nsteps or frame % 3 > 0:
140 |                     dist = euclidean_distances(X, centers)
141 |                     labels = dist.argmin(1)
142 | 
143 |                 if i < nsteps or frame % 3 > 1:
144 |                     centers = np.array([X[labels == j].mean(0)
145 |                                         for j in range(n_clusters)])
146 |                     nans = np.isnan(centers)
147 |                     centers[nans] = old_centers[nans]
148 | 
149 | 
150 |             # plot the data and cluster centers
151 |             plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow',
152 |                         vmin=0, vmax=n_clusters - 1);
153 |             plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o',
154 |                         c=np.arange(n_clusters),
155 |                         s=200, cmap='rainbow')
156 |             plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o',
157 |                         c='black', s=50)
158 | 
159 |             # plot new centers if third frame
160 |             if frame % 3 == 2:
161 |                 for i in range(n_clusters):
162 |                     plt.annotate('', centers[i], old_centers[i], 
163 |                                  arrowprops=dict(arrowstyle='->', linewidth=1))
164 |                 plt.scatter(centers[:, 0], centers[:, 1], marker='o',
165 |                             c=np.arange(n_clusters),
166 |                             s=200, cmap='rainbow')
167 |                 plt.scatter(centers[:, 0], centers[:, 1], marker='o',
168 |                             c='black', s=50)
169 | 
170 |             plt.xlim(-4, 4)
171 |             plt.ylim(-2, 10)
172 | 
173 |             if frame % 3 == 1:
174 |                 plt.text(3.8, 9.5, "1. Reassign points to nearest centroid",
175 |                          ha='right', va='top', size=14)
176 |             elif frame % 3 == 2:
177 |                 plt.text(3.8, 9.5, "2. Update centroids to cluster means",
178 |                          ha='right', va='top', size=14)
179 | 
180 |     
181 |     return interact(_kmeans_step, frame=[0, 50],
182 |                     n_clusters=[min_clusters, max_clusters])
183 | 
184 | 
185 | def plot_image_components(x, coefficients=None, mean=0, components=None,
186 |                           imshape=(8, 8), n_components=6, fontsize=12):
187 |     if coefficients is None:
188 |         coefficients = x
189 |         
190 |     if components is None:
191 |         components = np.eye(len(coefficients), len(x))
192 |         
193 |     mean = np.zeros_like(x) + mean
194 |         
195 | 
196 |     fig = plt.figure(figsize=(1.2 * (5 + n_components), 1.2 * 2))
197 |     g = plt.GridSpec(2, 5 + n_components, hspace=0.3)
198 | 
199 |     def show(i, j, x, title=None):
200 |         ax = fig.add_subplot(g[i, j], xticks=[], yticks=[])
201 |         ax.imshow(x.reshape(imshape), interpolation='nearest')
202 |         if title:
203 |             ax.set_title(title, fontsize=fontsize)
204 | 
205 |     show(slice(2), slice(2), x, "True")
206 | 
207 |     approx = mean.copy()
208 |     show(0, 2, np.zeros_like(x) + mean, r'$\mu$')
209 |     show(1, 2, approx, r'$1 \cdot \mu$')
210 | 
211 |     for i in range(0, n_components):
212 |         approx = approx + coefficients[i] * components[i]
213 |         show(0, i + 3, components[i], r'$c_{0}$'.format(i + 1))
214 |         show(1, i + 3, approx,
215 |              r"${0:.2f} \cdot c_{1}$".format(coefficients[i], i + 1))
216 |         plt.gca().text(0, 1.05, '$+$', ha='right', va='bottom',
217 |                        transform=plt.gca().transAxes, fontsize=fontsize)
218 | 
219 |     show(slice(2), slice(-2, None), approx, "Approx")
220 | 
221 | 
222 | def plot_pca_interactive(data, n_components=6):
223 |     from sklearn.decomposition import PCA
224 |     from IPython.html.widgets import interact
225 | 
226 |     pca = PCA(n_components=n_components)
227 |     Xproj = pca.fit_transform(data)
228 | 
229 |     def show_decomp(i=0):
230 |         plot_image_components(data[i], Xproj[i],
231 |                               pca.mean_, pca.components_)
232 |     
233 |     interact(show_decomp, i=(0, data.shape[0] - 1));
234 | 


--------------------------------------------------------------------------------
/deep-learning/tensor-flow-exercises/3_regularization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "kR-4eNdK6lYS"
  8 |    },
  9 |    "source": [
 10 |     "Deep Learning with TensorFlow\n",
 11 |     "=============\n",
 12 |     "\n",
 13 |     "Credits: Forked from [TensorFlow](https://github.com/tensorflow/tensorflow) by Google\n",
 14 |     "\n",
 15 |     "Setup\n",
 16 |     "------------\n",
 17 |     "\n",
 18 |     "Refer to the [setup instructions](https://github.com/donnemartin/data-science-ipython-notebooks/tree/feature/deep-learning/deep-learning/tensor-flow-exercises/README.md).\n",
 19 |     "\n",
 20 |     "Exercise 3\n",
 21 |     "------------\n",
 22 |     "\n",
 23 |     "Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.\n",
 24 |     "\n",
 25 |     "The goal of this exercise is to explore regularization techniques."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "cellView": "both",
 33 |     "colab": {
 34 |      "autoexec": {
 35 |       "startup": false,
 36 |       "wait_interval": 0
 37 |      }
 38 |     },
 39 |     "colab_type": "code",
 40 |     "collapsed": true,
 41 |     "id": "JLpLa8Jt7Vu4"
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# These are all the modules we'll be using later. Make sure you can import them\n",
 46 |     "# before proceeding further.\n",
 47 |     "import cPickle as pickle\n",
 48 |     "import numpy as np\n",
 49 |     "import tensorflow as tf"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {
 55 |     "colab_type": "text",
 56 |     "id": "1HrCK6e17WzV"
 57 |    },
 58 |    "source": [
 59 |     "First reload the data we generated in _notmist.ipynb_."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "cellView": "both",
 67 |     "colab": {
 68 |      "autoexec": {
 69 |       "startup": false,
 70 |       "wait_interval": 0
 71 |      },
 72 |      "output_extras": [
 73 |       {
 74 |        "item_id": 1
 75 |       }
 76 |      ]
 77 |     },
 78 |     "colab_type": "code",
 79 |     "collapsed": false,
 80 |     "executionInfo": {
 81 |      "elapsed": 11777,
 82 |      "status": "ok",
 83 |      "timestamp": 1449849322348,
 84 |      "user": {
 85 |       "color": "",
 86 |       "displayName": "",
 87 |       "isAnonymous": false,
 88 |       "isMe": true,
 89 |       "permissionId": "",
 90 |       "photoUrl": "",
 91 |       "sessionId": "0",
 92 |       "userId": ""
 93 |      },
 94 |      "user_tz": 480
 95 |     },
 96 |     "id": "y3-cj1bpmuxc",
 97 |     "outputId": "e03576f1-ebbe-4838-c388-f1777bcc9873"
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "Training set (200000, 28, 28) (200000,)\n",
105 |       "Validation set (10000, 28, 28) (10000,)\n",
106 |       "Test set (18724, 28, 28) (18724,)\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "pickle_file = 'notMNIST.pickle'\n",
112 |     "\n",
113 |     "with open(pickle_file, 'rb') as f:\n",
114 |     "  save = pickle.load(f)\n",
115 |     "  train_dataset = save['train_dataset']\n",
116 |     "  train_labels = save['train_labels']\n",
117 |     "  valid_dataset = save['valid_dataset']\n",
118 |     "  valid_labels = save['valid_labels']\n",
119 |     "  test_dataset = save['test_dataset']\n",
120 |     "  test_labels = save['test_labels']\n",
121 |     "  del save  # hint to help gc free up memory\n",
122 |     "  print 'Training set', train_dataset.shape, train_labels.shape\n",
123 |     "  print 'Validation set', valid_dataset.shape, valid_labels.shape\n",
124 |     "  print 'Test set', test_dataset.shape, test_labels.shape"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {
130 |     "colab_type": "text",
131 |     "id": "L7aHrm6nGDMB"
132 |    },
133 |    "source": [
134 |     "Reformat into a shape that's more adapted to the models we're going to train:\n",
135 |     "- data as a flat matrix,\n",
136 |     "- labels as float 1-hot encodings."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "cellView": "both",
144 |     "colab": {
145 |      "autoexec": {
146 |       "startup": false,
147 |       "wait_interval": 0
148 |      },
149 |      "output_extras": [
150 |       {
151 |        "item_id": 1
152 |       }
153 |      ]
154 |     },
155 |     "colab_type": "code",
156 |     "collapsed": false,
157 |     "executionInfo": {
158 |      "elapsed": 11728,
159 |      "status": "ok",
160 |      "timestamp": 1449849322356,
161 |      "user": {
162 |       "color": "",
163 |       "displayName": "",
164 |       "isAnonymous": false,
165 |       "isMe": true,
166 |       "permissionId": "",
167 |       "photoUrl": "",
168 |       "sessionId": "0",
169 |       "userId": ""
170 |      },
171 |      "user_tz": 480
172 |     },
173 |     "id": "IRSyYiIIGIzS",
174 |     "outputId": "3f8996ee-3574-4f44-c953-5c8a04636582"
175 |    },
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "Training set (200000, 784) (200000, 10)\n",
182 |       "Validation set (10000, 784) (10000, 10)\n",
183 |       "Test set (18724, 784) (18724, 10)\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "image_size = 28\n",
189 |     "num_labels = 10\n",
190 |     "\n",
191 |     "def reformat(dataset, labels):\n",
192 |     "  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)\n",
193 |     "  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]\n",
194 |     "  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)\n",
195 |     "  return dataset, labels\n",
196 |     "train_dataset, train_labels = reformat(train_dataset, train_labels)\n",
197 |     "valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)\n",
198 |     "test_dataset, test_labels = reformat(test_dataset, test_labels)\n",
199 |     "print 'Training set', train_dataset.shape, train_labels.shape\n",
200 |     "print 'Validation set', valid_dataset.shape, valid_labels.shape\n",
201 |     "print 'Test set', test_dataset.shape, test_labels.shape"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "cellView": "both",
209 |     "colab": {
210 |      "autoexec": {
211 |       "startup": false,
212 |       "wait_interval": 0
213 |      }
214 |     },
215 |     "colab_type": "code",
216 |     "collapsed": true,
217 |     "id": "RajPLaL_ZW6w"
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "def accuracy(predictions, labels):\n",
222 |     "  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))\n",
223 |     "          / predictions.shape[0])"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {
229 |     "colab_type": "text",
230 |     "id": "sgLbUAQ1CW-1"
231 |    },
232 |    "source": [
233 |     "---\n",
234 |     "Problem 1\n",
235 |     "---------\n",
236 |     "\n",
237 |     "Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compue the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.\n",
238 |     "\n",
239 |     "---"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {
245 |     "colab_type": "text",
246 |     "id": "na8xX2yHZzNF"
247 |    },
248 |    "source": [
249 |     "---\n",
250 |     "Problem 2\n",
251 |     "---------\n",
252 |     "Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?\n",
253 |     "\n",
254 |     "---"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {
260 |     "colab_type": "text",
261 |     "id": "ww3SCBUdlkRc"
262 |    },
263 |    "source": [
264 |     "---\n",
265 |     "Problem 3\n",
266 |     "---------\n",
267 |     "Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.\n",
268 |     "\n",
269 |     "What happens to our extreme overfitting case?\n",
270 |     "\n",
271 |     "---"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {
277 |     "colab_type": "text",
278 |     "id": "-b1hTz3VWZjw"
279 |    },
280 |    "source": [
281 |     "---\n",
282 |     "Problem 4\n",
283 |     "---------\n",
284 |     "\n",
285 |     "Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).\n",
286 |     "\n",
287 |     "One avenue you can explore is to add multiple layers.\n",
288 |     "\n",
289 |     "Another one is to use learning rate decay:\n",
290 |     "\n",
291 |     "    global_step = tf.Variable(0)  # count the number of steps taken.\n",
292 |     "    learning_rate = tf.train.exponential_decay(0.5, step, ...)\n",
293 |     "    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)\n",
294 |     " \n",
295 |     " ---\n"
296 |    ]
297 |   }
298 |  ],
299 |  "metadata": {
300 |   "colabVersion": "0.3.2",
301 |   "colab_default_view": {},
302 |   "colab_views": {},
303 |   "kernelspec": {
304 |    "display_name": "Python 3",
305 |    "language": "python",
306 |    "name": "python3"
307 |   },
308 |   "language_info": {
309 |    "codemirror_mode": {
310 |     "name": "ipython",
311 |     "version": 3
312 |    },
313 |    "file_extension": ".py",
314 |    "mimetype": "text/x-python",
315 |    "name": "python",
316 |    "nbconvert_exporter": "python",
317 |    "pygments_lexer": "ipython3",
318 |    "version": "3.4.3"
319 |   }
320 |  },
321 |  "nbformat": 4,
322 |  "nbformat_minor": 0
323 | }
324 | 


--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/lstm_text.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pkl
  2 | import time
  3 | 
  4 | import numpy
  5 | import theano
  6 | from theano import config
  7 | import theano.tensor as T
  8 | from theano.tensor.nnet import categorical_crossentropy
  9 | 
 10 | from fuel.datasets import TextFile
 11 | from fuel.streams import DataStream
 12 | from fuel.schemes import ConstantScheme
 13 | from fuel.transformers import Batch, Padding
 14 | 
 15 | 
 16 | # These files can be downloaded from
 17 | # http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz
 18 | # http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl
 19 | # don't forget to change the paths and gunzip train.txt.gz
 20 | TRAIN_FILE = '/u/brakelp/temp/traindata.txt'
 21 | VAL_FILE = '/u/brakelp/temp/valdata.txt'
 22 | DICT_FILE = '/u/brakelp/temp/dictionary.pkl'
 23 | 
 24 | 
 25 | def sequence_categorical_crossentropy(prediction, targets, mask):
 26 |     prediction_flat = prediction.reshape(((prediction.shape[0] *
 27 |                                            prediction.shape[1]),
 28 |                                           prediction.shape[2]), ndim=2)
 29 |     targets_flat = targets.flatten()
 30 |     mask_flat = mask.flatten()
 31 |     ce = categorical_crossentropy(prediction_flat, targets_flat)
 32 |     return T.sum(ce * mask_flat)
 33 | 
 34 | 
 35 | def gauss_weight(ndim_in, ndim_out=None, sd=.005):
 36 |     if ndim_out is None:
 37 |         ndim_out = ndim_in
 38 |     W = numpy.random.randn(ndim_in, ndim_out) * sd
 39 |     return numpy.asarray(W, dtype=config.floatX)
 40 | 
 41 | 
 42 | class LogisticRegression(object):
 43 |     """Multi-class Logistic Regression Class
 44 | 
 45 |     The logistic regression is fully described by a weight matrix :math:`W`
 46 |     and bias vector :math:`b`. Classification is done by projecting data
 47 |     points onto a set of hyperplanes, the distance to which is used to
 48 |     determine a class membership probability.
 49 |     """
 50 | 
 51 |     def __init__(self, input, n_in, n_out):
 52 |         """ Initialize the parameters of the logistic regression
 53 | 
 54 |         :type input: theano.tensor.TensorType
 55 |         :param input: symbolic variable that describes the input of the
 56 |                       architecture (one minibatch)
 57 | 
 58 |         :type n_in: int
 59 |         :param n_in: number of input units, the dimension of the space in
 60 |                      which the datapoints lie
 61 | 
 62 |         :type n_out: int
 63 |         :param n_out: number of output units, the dimension of the space in
 64 |                       which the labels lie
 65 | 
 66 |         """
 67 | 
 68 |         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
 69 |         self.W = theano.shared(value=numpy.zeros((n_in, n_out),
 70 |                                                  dtype=theano.config.floatX),
 71 |                                name='W', borrow=True)
 72 |         # initialize the baises b as a vector of n_out 0s
 73 |         self.b = theano.shared(value=numpy.zeros((n_out,),
 74 |                                                  dtype=theano.config.floatX),
 75 |                                name='b', borrow=True)
 76 | 
 77 |         # compute vector of class-membership probabilities in symbolic form
 78 |         energy = T.dot(input, self.W) + self.b
 79 |         energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])
 80 |         pmf = energy_exp / energy_exp.sum(2)[:, :, None]
 81 |         self.p_y_given_x = pmf
 82 | 
 83 |         # compute prediction as class whose probability is maximal in
 84 |         # symbolic form
 85 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
 86 | 
 87 |         # parameters of the model
 88 |         self.params = [self.W, self.b]
 89 | 
 90 | 
 91 | def index_dot(indices, w):
 92 |     return w[indices.flatten()]
 93 | 
 94 | 
 95 | class LstmLayer:
 96 | 
 97 |     def __init__(self, rng, input, mask, n_in, n_h):
 98 | 
 99 |         # Init params
100 |         self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)
101 |         self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)
102 |         self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)
103 |         self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)
104 | 
105 |         self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)
106 |         self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)
107 |         self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)
108 |         self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)
109 | 
110 |         self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
111 |                                  'b_i', borrow=True)
112 |         self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
113 |                                  'b_f', borrow=True)
114 |         self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
115 |                                  'b_c', borrow=True)
116 |         self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
117 |                                  'b_o', borrow=True)
118 | 
119 |         self.params = [self.W_i, self.W_f, self.W_c, self.W_o,
120 |                        self.U_i, self.U_f, self.U_c, self.U_o,
121 |                        self.b_i, self.b_f, self.b_c, self.b_o]
122 | 
123 |         outputs_info = [T.zeros((input.shape[1], n_h)),
124 |                         T.zeros((input.shape[1], n_h))]
125 | 
126 |         rval, updates = theano.scan(self._step,
127 |                                     sequences=[mask, input],
128 |                                     outputs_info=outputs_info)
129 | 
130 |         # self.output is in the format (batchsize, n_h)
131 |         self.output = rval[0]
132 | 
133 |     def _step(self, m_, x_, h_, c_):
134 | 
135 |         i_preact = (index_dot(x_, self.W_i) +
136 |                     T.dot(h_, self.U_i) + self.b_i)
137 |         i = T.nnet.sigmoid(i_preact)
138 | 
139 |         f_preact = (index_dot(x_, self.W_f) +
140 |                     T.dot(h_, self.U_f) + self.b_f)
141 |         f = T.nnet.sigmoid(f_preact)
142 | 
143 |         o_preact = (index_dot(x_, self.W_o) +
144 |                     T.dot(h_, self.U_o) + self.b_o)
145 |         o = T.nnet.sigmoid(o_preact)
146 | 
147 |         c_preact = (index_dot(x_, self.W_c) +
148 |                     T.dot(h_, self.U_c) + self.b_c)
149 |         c = T.tanh(c_preact)
150 | 
151 |         c = f * c_ + i * c
152 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
153 | 
154 |         h = o * T.tanh(c)
155 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
156 | 
157 |         return h, c
158 | 
159 | 
160 | def train_model(batch_size=100, n_h=50, n_epochs=40):
161 | 
162 |     # Load the datasets with Fuel
163 |     dictionary = pkl.load(open(DICT_FILE, 'r'))
164 |     dictionary['~'] = len(dictionary)
165 |     reverse_mapping = dict((j, i) for i, j in dictionary.items())
166 | 
167 |     print("Loading the data")
168 |     train = TextFile(files=[TRAIN_FILE],
169 |                      dictionary=dictionary,
170 |                      unk_token='~',
171 |                      level='character',
172 |                      preprocess=str.lower,
173 |                      bos_token=None,
174 |                      eos_token=None)
175 | 
176 |     train_stream = DataStream.default_stream(train)
177 | 
178 |     # organize data in batches and pad shorter sequences with zeros
179 |     train_stream = Batch(train_stream,
180 |                          iteration_scheme=ConstantScheme(batch_size))
181 |     train_stream = Padding(train_stream)
182 | 
183 |     # idem dito for the validation text
184 |     val = TextFile(files=[VAL_FILE],
185 |                      dictionary=dictionary,
186 |                      unk_token='~',
187 |                      level='character',
188 |                      preprocess=str.lower,
189 |                      bos_token=None,
190 |                      eos_token=None)
191 | 
192 |     val_stream = DataStream.default_stream(val)
193 | 
194 |     # organize data in batches and pad shorter sequences with zeros
195 |     val_stream = Batch(val_stream,
196 |                          iteration_scheme=ConstantScheme(batch_size))
197 |     val_stream = Padding(val_stream)
198 | 
199 |     print('Building model')
200 | 
201 |     # Set the random number generator' seeds for consistency
202 |     rng = numpy.random.RandomState(12345)
203 | 
204 |     x = T.lmatrix('x')
205 |     mask = T.matrix('mask')
206 | 
207 |     # Construct the LSTM layer
208 |     recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)
209 | 
210 |     logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
211 |                                       n_in=n_h, n_out=111)
212 | 
213 |     cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
214 |                                              x[1:],
215 |                                              mask[1:]) / batch_size
216 | 
217 |     # create a list of all model parameters to be fit by gradient descent
218 |     params = logreg_layer.params + recurrent_layer.params
219 | 
220 |     # create a list of gradients for all model parameters
221 |     grads = T.grad(cost, params)
222 | 
223 |     # update_model is a function that updates the model parameters by
224 |     # SGD Since this model has many parameters, it would be tedious to
225 |     # manually create an update rule for each model parameter. We thus
226 |     # create the updates list by automatically looping over all
227 |     # (params[i], grads[i]) pairs.
228 |     learning_rate = 0.1
229 |     updates = [
230 |         (param_i, param_i - learning_rate * grad_i)
231 |         for param_i, grad_i in zip(params, grads)
232 |     ]
233 | 
234 |     update_model = theano.function([x, mask], cost, updates=updates)
235 | 
236 |     evaluate_model = theano.function([x, mask], cost)
237 | 
238 |     # Define and compile a function for generating a sequence step by step.
239 |     x_t = T.iscalar()
240 |     h_p = T.vector()
241 |     c_p = T.vector()
242 |     h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
243 |     energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b
244 | 
245 |     energy_exp = T.exp(energy - T.max(energy, 1)[:, None])
246 | 
247 |     output = energy_exp / energy_exp.sum(1)[:, None]
248 |     single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])
249 | 
250 |     start_time = time.clock()
251 | 
252 |     iteration = 0
253 | 
254 |     for epoch in range(n_epochs):
255 |         print 'epoch:', epoch
256 | 
257 |         for x_, mask_ in train_stream.get_epoch_iterator():
258 |             iteration += 1
259 | 
260 |             cross_entropy = update_model(x_.T, mask_.T)
261 | 
262 | 
263 |             # Generate some text after each 20 minibatches
264 |             if iteration % 40 == 0:
265 |                 try:
266 |                     prediction = numpy.ones(111, dtype=config.floatX) / 111.0
267 |                     h_p = numpy.zeros((n_h,), dtype=config.floatX)
268 |                     c_p = numpy.zeros((n_h,), dtype=config.floatX)
269 |                     initial = 'the meaning of life is '
270 |                     sentence = initial
271 |                     for char in initial:
272 |                         x_t = dictionary[char]
273 |                         prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
274 |                                                            c_p.flatten())
275 |                     sample = numpy.random.multinomial(1, prediction.flatten())
276 |                     for i in range(450):
277 |                         x_t = numpy.argmax(sample)
278 |                         prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
279 |                                                            c_p.flatten())
280 |                         sentence += reverse_mapping[x_t]
281 |                         sample = numpy.random.multinomial(1, prediction.flatten())
282 |                     print 'LSTM: "' + sentence + '"'
283 |                 except ValueError:
284 |                     print 'Something went wrong during sentence generation.'
285 | 
286 |             if iteration % 40 == 0:
287 |                 print 'epoch:', epoch, '  minibatch:', iteration
288 |                 val_scores = []
289 |                 for x_val, mask_val in val_stream.get_epoch_iterator():
290 |                     val_scores.append(evaluate_model(x_val.T, mask_val.T))
291 |                 print 'Average validation CE per sentence:', numpy.mean(val_scores)
292 | 
293 |     end_time = time.clock()
294 |     print('Optimization complete.')
295 |     print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
296 | 
297 | 
298 | if __name__ == '__main__':
299 |     train_model()
300 | 


--------------------------------------------------------------------------------