├── __init__.py ├── aws └── __init__.py ├── kaggle └── __init__.py ├── numpy ├── __init__.py └── tests │ └── __init__.py ├── pandas ├── __init__.py └── tests │ └── __init__.py ├── scipy ├── __init__.py ├── tests │ └── __init__.py ├── 2002FemPreg.dat.gz ├── nsfg.py └── first.py ├── spark ├── __init__.py └── hdfs.ipynb ├── analyses ├── __init__.py └── churn_measurements.py ├── commands ├── __init__.py └── styles │ └── custom.css ├── mapreduce ├── __init__.py ├── test_mr_s3_log_parser.py └── mr_s3_log_parser.py ├── matplotlib ├── __init__.py └── tests │ └── __init__.py ├── python-data ├── __init__.py ├── tests │ ├── __init__.py │ ├── test_type_util.py │ └── test_transform_util.py ├── hello_world.txt ├── transform_util.py ├── type_util.py ├── pdb.ipynb ├── files.ipynb ├── logs.ipynb ├── unit_tests.ipynb └── datetime.ipynb ├── scikit-learn ├── __init__.py ├── tests │ └── __init__.py └── fig_code │ ├── __init__.py │ ├── linear_regression.py │ ├── data.py │ ├── sgd_separator.py │ ├── helpers.py │ ├── scikit-learn.ipynb │ ├── ML_flow_chart.py │ └── figures.py ├── .gitattributes ├── images ├── aws.png ├── cover.png ├── deep.png ├── mrjob.png ├── numpy.png ├── scipy.png ├── spark.png ├── svm.gif ├── k-means.gif ├── kaggle.png ├── pandas.png ├── python.png ├── regex-1.png ├── regex-2.png ├── theano.png ├── README.sketch ├── commands.png ├── coversmall.png ├── matplotlib.png ├── tensorflow.png ├── scikitlearn.png ├── README_1200x800.gif └── coversmall_alt.png ├── data ├── confusion_matrix.png ├── ozone.csv ├── titanic │ ├── gendermodel.py │ ├── myfirstforest.py │ ├── gendermodel.csv │ ├── genderclassmodel.csv │ ├── results-rf.csv │ └── genderclassmodel.py └── ozone_copy.csv ├── deep-learning ├── deep-dream │ ├── flowers.jpg │ └── sky1024px.jpg ├── theano-tutorial │ ├── rnn_tutorial │ │ ├── rnn_lstm.pdf │ │ ├── instruction.pdf │ │ ├── Makefile │ │ ├── synthetic.py │ │ ├── rnn_precompile.py │ │ └── lstm_text.py │ ├── intro_theano │ │ ├── intro_theano.pdf │ │ ├── Makefile │ │ └── utils.py │ └── scan_tutorial │ │ ├── scan_ex1_solution.py │ │ └── scan_ex2_solution.py ├── tensor-flow-exercises │ ├── Dockerfile │ ├── README.md │ └── 3_regularization.ipynb └── tensor-flow-examples │ ├── multigpu_basics.py │ ├── notebooks │ ├── 4_multi_gpu │ │ └── multigpu_basics.ipynb │ ├── 1_intro │ │ └── basic_operations.ipynb │ ├── 2_basic_classifiers │ │ └── logistic_regression.ipynb │ └── 3_neural_networks │ │ └── multilayer_perceptron.ipynb │ └── input_data.py ├── requirements.txt ├── LICENSE ├── .gitignore └── misc └── regex.ipynb /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aws/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kaggle/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /numpy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scipy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analyses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mapreduce/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matplotlib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /numpy/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scipy/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matplotlib/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scikit-learn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-data/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scikit-learn/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-data/hello_world.txt: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | -------------------------------------------------------------------------------- /images/aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/aws.png -------------------------------------------------------------------------------- /images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/cover.png -------------------------------------------------------------------------------- /images/deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/deep.png -------------------------------------------------------------------------------- /images/mrjob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/mrjob.png -------------------------------------------------------------------------------- /images/numpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/numpy.png -------------------------------------------------------------------------------- /images/scipy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/scipy.png -------------------------------------------------------------------------------- /images/spark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/spark.png -------------------------------------------------------------------------------- /images/svm.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/svm.gif -------------------------------------------------------------------------------- /images/k-means.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/k-means.gif -------------------------------------------------------------------------------- /images/kaggle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/kaggle.png -------------------------------------------------------------------------------- /images/pandas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/pandas.png -------------------------------------------------------------------------------- /images/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/python.png -------------------------------------------------------------------------------- /images/regex-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/regex-1.png -------------------------------------------------------------------------------- /images/regex-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/regex-2.png -------------------------------------------------------------------------------- /images/theano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/theano.png -------------------------------------------------------------------------------- /images/README.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/README.sketch -------------------------------------------------------------------------------- /images/commands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/commands.png -------------------------------------------------------------------------------- /images/coversmall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/coversmall.png -------------------------------------------------------------------------------- /images/matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/matplotlib.png -------------------------------------------------------------------------------- /images/tensorflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/tensorflow.png -------------------------------------------------------------------------------- /images/scikitlearn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/scikitlearn.png -------------------------------------------------------------------------------- /data/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/data/confusion_matrix.png -------------------------------------------------------------------------------- /images/README_1200x800.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/README_1200x800.gif -------------------------------------------------------------------------------- /images/coversmall_alt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/images/coversmall_alt.png -------------------------------------------------------------------------------- /scipy/2002FemPreg.dat.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/scipy/2002FemPreg.dat.gz -------------------------------------------------------------------------------- /deep-learning/deep-dream/flowers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/deep-dream/flowers.jpg -------------------------------------------------------------------------------- /deep-learning/deep-dream/sky1024px.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/deep-dream/sky1024px.jpg -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/intro_theano/intro_theano.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmfn/data-science-ipython-notebooks/HEAD/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/intro_theano/Makefile: -------------------------------------------------------------------------------- 1 | intro_theano.pdf: slides_source/intro_theano.tex 2 | cd slides_source; pdflatex --shell-escape intro_theano.tex 3 | mv slides_source/intro_theano.pdf . 4 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import * 2 | from .figures import * 3 | 4 | from .sgd_separator import plot_sgd_separator 5 | from .linear_regression import plot_linear_regression 6 | from .helpers import plot_iris_knn 7 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-exercises/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM b.gcr.io/tensorflow/tensorflow:latest 2 | MAINTAINER Vincent Vanhoucke 3 | RUN pip install scikit-learn 4 | ADD *.ipynb /notebooks/ 5 | WORKDIR /notebooks 6 | CMD ["/run_jupyter.sh"] 7 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-exercises/README.md: -------------------------------------------------------------------------------- 1 | Exercises 2 | =========================================================== 3 | 4 | Building the Docker container 5 | ----------------------------- 6 | 7 | docker build -t $USER/exercises . 8 | 9 | Running the container 10 | --------------------- 11 | 12 | docker run -p 8888:8888 -it --rm $USER/exercises 13 | -------------------------------------------------------------------------------- /python-data/tests/test_type_util.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from ..type_util import TypeUtil 3 | 4 | 5 | class TestUtil(): 6 | 7 | def test_is_iterable(self): 8 | assert_equal(TypeUtil.is_iterable('foo'), True) 9 | assert_equal(TypeUtil.is_iterable(7), False) 10 | 11 | def test_convert_to_list(self): 12 | assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True) 13 | assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backports.ssl-match-hostname==3.4.0.2 2 | certifi==2015.4.28 3 | functools32==3.2.3.post1 4 | gnureadline==6.3.3 5 | ipython==3.2.0 6 | Jinja2==2.7.3 7 | jsonschema==2.5.1 8 | MarkupSafe==0.23 9 | matplotlib==1.4.3 10 | mistune==0.6 11 | mock==1.0.1 12 | nose==1.3.7 13 | numpy==1.9.2 14 | pandas==0.16.2 15 | ptyprocess==0.5 16 | Pygments==2.0.2 17 | pyparsing==2.0.3 18 | python-dateutil==2.4.2 19 | pytz==2015.4 20 | pyzmq==14.7.0 21 | scikit-learn==0.16.1 22 | scipy==0.15.1 23 | seaborn==0.6.0 24 | six==1.9.0 25 | sympy==0.7.6 26 | terminado==0.5 27 | tornado==4.2 28 | wheel==0.24.0 29 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/Makefile: -------------------------------------------------------------------------------- 1 | all: instruction.pdf rnn_lstm.pdf 2 | 3 | instruction.pdf: slides_source/instruction.tex 4 | cd slides_source; pdflatex --shell-escape instruction.tex 5 | cd slides_source; pdflatex --shell-escape instruction.tex 6 | cd slides_source; pdflatex --shell-escape instruction.tex 7 | mv slides_source/instruction.pdf . 8 | 9 | rnn_lstm.pdf: slides_source/rnn_lstm.tex 10 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex 11 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex 12 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex 13 | mv slides_source/rnn_lstm.pdf . 14 | -------------------------------------------------------------------------------- /python-data/transform_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class TransformUtil: 5 | 6 | @classmethod 7 | def remove_punctuation(cls, value): 8 | """Removes !, #, and ?. 9 | """ 10 | return re.sub('[!#?]', '', value) 11 | 12 | @classmethod 13 | def clean_strings(cls, strings, ops): 14 | """General purpose method to clean strings. 15 | 16 | Pass in a sequence of strings and the operations to perform. 17 | """ 18 | result = [] 19 | for value in strings: 20 | for function in ops: 21 | value = function(value) 22 | result.append(value) 23 | return result -------------------------------------------------------------------------------- /python-data/type_util.py: -------------------------------------------------------------------------------- 1 | class TypeUtil: 2 | 3 | @classmethod 4 | def is_iterable(cls, obj): 5 | """Determines if obj is iterable. 6 | 7 | Useful when writing functions that can accept multiple types of 8 | input (list, tuple, ndarray, iterator). Pairs well with 9 | convert_to_list. 10 | """ 11 | try: 12 | iter(obj) 13 | return True 14 | except TypeError: 15 | return False 16 | 17 | @classmethod 18 | def convert_to_list(cls, obj): 19 | """Converts obj to a list if it is not a list and it is iterable, 20 | else returns the original obj. 21 | """ 22 | if not isinstance(obj, list) and cls.is_iterable(obj): 23 | obj = list(obj) 24 | return obj -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This repository contains a variety of content; some developed by Donne Martin, and some from third-parties. The third-party content is distributed under the license provided by those parties. 2 | 3 | The content developed by Donne Martin is distributed under the following license: 4 | 5 | Copyright 2015 Donne Martin 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. -------------------------------------------------------------------------------- /scikit-learn/fig_code/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import LinearRegression 4 | 5 | 6 | def plot_linear_regression(): 7 | a = 0.5 8 | b = 1.0 9 | 10 | # x from 0 to 10 11 | x = 30 * np.random.random(20) 12 | 13 | # y = a*x + b with noise 14 | y = a * x + b + np.random.normal(size=x.shape) 15 | 16 | # create a linear regression classifier 17 | clf = LinearRegression() 18 | clf.fit(x[:, None], y) 19 | 20 | # predict y from the data 21 | x_new = np.linspace(0, 30, 100) 22 | y_new = clf.predict(x_new[:, None]) 23 | 24 | # plot the results 25 | ax = plt.axes() 26 | ax.scatter(x, y) 27 | ax.plot(x_new, y_new) 28 | 29 | ax.set_xlabel('x') 30 | ax.set_ylabel('y') 31 | 32 | ax.axis('tight') 33 | 34 | 35 | if __name__ == '__main__': 36 | plot_linear_regression() 37 | plt.show() 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # IPython notebook 57 | .ipynb_checkpoints 58 | 59 | # Repo scratch directory 60 | scratch/ -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/scan_tutorial/scan_ex1_solution.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | 5 | coefficients = T.vector("coefficients") 6 | x = T.scalar("x") 7 | max_coefficients_supported = 10000 8 | 9 | 10 | def step(coeff, power, prior_value, free_var): 11 | return prior_value + (coeff * (free_var ** power)) 12 | 13 | # Generate the components of the polynomial 14 | full_range = T.arange(max_coefficients_supported) 15 | outputs_info = np.zeros((), dtype=theano.config.floatX) 16 | 17 | components, updates = theano.scan(fn=step, 18 | sequences=[coefficients, full_range], 19 | outputs_info=outputs_info, 20 | non_sequences=x) 21 | 22 | polynomial = components[-1] 23 | calculate_polynomial = theano.function(inputs=[coefficients, x], 24 | outputs=polynomial, 25 | updates=updates) 26 | 27 | test_coeff = np.asarray([1, 0, 2], dtype=theano.config.floatX) 28 | print(calculate_polynomial(test_coeff, 3)) 29 | -------------------------------------------------------------------------------- /python-data/tests/test_transform_util.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from ..transform_util import TransformUtil 3 | 4 | 5 | class TestTransformUtil(): 6 | 7 | states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', \ 8 | 'FlOrIda', 'south carolina##', 'West virginia?'] 9 | 10 | expected_output = ['Alabama', 11 | 'Georgia', 12 | 'Georgia', 13 | 'Georgia', 14 | 'Florida', 15 | 'South Carolina', 16 | 'West Virginia'] 17 | 18 | def test_remove_punctuation(self): 19 | assert_equal(TransformUtil.remove_punctuation('!#?'), '') 20 | 21 | def test_map_remove_punctuation(self): 22 | # Map applies a function to a collection 23 | output = map(TransformUtil.remove_punctuation, self.states) 24 | assert_equal('!#?' not in output, True) 25 | 26 | def test_clean_strings(self): 27 | clean_ops = [str.strip, TransformUtil.remove_punctuation, str.title] 28 | output = TransformUtil.clean_strings(self.states, clean_ops) 29 | assert_equal(output, self.expected_output) -------------------------------------------------------------------------------- /scikit-learn/fig_code/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def linear_data_sample(N=40, rseed=0, m=3, b=-2): 5 | rng = np.random.RandomState(rseed) 6 | 7 | x = 10 * rng.rand(N) 8 | dy = m / 2 * (1 + rng.rand(N)) 9 | y = m * x + b + dy * rng.randn(N) 10 | 11 | return (x, y, dy) 12 | 13 | 14 | def linear_data_sample_big_errs(N=40, rseed=0, m=3, b=-2): 15 | rng = np.random.RandomState(rseed) 16 | 17 | x = 10 * rng.rand(N) 18 | dy = m / 2 * (1 + rng.rand(N)) 19 | dy[20:25] *= 10 20 | y = m * x + b + dy * rng.randn(N) 21 | 22 | return (x, y, dy) 23 | 24 | 25 | def sample_light_curve(phased=True): 26 | from astroML.datasets import fetch_LINEAR_sample 27 | data = fetch_LINEAR_sample() 28 | t, y, dy = data[18525697].T 29 | 30 | if phased: 31 | P_best = 0.580313015651 32 | t /= P_best 33 | 34 | return (t, y, dy) 35 | 36 | 37 | def sample_light_curve_2(phased=True): 38 | from astroML.datasets import fetch_LINEAR_sample 39 | data = fetch_LINEAR_sample() 40 | t, y, dy = data[10022663].T 41 | 42 | if phased: 43 | P_best = 0.61596079804 44 | t /= P_best 45 | 46 | return (t, y, dy) 47 | 48 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/sgd_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import SGDClassifier 4 | from sklearn.datasets.samples_generator import make_blobs 5 | 6 | def plot_sgd_separator(): 7 | # we create 50 separable points 8 | X, Y = make_blobs(n_samples=50, centers=2, 9 | random_state=0, cluster_std=0.60) 10 | 11 | # fit the model 12 | clf = SGDClassifier(loss="hinge", alpha=0.01, 13 | n_iter=200, fit_intercept=True) 14 | clf.fit(X, Y) 15 | 16 | # plot the line, the points, and the nearest vectors to the plane 17 | xx = np.linspace(-1, 5, 10) 18 | yy = np.linspace(-1, 5, 10) 19 | 20 | X1, X2 = np.meshgrid(xx, yy) 21 | Z = np.empty(X1.shape) 22 | for (i, j), val in np.ndenumerate(X1): 23 | x1 = val 24 | x2 = X2[i, j] 25 | p = clf.decision_function([x1, x2]) 26 | Z[i, j] = p[0] 27 | levels = [-1.0, 0.0, 1.0] 28 | linestyles = ['dashed', 'solid', 'dashed'] 29 | colors = 'k' 30 | 31 | ax = plt.axes() 32 | ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) 33 | ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) 34 | 35 | ax.axis('tight') 36 | 37 | 38 | if __name__ == '__main__': 39 | plot_sgd_separator() 40 | plt.show() 41 | -------------------------------------------------------------------------------- /misc/regex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Regex\n", 8 | "\n", 9 | "Credits: Material from [Regular Expressions Cheat Sheet](http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/) by Dave Child\n", 10 | "\n", 11 | "Use with http://www.regexr.com to generate regular expressions." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "

\n", 19 | " \n", 20 | " \n", 21 | "
\n", 22 | "

" 23 | ] 24 | } 25 | ], 26 | "metadata": { 27 | "kernelspec": { 28 | "display_name": "Python 2", 29 | "language": "python", 30 | "name": "python2" 31 | }, 32 | "language_info": { 33 | "codemirror_mode": { 34 | "name": "ipython", 35 | "version": 2 36 | }, 37 | "file_extension": ".py", 38 | "mimetype": "text/x-python", 39 | "name": "python", 40 | "nbconvert_exporter": "python", 41 | "pygments_lexer": "ipython2", 42 | "version": "2.7.10" 43 | } 44 | }, 45 | "nbformat": 4, 46 | "nbformat_minor": 0 47 | } 48 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/scan_tutorial/scan_ex2_solution.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | 5 | probabilities = T.vector() 6 | nb_samples = T.iscalar() 7 | 8 | rng = T.shared_randomstreams.RandomStreams(1234) 9 | 10 | 11 | def sample_from_pvect(pvect): 12 | """ Provided utility function: given a symbolic vector of 13 | probabilities (which MUST sum to 1), sample one element 14 | and return its index. 15 | """ 16 | onehot_sample = rng.multinomial(n=1, pvals=pvect) 17 | sample = onehot_sample.argmax() 18 | return sample 19 | 20 | 21 | def set_p_to_zero(pvect, i): 22 | """ Provided utility function: given a symbolic vector of 23 | probabilities and an index 'i', set the probability of the 24 | i-th element to 0 and renormalize the probabilities so they 25 | sum to 1. 26 | """ 27 | new_pvect = T.set_subtensor(pvect[i], 0.) 28 | new_pvect = new_pvect / new_pvect.sum() 29 | return new_pvect 30 | 31 | 32 | def step(p): 33 | sample = sample_from_pvect(p) 34 | new_p = set_p_to_zero(p, sample) 35 | return new_p, sample 36 | 37 | output, updates = theano.scan(fn=step, 38 | outputs_info=[probabilities, None], 39 | n_steps=nb_samples) 40 | 41 | modified_probabilities, samples = output 42 | 43 | f = theano.function(inputs=[probabilities, nb_samples], 44 | outputs=[samples], 45 | updates=updates) 46 | 47 | # Testing the function 48 | test_probs = np.asarray([0.6, 0.3, 0.1], dtype=theano.config.floatX) 49 | for i in range(10): 50 | print(f(test_probs, 2)) 51 | -------------------------------------------------------------------------------- /commands/styles/custom.css: -------------------------------------------------------------------------------- 1 | 46 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/multigpu_basics.py: -------------------------------------------------------------------------------- 1 | #Multi GPU Basic example 2 | ''' 3 | This tutorial requires your machine to have 2 GPUs 4 | "/cpu:0": The CPU of your machine. 5 | "/gpu:0": The first GPU of your machine 6 | "/gpu:1": The second GPU of your machine 7 | ''' 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import datetime 12 | 13 | #Processing Units logs 14 | log_device_placement = True 15 | 16 | #num of multiplications to perform 17 | n = 10 18 | 19 | ''' 20 | Example: compute A^n + B^n on 2 GPUs 21 | Results on 8 cores with 2 GTX-980: 22 | * Single GPU computation time: 0:00:11.277449 23 | * Multi GPU computation time: 0:00:07.131701 24 | ''' 25 | #Create random large matrix 26 | A = np.random.rand(1e4, 1e4).astype('float32') 27 | B = np.random.rand(1e4, 1e4).astype('float32') 28 | 29 | # Creates a graph to store results 30 | c1 = [] 31 | c2 = [] 32 | 33 | def matpow(M, n): 34 | if n < 1: #Abstract cases where n < 1 35 | return M 36 | else: 37 | return tf.matmul(M, matpow(M, n-1)) 38 | 39 | ''' 40 | Single GPU computing 41 | ''' 42 | with tf.device('/gpu:0'): 43 | a = tf.constant(A) 44 | b = tf.constant(B) 45 | #compute A^n and B^n and store results in c1 46 | c1.append(matpow(a, n)) 47 | c1.append(matpow(b, n)) 48 | 49 | with tf.device('/cpu:0'): 50 | sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n 51 | 52 | t1_1 = datetime.datetime.now() 53 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess: 54 | # Runs the op. 55 | sess.run(sum) 56 | t2_1 = datetime.datetime.now() 57 | 58 | 59 | ''' 60 | Multi GPU computing 61 | ''' 62 | #GPU:0 computes A^n 63 | with tf.device('/gpu:0'): 64 | #compute A^n and store result in c2 65 | a = tf.constant(A) 66 | c2.append(matpow(a, n)) 67 | 68 | #GPU:1 computes B^n 69 | with tf.device('/gpu:1'): 70 | #compute B^n and store result in c2 71 | b = tf.constant(B) 72 | c2.append(matpow(b, n)) 73 | 74 | with tf.device('/cpu:0'): 75 | sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n 76 | 77 | t1_2 = datetime.datetime.now() 78 | with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess: 79 | # Runs the op. 80 | sess.run(sum) 81 | t2_2 = datetime.datetime.now() 82 | 83 | 84 | print "Single GPU computation time: " + str(t2_1-t1_1) 85 | print "Multi GPU computation time: " + str(t2_2-t1_2) -------------------------------------------------------------------------------- /scikit-learn/fig_code/helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small helpers for code that is not shown in the notebooks 3 | """ 4 | 5 | from sklearn import neighbors, datasets, linear_model 6 | import pylab as pl 7 | import numpy as np 8 | from matplotlib.colors import ListedColormap 9 | 10 | # Create color maps for 3-class classification problem, as with iris 11 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) 12 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) 13 | 14 | def plot_iris_knn(): 15 | iris = datasets.load_iris() 16 | X = iris.data[:, :2] # we only take the first two features. We could 17 | # avoid this ugly slicing by using a two-dim dataset 18 | y = iris.target 19 | 20 | knn = neighbors.KNeighborsClassifier(n_neighbors=5) 21 | knn.fit(X, y) 22 | 23 | x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1 24 | y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1 25 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), 26 | np.linspace(y_min, y_max, 100)) 27 | Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]) 28 | 29 | # Put the result into a color plot 30 | Z = Z.reshape(xx.shape) 31 | pl.figure() 32 | pl.pcolormesh(xx, yy, Z, cmap=cmap_light) 33 | 34 | # Plot also the training points 35 | pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) 36 | pl.xlabel('sepal length (cm)') 37 | pl.ylabel('sepal width (cm)') 38 | pl.axis('tight') 39 | 40 | 41 | def plot_polynomial_regression(): 42 | rng = np.random.RandomState(0) 43 | x = 2*rng.rand(100) - 1 44 | 45 | f = lambda t: 1.2 * t**2 + .1 * t**3 - .4 * t **5 - .5 * t ** 9 46 | y = f(x) + .4 * rng.normal(size=100) 47 | 48 | x_test = np.linspace(-1, 1, 100) 49 | 50 | pl.figure() 51 | pl.scatter(x, y, s=4) 52 | 53 | X = np.array([x**i for i in range(5)]).T 54 | X_test = np.array([x_test**i for i in range(5)]).T 55 | regr = linear_model.LinearRegression() 56 | regr.fit(X, y) 57 | pl.plot(x_test, regr.predict(X_test), label='4th order') 58 | 59 | X = np.array([x**i for i in range(10)]).T 60 | X_test = np.array([x_test**i for i in range(10)]).T 61 | regr = linear_model.LinearRegression() 62 | regr.fit(X, y) 63 | pl.plot(x_test, regr.predict(X_test), label='9th order') 64 | 65 | pl.legend(loc='best') 66 | pl.axis('tight') 67 | pl.title('Fitting a 4th and a 9th order polynomial') 68 | 69 | pl.figure() 70 | pl.scatter(x, y, s=4) 71 | pl.plot(x_test, f(x_test), label="truth") 72 | pl.axis('tight') 73 | pl.title('Ground truth (9th order polynomial)') 74 | 75 | 76 | -------------------------------------------------------------------------------- /python-data/pdb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# PDB\n", 15 | "\n", 16 | "The pdb module defines an interactive source code debugger for Python programs. Below are frequently used commands:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "# Run pdb when this line is hit\n", 28 | "import pdb; pdb.set_trace()\n", 29 | "\n", 30 | "# Run pdb when the script is run\n", 31 | "python -m pdb script.py\n", 32 | "\n", 33 | "# Help\n", 34 | "h[elp]\n", 35 | "\n", 36 | "# Show current content\n", 37 | "l[ist]\n", 38 | "\n", 39 | "# Examine variables\n", 40 | "p[rint]\n", 41 | "\n", 42 | "# Pretty print\n", 43 | "pp\n", 44 | "\n", 45 | "# Go to next line\n", 46 | "n[ext]\n", 47 | "\n", 48 | "# Step into\n", 49 | "s[tep]\n", 50 | "\n", 51 | "# Continue execution until the line with the line number greater \n", 52 | "# than the current one is reached or when returning from current frame.\n", 53 | "until\n", 54 | "\n", 55 | "# Return\n", 56 | "r[eturn]\n", 57 | "\n", 58 | "# See all breakpoints\n", 59 | "b to see all breakpoints\n", 60 | "\n", 61 | "# Set breakpoint at line 16\n", 62 | "b 16 \n", 63 | "\n", 64 | "# Clear breakpoint 1\n", 65 | "cl[ear] 1\n", 66 | "\n", 67 | "# Continue\n", 68 | "c[ontinue]\n", 69 | "\n", 70 | "# Conditional breakpoints, line 11\n", 71 | "b 11, this_year == 2015\n", 72 | "\n", 73 | "# Stack location\n", 74 | "w[here]\n", 75 | "\n", 76 | "# Go up in stack\n", 77 | "u[p]\n", 78 | "\n", 79 | "# Go down in stack\n", 80 | "d[own]\n", 81 | "\n", 82 | "# Longlist shows full method of where you're in (Python 3)\n", 83 | "ll\n", 84 | "\n", 85 | "# Quit\n", 86 | "q[uit]" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 2", 93 | "language": "python", 94 | "name": "python2" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 2 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython2", 106 | "version": "2.7.10" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 0 111 | } 112 | -------------------------------------------------------------------------------- /analyses/churn_measurements.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | __author__ = "Eric Chiang" 5 | __email__ = "eric[at]yhathq.com" 6 | 7 | """ 8 | 9 | Measurements inspired by Philip Tetlock's "Expert Political Judgment" 10 | 11 | Equations take from Yaniv, Yates, & Smith (1991): 12 | "Measures of Descrimination Skill in Probabilistic Judgement" 13 | 14 | """ 15 | 16 | 17 | def calibration(prob,outcome,n_bins=10): 18 | """Calibration measurement for a set of predictions. 19 | 20 | When predicting events at a given probability, how far is frequency 21 | of positive outcomes from that probability? 22 | NOTE: Lower scores are better 23 | 24 | prob: array_like, float 25 | Probability estimates for a set of events 26 | 27 | outcome: array_like, bool 28 | If event predicted occurred 29 | 30 | n_bins: int 31 | Number of judgement categories to prefrom calculation over. 32 | Prediction are binned based on probability, since "descrete" 33 | probabilities aren't required. 34 | 35 | """ 36 | prob = np.array(prob) 37 | outcome = np.array(outcome) 38 | 39 | c = 0.0 40 | # Construct bins 41 | judgement_bins = np.arange(n_bins + 1) / n_bins 42 | # Which bin is each prediction in? 43 | bin_num = np.digitize(prob,judgement_bins) 44 | for j_bin in np.unique(bin_num): 45 | # Is event in bin 46 | in_bin = bin_num == j_bin 47 | # Predicted probability taken as average of preds in bin 48 | predicted_prob = np.mean(prob[in_bin]) 49 | # How often did events in this bin actually happen? 50 | true_bin_prob = np.mean(outcome[in_bin]) 51 | # Squared distance between predicted and true times num of obs 52 | c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2) 53 | return c / len(prob) 54 | 55 | def discrimination(prob,outcome,n_bins=10): 56 | """Discrimination measurement for a set of predictions. 57 | 58 | For each judgement category, how far from the base probability 59 | is the true frequency of that bin? 60 | NOTE: High scores are better 61 | 62 | prob: array_like, float 63 | Probability estimates for a set of events 64 | 65 | outcome: array_like, bool 66 | If event predicted occurred 67 | 68 | n_bins: int 69 | Number of judgement categories to prefrom calculation over. 70 | Prediction are binned based on probability, since "descrete" 71 | probabilities aren't required. 72 | 73 | """ 74 | prob = np.array(prob) 75 | outcome = np.array(outcome) 76 | 77 | d = 0.0 78 | # Base frequency of outcomes 79 | base_prob = np.mean(outcome) 80 | # Construct bins 81 | judgement_bins = np.arange(n_bins + 1) / n_bins 82 | # Which bin is each prediction in? 83 | bin_num = np.digitize(prob,judgement_bins) 84 | for j_bin in np.unique(bin_num): 85 | in_bin = bin_num == j_bin 86 | true_bin_prob = np.mean(outcome[in_bin]) 87 | # Squared distance between true and base times num of obs 88 | d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2) 89 | return d / len(prob) 90 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/synthetic.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | 4 | 5 | def mackey_glass(sample_len=1000, tau=17, seed=None, n_samples = 1): 6 | ''' 7 | mackey_glass(sample_len=1000, tau=17, seed = None, n_samples = 1) -> input 8 | Generate the Mackey Glass time-series. Parameters are: 9 | - sample_len: length of the time-series in timesteps. Default is 1000. 10 | - tau: delay of the MG - system. Commonly used values are tau=17 (mild 11 | chaos) and tau=30 (moderate chaos). Default is 17. 12 | - seed: to seed the random generator, can be used to generate the same 13 | timeseries at each invocation. 14 | - n_samples : number of samples to generate 15 | ''' 16 | delta_t = 10 17 | history_len = tau * delta_t 18 | # Initial conditions for the history of the system 19 | timeseries = 1.2 20 | 21 | if seed is not None: 22 | np.random.seed(seed) 23 | 24 | samples = [] 25 | 26 | for _ in range(n_samples): 27 | history = collections.deque(1.2 * np.ones(history_len) + 0.2 * \ 28 | (np.random.rand(history_len) - 0.5)) 29 | # Preallocate the array for the time-series 30 | inp = np.zeros((sample_len,1)) 31 | 32 | for timestep in range(sample_len): 33 | for _ in range(delta_t): 34 | xtau = history.popleft() 35 | history.append(timeseries) 36 | timeseries = history[-1] + (0.2 * xtau / (1.0 + xtau ** 10) - \ 37 | 0.1 * history[-1]) / delta_t 38 | inp[timestep] = timeseries 39 | 40 | # Squash timeseries through tanh 41 | inp = np.tanh(inp - 1) 42 | samples.append(inp) 43 | return samples 44 | 45 | 46 | def mso(sample_len=1000, n_samples = 1): 47 | ''' 48 | mso(sample_len=1000, n_samples = 1) -> input 49 | Generate the Multiple Sinewave Oscillator time-series, a sum of two sines 50 | with incommensurable periods. Parameters are: 51 | - sample_len: length of the time-series in timesteps 52 | - n_samples: number of samples to generate 53 | ''' 54 | signals = [] 55 | for _ in range(n_samples): 56 | phase = np.random.rand() 57 | x = np.atleast_2d(np.arange(sample_len)).T 58 | signals.append(np.sin(0.2 * x + phase) + np.sin(0.311 * x + phase)) 59 | return signals 60 | 61 | 62 | def lorentz(sample_len=1000, sigma=10, rho=28, beta=8 / 3, step=0.01): 63 | """This function generates a Lorentz time series of length sample_len, 64 | with standard parameters sigma, rho and beta. 65 | """ 66 | 67 | x = np.zeros([sample_len]) 68 | y = np.zeros([sample_len]) 69 | z = np.zeros([sample_len]) 70 | 71 | # Initial conditions taken from 'Chaos and Time Series Analysis', J. Sprott 72 | x[0] = 0; 73 | y[0] = -0.01; 74 | z[0] = 9; 75 | 76 | for t in range(sample_len - 1): 77 | x[t + 1] = x[t] + sigma * (y[t] - x[t]) * step 78 | y[t + 1] = y[t] + (x[t] * (rho - z[t]) - y[t]) * step 79 | z[t + 1] = z[t] + (x[t] * y[t] - beta * z[t]) * step 80 | 81 | x.shape += (1,) 82 | y.shape += (1,) 83 | z.shape += (1,) 84 | 85 | return np.concatenate((x, y, z), axis=1) 86 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/scikit-learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:29899a15bea89b9d8275879798b23011cecabc0eff03dd41bb606324221e0bc3" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# scikit-learn" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "%matplotlib inline\n", 23 | "\n", 24 | "# set seaborn plot defaults.\n", 25 | "# This can be safely commented out\n", 26 | "import seaborn; seaborn.set()" 27 | ], 28 | "language": "python", 29 | "metadata": {}, 30 | "outputs": [], 31 | "prompt_number": 3 32 | }, 33 | { 34 | "cell_type": "code", 35 | "collapsed": false, 36 | "input": [ 37 | "# Import the example plot from the figures directory\n", 38 | "from fig_code import plot_sgd_separator\n", 39 | "plot_sgd_separator()" 40 | ], 41 | "language": "python", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "ename": "ImportError", 46 | "evalue": "No module named fig_code", 47 | "output_type": "pyerr", 48 | "traceback": [ 49 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 50 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Import the example plot from the figures directory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mfig_code\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 51 | "\u001b[0;31mImportError\u001b[0m: No module named fig_code" 52 | ] 53 | } 54 | ], 55 | "prompt_number": 4 56 | }, 57 | { 58 | "cell_type": "code", 59 | "collapsed": false, 60 | "input": [], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "collapsed": false, 68 | "input": [], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "collapsed": false, 76 | "input": [], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "collapsed": false, 84 | "input": [], 85 | "language": "python", 86 | "metadata": {}, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "collapsed": false, 92 | "input": [], 93 | "language": "python", 94 | "metadata": {}, 95 | "outputs": [] 96 | } 97 | ], 98 | "metadata": {} 99 | } 100 | ] 101 | } -------------------------------------------------------------------------------- /mapreduce/test_mr_s3_log_parser.py: -------------------------------------------------------------------------------- 1 | 2 | from StringIO import StringIO 3 | import unittest2 as unittest 4 | from mr_s3_log_parser import MrS3LogParser 5 | 6 | 7 | class MrTestsUtil: 8 | 9 | def run_mr_sandbox(self, mr_job, stdin): 10 | # inline runs the job in the same process so small jobs tend to 11 | # run faster and stack traces are simpler 12 | # --no-conf prevents options from local mrjob.conf from polluting 13 | # the testing environment 14 | # "-" reads from standard in 15 | mr_job.sandbox(stdin=stdin) 16 | 17 | # make_runner ensures job cleanup is performed regardless of 18 | # success or failure 19 | with mr_job.make_runner() as runner: 20 | runner.run() 21 | for line in runner.stream_output(): 22 | key, value = mr_job.parse_output_line(line) 23 | yield value 24 | 25 | 26 | class TestMrS3LogParser(unittest.TestCase): 27 | 28 | mr_job = None 29 | mr_tests_util = None 30 | 31 | RAW_LOG_LINE_INVALID = \ 32 | '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \ 33 | '00000388225bcc00000 ' \ 34 | 's3-storage [22/Jul/2013:21:03:27 +0000] ' \ 35 | '00.111.222.33 ' \ 36 | 37 | RAW_LOG_LINE_VALID = \ 38 | '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \ 39 | '00000388225bcc00000 ' \ 40 | 's3-storage [22/Jul/2013:21:03:27 +0000] ' \ 41 | '00.111.222.33 ' \ 42 | 'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \ 43 | 'REST.HEAD.OBJECT user/file.pdf ' \ 44 | '"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \ 45 | '00000SDZk ' \ 46 | 'HTTP/1.1" 200 - - 4000272 18 - "-" ' \ 47 | '"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \ 48 | '00000XMHZJp6DjM9x5JVEAMo8MG00000' 49 | 50 | DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000" 51 | DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000" 52 | DATE_VALID = "2013-07-22" 53 | DATE_TIME_VALID = "2013-07-22 21:04:17" 54 | TIME_ZONE_VALID = "+0000" 55 | 56 | def __init__(self, *args, **kwargs): 57 | super(TestMrS3LogParser, self).__init__(*args, **kwargs) 58 | self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-']) 59 | self.mr_tests_util = MrTestsUtil() 60 | 61 | def test_invalid_log_lines(self): 62 | stdin = StringIO(self.RAW_LOG_LINE_INVALID) 63 | 64 | for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin): 65 | self.assertEqual(result.find("Error"), 0) 66 | 67 | def test_valid_log_lines(self): 68 | stdin = StringIO(self.RAW_LOG_LINE_VALID) 69 | 70 | for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin): 71 | self.assertEqual(result.find("Error"), -1) 72 | 73 | def test_clean_date_time_zone(self): 74 | date, date_time, time_zone_parsed = \ 75 | self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID) 76 | self.assertEqual(date, self.DATE_VALID) 77 | self.assertEqual(date_time, self.DATE_TIME_VALID) 78 | self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID) 79 | 80 | # Use a lambda to delay the calling of clean_date_time_zone so that 81 | # assertRaises has enough time to handle it properly 82 | self.assertRaises(ValueError, 83 | lambda: self.mr_job.clean_date_time_zone( 84 | self.DATE_TIME_ZONE_INVALID)) 85 | 86 | if __name__ == '__main__': 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /scipy/nsfg.py: -------------------------------------------------------------------------------- 1 | """This file contains code for use with "Think Stats", 2 | by Allen B. Downey, available from greenteapress.com 3 | 4 | Copyright 2010 Allen B. Downey 5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html 6 | """ 7 | 8 | from __future__ import print_function 9 | 10 | from collections import defaultdict 11 | import numpy as np 12 | import sys 13 | 14 | import thinkstats2 15 | 16 | 17 | def ReadFemPreg(dct_file='2002FemPreg.dct', 18 | dat_file='2002FemPreg.dat.gz'): 19 | """Reads the NSFG pregnancy data. 20 | 21 | dct_file: string file name 22 | dat_file: string file name 23 | 24 | returns: DataFrame 25 | """ 26 | dct = thinkstats2.ReadStataDct(dct_file) 27 | df = dct.ReadFixedWidth(dat_file, compression='gzip') 28 | CleanFemPreg(df) 29 | return df 30 | 31 | 32 | def CleanFemPreg(df): 33 | """Recodes variables from the pregnancy frame. 34 | 35 | df: DataFrame 36 | """ 37 | # mother's age is encoded in centiyears; convert to years 38 | df.agepreg /= 100.0 39 | 40 | # birthwgt_lb contains at least one bogus value (51 lbs) 41 | # replace with NaN 42 | df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan 43 | 44 | # replace 'not ascertained', 'refused', 'don't know' with NaN 45 | na_vals = [97, 98, 99] 46 | df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) 47 | df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) 48 | df.hpagelb.replace(na_vals, np.nan, inplace=True) 49 | 50 | df.babysex.replace([7, 9], np.nan, inplace=True) 51 | df.nbrnaliv.replace([9], np.nan, inplace=True) 52 | 53 | # birthweight is stored in two columns, lbs and oz. 54 | # convert to a single column in lb 55 | # NOTE: creating a new column requires dictionary syntax, 56 | # not attribute assignment (like df.totalwgt_lb) 57 | df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 58 | 59 | # due to a bug in ReadStataDct, the last variable gets clipped; 60 | # so for now set it to NaN 61 | df.cmintvw = np.nan 62 | 63 | 64 | def MakePregMap(df): 65 | """Make a map from caseid to list of preg indices. 66 | 67 | df: DataFrame 68 | 69 | returns: dict that maps from caseid to list of indices into preg df 70 | """ 71 | d = defaultdict(list) 72 | for index, caseid in df.caseid.iteritems(): 73 | d[caseid].append(index) 74 | return d 75 | 76 | 77 | def main(script): 78 | """Tests the functions in this module. 79 | 80 | script: string script name 81 | """ 82 | df = ReadFemPreg() 83 | print(df.shape) 84 | 85 | assert len(df) == 13593 86 | 87 | assert df.caseid[13592] == 12571 88 | assert df.pregordr.value_counts()[1] == 5033 89 | assert df.nbrnaliv.value_counts()[1] == 8981 90 | assert df.babysex.value_counts()[1] == 4641 91 | assert df.birthwgt_lb.value_counts()[7] == 3049 92 | assert df.birthwgt_oz.value_counts()[0] == 1037 93 | assert df.prglngth.value_counts()[39] == 4744 94 | assert df.outcome.value_counts()[1] == 9148 95 | assert df.birthord.value_counts()[1] == 4413 96 | assert df.agepreg.value_counts()[22.75] == 100 97 | assert df.totalwgt_lb.value_counts()[7.5] == 302 98 | 99 | weights = df.finalwgt.value_counts() 100 | key = max(weights.keys()) 101 | assert df.finalwgt.value_counts()[key] == 6 102 | 103 | print('%s: All tests passed.' % script) 104 | 105 | if __name__ == '__main__': 106 | main(*sys.argv) 107 | -------------------------------------------------------------------------------- /data/ozone.csv: -------------------------------------------------------------------------------- 1 | "Ozone","Solar.R","Wind","Temp","Month","Day" 2 | 41,190,7.4,67,5,1 3 | 36,118,8,72,5,2 4 | 12,149,12.6,74,5,3 5 | 18,313,11.5,62,5,4 6 | NA,NA,14.3,56,5,5 7 | 28,NA,14.9,66,5,6 8 | 23,299,8.6,65,5,7 9 | 19,99,13.8,59,5,8 10 | 8,19,20.1,61,5,9 11 | NA,194,8.6,69,5,10 12 | 7,NA,6.9,74,5,11 13 | 16,256,9.7,69,5,12 14 | 11,290,9.2,66,5,13 15 | 14,274,10.9,68,5,14 16 | 18,65,13.2,58,5,15 17 | 14,334,11.5,64,5,16 18 | 34,307,12,66,5,17 19 | 6,78,18.4,57,5,18 20 | 30,322,11.5,68,5,19 21 | 11,44,9.7,62,5,20 22 | 1,8,9.7,59,5,21 23 | 11,320,16.6,73,5,22 24 | 4,25,9.7,61,5,23 25 | 32,92,12,61,5,24 26 | NA,66,16.6,57,5,25 27 | NA,266,14.9,58,5,26 28 | NA,NA,8,57,5,27 29 | 23,13,12,67,5,28 30 | 45,252,14.9,81,5,29 31 | 115,223,5.7,79,5,30 32 | 37,279,7.4,76,5,31 33 | NA,286,8.6,78,6,1 34 | NA,287,9.7,74,6,2 35 | NA,242,16.1,67,6,3 36 | NA,186,9.2,84,6,4 37 | NA,220,8.6,85,6,5 38 | NA,264,14.3,79,6,6 39 | 29,127,9.7,82,6,7 40 | NA,273,6.9,87,6,8 41 | 71,291,13.8,90,6,9 42 | 39,323,11.5,87,6,10 43 | NA,259,10.9,93,6,11 44 | NA,250,9.2,92,6,12 45 | 23,148,8,82,6,13 46 | NA,332,13.8,80,6,14 47 | NA,322,11.5,79,6,15 48 | 21,191,14.9,77,6,16 49 | 37,284,20.7,72,6,17 50 | 20,37,9.2,65,6,18 51 | 12,120,11.5,73,6,19 52 | 13,137,10.3,76,6,20 53 | NA,150,6.3,77,6,21 54 | NA,59,1.7,76,6,22 55 | NA,91,4.6,76,6,23 56 | NA,250,6.3,76,6,24 57 | NA,135,8,75,6,25 58 | NA,127,8,78,6,26 59 | NA,47,10.3,73,6,27 60 | NA,98,11.5,80,6,28 61 | NA,31,14.9,77,6,29 62 | NA,138,8,83,6,30 63 | 135,269,4.1,84,7,1 64 | 49,248,9.2,85,7,2 65 | 32,236,9.2,81,7,3 66 | NA,101,10.9,84,7,4 67 | 64,175,4.6,83,7,5 68 | 40,314,10.9,83,7,6 69 | 77,276,5.1,88,7,7 70 | 97,267,6.3,92,7,8 71 | 97,272,5.7,92,7,9 72 | 85,175,7.4,89,7,10 73 | NA,139,8.6,82,7,11 74 | 10,264,14.3,73,7,12 75 | 27,175,14.9,81,7,13 76 | NA,291,14.9,91,7,14 77 | 7,48,14.3,80,7,15 78 | 48,260,6.9,81,7,16 79 | 35,274,10.3,82,7,17 80 | 61,285,6.3,84,7,18 81 | 79,187,5.1,87,7,19 82 | 63,220,11.5,85,7,20 83 | 16,7,6.9,74,7,21 84 | NA,258,9.7,81,7,22 85 | NA,295,11.5,82,7,23 86 | 80,294,8.6,86,7,24 87 | 108,223,8,85,7,25 88 | 20,81,8.6,82,7,26 89 | 52,82,12,86,7,27 90 | 82,213,7.4,88,7,28 91 | 50,275,7.4,86,7,29 92 | 64,253,7.4,83,7,30 93 | 59,254,9.2,81,7,31 94 | 39,83,6.9,81,8,1 95 | 9,24,13.8,81,8,2 96 | 16,77,7.4,82,8,3 97 | 78,NA,6.9,86,8,4 98 | 35,NA,7.4,85,8,5 99 | 66,NA,4.6,87,8,6 100 | 122,255,4,89,8,7 101 | 89,229,10.3,90,8,8 102 | 110,207,8,90,8,9 103 | NA,222,8.6,92,8,10 104 | NA,137,11.5,86,8,11 105 | 44,192,11.5,86,8,12 106 | 28,273,11.5,82,8,13 107 | 65,157,9.7,80,8,14 108 | NA,64,11.5,79,8,15 109 | 22,71,10.3,77,8,16 110 | 59,51,6.3,79,8,17 111 | 23,115,7.4,76,8,18 112 | 31,244,10.9,78,8,19 113 | 44,190,10.3,78,8,20 114 | 21,259,15.5,77,8,21 115 | 9,36,14.3,72,8,22 116 | NA,255,12.6,75,8,23 117 | 45,212,9.7,79,8,24 118 | 168,238,3.4,81,8,25 119 | 73,215,8,86,8,26 120 | NA,153,5.7,88,8,27 121 | 76,203,9.7,97,8,28 122 | 118,225,2.3,94,8,29 123 | 84,237,6.3,96,8,30 124 | 85,188,6.3,94,8,31 125 | 96,167,6.9,91,9,1 126 | 78,197,5.1,92,9,2 127 | 73,183,2.8,93,9,3 128 | 91,189,4.6,93,9,4 129 | 47,95,7.4,87,9,5 130 | 32,92,15.5,84,9,6 131 | 20,252,10.9,80,9,7 132 | 23,220,10.3,78,9,8 133 | 21,230,10.9,75,9,9 134 | 24,259,9.7,73,9,10 135 | 44,236,14.9,81,9,11 136 | 21,259,15.5,76,9,12 137 | 28,238,6.3,77,9,13 138 | 9,24,10.9,71,9,14 139 | 13,112,11.5,71,9,15 140 | 46,237,6.9,78,9,16 141 | 18,224,13.8,67,9,17 142 | 13,27,10.3,76,9,18 143 | 24,238,10.3,68,9,19 144 | 16,201,8,82,9,20 145 | 13,238,12.6,64,9,21 146 | 23,14,9.2,71,9,22 147 | 36,139,10.3,81,9,23 148 | 7,49,10.3,69,9,24 149 | 14,20,16.6,63,9,25 150 | 30,193,6.9,70,9,26 151 | NA,145,13.2,77,9,27 152 | 14,191,14.3,75,9,28 153 | 18,131,8,76,9,29 154 | 20,223,11.5,68,9,30 155 | -------------------------------------------------------------------------------- /data/titanic/gendermodel.py: -------------------------------------------------------------------------------- 1 | """ This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this 2 | Author : AstroDave 3 | Date : 18 September 2012 4 | Revised: 28 March 2014 5 | 6 | """ 7 | 8 | 9 | import csv as csv 10 | import numpy as np 11 | 12 | csv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file 13 | header = csv_file_object.next() # Skip the fist line as it is a header 14 | data=[] # Create a variable to hold the data 15 | 16 | for row in csv_file_object: # Skip through each row in the csv file, 17 | data.append(row[0:]) # adding each row to the data variable 18 | data = np.array(data) # Then convert from a list to an array. 19 | 20 | # Now I have an array of 12 columns and 891 rows 21 | # I can access any element I want, so the entire first column would 22 | # be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0 23 | # I have to add the .astype() command, because 24 | # when appending the rows, python thought it was a string - so needed to convert 25 | 26 | # Set some variables 27 | number_passengers = np.size(data[0::,1].astype(np.float)) 28 | number_survived = np.sum(data[0::,1].astype(np.float)) 29 | proportion_survivors = number_survived / number_passengers 30 | 31 | # I can now find the stats of all the women on board, 32 | # by making an array that lists True/False whether each row is female 33 | women_only_stats = data[0::,4] == "female" # This finds where all the women are 34 | men_only_stats = data[0::,4] != "female" # This finds where all the men are (note != means 'not equal') 35 | 36 | # I can now filter the whole data, to find statistics for just women, by just placing 37 | # women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index. 38 | # You can test it by placing it there, and requesting column index [4], and the output should all read 'female' 39 | # e.g. try typing this: data[women_only_stats,4] 40 | women_onboard = data[women_only_stats,1].astype(np.float) 41 | men_onboard = data[men_only_stats,1].astype(np.float) 42 | 43 | # and derive some statistics about them 44 | proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard) 45 | proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard) 46 | 47 | print 'Proportion of women who survived is %s' % proportion_women_survived 48 | print 'Proportion of men who survived is %s' % proportion_men_survived 49 | 50 | # Now that I have my indicator that women were much more likely to survive, 51 | # I am done with the training set. 52 | # Now I will read in the test file and write out my simplistic prediction: 53 | # if female, then model that she survived (1) 54 | # if male, then model that he did not survive (0) 55 | 56 | # First, read in test.csv 57 | test_file = open('test.csv', 'rb') 58 | test_file_object = csv.reader(test_file) 59 | header = test_file_object.next() 60 | 61 | # Also open the a new file so I can write to it. Call it something descriptive 62 | # Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex') 63 | # Write out the PassengerId, and my prediction. 64 | 65 | predictions_file = open("gendermodel.csv", "wb") 66 | predictions_file_object = csv.writer(predictions_file) 67 | predictions_file_object.writerow(["PassengerId", "Survived"]) # write the column headers 68 | for row in test_file_object: # For each row in test file, 69 | if row[3] == 'female': # is it a female, if yes then 70 | predictions_file_object.writerow([row[0], "1"]) # write the PassengerId, and predict 1 71 | else: # or else if male, 72 | predictions_file_object.writerow([row[0], "0"]) # write the PassengerId, and predict 0. 73 | test_file.close() # Close out the files. 74 | predictions_file.close() 75 | 76 | -------------------------------------------------------------------------------- /data/ozone_copy.csv: -------------------------------------------------------------------------------- 1 | 41.0,190.0,7.4,67,5,1 2 | 36.0,118.0,8.0,72,5,2 3 | 12.0,149.0,12.6,74,5,3 4 | 18.0,313.0,11.5,62,5,4 5 | ,,14.3,56,5,5 6 | 28.0,,14.9,66,5,6 7 | 23.0,299.0,8.6,65,5,7 8 | 19.0,99.0,13.8,59,5,8 9 | 8.0,19.0,20.1,61,5,9 10 | ,194.0,8.6,69,5,10 11 | 7.0,,6.9,74,5,11 12 | 16.0,256.0,9.7,69,5,12 13 | 11.0,290.0,9.2,66,5,13 14 | 14.0,274.0,10.9,68,5,14 15 | 18.0,65.0,13.2,58,5,15 16 | 14.0,334.0,11.5,64,5,16 17 | 34.0,307.0,12.0,66,5,17 18 | 6.0,78.0,18.4,57,5,18 19 | 30.0,322.0,11.5,68,5,19 20 | 11.0,44.0,9.7,62,5,20 21 | 1.0,8.0,9.7,59,5,21 22 | 11.0,320.0,16.6,73,5,22 23 | 4.0,25.0,9.7,61,5,23 24 | 32.0,92.0,12.0,61,5,24 25 | ,66.0,16.6,57,5,25 26 | ,266.0,14.9,58,5,26 27 | ,,8.0,57,5,27 28 | 23.0,13.0,12.0,67,5,28 29 | 45.0,252.0,14.9,81,5,29 30 | 115.0,223.0,5.7,79,5,30 31 | 37.0,279.0,7.4,76,5,31 32 | ,286.0,8.6,78,6,1 33 | ,287.0,9.7,74,6,2 34 | ,242.0,16.1,67,6,3 35 | ,186.0,9.2,84,6,4 36 | ,220.0,8.6,85,6,5 37 | ,264.0,14.3,79,6,6 38 | 29.0,127.0,9.7,82,6,7 39 | ,273.0,6.9,87,6,8 40 | 71.0,291.0,13.8,90,6,9 41 | 39.0,323.0,11.5,87,6,10 42 | ,259.0,10.9,93,6,11 43 | ,250.0,9.2,92,6,12 44 | 23.0,148.0,8.0,82,6,13 45 | ,332.0,13.8,80,6,14 46 | ,322.0,11.5,79,6,15 47 | 21.0,191.0,14.9,77,6,16 48 | 37.0,284.0,20.7,72,6,17 49 | 20.0,37.0,9.2,65,6,18 50 | 12.0,120.0,11.5,73,6,19 51 | 13.0,137.0,10.3,76,6,20 52 | ,150.0,6.3,77,6,21 53 | ,59.0,1.7,76,6,22 54 | ,91.0,4.6,76,6,23 55 | ,250.0,6.3,76,6,24 56 | ,135.0,8.0,75,6,25 57 | ,127.0,8.0,78,6,26 58 | ,47.0,10.3,73,6,27 59 | ,98.0,11.5,80,6,28 60 | ,31.0,14.9,77,6,29 61 | ,138.0,8.0,83,6,30 62 | 135.0,269.0,4.1,84,7,1 63 | 49.0,248.0,9.2,85,7,2 64 | 32.0,236.0,9.2,81,7,3 65 | ,101.0,10.9,84,7,4 66 | 64.0,175.0,4.6,83,7,5 67 | 40.0,314.0,10.9,83,7,6 68 | 77.0,276.0,5.1,88,7,7 69 | 97.0,267.0,6.3,92,7,8 70 | 97.0,272.0,5.7,92,7,9 71 | 85.0,175.0,7.4,89,7,10 72 | ,139.0,8.6,82,7,11 73 | 10.0,264.0,14.3,73,7,12 74 | 27.0,175.0,14.9,81,7,13 75 | ,291.0,14.9,91,7,14 76 | 7.0,48.0,14.3,80,7,15 77 | 48.0,260.0,6.9,81,7,16 78 | 35.0,274.0,10.3,82,7,17 79 | 61.0,285.0,6.3,84,7,18 80 | 79.0,187.0,5.1,87,7,19 81 | 63.0,220.0,11.5,85,7,20 82 | 16.0,7.0,6.9,74,7,21 83 | ,258.0,9.7,81,7,22 84 | ,295.0,11.5,82,7,23 85 | 80.0,294.0,8.6,86,7,24 86 | 108.0,223.0,8.0,85,7,25 87 | 20.0,81.0,8.6,82,7,26 88 | 52.0,82.0,12.0,86,7,27 89 | 82.0,213.0,7.4,88,7,28 90 | 50.0,275.0,7.4,86,7,29 91 | 64.0,253.0,7.4,83,7,30 92 | 59.0,254.0,9.2,81,7,31 93 | 39.0,83.0,6.9,81,8,1 94 | 9.0,24.0,13.8,81,8,2 95 | 16.0,77.0,7.4,82,8,3 96 | 78.0,,6.9,86,8,4 97 | 35.0,,7.4,85,8,5 98 | 66.0,,4.6,87,8,6 99 | 122.0,255.0,4.0,89,8,7 100 | 89.0,229.0,10.3,90,8,8 101 | 110.0,207.0,8.0,90,8,9 102 | ,222.0,8.6,92,8,10 103 | ,137.0,11.5,86,8,11 104 | 44.0,192.0,11.5,86,8,12 105 | 28.0,273.0,11.5,82,8,13 106 | 65.0,157.0,9.7,80,8,14 107 | ,64.0,11.5,79,8,15 108 | 22.0,71.0,10.3,77,8,16 109 | 59.0,51.0,6.3,79,8,17 110 | 23.0,115.0,7.4,76,8,18 111 | 31.0,244.0,10.9,78,8,19 112 | 44.0,190.0,10.3,78,8,20 113 | 21.0,259.0,15.5,77,8,21 114 | 9.0,36.0,14.3,72,8,22 115 | ,255.0,12.6,75,8,23 116 | 45.0,212.0,9.7,79,8,24 117 | 168.0,238.0,3.4,81,8,25 118 | 73.0,215.0,8.0,86,8,26 119 | ,153.0,5.7,88,8,27 120 | 76.0,203.0,9.7,97,8,28 121 | 118.0,225.0,2.3,94,8,29 122 | 84.0,237.0,6.3,96,8,30 123 | 85.0,188.0,6.3,94,8,31 124 | 96.0,167.0,6.9,91,9,1 125 | 78.0,197.0,5.1,92,9,2 126 | 73.0,183.0,2.8,93,9,3 127 | 91.0,189.0,4.6,93,9,4 128 | 47.0,95.0,7.4,87,9,5 129 | 32.0,92.0,15.5,84,9,6 130 | 20.0,252.0,10.9,80,9,7 131 | 23.0,220.0,10.3,78,9,8 132 | 21.0,230.0,10.9,75,9,9 133 | 24.0,259.0,9.7,73,9,10 134 | 44.0,236.0,14.9,81,9,11 135 | 21.0,259.0,15.5,76,9,12 136 | 28.0,238.0,6.3,77,9,13 137 | 9.0,24.0,10.9,71,9,14 138 | 13.0,112.0,11.5,71,9,15 139 | 46.0,237.0,6.9,78,9,16 140 | 18.0,224.0,13.8,67,9,17 141 | 13.0,27.0,10.3,76,9,18 142 | 24.0,238.0,10.3,68,9,19 143 | 16.0,201.0,8.0,82,9,20 144 | 13.0,238.0,12.6,64,9,21 145 | 23.0,14.0,9.2,71,9,22 146 | 36.0,139.0,10.3,81,9,23 147 | 7.0,49.0,10.3,69,9,24 148 | 14.0,20.0,16.6,63,9,25 149 | 30.0,193.0,6.9,70,9,26 150 | ,145.0,13.2,77,9,27 151 | 14.0,191.0,14.3,75,9,28 152 | 18.0,131.0,8.0,76,9,29 153 | 20.0,223.0,11.5,68,9,30 154 | -------------------------------------------------------------------------------- /python-data/files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Files\n", 15 | "\n", 16 | "* Read a File\n", 17 | "* Write a File\n", 18 | "* Read and Write UTF-8" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Read a File\n", 26 | "\n", 27 | "Open a file in read-only mode.\n", 28 | "Iterate over the file lines. rstrip removes the EOL markers." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "class TypeUtil:\n", 43 | "\n", 44 | " @classmethod\n", 45 | " def is_iterable(cls, obj):\n", 46 | " \"\"\"Determines if obj is iterable.\n", 47 | "\n", 48 | " Useful when writing functions that can accept multiple types of\n", 49 | " input (list, tuple, ndarray, iterator). Pairs well with\n", 50 | " convert_to_list.\n", 51 | " \"\"\"\n", 52 | " try:\n", 53 | " iter(obj)\n", 54 | " return True\n", 55 | " except TypeError:\n", 56 | " return False\n", 57 | "\n", 58 | " @classmethod\n", 59 | " def convert_to_list(cls, obj):\n", 60 | " \"\"\"Converts obj to a list if it is not a list and it is iterable,\n", 61 | " else returns the original obj.\n", 62 | " \"\"\"\n", 63 | " if not isinstance(obj, list) and cls.is_iterable(obj):\n", 64 | " obj = list(obj)\n", 65 | " return obj\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "old_file_path = 'type_util.py'\n", 71 | "with open(old_file_path, 'r') as old_file:\n", 72 | " for line in old_file:\n", 73 | " print(line.rstrip())" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Write to a file\n", 81 | "\n", 82 | "Create a new file overwriting any previous file with the same name, write text, then close the file:" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 2, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "new_file_path = 'hello_world.txt'\n", 94 | "with open(new_file_path, 'w') as new_file:\n", 95 | " new_file.write('hello world!')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Read and Write UTF-8" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "import codecs\n", 114 | "with codecs.open(\"hello_world_new.txt\", \"a\", \"utf-8\") as new_file:\n", 115 | " with codecs.open(\"hello_world.txt\", \"r\", \"utf-8\") as old_file: \n", 116 | " for line in old_file:\n", 117 | " new_file.write(line + '\\n')" 118 | ] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": "Python 2", 124 | "language": "python", 125 | "name": "python2" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 2 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython2", 137 | "version": "2.7.10" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 0 142 | } 143 | -------------------------------------------------------------------------------- /data/titanic/myfirstforest.py: -------------------------------------------------------------------------------- 1 | """ Writing my first randomforest code. 2 | Author : AstroDave 3 | Date : 23rd September 2012 4 | Revised: 15 April 2014 5 | please see packages.python.org/milk/randomforests.html for more 6 | 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | import csv as csv 11 | from sklearn.ensemble import RandomForestClassifier 12 | 13 | # Data cleanup 14 | # TRAIN DATA 15 | train_df = pd.read_csv('train.csv', header=0) # Load the train file into a dataframe 16 | 17 | # I need to convert all strings to integer classifiers. 18 | # I need to fill in the missing values of the data and make it complete. 19 | 20 | # female = 0, Male = 1 21 | train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 22 | 23 | # Embarked from 'C', 'Q', 'S' 24 | # Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc. 25 | 26 | # All missing Embarked -> just make them embark from most common place 27 | if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0: 28 | train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values 29 | 30 | Ports = list(enumerate(np.unique(train_df['Embarked']))) # determine all values of Embarked, 31 | Ports_dict = { name : i for i, name in Ports } # set up a dictionary in the form Ports : index 32 | train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int) # Convert all Embark strings to int 33 | 34 | # All the ages with no data -> make the median of all Ages 35 | median_age = train_df['Age'].dropna().median() 36 | if len(train_df.Age[ train_df.Age.isnull() ]) > 0: 37 | train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age 38 | 39 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender) 40 | train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 41 | 42 | 43 | # TEST DATA 44 | test_df = pd.read_csv('test.csv', header=0) # Load the test file into a dataframe 45 | 46 | # I need to do the same with the test data now, so that the columns are the same as the training data 47 | # I need to convert all strings to integer classifiers: 48 | # female = 0, Male = 1 49 | test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 50 | 51 | # Embarked from 'C', 'Q', 'S' 52 | # All missing Embarked -> just make them embark from most common place 53 | if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0: 54 | test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values 55 | # Again convert all Embarked strings to int 56 | test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int) 57 | 58 | 59 | # All the ages with no data -> make the median of all Ages 60 | median_age = test_df['Age'].dropna().median() 61 | if len(test_df.Age[ test_df.Age.isnull() ]) > 0: 62 | test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age 63 | 64 | # All the missing Fares -> assume median of their respective class 65 | if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0: 66 | median_fare = np.zeros(3) 67 | for f in range(0,3): # loop 0 to 2 68 | median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median() 69 | for f in range(0,3): # loop 0 to 2 70 | test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f] 71 | 72 | # Collect the test data's PassengerIds before dropping it 73 | ids = test_df['PassengerId'].values 74 | # Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender) 75 | test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 76 | 77 | 78 | # The data is now ready to go. So lets fit to the train, then predict to the test! 79 | # Convert back to a numpy array 80 | train_data = train_df.values 81 | test_data = test_df.values 82 | 83 | 84 | print 'Training...' 85 | forest = RandomForestClassifier(n_estimators=100) 86 | forest = forest.fit( train_data[0::,1::], train_data[0::,0] ) 87 | 88 | print 'Predicting...' 89 | output = forest.predict(test_data).astype(int) 90 | 91 | 92 | predictions_file = open("myfirstforest.csv", "wb") 93 | open_file_object = csv.writer(predictions_file) 94 | open_file_object.writerow(["PassengerId","Survived"]) 95 | open_file_object.writerows(zip(ids, output)) 96 | predictions_file.close() 97 | print 'Done.' 98 | -------------------------------------------------------------------------------- /scipy/first.py: -------------------------------------------------------------------------------- 1 | """This file contains code used in "Think Stats", 2 | by Allen B. Downey, available from greenteapress.com 3 | 4 | Copyright 2014 Allen B. Downey 5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html 6 | """ 7 | 8 | from __future__ import print_function 9 | 10 | import math 11 | import numpy as np 12 | 13 | import nsfg 14 | import thinkstats2 15 | import thinkplot 16 | 17 | 18 | def MakeFrames(): 19 | """Reads pregnancy data and partitions first babies and others. 20 | 21 | returns: DataFrames (all live births, first babies, others) 22 | """ 23 | preg = nsfg.ReadFemPreg() 24 | 25 | live = preg[preg.outcome == 1] 26 | firsts = live[live.birthord == 1] 27 | others = live[live.birthord != 1] 28 | 29 | assert len(live) == 9148 30 | assert len(firsts) == 4413 31 | assert len(others) == 4735 32 | 33 | return live, firsts, others 34 | 35 | 36 | def Summarize(live, firsts, others): 37 | """Print various summary statistics.""" 38 | 39 | mean = live.prglngth.mean() 40 | var = live.prglngth.var() 41 | std = live.prglngth.std() 42 | 43 | print('Live mean', mean) 44 | print('Live variance', var) 45 | print('Live std', std) 46 | 47 | mean1 = firsts.prglngth.mean() 48 | mean2 = others.prglngth.mean() 49 | 50 | var1 = firsts.prglngth.var() 51 | var2 = others.prglngth.var() 52 | 53 | print('Mean') 54 | print('First babies', mean1) 55 | print('Others', mean2) 56 | 57 | print('Variance') 58 | print('First babies', var1) 59 | print('Others', var2) 60 | 61 | print('Difference in weeks', mean1 - mean2) 62 | print('Difference in hours', (mean1 - mean2) * 7 * 24) 63 | 64 | print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100) 65 | 66 | d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth) 67 | print('Cohen d', d) 68 | 69 | 70 | def PrintExtremes(live): 71 | """Plots the histogram of pregnancy lengths and prints the extremes. 72 | 73 | live: DataFrame of live births 74 | """ 75 | hist = thinkstats2.Hist(live.prglngth) 76 | thinkplot.Hist(hist, label='live births') 77 | 78 | thinkplot.Save(root='first_nsfg_hist_live', 79 | title='Histogram', 80 | xlabel='weeks', 81 | ylabel='frequency') 82 | 83 | print('Shortest lengths:') 84 | for weeks, freq in hist.Smallest(10): 85 | print(weeks, freq) 86 | 87 | print('Longest lengths:') 88 | for weeks, freq in hist.Largest(10): 89 | print(weeks, freq) 90 | 91 | 92 | def MakeHists(live): 93 | """Plot Hists for live births 94 | 95 | live: DataFrame 96 | others: DataFrame 97 | """ 98 | hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') 99 | thinkplot.Hist(hist) 100 | thinkplot.Save(root='first_wgt_lb_hist', 101 | xlabel='pounds', 102 | ylabel='frequency', 103 | axis=[-1, 14, 0, 3200]) 104 | 105 | hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz') 106 | thinkplot.Hist(hist) 107 | thinkplot.Save(root='first_wgt_oz_hist', 108 | xlabel='ounces', 109 | ylabel='frequency', 110 | axis=[-1, 16, 0, 1200]) 111 | 112 | hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') 113 | thinkplot.Hist(hist) 114 | thinkplot.Save(root='first_agepreg_hist', 115 | xlabel='years', 116 | ylabel='frequency') 117 | 118 | hist = thinkstats2.Hist(live.prglngth, label='prglngth') 119 | thinkplot.Hist(hist) 120 | thinkplot.Save(root='first_prglngth_hist', 121 | xlabel='weeks', 122 | ylabel='frequency', 123 | axis=[-1, 53, 0, 5000]) 124 | 125 | 126 | def MakeComparison(firsts, others): 127 | """Plots histograms of pregnancy length for first babies and others. 128 | 129 | firsts: DataFrame 130 | others: DataFrame 131 | """ 132 | first_hist = thinkstats2.Hist(firsts.prglngth, label='first') 133 | other_hist = thinkstats2.Hist(others.prglngth, label='other') 134 | 135 | width = 0.45 136 | thinkplot.PrePlot(2) 137 | thinkplot.Hist(first_hist, align='right', width=width) 138 | thinkplot.Hist(other_hist, align='left', width=width) 139 | 140 | thinkplot.Save(root='first_nsfg_hist', 141 | title='Histogram', 142 | xlabel='weeks', 143 | ylabel='frequency', 144 | axis=[27, 46, 0, 2700]) 145 | 146 | 147 | def main(script): 148 | live, firsts, others = MakeFrames() 149 | 150 | MakeHists(live) 151 | PrintExtremes(live) 152 | MakeComparison(firsts, others) 153 | Summarize(live, firsts, others) 154 | 155 | 156 | if __name__ == '__main__': 157 | import sys 158 | main(*sys.argv) 159 | 160 | 161 | -------------------------------------------------------------------------------- /mapreduce/mr_s3_log_parser.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | from mrjob.job import MRJob 4 | from mrjob.protocol import RawValueProtocol, ReprProtocol 5 | import re 6 | 7 | 8 | class MrS3LogParser(MRJob): 9 | """Parses the logs from S3 based on the S3 logging format: 10 | http://docs.aws.amazon.com/AmazonS3/latest/dev/LogFormat.html 11 | 12 | Aggregates a user's daily requests by user agent and operation 13 | 14 | Outputs date_time, requester, user_agent, operation, count 15 | """ 16 | 17 | LOGPATS = r'(\S+) (\S+) \[(.*?)\] (\S+) (\S+) ' \ 18 | r'(\S+) (\S+) (\S+) ("([^"]+)"|-) ' \ 19 | r'(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ' \ 20 | r'("([^"]+)"|-) ("([^"]+)"|-)' 21 | NUM_ENTRIES_PER_LINE = 17 22 | logpat = re.compile(LOGPATS) 23 | 24 | (S3_LOG_BUCKET_OWNER, 25 | S3_LOG_BUCKET, 26 | S3_LOG_DATE_TIME, 27 | S3_LOG_IP, 28 | S3_LOG_REQUESTER_ID, 29 | S3_LOG_REQUEST_ID, 30 | S3_LOG_OPERATION, 31 | S3_LOG_KEY, 32 | S3_LOG_HTTP_METHOD, 33 | S3_LOG_HTTP_STATUS, 34 | S3_LOG_S3_ERROR, 35 | S3_LOG_BYTES_SENT, 36 | S3_LOG_OBJECT_SIZE, 37 | S3_LOG_TOTAL_TIME, 38 | S3_LOG_TURN_AROUND_TIME, 39 | S3_LOG_REFERER, 40 | S3_LOG_USER_AGENT) = range(NUM_ENTRIES_PER_LINE) 41 | 42 | DELIMITER = '\t' 43 | 44 | # We use RawValueProtocol for input to be format agnostic 45 | # and avoid any type of parsing errors 46 | INPUT_PROTOCOL = RawValueProtocol 47 | 48 | # We use RawValueProtocol for output so we can output raw lines 49 | # instead of (k, v) pairs 50 | OUTPUT_PROTOCOL = RawValueProtocol 51 | 52 | # Encode the intermediate records using repr() instead of JSON, so the 53 | # record doesn't get Unicode-encoded 54 | INTERNAL_PROTOCOL = ReprProtocol 55 | 56 | def clean_date_time_zone(self, raw_date_time_zone): 57 | """Converts entry 22/Jul/2013:21:04:17 +0000 to the format 58 | 'YYYY-MM-DD HH:MM:SS' which is more suitable for loading into 59 | a database such as Redshift or RDS 60 | 61 | Note: requires the chars "[ ]" to be stripped prior to input 62 | Returns the converted datetime annd timezone 63 | or None for both values if failed 64 | 65 | TODO: Needs to combine timezone with date as one field 66 | """ 67 | date_time = None 68 | time_zone_parsed = None 69 | 70 | # TODO: Probably cleaner to parse this with a regex 71 | date_parsed = raw_date_time_zone[:raw_date_time_zone.find(":")] 72 | time_parsed = raw_date_time_zone[raw_date_time_zone.find(":") + 1: 73 | raw_date_time_zone.find("+") - 1] 74 | time_zone_parsed = raw_date_time_zone[raw_date_time_zone.find("+"):] 75 | 76 | try: 77 | date_struct = time.strptime(date_parsed, "%d/%b/%Y") 78 | converted_date = time.strftime("%Y-%m-%d", date_struct) 79 | date_time = converted_date + " " + time_parsed 80 | 81 | # Throws a ValueError exception if the operation fails that is 82 | # caught by the calling function and is handled appropriately 83 | except ValueError as error: 84 | raise ValueError(error) 85 | else: 86 | return converted_date, date_time, time_zone_parsed 87 | 88 | def mapper(self, _, line): 89 | line = line.strip() 90 | match = self.logpat.search(line) 91 | 92 | date_time = None 93 | requester = None 94 | user_agent = None 95 | operation = None 96 | 97 | try: 98 | for n in range(self.NUM_ENTRIES_PER_LINE): 99 | group = match.group(1 + n) 100 | 101 | if n == self.S3_LOG_DATE_TIME: 102 | date, date_time, time_zone_parsed = \ 103 | self.clean_date_time_zone(group) 104 | # Leave the following line of code if 105 | # you want to aggregate by date 106 | date_time = date + " 00:00:00" 107 | elif n == self.S3_LOG_REQUESTER_ID: 108 | requester = group 109 | elif n == self.S3_LOG_USER_AGENT: 110 | user_agent = group 111 | elif n == self.S3_LOG_OPERATION: 112 | operation = group 113 | else: 114 | pass 115 | 116 | except Exception: 117 | yield (("Error while parsing line: %s", line), 1) 118 | else: 119 | yield ((date_time, requester, user_agent, operation), 1) 120 | 121 | def reducer(self, key, values): 122 | output = list(key) 123 | output = self.DELIMITER.join(output) + \ 124 | self.DELIMITER + \ 125 | str(sum(values)) 126 | 127 | yield None, output 128 | 129 | def steps(self): 130 | return [ 131 | self.mr(mapper=self.mapper, 132 | reducer=self.reducer) 133 | ] 134 | 135 | 136 | if __name__ == '__main__': 137 | MrS3LogParser.run() -------------------------------------------------------------------------------- /python-data/logs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Logging in Python\n", 15 | "* Logging with RotatingFileHandler\n", 16 | "* Logging with TimedRotatingFileHandler " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Logging with RotatingFileHandler\n", 24 | "\n", 25 | "The logging discussion is taken from the [Python Logging Cookbook](https://docs.python.org/2/howto/logging-cookbook.html#using-file-rotation):\n", 26 | "\n", 27 | "Sometimes you want to let a log file grow to a certain size, then open a new file and log to that. You may want to keep a certain number of these files, and when that many files have been created, rotate the files so that the number of files and the size of the files both remain bounded. For this usage pattern, the logging package provides a RotatingFileHandler.\n", 28 | "\n", 29 | "The most current file is always logging_rotatingfile_example.out, and each time it reaches the size limit it is renamed with the suffix .1. Each of the existing backup files is renamed to increment the suffix (.1 becomes .2, etc.) and the .6 file is erased.\n", 30 | "\n", 31 | "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import logging\n", 43 | "import time\n", 44 | " \n", 45 | "from logging.handlers import RotatingFileHandler\n", 46 | " \n", 47 | "#----------------------------------------------------------------------\n", 48 | "def create_rotating_log(path):\n", 49 | " \"\"\"\n", 50 | " Creates a rotating log\n", 51 | " \"\"\"\n", 52 | " logger = logging.getLogger(\"Rotating Log\")\n", 53 | " logger.setLevel(logging.INFO)\n", 54 | " \n", 55 | " # add a rotating handler\n", 56 | " handler = RotatingFileHandler(path, maxBytes=20,\n", 57 | " backupCount=5)\n", 58 | " logger.addHandler(handler)\n", 59 | " \n", 60 | " for i in range(10):\n", 61 | " logger.info(\"This is test log line %s\" % i)\n", 62 | " time.sleep(1.5)\n", 63 | " \n", 64 | "#----------------------------------------------------------------------\n", 65 | "if __name__ == \"__main__\":\n", 66 | " log_file = \"test.log\"\n", 67 | " create_rotating_log(log_file)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Logging with TimedRotatingFileHandler\n", 75 | "\n", 76 | "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import logging\n", 88 | "import time\n", 89 | " \n", 90 | "from logging.handlers import TimedRotatingFileHandler\n", 91 | " \n", 92 | "#----------------------------------------------------------------------\n", 93 | "def create_timed_rotating_log(path):\n", 94 | " \"\"\"\"\"\"\n", 95 | " logger = logging.getLogger(\"Rotating Log\")\n", 96 | " logger.setLevel(logging.INFO)\n", 97 | " \n", 98 | " # Rotate log based on when parameter:\n", 99 | " # second (s)\n", 100 | " # minute (m)\n", 101 | " # hour (h)\n", 102 | " # day (d)\n", 103 | " # w0-w6 (weekday, 0=Monday)\n", 104 | " # midnight\n", 105 | " handler = TimedRotatingFileHandler(path,\n", 106 | " when=\"m\",\n", 107 | " interval=1,\n", 108 | " backupCount=5)\n", 109 | " logger.addHandler(handler)\n", 110 | " \n", 111 | " for i in range(20):\n", 112 | " logger.info(\"This is a test!\")\n", 113 | " time.sleep(1.5)\n", 114 | " \n", 115 | "#----------------------------------------------------------------------\n", 116 | "if __name__ == \"__main__\":\n", 117 | " log_file = \"timed_test.log\"\n", 118 | " create_timed_rotating_log(log_file)" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 2", 125 | "language": "python", 126 | "name": "python2" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 2 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython2", 138 | "version": "2.7.10" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 0 143 | } 144 | -------------------------------------------------------------------------------- /python-data/unit_tests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Nose Unit Tests with IPython Notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Nose\n", 22 | "\n", 23 | "Testing is a vital part of software development. Nose extends unittest to make testing easier." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Install Nose\n", 31 | "\n", 32 | "Run the following command line:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "!pip install nose" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Create the Code\n", 51 | "\n", 52 | "Save your code to a file with the %%file magic:" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 1, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "Overwriting type_util.py\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "%%file type_util.py\n", 72 | "class TypeUtil:\n", 73 | "\n", 74 | " @classmethod\n", 75 | " def is_iterable(cls, obj):\n", 76 | " \"\"\"Determines if obj is iterable.\n", 77 | "\n", 78 | " Useful when writing functions that can accept multiple types of\n", 79 | " input (list, tuple, ndarray, iterator). Pairs well with\n", 80 | " convert_to_list.\n", 81 | " \"\"\"\n", 82 | " try:\n", 83 | " iter(obj)\n", 84 | " return True\n", 85 | " except TypeError:\n", 86 | " return False\n", 87 | "\n", 88 | " @classmethod\n", 89 | " def convert_to_list(cls, obj):\n", 90 | " \"\"\"Converts obj to a list if it is not a list and it is iterable, \n", 91 | " else returns the original obj.\n", 92 | " \"\"\"\n", 93 | " if not isinstance(obj, list) and cls.is_iterable(obj):\n", 94 | " obj = list(obj)\n", 95 | " return obj\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Create the Nose Tests\n", 103 | "\n", 104 | "Save your test to a file with the %%file magic:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 2, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Overwriting tests/test_type_util.py\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "%%file tests/test_type_util.py\n", 124 | "from nose.tools import assert_equal\n", 125 | "from ..type_util import TypeUtil\n", 126 | "\n", 127 | "\n", 128 | "class TestUtil():\n", 129 | "\n", 130 | " def test_is_iterable(self):\n", 131 | " assert_equal(TypeUtil.is_iterable('foo'), True)\n", 132 | " assert_equal(TypeUtil.is_iterable(7), False)\n", 133 | "\n", 134 | " def test_convert_to_list(self):\n", 135 | " assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)\n", 136 | " assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Run the Nose Tests\n", 144 | "\n", 145 | "Run the following command line:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "core.tests.test_type_util.TestUtil.test_convert_to_list ... ok\r\n", 160 | "core.tests.test_type_util.TestUtil.test_is_iterable ... ok\r\n", 161 | "\r\n", 162 | "----------------------------------------------------------------------\r\n", 163 | "Ran 2 tests in 0.001s\r\n", 164 | "\r\n", 165 | "OK\r\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "!nosetests tests/test_type_util.py -v" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 2", 177 | "language": "python", 178 | "name": "python2" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 2 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython2", 190 | "version": "2.7.10" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 0 195 | } 196 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/4_multi_gpu/multigpu_basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic Multi GPU Computation in TensorFlow\n", 8 | "\n", 9 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 10 | "\n", 11 | "## Setup\n", 12 | "\n", 13 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "source": [ 22 | "This tutorial requires your machine to have 2 GPUs\n", 23 | "* \"/cpu:0\": The CPU of your machine.\n", 24 | "* \"/gpu:0\": The first GPU of your machine\n", 25 | "* \"/gpu:1\": The second GPU of your machine\n", 26 | "* For this example, we are using 2 GTX-980" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import tensorflow as tf\n", 39 | "import datetime" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "#Processing Units logs\n", 51 | "log_device_placement = True\n", 52 | "\n", 53 | "#num of multiplications to perform\n", 54 | "n = 10" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# Example: compute A^n + B^n on 2 GPUs\n", 66 | "\n", 67 | "# Create random large matrix\n", 68 | "A = np.random.rand(1e4, 1e4).astype('float32')\n", 69 | "B = np.random.rand(1e4, 1e4).astype('float32')\n", 70 | "\n", 71 | "# Creates a graph to store results\n", 72 | "c1 = []\n", 73 | "c2 = []\n", 74 | "\n", 75 | "# Define matrix power\n", 76 | "def matpow(M, n):\n", 77 | " if n < 1: #Abstract cases where n < 1\n", 78 | " return M\n", 79 | " else:\n", 80 | " return tf.matmul(M, matpow(M, n-1))" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# Single GPU computing\n", 92 | "\n", 93 | "with tf.device('/gpu:0'):\n", 94 | " a = tf.constant(A)\n", 95 | " b = tf.constant(B)\n", 96 | " #compute A^n and B^n and store results in c1\n", 97 | " c1.append(matpow(a, n))\n", 98 | " c1.append(matpow(b, n))\n", 99 | "\n", 100 | "with tf.device('/cpu:0'):\n", 101 | " sum = tf.add_n(c1) #Addition of all elements in c1, i.e. A^n + B^n\n", 102 | "\n", 103 | "t1_1 = datetime.datetime.now()\n", 104 | "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n", 105 | " # Runs the op.\n", 106 | " sess.run(sum)\n", 107 | "t2_1 = datetime.datetime.now()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "# Multi GPU computing\n", 119 | "# GPU:0 computes A^n\n", 120 | "with tf.device('/gpu:0'):\n", 121 | " #compute A^n and store result in c2\n", 122 | " a = tf.constant(A)\n", 123 | " c2.append(matpow(a, n))\n", 124 | "\n", 125 | "#GPU:1 computes B^n\n", 126 | "with tf.device('/gpu:1'):\n", 127 | " #compute B^n and store result in c2\n", 128 | " b = tf.constant(B)\n", 129 | " c2.append(matpow(b, n))\n", 130 | "\n", 131 | "with tf.device('/cpu:0'):\n", 132 | " sum = tf.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n\n", 133 | "\n", 134 | "t1_2 = datetime.datetime.now()\n", 135 | "with tf.Session(config=tf.ConfigProto(log_device_placement=log_device_placement)) as sess:\n", 136 | " # Runs the op.\n", 137 | " sess.run(sum)\n", 138 | "t2_2 = datetime.datetime.now()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Single GPU computation time: 0:00:11.833497\n", 153 | "Multi GPU computation time: 0:00:07.085913\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "print \"Single GPU computation time: \" + str(t2_1-t1_1)\n", 159 | "print \"Multi GPU computation time: \" + str(t2_2-t1_2)" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.4.3" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 0 184 | } 185 | -------------------------------------------------------------------------------- /data/titanic/gendermodel.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /data/titanic/genderclassmodel.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,0 35 | 925,0 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,0 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,0 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,0 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,0 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,0 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,0 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,0 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,0 368 | 1258,0 369 | 1259,0 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/ML_flow_chart.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tutorial Diagrams 3 | ----------------- 4 | 5 | This script plots the flow-charts used in the scikit-learn tutorials. 6 | """ 7 | 8 | import numpy as np 9 | import pylab as pl 10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow 11 | 12 | def create_base(box_bg = '#CCCCCC', 13 | arrow1 = '#88CCFF', 14 | arrow2 = '#88FF88', 15 | supervised=True): 16 | fig = pl.figure(figsize=(9, 6), facecolor='w') 17 | ax = pl.axes((0, 0, 1, 1), 18 | xticks=[], yticks=[], frameon=False) 19 | ax.set_xlim(0, 9) 20 | ax.set_ylim(0, 6) 21 | 22 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg), 23 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg), 24 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg), 25 | 26 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg), 27 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg), 28 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg), 29 | 30 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg), 31 | 32 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg), 33 | 34 | Circle((5.5, 3.5), 1.0, fc=box_bg), 35 | 36 | Polygon([[5.5, 1.7], 37 | [6.1, 1.1], 38 | [5.5, 0.5], 39 | [4.9, 1.1]], fc=box_bg), 40 | 41 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1, 42 | width=0.25, head_width=0.5, head_length=0.2), 43 | 44 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1, 45 | width=0.25, head_width=0.5, head_length=0.2), 46 | 47 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1, 48 | width=0.25, head_width=0.5, head_length=0.2), 49 | 50 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2, 51 | width=0.25, head_width=0.5, head_length=0.2), 52 | 53 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2, 54 | width=0.25, head_width=0.5, head_length=0.2), 55 | 56 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2, 57 | width=0.25, head_width=0.5, head_length=0.2)] 58 | 59 | if supervised: 60 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg), 61 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg), 62 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg), 63 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1, 64 | width=0.25, head_width=0.5, head_length=0.2), 65 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)] 66 | else: 67 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)] 68 | 69 | for p in patches: 70 | ax.add_patch(p) 71 | 72 | pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.", 73 | ha='center', va='center', fontsize=14) 74 | 75 | pl.text(3.6, 4.9, "Feature\nVectors", 76 | ha='left', va='center', fontsize=14) 77 | 78 | pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm", 79 | ha='center', va='center', fontsize=14) 80 | 81 | pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.", 82 | ha='center', va='center', fontsize=14) 83 | 84 | pl.text(3.3, 1.7, "Feature\nVector", 85 | ha='left', va='center', fontsize=14) 86 | 87 | pl.text(5.5, 1.1, "Predictive\nModel", 88 | ha='center', va='center', fontsize=12) 89 | 90 | if supervised: 91 | pl.text(1.45, 3.05, "Labels", 92 | ha='center', va='center', fontsize=14) 93 | 94 | pl.text(8.05, 1.1, "Expected\nLabel", 95 | ha='center', va='center', fontsize=14) 96 | pl.text(8.8, 5.8, "Supervised Learning Model", 97 | ha='right', va='top', fontsize=18) 98 | 99 | else: 100 | pl.text(8.05, 1.1, 101 | "Likelihood\nor Cluster ID\nor Better\nRepresentation", 102 | ha='center', va='center', fontsize=12) 103 | pl.text(8.8, 5.8, "Unsupervised Learning Model", 104 | ha='right', va='top', fontsize=18) 105 | 106 | 107 | 108 | def plot_supervised_chart(annotate=False): 109 | create_base(supervised=True) 110 | if annotate: 111 | fontdict = dict(color='r', weight='bold', size=14) 112 | pl.text(1.9, 4.55, 'X = vec.fit_transform(input)', 113 | fontdict=fontdict, 114 | rotation=20, ha='left', va='bottom') 115 | pl.text(3.7, 3.2, 'clf.fit(X, y)', 116 | fontdict=fontdict, 117 | rotation=20, ha='left', va='bottom') 118 | pl.text(1.7, 1.5, 'X_new = vec.transform(input)', 119 | fontdict=fontdict, 120 | rotation=20, ha='left', va='bottom') 121 | pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)', 122 | fontdict=fontdict, 123 | rotation=20, ha='left', va='bottom') 124 | 125 | def plot_unsupervised_chart(): 126 | create_base(supervised=False) 127 | 128 | 129 | if __name__ == '__main__': 130 | plot_supervised_chart(False) 131 | plot_supervised_chart(True) 132 | plot_unsupervised_chart() 133 | pl.show() 134 | 135 | 136 | -------------------------------------------------------------------------------- /spark/hdfs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# HDFS" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Run an HDFS command:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "!hdfs" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Run a file system command on the file systems (FsShell):" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "!hdfs dfs" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "List the user's home directory:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "!hdfs dfs -ls" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "List the HDFS root directory:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "!hdfs dfs -ls /" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Copy a local file to the user's directory on HDFS:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "!hdfs dfs -put file.txt file.txt" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Display the contents of the specified HDFS file:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "!hdfs dfs -cat file.txt" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "Print the last 10 lines of the file to the terminal:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "!hdfs dfs -cat file.txt | tail -n 10" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "View a directory and all of its files:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "!hdfs dfs -cat dir/* | less" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Copy an HDFS file to local:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "!hdfs dfs -get file.txt file.txt" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "Create a directory on HDFS:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "!hdfs dfs -mkdir dir" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "Recursively delete the specified directory and all of its contents:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "!hdfs dfs -rm -r dir" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Specify HDFS file in Spark (paths are relative to the user's home HDFS directory):" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "data = sc.textFile (\"hdfs://hdfs-host:port/path/file.txt\")" 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 2", 237 | "language": "python", 238 | "name": "python2" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 2 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython2", 250 | "version": "2.7.10" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 0 255 | } 256 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/intro_theano/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | 10 | import numpy 11 | from six.moves import xrange 12 | 13 | 14 | def scale_to_unit_interval(ndar, eps=1e-8): 15 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 16 | ndar = ndar.copy() 17 | ndar -= ndar.min() 18 | ndar *= 1.0 / (ndar.max() + eps) 19 | return ndar 20 | 21 | 22 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 23 | scale_rows_to_unit_interval=True, 24 | output_pixel_vals=True): 25 | """ 26 | Transform an array with one flattened image per row, into an array in 27 | which images are reshaped and layed out like tiles on a floor. 28 | 29 | This function is useful for visualizing datasets whose rows are images, 30 | and also columns of matrices for transforming those rows 31 | (such as the first layer of a neural net). 32 | 33 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 34 | be 2-D ndarrays or None; 35 | :param X: a 2-D array in which every row is a flattened image. 36 | 37 | :type img_shape: tuple; (height, width) 38 | :param img_shape: the original shape of each image 39 | 40 | :type tile_shape: tuple; (rows, cols) 41 | :param tile_shape: the number of images to tile (rows, cols) 42 | 43 | :param output_pixel_vals: if output should be pixel values (i.e. int8 44 | values) or floats 45 | 46 | :param scale_rows_to_unit_interval: if the values need to be scaled before 47 | being plotted to [0,1] or not 48 | 49 | 50 | :returns: array suitable for viewing as an image. 51 | (See:`Image.fromarray`.) 52 | :rtype: a 2-d array with same dtype as X. 53 | 54 | """ 55 | 56 | assert len(img_shape) == 2 57 | assert len(tile_shape) == 2 58 | assert len(tile_spacing) == 2 59 | 60 | # The expression below can be re-written in a more C style as 61 | # follows : 62 | # 63 | # out_shape = [0,0] 64 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 65 | # tile_spacing[0] 66 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 67 | # tile_spacing[1] 68 | out_shape = [ 69 | (ishp + tsp) * tshp - tsp 70 | for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing) 71 | ] 72 | 73 | if isinstance(X, tuple): 74 | assert len(X) == 4 75 | # Create an output numpy ndarray to store the image 76 | if output_pixel_vals: 77 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 78 | dtype='uint8') 79 | else: 80 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 81 | dtype=X.dtype) 82 | 83 | #colors default to 0, alpha defaults to 1 (opaque) 84 | if output_pixel_vals: 85 | channel_defaults = [0, 0, 0, 255] 86 | else: 87 | channel_defaults = [0., 0., 0., 1.] 88 | 89 | for i in xrange(4): 90 | if X[i] is None: 91 | # if channel is None, fill it with zeros of the correct 92 | # dtype 93 | dt = out_array.dtype 94 | if output_pixel_vals: 95 | dt = 'uint8' 96 | out_array[:, :, i] = numpy.zeros( 97 | out_shape, 98 | dtype=dt 99 | ) + channel_defaults[i] 100 | else: 101 | # use a recurrent call to compute the channel and store it 102 | # in the output 103 | out_array[:, :, i] = tile_raster_images( 104 | X[i], img_shape, tile_shape, tile_spacing, 105 | scale_rows_to_unit_interval, output_pixel_vals) 106 | return out_array 107 | 108 | else: 109 | # if we are dealing with only one channel 110 | H, W = img_shape 111 | Hs, Ws = tile_spacing 112 | 113 | # generate a matrix to store the output 114 | dt = X.dtype 115 | if output_pixel_vals: 116 | dt = 'uint8' 117 | out_array = numpy.zeros(out_shape, dtype=dt) 118 | 119 | for tile_row in xrange(tile_shape[0]): 120 | for tile_col in xrange(tile_shape[1]): 121 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 122 | this_x = X[tile_row * tile_shape[1] + tile_col] 123 | if scale_rows_to_unit_interval: 124 | # if we should scale values to be between 0 and 1 125 | # do this by calling the `scale_to_unit_interval` 126 | # function 127 | this_img = scale_to_unit_interval( 128 | this_x.reshape(img_shape)) 129 | else: 130 | this_img = this_x.reshape(img_shape) 131 | # add the slice to the corresponding position in the 132 | # output array 133 | c = 1 134 | if output_pixel_vals: 135 | c = 255 136 | out_array[ 137 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 138 | tile_col * (W + Ws): tile_col * (W + Ws) + W 139 | ] = this_img * c 140 | return out_array 141 | -------------------------------------------------------------------------------- /data/titanic/results-rf.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0.0 3 | 893,0.0 4 | 894,0.0 5 | 895,1.0 6 | 896,1.0 7 | 897,0.0 8 | 898,0.0 9 | 899,0.0 10 | 900,1.0 11 | 901,0.0 12 | 902,0.0 13 | 903,0.0 14 | 904,1.0 15 | 905,0.0 16 | 906,1.0 17 | 907,1.0 18 | 908,0.0 19 | 909,1.0 20 | 910,1.0 21 | 911,1.0 22 | 912,0.0 23 | 913,1.0 24 | 914,1.0 25 | 915,1.0 26 | 916,1.0 27 | 917,0.0 28 | 918,1.0 29 | 919,1.0 30 | 920,1.0 31 | 921,0.0 32 | 922,0.0 33 | 923,0.0 34 | 924,1.0 35 | 925,0.0 36 | 926,1.0 37 | 927,1.0 38 | 928,0.0 39 | 929,0.0 40 | 930,0.0 41 | 931,1.0 42 | 932,0.0 43 | 933,1.0 44 | 934,0.0 45 | 935,1.0 46 | 936,1.0 47 | 937,0.0 48 | 938,1.0 49 | 939,0.0 50 | 940,1.0 51 | 941,1.0 52 | 942,0.0 53 | 943,0.0 54 | 944,1.0 55 | 945,1.0 56 | 946,0.0 57 | 947,0.0 58 | 948,0.0 59 | 949,0.0 60 | 950,0.0 61 | 951,1.0 62 | 952,0.0 63 | 953,0.0 64 | 954,0.0 65 | 955,1.0 66 | 956,1.0 67 | 957,1.0 68 | 958,1.0 69 | 959,0.0 70 | 960,0.0 71 | 961,1.0 72 | 962,1.0 73 | 963,0.0 74 | 964,0.0 75 | 965,0.0 76 | 966,1.0 77 | 967,0.0 78 | 968,0.0 79 | 969,1.0 80 | 970,0.0 81 | 971,1.0 82 | 972,1.0 83 | 973,0.0 84 | 974,0.0 85 | 975,0.0 86 | 976,0.0 87 | 977,0.0 88 | 978,1.0 89 | 979,0.0 90 | 980,0.0 91 | 981,1.0 92 | 982,1.0 93 | 983,0.0 94 | 984,1.0 95 | 985,0.0 96 | 986,0.0 97 | 987,0.0 98 | 988,1.0 99 | 989,0.0 100 | 990,0.0 101 | 991,0.0 102 | 992,1.0 103 | 993,0.0 104 | 994,0.0 105 | 995,0.0 106 | 996,1.0 107 | 997,0.0 108 | 998,0.0 109 | 999,0.0 110 | 1000,0.0 111 | 1001,0.0 112 | 1002,0.0 113 | 1003,0.0 114 | 1004,1.0 115 | 1005,0.0 116 | 1006,1.0 117 | 1007,0.0 118 | 1008,0.0 119 | 1009,1.0 120 | 1010,0.0 121 | 1011,1.0 122 | 1012,1.0 123 | 1013,0.0 124 | 1014,1.0 125 | 1015,0.0 126 | 1016,0.0 127 | 1017,1.0 128 | 1018,0.0 129 | 1019,0.0 130 | 1020,0.0 131 | 1021,0.0 132 | 1022,1.0 133 | 1023,0.0 134 | 1024,0.0 135 | 1025,0.0 136 | 1026,0.0 137 | 1027,0.0 138 | 1028,0.0 139 | 1029,0.0 140 | 1030,0.0 141 | 1031,0.0 142 | 1032,0.0 143 | 1033,1.0 144 | 1034,0.0 145 | 1035,0.0 146 | 1036,1.0 147 | 1037,0.0 148 | 1038,0.0 149 | 1039,0.0 150 | 1040,1.0 151 | 1041,0.0 152 | 1042,1.0 153 | 1043,0.0 154 | 1044,0.0 155 | 1045,1.0 156 | 1046,0.0 157 | 1047,0.0 158 | 1048,1.0 159 | 1049,0.0 160 | 1050,1.0 161 | 1051,1.0 162 | 1052,0.0 163 | 1053,1.0 164 | 1054,1.0 165 | 1055,0.0 166 | 1056,0.0 167 | 1057,1.0 168 | 1058,0.0 169 | 1059,0.0 170 | 1060,1.0 171 | 1061,0.0 172 | 1062,0.0 173 | 1063,0.0 174 | 1064,0.0 175 | 1065,0.0 176 | 1066,0.0 177 | 1067,1.0 178 | 1068,1.0 179 | 1069,0.0 180 | 1070,1.0 181 | 1071,1.0 182 | 1072,0.0 183 | 1073,0.0 184 | 1074,1.0 185 | 1075,0.0 186 | 1076,1.0 187 | 1077,0.0 188 | 1078,1.0 189 | 1079,0.0 190 | 1080,0.0 191 | 1081,0.0 192 | 1082,0.0 193 | 1083,0.0 194 | 1084,1.0 195 | 1085,0.0 196 | 1086,1.0 197 | 1087,0.0 198 | 1088,1.0 199 | 1089,0.0 200 | 1090,0.0 201 | 1091,0.0 202 | 1092,0.0 203 | 1093,1.0 204 | 1094,0.0 205 | 1095,1.0 206 | 1096,0.0 207 | 1097,0.0 208 | 1098,0.0 209 | 1099,0.0 210 | 1100,1.0 211 | 1101,0.0 212 | 1102,0.0 213 | 1103,0.0 214 | 1104,0.0 215 | 1105,1.0 216 | 1106,0.0 217 | 1107,0.0 218 | 1108,0.0 219 | 1109,0.0 220 | 1110,1.0 221 | 1111,0.0 222 | 1112,1.0 223 | 1113,0.0 224 | 1114,1.0 225 | 1115,1.0 226 | 1116,0.0 227 | 1117,1.0 228 | 1118,0.0 229 | 1119,0.0 230 | 1120,0.0 231 | 1121,0.0 232 | 1122,0.0 233 | 1123,1.0 234 | 1124,0.0 235 | 1125,0.0 236 | 1126,1.0 237 | 1127,0.0 238 | 1128,0.0 239 | 1129,1.0 240 | 1130,1.0 241 | 1131,1.0 242 | 1132,0.0 243 | 1133,1.0 244 | 1134,0.0 245 | 1135,0.0 246 | 1136,0.0 247 | 1137,0.0 248 | 1138,1.0 249 | 1139,0.0 250 | 1140,1.0 251 | 1141,0.0 252 | 1142,1.0 253 | 1143,0.0 254 | 1144,0.0 255 | 1145,0.0 256 | 1146,0.0 257 | 1147,0.0 258 | 1148,0.0 259 | 1149,0.0 260 | 1150,1.0 261 | 1151,0.0 262 | 1152,0.0 263 | 1153,0.0 264 | 1154,1.0 265 | 1155,1.0 266 | 1156,0.0 267 | 1157,0.0 268 | 1158,0.0 269 | 1159,0.0 270 | 1160,0.0 271 | 1161,0.0 272 | 1162,0.0 273 | 1163,0.0 274 | 1164,1.0 275 | 1165,0.0 276 | 1166,0.0 277 | 1167,1.0 278 | 1168,0.0 279 | 1169,0.0 280 | 1170,0.0 281 | 1171,0.0 282 | 1172,0.0 283 | 1173,1.0 284 | 1174,0.0 285 | 1175,0.0 286 | 1176,1.0 287 | 1177,0.0 288 | 1178,0.0 289 | 1179,0.0 290 | 1180,0.0 291 | 1181,0.0 292 | 1182,0.0 293 | 1183,0.0 294 | 1184,0.0 295 | 1185,0.0 296 | 1186,0.0 297 | 1187,0.0 298 | 1188,1.0 299 | 1189,0.0 300 | 1190,0.0 301 | 1191,0.0 302 | 1192,0.0 303 | 1193,0.0 304 | 1194,0.0 305 | 1195,0.0 306 | 1196,0.0 307 | 1197,1.0 308 | 1198,1.0 309 | 1199,1.0 310 | 1200,0.0 311 | 1201,0.0 312 | 1202,0.0 313 | 1203,1.0 314 | 1204,0.0 315 | 1205,0.0 316 | 1206,1.0 317 | 1207,1.0 318 | 1208,0.0 319 | 1209,0.0 320 | 1210,0.0 321 | 1211,0.0 322 | 1212,0.0 323 | 1213,0.0 324 | 1214,0.0 325 | 1215,1.0 326 | 1216,1.0 327 | 1217,0.0 328 | 1218,1.0 329 | 1219,0.0 330 | 1220,0.0 331 | 1221,0.0 332 | 1222,1.0 333 | 1223,1.0 334 | 1224,0.0 335 | 1225,1.0 336 | 1226,0.0 337 | 1227,0.0 338 | 1228,1.0 339 | 1229,0.0 340 | 1230,0.0 341 | 1231,0.0 342 | 1232,0.0 343 | 1233,0.0 344 | 1234,0.0 345 | 1235,1.0 346 | 1236,0.0 347 | 1237,0.0 348 | 1238,0.0 349 | 1239,1.0 350 | 1240,0.0 351 | 1241,1.0 352 | 1242,1.0 353 | 1243,0.0 354 | 1244,0.0 355 | 1245,0.0 356 | 1246,1.0 357 | 1247,0.0 358 | 1248,1.0 359 | 1249,0.0 360 | 1250,0.0 361 | 1251,1.0 362 | 1252,0.0 363 | 1253,1.0 364 | 1254,1.0 365 | 1255,1.0 366 | 1256,1.0 367 | 1257,0.0 368 | 1258,0.0 369 | 1259,0.0 370 | 1260,1.0 371 | 1261,1.0 372 | 1262,0.0 373 | 1263,1.0 374 | 1264,0.0 375 | 1265,0.0 376 | 1266,1.0 377 | 1267,1.0 378 | 1268,0.0 379 | 1269,0.0 380 | 1270,0.0 381 | 1271,0.0 382 | 1272,0.0 383 | 1273,0.0 384 | 1274,1.0 385 | 1275,1.0 386 | 1276,0.0 387 | 1277,1.0 388 | 1278,0.0 389 | 1279,0.0 390 | 1280,0.0 391 | 1281,0.0 392 | 1282,0.0 393 | 1283,1.0 394 | 1284,0.0 395 | 1285,0.0 396 | 1286,0.0 397 | 1287,1.0 398 | 1288,0.0 399 | 1289,1.0 400 | 1290,0.0 401 | 1291,0.0 402 | 1292,1.0 403 | 1293,0.0 404 | 1294,1.0 405 | 1295,0.0 406 | 1296,0.0 407 | 1297,0.0 408 | 1298,0.0 409 | 1299,0.0 410 | 1300,0.0 411 | 1301,1.0 412 | 1302,0.0 413 | 1303,1.0 414 | 1304,0.0 415 | 1305,0.0 416 | 1306,1.0 417 | 1307,0.0 418 | 1308,0.0 419 | 1309,0.0 420 | -------------------------------------------------------------------------------- /data/titanic/genderclassmodel.py: -------------------------------------------------------------------------------- 1 | """ Now that the user can read in a file this creates a model which uses the price, class and gender 2 | Author : AstroDave 3 | Date : 18th September 2012 4 | Revised : 28 March 2014 5 | 6 | """ 7 | 8 | 9 | import csv as csv 10 | import numpy as np 11 | 12 | csv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file 13 | header = csv_file_object.next() # Skip the fist line as it is a header 14 | data=[] # Create a variable to hold the data 15 | 16 | for row in csv_file_object: # Skip through each row in the csv file 17 | data.append(row) # adding each row to the data variable 18 | data = np.array(data) # Then convert from a list to an array 19 | 20 | # In order to analyse the price column I need to bin up that data 21 | # here are my binning parameters, the problem we face is some of the fares are very large 22 | # So we can either have a lot of bins with nothing in them or we can just lose some 23 | # information by just considering that anythng over 39 is simply in the last bin. 24 | # So we add a ceiling 25 | fare_ceiling = 40 26 | # then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling 27 | data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0 28 | 29 | fare_bracket_size = 10 30 | number_of_price_brackets = fare_ceiling / fare_bracket_size 31 | number_of_classes = 3 # I know there were 1st, 2nd and 3rd classes on board. 32 | number_of_classes = len(np.unique(data[0::,2])) # But it's better practice to calculate this from the Pclass directly: 33 | # just take the length of an array of UNIQUE values in column index 2 34 | 35 | 36 | # This reference matrix will show the proportion of survivors as a sorted table of 37 | # gender, class and ticket fare. 38 | # First initialize it with all zeros 39 | survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float) 40 | 41 | # I can now find the stats of all the women and men on board 42 | for i in xrange(number_of_classes): 43 | for j in xrange(number_of_price_brackets): 44 | 45 | women_only_stats = data[ (data[0::,4] == "female") \ 46 | & (data[0::,2].astype(np.float) == i+1) \ 47 | & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \ 48 | & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1] 49 | 50 | men_only_stats = data[ (data[0::,4] != "female") \ 51 | & (data[0::,2].astype(np.float) == i+1) \ 52 | & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \ 53 | & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1] 54 | 55 | #if i == 0 and j == 3: 56 | 57 | survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) # Female stats 58 | survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float)) # Male stats 59 | 60 | # Since in python if it tries to find the mean of an array with nothing in it 61 | # (such that the denominator is 0), then it returns nan, we can convert these to 0 62 | # by just saying where does the array not equal the array, and set these to 0. 63 | survival_table[ survival_table != survival_table ] = 0. 64 | 65 | # Now I have my proportion of survivors, simply round them such that if <0.5 66 | # I predict they dont surivive, and if >= 0.5 they do 67 | survival_table[ survival_table < 0.5 ] = 0 68 | survival_table[ survival_table >= 0.5 ] = 1 69 | 70 | # Now I have my indicator I can read in the test file and write out 71 | # if a women then survived(1) if a man then did not survived (0) 72 | # First read in test 73 | test_file = open('test.csv', 'rb') 74 | test_file_object = csv.reader(test_file) 75 | header = test_file_object.next() 76 | 77 | # Also open the a new file so I can write to it. 78 | predictions_file = open("genderclassmodel.csv", "wb") 79 | predictions_file_object = csv.writer(predictions_file) 80 | predictions_file_object.writerow(["PassengerId", "Survived"]) 81 | 82 | # First thing to do is bin up the price file 83 | for row in test_file_object: 84 | for j in xrange(number_of_price_brackets): 85 | # If there is no fare then place the price of the ticket according to class 86 | try: 87 | row[8] = float(row[8]) # No fare recorded will come up as a string so 88 | # try to make it a float 89 | except: # If fails then just bin the fare according to the class 90 | bin_fare = 3 - float(row[1]) 91 | break # Break from the loop and move to the next row 92 | if row[8] > fare_ceiling: # Otherwise now test to see if it is higher 93 | # than the fare ceiling we set earlier 94 | bin_fare = number_of_price_brackets - 1 95 | break # And then break to the next row 96 | 97 | if row[8] >= j*fare_bracket_size\ 98 | and row[8] < (j+1)*fare_bracket_size: # If passed these tests then loop through 99 | # each bin until you find the right one 100 | # append it to the bin_fare 101 | # and move to the next loop 102 | bin_fare = j 103 | break 104 | # Now I have the binned fare, passenger class, and whether female or male, we can 105 | # just cross ref their details with our survival table 106 | if row[3] == 'female': 107 | predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])]) 108 | else: 109 | predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])]) 110 | 111 | # Close out the files 112 | test_file.close() 113 | predictions_file.close() -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/input_data.py: -------------------------------------------------------------------------------- 1 | """Functions for downloading and reading MNIST data.""" 2 | from __future__ import print_function 3 | import gzip 4 | import os 5 | import urllib 6 | import numpy 7 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' 8 | def maybe_download(filename, work_directory): 9 | """Download the data from Yann's website, unless it's already here.""" 10 | if not os.path.exists(work_directory): 11 | os.mkdir(work_directory) 12 | filepath = os.path.join(work_directory, filename) 13 | if not os.path.exists(filepath): 14 | filepath, _ = urllib.urlretrieve(SOURCE_URL + filename, filepath) 15 | statinfo = os.stat(filepath) 16 | print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.') 17 | return filepath 18 | def _read32(bytestream): 19 | dt = numpy.dtype(numpy.uint32).newbyteorder('>') 20 | return numpy.frombuffer(bytestream.read(4), dtype=dt) 21 | def extract_images(filename): 22 | """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" 23 | print('Extracting', filename) 24 | with gzip.open(filename) as bytestream: 25 | magic = _read32(bytestream) 26 | if magic != 2051: 27 | raise ValueError( 28 | 'Invalid magic number %d in MNIST image file: %s' % 29 | (magic, filename)) 30 | num_images = _read32(bytestream) 31 | rows = _read32(bytestream) 32 | cols = _read32(bytestream) 33 | buf = bytestream.read(rows * cols * num_images) 34 | data = numpy.frombuffer(buf, dtype=numpy.uint8) 35 | data = data.reshape(num_images, rows, cols, 1) 36 | return data 37 | def dense_to_one_hot(labels_dense, num_classes=10): 38 | """Convert class labels from scalars to one-hot vectors.""" 39 | num_labels = labels_dense.shape[0] 40 | index_offset = numpy.arange(num_labels) * num_classes 41 | labels_one_hot = numpy.zeros((num_labels, num_classes)) 42 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 43 | return labels_one_hot 44 | def extract_labels(filename, one_hot=False): 45 | """Extract the labels into a 1D uint8 numpy array [index].""" 46 | print('Extracting', filename) 47 | with gzip.open(filename) as bytestream: 48 | magic = _read32(bytestream) 49 | if magic != 2049: 50 | raise ValueError( 51 | 'Invalid magic number %d in MNIST label file: %s' % 52 | (magic, filename)) 53 | num_items = _read32(bytestream) 54 | buf = bytestream.read(num_items) 55 | labels = numpy.frombuffer(buf, dtype=numpy.uint8) 56 | if one_hot: 57 | return dense_to_one_hot(labels) 58 | return labels 59 | class DataSet(object): 60 | def __init__(self, images, labels, fake_data=False): 61 | if fake_data: 62 | self._num_examples = 10000 63 | else: 64 | assert images.shape[0] == labels.shape[0], ( 65 | "images.shape: %s labels.shape: %s" % (images.shape, 66 | labels.shape)) 67 | self._num_examples = images.shape[0] 68 | # Convert shape from [num examples, rows, columns, depth] 69 | # to [num examples, rows*columns] (assuming depth == 1) 70 | assert images.shape[3] == 1 71 | images = images.reshape(images.shape[0], 72 | images.shape[1] * images.shape[2]) 73 | # Convert from [0, 255] -> [0.0, 1.0]. 74 | images = images.astype(numpy.float32) 75 | images = numpy.multiply(images, 1.0 / 255.0) 76 | self._images = images 77 | self._labels = labels 78 | self._epochs_completed = 0 79 | self._index_in_epoch = 0 80 | @property 81 | def images(self): 82 | return self._images 83 | @property 84 | def labels(self): 85 | return self._labels 86 | @property 87 | def num_examples(self): 88 | return self._num_examples 89 | @property 90 | def epochs_completed(self): 91 | return self._epochs_completed 92 | def next_batch(self, batch_size, fake_data=False): 93 | """Return the next `batch_size` examples from this data set.""" 94 | if fake_data: 95 | fake_image = [1.0 for _ in xrange(784)] 96 | fake_label = 0 97 | return [fake_image for _ in xrange(batch_size)], [ 98 | fake_label for _ in xrange(batch_size)] 99 | start = self._index_in_epoch 100 | self._index_in_epoch += batch_size 101 | if self._index_in_epoch > self._num_examples: 102 | # Finished epoch 103 | self._epochs_completed += 1 104 | # Shuffle the data 105 | perm = numpy.arange(self._num_examples) 106 | numpy.random.shuffle(perm) 107 | self._images = self._images[perm] 108 | self._labels = self._labels[perm] 109 | # Start next epoch 110 | start = 0 111 | self._index_in_epoch = batch_size 112 | assert batch_size <= self._num_examples 113 | end = self._index_in_epoch 114 | return self._images[start:end], self._labels[start:end] 115 | def read_data_sets(train_dir, fake_data=False, one_hot=False): 116 | class DataSets(object): 117 | pass 118 | data_sets = DataSets() 119 | if fake_data: 120 | data_sets.train = DataSet([], [], fake_data=True) 121 | data_sets.validation = DataSet([], [], fake_data=True) 122 | data_sets.test = DataSet([], [], fake_data=True) 123 | return data_sets 124 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' 125 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' 126 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz' 127 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz' 128 | VALIDATION_SIZE = 5000 129 | local_file = maybe_download(TRAIN_IMAGES, train_dir) 130 | train_images = extract_images(local_file) 131 | local_file = maybe_download(TRAIN_LABELS, train_dir) 132 | train_labels = extract_labels(local_file, one_hot=one_hot) 133 | local_file = maybe_download(TEST_IMAGES, train_dir) 134 | test_images = extract_images(local_file) 135 | local_file = maybe_download(TEST_LABELS, train_dir) 136 | test_labels = extract_labels(local_file, one_hot=one_hot) 137 | validation_images = train_images[:VALIDATION_SIZE] 138 | validation_labels = train_labels[:VALIDATION_SIZE] 139 | train_images = train_images[VALIDATION_SIZE:] 140 | train_labels = train_labels[VALIDATION_SIZE:] 141 | data_sets.train = DataSet(train_images, train_labels) 142 | data_sets.validation = DataSet(validation_images, validation_labels) 143 | data_sets.test = DataSet(test_images, test_labels) 144 | return data_sets -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/1_intro/basic_operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic Operations in TensorFlow\n", 8 | "\n", 9 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 10 | "\n", 11 | "## Setup\n", 12 | "\n", 13 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import tensorflow as tf" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "# Basic constant operations\n", 36 | "# The value returned by the constructor represents the output\n", 37 | "# of the Constant op.\n", 38 | "a = tf.constant(2)\n", 39 | "b = tf.constant(3)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "a=2, b=3\n", 54 | "Addition with constants: 5\n", 55 | "Multiplication with constants: 6\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "# Launch the default graph.\n", 61 | "with tf.Session() as sess:\n", 62 | " print \"a=2, b=3\"\n", 63 | " print \"Addition with constants: %i\" % sess.run(a+b)\n", 64 | " print \"Multiplication with constants: %i\" % sess.run(a*b)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "# Basic Operations with variable as graph input\n", 76 | "# The value returned by the constructor represents the output\n", 77 | "# of the Variable op. (define as input when running session)\n", 78 | "# tf Graph input\n", 79 | "a = tf.placeholder(tf.int16)\n", 80 | "b = tf.placeholder(tf.int16)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# Define some operations\n", 92 | "add = tf.add(a, b)\n", 93 | "mul = tf.mul(a, b)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Addition with variables: 5\n", 108 | "Multiplication with variables: 6\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "# Launch the default graph.\n", 114 | "with tf.Session() as sess:\n", 115 | " # Run every operation with variable input\n", 116 | " print \"Addition with variables: %i\" % sess.run(add, feed_dict={a: 2, b: 3})\n", 117 | " print \"Multiplication with variables: %i\" % sess.run(mul, feed_dict={a: 2, b: 3})" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 8, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "# ----------------\n", 129 | "# More in details:\n", 130 | "# Matrix Multiplication from TensorFlow official tutorial\n", 131 | "\n", 132 | "# Create a Constant op that produces a 1x2 matrix. The op is\n", 133 | "# added as a node to the default graph.\n", 134 | "#\n", 135 | "# The value returned by the constructor represents the output\n", 136 | "# of the Constant op.\n", 137 | "matrix1 = tf.constant([[3., 3.]])" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 9, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "# Create another Constant that produces a 2x1 matrix.\n", 149 | "matrix2 = tf.constant([[2.],[2.]])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 10, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.\n", 161 | "# The returned value, 'product', represents the result of the matrix\n", 162 | "# multiplication.\n", 163 | "product = tf.matmul(matrix1, matrix2)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 11, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "[[ 12.]]\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "# To run the matmul op we call the session 'run()' method, passing 'product'\n", 183 | "# which represents the output of the matmul op. This indicates to the call\n", 184 | "# that we want to get the output of the matmul op back.\n", 185 | "#\n", 186 | "# All inputs needed by the op are run automatically by the session. They\n", 187 | "# typically are run in parallel.\n", 188 | "#\n", 189 | "# The call 'run(product)' thus causes the execution of threes ops in the\n", 190 | "# graph: the two constants and matmul.\n", 191 | "#\n", 192 | "# The output of the op is returned in 'result' as a numpy `ndarray` object.\n", 193 | "with tf.Session() as sess:\n", 194 | " result = sess.run(product)\n", 195 | " print result" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [] 206 | } 207 | ], 208 | "metadata": { 209 | "kernelspec": { 210 | "display_name": "Python 2", 211 | "language": "python", 212 | "name": "python2" 213 | }, 214 | "language_info": { 215 | "codemirror_mode": { 216 | "name": "ipython", 217 | "version": 2 218 | }, 219 | "file_extension": ".py", 220 | "mimetype": "text/x-python", 221 | "name": "python", 222 | "nbconvert_exporter": "python", 223 | "pygments_lexer": "ipython2", 224 | "version": "2.7.5+" 225 | } 226 | }, 227 | "nbformat": 4, 228 | "nbformat_minor": 0 229 | } 230 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/2_basic_classifiers/logistic_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Logistic Regression in TensorFlow\n", 10 | "\n", 11 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 12 | "\n", 13 | "## Setup\n", 14 | "\n", 15 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 5, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Extracting /tmp/data/train-images-idx3-ubyte.gz\n", 30 | "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n", 31 | "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n", 32 | "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# Import MINST data\n", 38 | "import input_data\n", 39 | "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 6, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import tensorflow as tf" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 7, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# Parameters\n", 62 | "learning_rate = 0.01\n", 63 | "training_epochs = 25\n", 64 | "batch_size = 100\n", 65 | "display_step = 1" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 8, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# tf Graph Input\n", 77 | "x = tf.placeholder(\"float\", [None, 784]) # mnist data image of shape 28*28=784\n", 78 | "y = tf.placeholder(\"float\", [None, 10]) # 0-9 digits recognition => 10 classes" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 9, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# Create model\n", 90 | "\n", 91 | "# Set model weights\n", 92 | "W = tf.Variable(tf.zeros([784, 10]))\n", 93 | "b = tf.Variable(tf.zeros([10]))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 10, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "# Construct model\n", 105 | "activation = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 11, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "# Minimize error using cross entropy\n", 117 | "# Cross entropy\n", 118 | "cost = -tf.reduce_sum(y*tf.log(activation)) \n", 119 | "# Gradient Descent\n", 120 | "optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) " 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 12, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# Initializing the variables\n", 132 | "init = tf.initialize_all_variables()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 13, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Epoch: 0001 cost= 29.860479714\n", 147 | "Epoch: 0002 cost= 22.080549484\n", 148 | "Epoch: 0003 cost= 21.237104595\n", 149 | "Epoch: 0004 cost= 20.460196280\n", 150 | "Epoch: 0005 cost= 20.185128237\n", 151 | "Epoch: 0006 cost= 19.940297202\n", 152 | "Epoch: 0007 cost= 19.645111119\n", 153 | "Epoch: 0008 cost= 19.507218031\n", 154 | "Epoch: 0009 cost= 19.389794492\n", 155 | "Epoch: 0010 cost= 19.177005816\n", 156 | "Epoch: 0011 cost= 19.082493615\n", 157 | "Epoch: 0012 cost= 19.072873598\n", 158 | "Epoch: 0013 cost= 18.938005402\n", 159 | "Epoch: 0014 cost= 18.891806430\n", 160 | "Epoch: 0015 cost= 18.839480221\n", 161 | "Epoch: 0016 cost= 18.769349510\n", 162 | "Epoch: 0017 cost= 18.590865587\n", 163 | "Epoch: 0018 cost= 18.623413677\n", 164 | "Epoch: 0019 cost= 18.546149085\n", 165 | "Epoch: 0020 cost= 18.432274895\n", 166 | "Epoch: 0021 cost= 18.358189004\n", 167 | "Epoch: 0022 cost= 18.380014628\n", 168 | "Epoch: 0023 cost= 18.499993471\n", 169 | "Epoch: 0024 cost= 18.386477311\n", 170 | "Epoch: 0025 cost= 18.258080609\n", 171 | "Optimization Finished!\n", 172 | "Accuracy: 0.9048\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "# Launch the graph\n", 178 | "with tf.Session() as sess:\n", 179 | " sess.run(init)\n", 180 | "\n", 181 | " # Training cycle\n", 182 | " for epoch in range(training_epochs):\n", 183 | " avg_cost = 0.\n", 184 | " total_batch = int(mnist.train.num_examples/batch_size)\n", 185 | " # Loop over all batches\n", 186 | " for i in range(total_batch):\n", 187 | " batch_xs, batch_ys = mnist.train.next_batch(batch_size)\n", 188 | " # Fit training using batch data\n", 189 | " sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})\n", 190 | " # Compute average loss\n", 191 | " avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch\n", 192 | " # Display logs per epoch step\n", 193 | " if epoch % display_step == 0:\n", 194 | " print \"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(avg_cost)\n", 195 | "\n", 196 | " print \"Optimization Finished!\"\n", 197 | "\n", 198 | " # Test model\n", 199 | " correct_prediction = tf.equal(tf.argmax(activation, 1), tf.argmax(y, 1))\n", 200 | " # Calculate accuracy\n", 201 | " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", 202 | " print \"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})" 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.4.3" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 0 227 | } 228 | -------------------------------------------------------------------------------- /python-data/datetime.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Dates and Times" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "* Basics\n", 22 | "* strftime\n", 23 | "* strptime\n", 24 | "* timedelta" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Basics" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "from datetime import datetime, date, time" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "year = 2015\n", 54 | "month = 1\n", 55 | "day = 20\n", 56 | "hour = 7\n", 57 | "minute = 28\n", 58 | "second = 15" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "dt = datetime(year, month, day, hour, minute, second)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "(7, 28, 15)" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "dt.hour, dt.minute, dt.second" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Extract the equivalent date object:" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "datetime.date(2015, 1, 20)" 112 | ] 113 | }, 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "dt.date()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Extract the equivalent time object:" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "datetime.time(7, 28, 15)" 141 | ] 142 | }, 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "dt.time()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "When aggregating or grouping time series data, it is sometimes useful to replace fields of a series of datetimes such as zeroing out the minute and second fields:" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "datetime.datetime(2015, 1, 20, 7, 0)" 170 | ] 171 | }, 172 | "execution_count": 7, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "dt.replace(minute=0, second=0)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "## strftime" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "Format a datetime string:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "'01/20/2015 07:28'" 206 | ] 207 | }, 208 | "execution_count": 8, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "dt.strftime('%m/%d/%Y %H:%M')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## strptime" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Convert a string into a datetime object:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 9, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "datetime.datetime(2015, 1, 20, 0, 0)" 242 | ] 243 | }, 244 | "execution_count": 9, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "datetime.strptime('20150120', '%Y%m%d')" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## timedelta" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Get the current datetime:" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 10, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "dt_now = datetime.now()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "Subtract two datetime fields to create a timedelta:" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 11, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "datetime.timedelta(6, 40171, 885211)" 296 | ] 297 | }, 298 | "execution_count": 11, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "delta = dt_now - dt\n", 305 | "delta" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Add a datetime and a timedelta to get a new datetime:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 12, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "datetime.datetime(2015, 1, 26, 18, 37, 46, 885211)" 326 | ] 327 | }, 328 | "execution_count": 12, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "dt + delta" 335 | ] 336 | } 337 | ], 338 | "metadata": { 339 | "kernelspec": { 340 | "display_name": "Python 2", 341 | "language": "python", 342 | "name": "python2" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 2 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython2", 354 | "version": "2.7.10" 355 | } 356 | }, 357 | "nbformat": 4, 358 | "nbformat_minor": 0 359 | } 360 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-examples/notebooks/3_neural_networks/multilayer_perceptron.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multilayer Perceptron in TensorFlow\n", 8 | "\n", 9 | "Credits: Forked from [TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples) by Aymeric Damien\n", 10 | "\n", 11 | "## Setup\n", 12 | "\n", 13 | "Refer to the [setup instructions](http://nbviewer.ipython.org/github/donnemartin/data-science-ipython-notebooks/blob/master/deep-learning/tensor-flow-examples/Setup_TensorFlow.md)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Extracting /tmp/data/train-images-idx3-ubyte.gz\n", 28 | "Extracting /tmp/data/train-labels-idx1-ubyte.gz\n", 29 | "Extracting /tmp/data/t10k-images-idx3-ubyte.gz\n", 30 | "Extracting /tmp/data/t10k-labels-idx1-ubyte.gz\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "# Import MINST data\n", 36 | "import input_data\n", 37 | "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "import tensorflow as tf" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# Parameters\n", 60 | "learning_rate = 0.001\n", 61 | "training_epochs = 15\n", 62 | "batch_size = 100\n", 63 | "display_step = 1" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# Network Parameters\n", 75 | "n_hidden_1 = 256 # 1st layer num features\n", 76 | "n_hidden_2 = 256 # 2nd layer num features\n", 77 | "n_input = 784 # MNIST data input (img shape: 28*28)\n", 78 | "n_classes = 10 # MNIST total classes (0-9 digits)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# tf Graph input\n", 90 | "x = tf.placeholder(\"float\", [None, n_input])\n", 91 | "y = tf.placeholder(\"float\", [None, n_classes])" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "# Create model\n", 103 | "def multilayer_perceptron(_X, _weights, _biases):\n", 104 | " #Hidden layer with RELU activation\n", 105 | " layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) \n", 106 | " #Hidden layer with RELU activation\n", 107 | " layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) \n", 108 | " return tf.matmul(layer_2, weights['out']) + biases['out']" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "# Store layers weight & bias\n", 120 | "weights = {\n", 121 | " 'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),\n", 122 | " 'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),\n", 123 | " 'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))\n", 124 | "}\n", 125 | "biases = {\n", 126 | " 'b1': tf.Variable(tf.random_normal([n_hidden_1])),\n", 127 | " 'b2': tf.Variable(tf.random_normal([n_hidden_2])),\n", 128 | " 'out': tf.Variable(tf.random_normal([n_classes]))\n", 129 | "}" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "# Construct model\n", 141 | "pred = multilayer_perceptron(x, weights, biases)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 10, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "# Define loss and optimizer\n", 153 | "# Softmax loss\n", 154 | "cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) \n", 155 | "# Adam Optimizer\n", 156 | "optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 11, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "# Initializing the variables\n", 168 | "init = tf.initialize_all_variables()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 12, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "Epoch: 0001 cost= 160.113980416\n", 183 | "Epoch: 0002 cost= 38.665780694\n", 184 | "Epoch: 0003 cost= 24.118004577\n", 185 | "Epoch: 0004 cost= 16.440921303\n", 186 | "Epoch: 0005 cost= 11.689460141\n", 187 | "Epoch: 0006 cost= 8.469423468\n", 188 | "Epoch: 0007 cost= 6.223237230\n", 189 | "Epoch: 0008 cost= 4.560174118\n", 190 | "Epoch: 0009 cost= 3.250516910\n", 191 | "Epoch: 0010 cost= 2.359658795\n", 192 | "Epoch: 0011 cost= 1.694081847\n", 193 | "Epoch: 0012 cost= 1.167997509\n", 194 | "Epoch: 0013 cost= 0.872986831\n", 195 | "Epoch: 0014 cost= 0.630616366\n", 196 | "Epoch: 0015 cost= 0.487381571\n", 197 | "Optimization Finished!\n", 198 | "Accuracy: 0.9462\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "# Launch the graph\n", 204 | "with tf.Session() as sess:\n", 205 | " sess.run(init)\n", 206 | "\n", 207 | " # Training cycle\n", 208 | " for epoch in range(training_epochs):\n", 209 | " avg_cost = 0.\n", 210 | " total_batch = int(mnist.train.num_examples/batch_size)\n", 211 | " # Loop over all batches\n", 212 | " for i in range(total_batch):\n", 213 | " batch_xs, batch_ys = mnist.train.next_batch(batch_size)\n", 214 | " # Fit training using batch data\n", 215 | " sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})\n", 216 | " # Compute average loss\n", 217 | " avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch\n", 218 | " # Display logs per epoch step\n", 219 | " if epoch % display_step == 0:\n", 220 | " print \"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(avg_cost)\n", 221 | "\n", 222 | " print \"Optimization Finished!\"\n", 223 | "\n", 224 | " # Test model\n", 225 | " correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))\n", 226 | " # Calculate accuracy\n", 227 | " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", 228 | " print \"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})" 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.4.3" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 0 253 | } 254 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/rnn_precompile.py: -------------------------------------------------------------------------------- 1 | """This file is only here to speed up the execution of notebooks. 2 | 3 | It contains a subset of the code defined in simple_rnn.ipynb and 4 | lstm_text.ipynb, in particular the code compiling Theano function. 5 | Executing this script first will populate the cache of compiled C code, 6 | which will make subsequent compilations faster. 7 | 8 | The use case is to run this script in the background when a demo VM 9 | such as the one for NVIDIA's qwikLABS, so that the compilation phase 10 | started from the notebooks is faster. 11 | 12 | """ 13 | import numpy 14 | 15 | import theano 16 | import theano.tensor as T 17 | 18 | from theano import config 19 | from theano.tensor.nnet import categorical_crossentropy 20 | 21 | 22 | floatX = theano.config.floatX 23 | 24 | 25 | # simple_rnn.ipynb 26 | 27 | class SimpleRNN(object): 28 | def __init__(self, input_dim, recurrent_dim): 29 | w_xh = numpy.random.normal(0, .01, (input_dim, recurrent_dim)) 30 | w_hh = numpy.random.normal(0, .02, (recurrent_dim, recurrent_dim)) 31 | self.w_xh = theano.shared(numpy.asarray(w_xh, dtype=floatX), name='w_xh') 32 | self.w_hh = theano.shared(numpy.asarray(w_hh, dtype=floatX), name='w_hh') 33 | self.b_h = theano.shared(numpy.zeros((recurrent_dim,), dtype=floatX), name='b_h') 34 | self.parameters = [self.w_xh, self.w_hh, self.b_h] 35 | 36 | def _step(self, input_t, previous): 37 | return T.tanh(T.dot(previous, self.w_hh) + input_t) 38 | 39 | def __call__(self, x): 40 | x_w_xh = T.dot(x, self.w_xh) + self.b_h 41 | result, updates = theano.scan(self._step, 42 | sequences=[x_w_xh], 43 | outputs_info=[T.zeros_like(self.b_h)]) 44 | return result 45 | 46 | 47 | w_ho_np = numpy.random.normal(0, .01, (15, 1)) 48 | w_ho = theano.shared(numpy.asarray(w_ho_np, dtype=floatX), name='w_ho') 49 | b_o = theano.shared(numpy.zeros((1,), dtype=floatX), name='b_o') 50 | 51 | x = T.matrix('x') 52 | my_rnn = SimpleRNN(1, 15) 53 | hidden = my_rnn(x) 54 | prediction = T.dot(hidden, w_ho) + b_o 55 | parameters = my_rnn.parameters + [w_ho, b_o] 56 | l2 = sum((p**2).sum() for p in parameters) 57 | mse = T.mean((prediction[:-1] - x[1:])**2) 58 | cost = mse + .0001 * l2 59 | gradient = T.grad(cost, wrt=parameters) 60 | 61 | lr = .3 62 | updates = [(par, par - lr * gra) for par, gra in zip(parameters, gradient)] 63 | update_model = theano.function([x], cost, updates=updates) 64 | get_cost = theano.function([x], mse) 65 | predict = theano.function([x], prediction) 66 | get_hidden = theano.function([x], hidden) 67 | get_gradient = theano.function([x], gradient) 68 | 69 | predict = theano.function([x], prediction) 70 | 71 | # Generating sequences 72 | 73 | x_t = T.vector() 74 | h_p = T.vector() 75 | preactivation = T.dot(x_t, my_rnn.w_xh) + my_rnn.b_h 76 | h_t = my_rnn._step(preactivation, h_p) 77 | o_t = T.dot(h_t, w_ho) + b_o 78 | 79 | single_step = theano.function([x_t, h_p], [o_t, h_t]) 80 | 81 | # lstm_text.ipynb 82 | 83 | def gauss_weight(rng, ndim_in, ndim_out=None, sd=.005): 84 | if ndim_out is None: 85 | ndim_out = ndim_in 86 | W = rng.randn(ndim_in, ndim_out) * sd 87 | return numpy.asarray(W, dtype=config.floatX) 88 | 89 | 90 | def index_dot(indices, w): 91 | return w[indices.flatten()] 92 | 93 | 94 | class LstmLayer: 95 | 96 | def __init__(self, rng, input, mask, n_in, n_h): 97 | 98 | # Init params 99 | self.W_i = theano.shared(gauss_weight(rng, n_in, n_h), 'W_i', borrow=True) 100 | self.W_f = theano.shared(gauss_weight(rng, n_in, n_h), 'W_f', borrow=True) 101 | self.W_c = theano.shared(gauss_weight(rng, n_in, n_h), 'W_c', borrow=True) 102 | self.W_o = theano.shared(gauss_weight(rng, n_in, n_h), 'W_o', borrow=True) 103 | 104 | self.U_i = theano.shared(gauss_weight(rng, n_h), 'U_i', borrow=True) 105 | self.U_f = theano.shared(gauss_weight(rng, n_h), 'U_f', borrow=True) 106 | self.U_c = theano.shared(gauss_weight(rng, n_h), 'U_c', borrow=True) 107 | self.U_o = theano.shared(gauss_weight(rng, n_h), 'U_o', borrow=True) 108 | 109 | self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 110 | 'b_i', borrow=True) 111 | self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 112 | 'b_f', borrow=True) 113 | self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 114 | 'b_c', borrow=True) 115 | self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 116 | 'b_o', borrow=True) 117 | 118 | self.params = [self.W_i, self.W_f, self.W_c, self.W_o, 119 | self.U_i, self.U_f, self.U_c, self.U_o, 120 | self.b_i, self.b_f, self.b_c, self.b_o] 121 | 122 | outputs_info = [T.zeros((input.shape[1], n_h)), 123 | T.zeros((input.shape[1], n_h))] 124 | 125 | rval, updates = theano.scan(self._step, 126 | sequences=[mask, input], 127 | outputs_info=outputs_info) 128 | 129 | # self.output is in the format (length, batchsize, n_h) 130 | self.output = rval[0] 131 | 132 | def _step(self, m_, x_, h_, c_): 133 | 134 | i_preact = (index_dot(x_, self.W_i) + 135 | T.dot(h_, self.U_i) + self.b_i) 136 | i = T.nnet.sigmoid(i_preact) 137 | 138 | f_preact = (index_dot(x_, self.W_f) + 139 | T.dot(h_, self.U_f) + self.b_f) 140 | f = T.nnet.sigmoid(f_preact) 141 | 142 | o_preact = (index_dot(x_, self.W_o) + 143 | T.dot(h_, self.U_o) + self.b_o) 144 | o = T.nnet.sigmoid(o_preact) 145 | 146 | c_preact = (index_dot(x_, self.W_c) + 147 | T.dot(h_, self.U_c) + self.b_c) 148 | c = T.tanh(c_preact) 149 | 150 | c = f * c_ + i * c 151 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 152 | 153 | h = o * T.tanh(c) 154 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 155 | 156 | return h, c 157 | 158 | 159 | def sequence_categorical_crossentropy(prediction, targets, mask): 160 | prediction_flat = prediction.reshape(((prediction.shape[0] * 161 | prediction.shape[1]), 162 | prediction.shape[2]), ndim=2) 163 | targets_flat = targets.flatten() 164 | mask_flat = mask.flatten() 165 | ce = categorical_crossentropy(prediction_flat, targets_flat) 166 | return T.sum(ce * mask_flat) 167 | 168 | 169 | class LogisticRegression(object): 170 | 171 | def __init__(self, rng, input, n_in, n_out): 172 | 173 | W = gauss_weight(rng, n_in, n_out) 174 | self.W = theano.shared(value=numpy.asarray(W, dtype=theano.config.floatX), 175 | name='W', borrow=True) 176 | # initialize the biases b as a vector of n_out 0s 177 | self.b = theano.shared(value=numpy.zeros((n_out,), 178 | dtype=theano.config.floatX), 179 | name='b', borrow=True) 180 | 181 | # compute vector of class-membership probabilities in symbolic form 182 | energy = T.dot(input, self.W) + self.b 183 | energy_exp = T.exp(energy - T.max(energy, axis=2, keepdims=True)) 184 | pmf = energy_exp / energy_exp.sum(axis=2, keepdims=True) 185 | self.p_y_given_x = pmf 186 | self.params = [self.W, self.b] 187 | 188 | batch_size = 100 189 | n_h = 50 190 | 191 | # The Theano graph 192 | # Set the random number generator' seeds for consistency 193 | rng = numpy.random.RandomState(12345) 194 | 195 | x = T.lmatrix('x') 196 | mask = T.matrix('mask') 197 | 198 | # Construct an LSTM layer and a logistic regression layer 199 | recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) 200 | logreg_layer = LogisticRegression(rng=rng, input=recurrent_layer.output[:-1], 201 | n_in=n_h, n_out=111) 202 | 203 | # define a cost variable to optimize 204 | cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, 205 | x[1:], 206 | mask[1:]) / batch_size 207 | 208 | # create a list of all model parameters to be fit by gradient descent 209 | params = logreg_layer.params + recurrent_layer.params 210 | 211 | # create a list of gradients for all model parameters 212 | grads = T.grad(cost, params) 213 | 214 | learning_rate = 0.1 215 | updates = [ 216 | (param_i, param_i - learning_rate * grad_i) 217 | for param_i, grad_i in zip(params, grads) 218 | ] 219 | 220 | update_model = theano.function([x, mask], cost, updates=updates) 221 | 222 | evaluate_model = theano.function([x, mask], cost) 223 | 224 | # Generating Sequences 225 | x_t = T.iscalar() 226 | h_p = T.vector() 227 | c_p = T.vector() 228 | h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) 229 | energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b 230 | 231 | energy_exp = T.exp(energy - T.max(energy, axis=1, keepdims=True)) 232 | 233 | output = energy_exp / energy_exp.sum(axis=1, keepdims=True) 234 | single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) 235 | -------------------------------------------------------------------------------- /scikit-learn/fig_code/figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import warnings 4 | 5 | 6 | def plot_venn_diagram(): 7 | fig, ax = plt.subplots(subplot_kw=dict(frameon=False, xticks=[], yticks=[])) 8 | ax.add_patch(plt.Circle((0.3, 0.3), 0.3, fc='red', alpha=0.5)) 9 | ax.add_patch(plt.Circle((0.6, 0.3), 0.3, fc='blue', alpha=0.5)) 10 | ax.add_patch(plt.Rectangle((-0.1, -0.1), 1.1, 0.8, fc='none', ec='black')) 11 | ax.text(0.2, 0.3, '$x$', size=30, ha='center', va='center') 12 | ax.text(0.7, 0.3, '$y$', size=30, ha='center', va='center') 13 | ax.text(0.0, 0.6, '$I$', size=30) 14 | ax.axis('equal') 15 | 16 | 17 | def plot_example_decision_tree(): 18 | fig = plt.figure(figsize=(10, 4)) 19 | ax = fig.add_axes([0, 0, 0.8, 1], frameon=False, xticks=[], yticks=[]) 20 | ax.set_title('Example Decision Tree: Animal Classification', size=24) 21 | 22 | def text(ax, x, y, t, size=20, **kwargs): 23 | ax.text(x, y, t, 24 | ha='center', va='center', size=size, 25 | bbox=dict(boxstyle='round', ec='k', fc='w'), **kwargs) 26 | 27 | text(ax, 0.5, 0.9, "How big is\nthe animal?", 20) 28 | text(ax, 0.3, 0.6, "Does the animal\nhave horns?", 18) 29 | text(ax, 0.7, 0.6, "Does the animal\nhave two legs?", 18) 30 | text(ax, 0.12, 0.3, "Are the horns\nlonger than 10cm?", 14) 31 | text(ax, 0.38, 0.3, "Is the animal\nwearing a collar?", 14) 32 | text(ax, 0.62, 0.3, "Does the animal\nhave wings?", 14) 33 | text(ax, 0.88, 0.3, "Does the animal\nhave a tail?", 14) 34 | 35 | text(ax, 0.4, 0.75, "> 1m", 12, alpha=0.4) 36 | text(ax, 0.6, 0.75, "< 1m", 12, alpha=0.4) 37 | 38 | text(ax, 0.21, 0.45, "yes", 12, alpha=0.4) 39 | text(ax, 0.34, 0.45, "no", 12, alpha=0.4) 40 | 41 | text(ax, 0.66, 0.45, "yes", 12, alpha=0.4) 42 | text(ax, 0.79, 0.45, "no", 12, alpha=0.4) 43 | 44 | ax.plot([0.3, 0.5, 0.7], [0.6, 0.9, 0.6], '-k') 45 | ax.plot([0.12, 0.3, 0.38], [0.3, 0.6, 0.3], '-k') 46 | ax.plot([0.62, 0.7, 0.88], [0.3, 0.6, 0.3], '-k') 47 | ax.plot([0.0, 0.12, 0.20], [0.0, 0.3, 0.0], '--k') 48 | ax.plot([0.28, 0.38, 0.48], [0.0, 0.3, 0.0], '--k') 49 | ax.plot([0.52, 0.62, 0.72], [0.0, 0.3, 0.0], '--k') 50 | ax.plot([0.8, 0.88, 1.0], [0.0, 0.3, 0.0], '--k') 51 | ax.axis([0, 1, 0, 1]) 52 | 53 | 54 | def visualize_tree(estimator, X, y, boundaries=True, 55 | xlim=None, ylim=None): 56 | estimator.fit(X, y) 57 | 58 | if xlim is None: 59 | xlim = (X[:, 0].min() - 0.1, X[:, 0].max() + 0.1) 60 | if ylim is None: 61 | ylim = (X[:, 1].min() - 0.1, X[:, 1].max() + 0.1) 62 | 63 | x_min, x_max = xlim 64 | y_min, y_max = ylim 65 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), 66 | np.linspace(y_min, y_max, 100)) 67 | Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()]) 68 | 69 | # Put the result into a color plot 70 | Z = Z.reshape(xx.shape) 71 | plt.figure() 72 | plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap='rainbow') 73 | plt.clim(y.min(), y.max()) 74 | 75 | # Plot also the training points 76 | plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow') 77 | plt.axis('off') 78 | 79 | plt.xlim(x_min, x_max) 80 | plt.ylim(y_min, y_max) 81 | plt.clim(y.min(), y.max()) 82 | 83 | # Plot the decision boundaries 84 | def plot_boundaries(i, xlim, ylim): 85 | if i < 0: 86 | return 87 | 88 | tree = estimator.tree_ 89 | 90 | if tree.feature[i] == 0: 91 | plt.plot([tree.threshold[i], tree.threshold[i]], ylim, '-k') 92 | plot_boundaries(tree.children_left[i], 93 | [xlim[0], tree.threshold[i]], ylim) 94 | plot_boundaries(tree.children_right[i], 95 | [tree.threshold[i], xlim[1]], ylim) 96 | 97 | elif tree.feature[i] == 1: 98 | plt.plot(xlim, [tree.threshold[i], tree.threshold[i]], '-k') 99 | plot_boundaries(tree.children_left[i], xlim, 100 | [ylim[0], tree.threshold[i]]) 101 | plot_boundaries(tree.children_right[i], xlim, 102 | [tree.threshold[i], ylim[1]]) 103 | 104 | if boundaries: 105 | plot_boundaries(0, plt.xlim(), plt.ylim()) 106 | 107 | 108 | def plot_tree_interactive(X, y): 109 | from sklearn.tree import DecisionTreeClassifier 110 | 111 | def interactive_tree(depth=1): 112 | clf = DecisionTreeClassifier(max_depth=depth, random_state=0) 113 | visualize_tree(clf, X, y) 114 | 115 | from IPython.html.widgets import interact 116 | return interact(interactive_tree, depth=[1, 5]) 117 | 118 | 119 | def plot_kmeans_interactive(min_clusters=1, max_clusters=6): 120 | from IPython.html.widgets import interact 121 | from sklearn.metrics.pairwise import euclidean_distances 122 | from sklearn.datasets.samples_generator import make_blobs 123 | 124 | with warnings.catch_warnings(): 125 | warnings.filterwarnings('ignore') 126 | 127 | X, y = make_blobs(n_samples=300, centers=4, 128 | random_state=0, cluster_std=0.60) 129 | 130 | def _kmeans_step(frame=0, n_clusters=4): 131 | rng = np.random.RandomState(2) 132 | labels = np.zeros(X.shape[0]) 133 | centers = rng.randn(n_clusters, 2) 134 | 135 | nsteps = frame // 3 136 | 137 | for i in range(nsteps + 1): 138 | old_centers = centers 139 | if i < nsteps or frame % 3 > 0: 140 | dist = euclidean_distances(X, centers) 141 | labels = dist.argmin(1) 142 | 143 | if i < nsteps or frame % 3 > 1: 144 | centers = np.array([X[labels == j].mean(0) 145 | for j in range(n_clusters)]) 146 | nans = np.isnan(centers) 147 | centers[nans] = old_centers[nans] 148 | 149 | 150 | # plot the data and cluster centers 151 | plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow', 152 | vmin=0, vmax=n_clusters - 1); 153 | plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o', 154 | c=np.arange(n_clusters), 155 | s=200, cmap='rainbow') 156 | plt.scatter(old_centers[:, 0], old_centers[:, 1], marker='o', 157 | c='black', s=50) 158 | 159 | # plot new centers if third frame 160 | if frame % 3 == 2: 161 | for i in range(n_clusters): 162 | plt.annotate('', centers[i], old_centers[i], 163 | arrowprops=dict(arrowstyle='->', linewidth=1)) 164 | plt.scatter(centers[:, 0], centers[:, 1], marker='o', 165 | c=np.arange(n_clusters), 166 | s=200, cmap='rainbow') 167 | plt.scatter(centers[:, 0], centers[:, 1], marker='o', 168 | c='black', s=50) 169 | 170 | plt.xlim(-4, 4) 171 | plt.ylim(-2, 10) 172 | 173 | if frame % 3 == 1: 174 | plt.text(3.8, 9.5, "1. Reassign points to nearest centroid", 175 | ha='right', va='top', size=14) 176 | elif frame % 3 == 2: 177 | plt.text(3.8, 9.5, "2. Update centroids to cluster means", 178 | ha='right', va='top', size=14) 179 | 180 | 181 | return interact(_kmeans_step, frame=[0, 50], 182 | n_clusters=[min_clusters, max_clusters]) 183 | 184 | 185 | def plot_image_components(x, coefficients=None, mean=0, components=None, 186 | imshape=(8, 8), n_components=6, fontsize=12): 187 | if coefficients is None: 188 | coefficients = x 189 | 190 | if components is None: 191 | components = np.eye(len(coefficients), len(x)) 192 | 193 | mean = np.zeros_like(x) + mean 194 | 195 | 196 | fig = plt.figure(figsize=(1.2 * (5 + n_components), 1.2 * 2)) 197 | g = plt.GridSpec(2, 5 + n_components, hspace=0.3) 198 | 199 | def show(i, j, x, title=None): 200 | ax = fig.add_subplot(g[i, j], xticks=[], yticks=[]) 201 | ax.imshow(x.reshape(imshape), interpolation='nearest') 202 | if title: 203 | ax.set_title(title, fontsize=fontsize) 204 | 205 | show(slice(2), slice(2), x, "True") 206 | 207 | approx = mean.copy() 208 | show(0, 2, np.zeros_like(x) + mean, r'$\mu$') 209 | show(1, 2, approx, r'$1 \cdot \mu$') 210 | 211 | for i in range(0, n_components): 212 | approx = approx + coefficients[i] * components[i] 213 | show(0, i + 3, components[i], r'$c_{0}$'.format(i + 1)) 214 | show(1, i + 3, approx, 215 | r"${0:.2f} \cdot c_{1}$".format(coefficients[i], i + 1)) 216 | plt.gca().text(0, 1.05, '$+$', ha='right', va='bottom', 217 | transform=plt.gca().transAxes, fontsize=fontsize) 218 | 219 | show(slice(2), slice(-2, None), approx, "Approx") 220 | 221 | 222 | def plot_pca_interactive(data, n_components=6): 223 | from sklearn.decomposition import PCA 224 | from IPython.html.widgets import interact 225 | 226 | pca = PCA(n_components=n_components) 227 | Xproj = pca.fit_transform(data) 228 | 229 | def show_decomp(i=0): 230 | plot_image_components(data[i], Xproj[i], 231 | pca.mean_, pca.components_) 232 | 233 | interact(show_decomp, i=(0, data.shape[0] - 1)); 234 | -------------------------------------------------------------------------------- /deep-learning/tensor-flow-exercises/3_regularization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "kR-4eNdK6lYS" 8 | }, 9 | "source": [ 10 | "Deep Learning with TensorFlow\n", 11 | "=============\n", 12 | "\n", 13 | "Credits: Forked from [TensorFlow](https://github.com/tensorflow/tensorflow) by Google\n", 14 | "\n", 15 | "Setup\n", 16 | "------------\n", 17 | "\n", 18 | "Refer to the [setup instructions](https://github.com/donnemartin/data-science-ipython-notebooks/tree/feature/deep-learning/deep-learning/tensor-flow-exercises/README.md).\n", 19 | "\n", 20 | "Exercise 3\n", 21 | "------------\n", 22 | "\n", 23 | "Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.\n", 24 | "\n", 25 | "The goal of this exercise is to explore regularization techniques." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "cellView": "both", 33 | "colab": { 34 | "autoexec": { 35 | "startup": false, 36 | "wait_interval": 0 37 | } 38 | }, 39 | "colab_type": "code", 40 | "collapsed": true, 41 | "id": "JLpLa8Jt7Vu4" 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "# These are all the modules we'll be using later. Make sure you can import them\n", 46 | "# before proceeding further.\n", 47 | "import cPickle as pickle\n", 48 | "import numpy as np\n", 49 | "import tensorflow as tf" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "colab_type": "text", 56 | "id": "1HrCK6e17WzV" 57 | }, 58 | "source": [ 59 | "First reload the data we generated in _notmist.ipynb_." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "cellView": "both", 67 | "colab": { 68 | "autoexec": { 69 | "startup": false, 70 | "wait_interval": 0 71 | }, 72 | "output_extras": [ 73 | { 74 | "item_id": 1 75 | } 76 | ] 77 | }, 78 | "colab_type": "code", 79 | "collapsed": false, 80 | "executionInfo": { 81 | "elapsed": 11777, 82 | "status": "ok", 83 | "timestamp": 1449849322348, 84 | "user": { 85 | "color": "", 86 | "displayName": "", 87 | "isAnonymous": false, 88 | "isMe": true, 89 | "permissionId": "", 90 | "photoUrl": "", 91 | "sessionId": "0", 92 | "userId": "" 93 | }, 94 | "user_tz": 480 95 | }, 96 | "id": "y3-cj1bpmuxc", 97 | "outputId": "e03576f1-ebbe-4838-c388-f1777bcc9873" 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Training set (200000, 28, 28) (200000,)\n", 105 | "Validation set (10000, 28, 28) (10000,)\n", 106 | "Test set (18724, 28, 28) (18724,)\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "pickle_file = 'notMNIST.pickle'\n", 112 | "\n", 113 | "with open(pickle_file, 'rb') as f:\n", 114 | " save = pickle.load(f)\n", 115 | " train_dataset = save['train_dataset']\n", 116 | " train_labels = save['train_labels']\n", 117 | " valid_dataset = save['valid_dataset']\n", 118 | " valid_labels = save['valid_labels']\n", 119 | " test_dataset = save['test_dataset']\n", 120 | " test_labels = save['test_labels']\n", 121 | " del save # hint to help gc free up memory\n", 122 | " print 'Training set', train_dataset.shape, train_labels.shape\n", 123 | " print 'Validation set', valid_dataset.shape, valid_labels.shape\n", 124 | " print 'Test set', test_dataset.shape, test_labels.shape" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "colab_type": "text", 131 | "id": "L7aHrm6nGDMB" 132 | }, 133 | "source": [ 134 | "Reformat into a shape that's more adapted to the models we're going to train:\n", 135 | "- data as a flat matrix,\n", 136 | "- labels as float 1-hot encodings." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "cellView": "both", 144 | "colab": { 145 | "autoexec": { 146 | "startup": false, 147 | "wait_interval": 0 148 | }, 149 | "output_extras": [ 150 | { 151 | "item_id": 1 152 | } 153 | ] 154 | }, 155 | "colab_type": "code", 156 | "collapsed": false, 157 | "executionInfo": { 158 | "elapsed": 11728, 159 | "status": "ok", 160 | "timestamp": 1449849322356, 161 | "user": { 162 | "color": "", 163 | "displayName": "", 164 | "isAnonymous": false, 165 | "isMe": true, 166 | "permissionId": "", 167 | "photoUrl": "", 168 | "sessionId": "0", 169 | "userId": "" 170 | }, 171 | "user_tz": 480 172 | }, 173 | "id": "IRSyYiIIGIzS", 174 | "outputId": "3f8996ee-3574-4f44-c953-5c8a04636582" 175 | }, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "Training set (200000, 784) (200000, 10)\n", 182 | "Validation set (10000, 784) (10000, 10)\n", 183 | "Test set (18724, 784) (18724, 10)\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "image_size = 28\n", 189 | "num_labels = 10\n", 190 | "\n", 191 | "def reformat(dataset, labels):\n", 192 | " dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)\n", 193 | " # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]\n", 194 | " labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)\n", 195 | " return dataset, labels\n", 196 | "train_dataset, train_labels = reformat(train_dataset, train_labels)\n", 197 | "valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)\n", 198 | "test_dataset, test_labels = reformat(test_dataset, test_labels)\n", 199 | "print 'Training set', train_dataset.shape, train_labels.shape\n", 200 | "print 'Validation set', valid_dataset.shape, valid_labels.shape\n", 201 | "print 'Test set', test_dataset.shape, test_labels.shape" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "cellView": "both", 209 | "colab": { 210 | "autoexec": { 211 | "startup": false, 212 | "wait_interval": 0 213 | } 214 | }, 215 | "colab_type": "code", 216 | "collapsed": true, 217 | "id": "RajPLaL_ZW6w" 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "def accuracy(predictions, labels):\n", 222 | " return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))\n", 223 | " / predictions.shape[0])" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "colab_type": "text", 230 | "id": "sgLbUAQ1CW-1" 231 | }, 232 | "source": [ 233 | "---\n", 234 | "Problem 1\n", 235 | "---------\n", 236 | "\n", 237 | "Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compue the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.\n", 238 | "\n", 239 | "---" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "colab_type": "text", 246 | "id": "na8xX2yHZzNF" 247 | }, 248 | "source": [ 249 | "---\n", 250 | "Problem 2\n", 251 | "---------\n", 252 | "Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?\n", 253 | "\n", 254 | "---" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "colab_type": "text", 261 | "id": "ww3SCBUdlkRc" 262 | }, 263 | "source": [ 264 | "---\n", 265 | "Problem 3\n", 266 | "---------\n", 267 | "Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.\n", 268 | "\n", 269 | "What happens to our extreme overfitting case?\n", 270 | "\n", 271 | "---" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "colab_type": "text", 278 | "id": "-b1hTz3VWZjw" 279 | }, 280 | "source": [ 281 | "---\n", 282 | "Problem 4\n", 283 | "---------\n", 284 | "\n", 285 | "Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).\n", 286 | "\n", 287 | "One avenue you can explore is to add multiple layers.\n", 288 | "\n", 289 | "Another one is to use learning rate decay:\n", 290 | "\n", 291 | " global_step = tf.Variable(0) # count the number of steps taken.\n", 292 | " learning_rate = tf.train.exponential_decay(0.5, step, ...)\n", 293 | " optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)\n", 294 | " \n", 295 | " ---\n" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "colabVersion": "0.3.2", 301 | "colab_default_view": {}, 302 | "colab_views": {}, 303 | "kernelspec": { 304 | "display_name": "Python 3", 305 | "language": "python", 306 | "name": "python3" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 3 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython3", 318 | "version": "3.4.3" 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 0 323 | } 324 | -------------------------------------------------------------------------------- /deep-learning/theano-tutorial/rnn_tutorial/lstm_text.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import time 3 | 4 | import numpy 5 | import theano 6 | from theano import config 7 | import theano.tensor as T 8 | from theano.tensor.nnet import categorical_crossentropy 9 | 10 | from fuel.datasets import TextFile 11 | from fuel.streams import DataStream 12 | from fuel.schemes import ConstantScheme 13 | from fuel.transformers import Batch, Padding 14 | 15 | 16 | # These files can be downloaded from 17 | # http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz 18 | # http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl 19 | # don't forget to change the paths and gunzip train.txt.gz 20 | TRAIN_FILE = '/u/brakelp/temp/traindata.txt' 21 | VAL_FILE = '/u/brakelp/temp/valdata.txt' 22 | DICT_FILE = '/u/brakelp/temp/dictionary.pkl' 23 | 24 | 25 | def sequence_categorical_crossentropy(prediction, targets, mask): 26 | prediction_flat = prediction.reshape(((prediction.shape[0] * 27 | prediction.shape[1]), 28 | prediction.shape[2]), ndim=2) 29 | targets_flat = targets.flatten() 30 | mask_flat = mask.flatten() 31 | ce = categorical_crossentropy(prediction_flat, targets_flat) 32 | return T.sum(ce * mask_flat) 33 | 34 | 35 | def gauss_weight(ndim_in, ndim_out=None, sd=.005): 36 | if ndim_out is None: 37 | ndim_out = ndim_in 38 | W = numpy.random.randn(ndim_in, ndim_out) * sd 39 | return numpy.asarray(W, dtype=config.floatX) 40 | 41 | 42 | class LogisticRegression(object): 43 | """Multi-class Logistic Regression Class 44 | 45 | The logistic regression is fully described by a weight matrix :math:`W` 46 | and bias vector :math:`b`. Classification is done by projecting data 47 | points onto a set of hyperplanes, the distance to which is used to 48 | determine a class membership probability. 49 | """ 50 | 51 | def __init__(self, input, n_in, n_out): 52 | """ Initialize the parameters of the logistic regression 53 | 54 | :type input: theano.tensor.TensorType 55 | :param input: symbolic variable that describes the input of the 56 | architecture (one minibatch) 57 | 58 | :type n_in: int 59 | :param n_in: number of input units, the dimension of the space in 60 | which the datapoints lie 61 | 62 | :type n_out: int 63 | :param n_out: number of output units, the dimension of the space in 64 | which the labels lie 65 | 66 | """ 67 | 68 | # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 69 | self.W = theano.shared(value=numpy.zeros((n_in, n_out), 70 | dtype=theano.config.floatX), 71 | name='W', borrow=True) 72 | # initialize the baises b as a vector of n_out 0s 73 | self.b = theano.shared(value=numpy.zeros((n_out,), 74 | dtype=theano.config.floatX), 75 | name='b', borrow=True) 76 | 77 | # compute vector of class-membership probabilities in symbolic form 78 | energy = T.dot(input, self.W) + self.b 79 | energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None]) 80 | pmf = energy_exp / energy_exp.sum(2)[:, :, None] 81 | self.p_y_given_x = pmf 82 | 83 | # compute prediction as class whose probability is maximal in 84 | # symbolic form 85 | self.y_pred = T.argmax(self.p_y_given_x, axis=1) 86 | 87 | # parameters of the model 88 | self.params = [self.W, self.b] 89 | 90 | 91 | def index_dot(indices, w): 92 | return w[indices.flatten()] 93 | 94 | 95 | class LstmLayer: 96 | 97 | def __init__(self, rng, input, mask, n_in, n_h): 98 | 99 | # Init params 100 | self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True) 101 | self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True) 102 | self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True) 103 | self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True) 104 | 105 | self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True) 106 | self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True) 107 | self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True) 108 | self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True) 109 | 110 | self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 111 | 'b_i', borrow=True) 112 | self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 113 | 'b_f', borrow=True) 114 | self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 115 | 'b_c', borrow=True) 116 | self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 117 | 'b_o', borrow=True) 118 | 119 | self.params = [self.W_i, self.W_f, self.W_c, self.W_o, 120 | self.U_i, self.U_f, self.U_c, self.U_o, 121 | self.b_i, self.b_f, self.b_c, self.b_o] 122 | 123 | outputs_info = [T.zeros((input.shape[1], n_h)), 124 | T.zeros((input.shape[1], n_h))] 125 | 126 | rval, updates = theano.scan(self._step, 127 | sequences=[mask, input], 128 | outputs_info=outputs_info) 129 | 130 | # self.output is in the format (batchsize, n_h) 131 | self.output = rval[0] 132 | 133 | def _step(self, m_, x_, h_, c_): 134 | 135 | i_preact = (index_dot(x_, self.W_i) + 136 | T.dot(h_, self.U_i) + self.b_i) 137 | i = T.nnet.sigmoid(i_preact) 138 | 139 | f_preact = (index_dot(x_, self.W_f) + 140 | T.dot(h_, self.U_f) + self.b_f) 141 | f = T.nnet.sigmoid(f_preact) 142 | 143 | o_preact = (index_dot(x_, self.W_o) + 144 | T.dot(h_, self.U_o) + self.b_o) 145 | o = T.nnet.sigmoid(o_preact) 146 | 147 | c_preact = (index_dot(x_, self.W_c) + 148 | T.dot(h_, self.U_c) + self.b_c) 149 | c = T.tanh(c_preact) 150 | 151 | c = f * c_ + i * c 152 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 153 | 154 | h = o * T.tanh(c) 155 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 156 | 157 | return h, c 158 | 159 | 160 | def train_model(batch_size=100, n_h=50, n_epochs=40): 161 | 162 | # Load the datasets with Fuel 163 | dictionary = pkl.load(open(DICT_FILE, 'r')) 164 | dictionary['~'] = len(dictionary) 165 | reverse_mapping = dict((j, i) for i, j in dictionary.items()) 166 | 167 | print("Loading the data") 168 | train = TextFile(files=[TRAIN_FILE], 169 | dictionary=dictionary, 170 | unk_token='~', 171 | level='character', 172 | preprocess=str.lower, 173 | bos_token=None, 174 | eos_token=None) 175 | 176 | train_stream = DataStream.default_stream(train) 177 | 178 | # organize data in batches and pad shorter sequences with zeros 179 | train_stream = Batch(train_stream, 180 | iteration_scheme=ConstantScheme(batch_size)) 181 | train_stream = Padding(train_stream) 182 | 183 | # idem dito for the validation text 184 | val = TextFile(files=[VAL_FILE], 185 | dictionary=dictionary, 186 | unk_token='~', 187 | level='character', 188 | preprocess=str.lower, 189 | bos_token=None, 190 | eos_token=None) 191 | 192 | val_stream = DataStream.default_stream(val) 193 | 194 | # organize data in batches and pad shorter sequences with zeros 195 | val_stream = Batch(val_stream, 196 | iteration_scheme=ConstantScheme(batch_size)) 197 | val_stream = Padding(val_stream) 198 | 199 | print('Building model') 200 | 201 | # Set the random number generator' seeds for consistency 202 | rng = numpy.random.RandomState(12345) 203 | 204 | x = T.lmatrix('x') 205 | mask = T.matrix('mask') 206 | 207 | # Construct the LSTM layer 208 | recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) 209 | 210 | logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1], 211 | n_in=n_h, n_out=111) 212 | 213 | cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, 214 | x[1:], 215 | mask[1:]) / batch_size 216 | 217 | # create a list of all model parameters to be fit by gradient descent 218 | params = logreg_layer.params + recurrent_layer.params 219 | 220 | # create a list of gradients for all model parameters 221 | grads = T.grad(cost, params) 222 | 223 | # update_model is a function that updates the model parameters by 224 | # SGD Since this model has many parameters, it would be tedious to 225 | # manually create an update rule for each model parameter. We thus 226 | # create the updates list by automatically looping over all 227 | # (params[i], grads[i]) pairs. 228 | learning_rate = 0.1 229 | updates = [ 230 | (param_i, param_i - learning_rate * grad_i) 231 | for param_i, grad_i in zip(params, grads) 232 | ] 233 | 234 | update_model = theano.function([x, mask], cost, updates=updates) 235 | 236 | evaluate_model = theano.function([x, mask], cost) 237 | 238 | # Define and compile a function for generating a sequence step by step. 239 | x_t = T.iscalar() 240 | h_p = T.vector() 241 | c_p = T.vector() 242 | h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) 243 | energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b 244 | 245 | energy_exp = T.exp(energy - T.max(energy, 1)[:, None]) 246 | 247 | output = energy_exp / energy_exp.sum(1)[:, None] 248 | single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) 249 | 250 | start_time = time.clock() 251 | 252 | iteration = 0 253 | 254 | for epoch in range(n_epochs): 255 | print 'epoch:', epoch 256 | 257 | for x_, mask_ in train_stream.get_epoch_iterator(): 258 | iteration += 1 259 | 260 | cross_entropy = update_model(x_.T, mask_.T) 261 | 262 | 263 | # Generate some text after each 20 minibatches 264 | if iteration % 40 == 0: 265 | try: 266 | prediction = numpy.ones(111, dtype=config.floatX) / 111.0 267 | h_p = numpy.zeros((n_h,), dtype=config.floatX) 268 | c_p = numpy.zeros((n_h,), dtype=config.floatX) 269 | initial = 'the meaning of life is ' 270 | sentence = initial 271 | for char in initial: 272 | x_t = dictionary[char] 273 | prediction, h_p, c_p = single_step(x_t, h_p.flatten(), 274 | c_p.flatten()) 275 | sample = numpy.random.multinomial(1, prediction.flatten()) 276 | for i in range(450): 277 | x_t = numpy.argmax(sample) 278 | prediction, h_p, c_p = single_step(x_t, h_p.flatten(), 279 | c_p.flatten()) 280 | sentence += reverse_mapping[x_t] 281 | sample = numpy.random.multinomial(1, prediction.flatten()) 282 | print 'LSTM: "' + sentence + '"' 283 | except ValueError: 284 | print 'Something went wrong during sentence generation.' 285 | 286 | if iteration % 40 == 0: 287 | print 'epoch:', epoch, ' minibatch:', iteration 288 | val_scores = [] 289 | for x_val, mask_val in val_stream.get_epoch_iterator(): 290 | val_scores.append(evaluate_model(x_val.T, mask_val.T)) 291 | print 'Average validation CE per sentence:', numpy.mean(val_scores) 292 | 293 | end_time = time.clock() 294 | print('Optimization complete.') 295 | print('The code ran for %.2fm' % ((end_time - start_time) / 60.)) 296 | 297 | 298 | if __name__ == '__main__': 299 | train_model() 300 | --------------------------------------------------------------------------------