├── images
    ├── MLP.png
    ├── ds_ai.png
    ├── backprop.png
    ├── fwd_step.png
    ├── github.jpg
    ├── ndarray.png
    ├── pycharm.png
    ├── sigmoid.png
    ├── tf_logo.png
    ├── venn_ds.png
    ├── Perceptron.png
    ├── cluster_0.png
    ├── cluster_1.png
    ├── df_inside.png
    ├── df_outside.png
    ├── ds_ai_full.png
    ├── edit_mode.png
    ├── join-inner.png
    ├── join-left.png
    ├── join-outer.png
    ├── join-right.png
    ├── reference.png
    ├── bkwd_step_net.png
    ├── command_mode.png
    ├── fwd_step_net.png
    ├── gmail_small.png
    ├── iris_setosa.jpg
    ├── ml-wordle-436.jpg
    ├── mlp_details.png
    ├── overfitting.png
    ├── petal_sepal.jpg
    ├── scikit-learn.png
    ├── single_layer.png
    ├── storage_index.png
    ├── twitter_small.png
    ├── venn_diagram.png
    ├── df_inside_numpy.png
    ├── iris_versicolor.jpg
    ├── iris_virginica.jpg
    ├── linkedin_small.png
    ├── menubar_toolbar.png
    ├── multi-layers-1.png
    ├── multi-layers-2.png
    ├── storage_simple.png
    ├── tensorflow_head.png
    ├── Perceptron and MLP.png
    ├── cluster_comparison.png
    ├── keras-logo-small.jpg
    ├── logistic_function.png
    ├── modeling_data_flow.png
    ├── ndarray_with_details.png
    ├── keras-tensorflow-logo.jpg
    ├── ml_supervised_example.png
    ├── ml_unsupervised_example.png
    └── scikit-learn-cheatsheet.png
├── 2_alchemist
    ├── data
    │   ├── blooth_sales_data.xlsx
    │   ├── blooth_sales_data_clean.xlsx
    │   └── sampledf.json
    ├── helpers.py
    ├── createFakeHDF.py
    ├── plot_clustering.py
    ├── 0. Introducing Pandas.ipynb
    ├── 5. Level Up.ipynb
    └── 1. Data selection & Indexing.ipynb
├── requirements.txt
├── conda-environment.yml
├── 3_mage
    ├── utils
    │   ├── plot_linear_svc_regularization.py
    │   ├── __init__.py
    │   ├── plot_interactive_forest.py
    │   ├── plot_kneighbors_regularization.py
    │   ├── plot_2d_separator.py
    │   ├── plot_rbf_svm_parameters.py
    │   └── plot_interactive_tree.py
    ├── 5.1. Review of Scikit-learn API.ipynb
    └── 1.1. Introduction to Machine Learning.ipynb
├── LICENSE
├── .gitignore
├── 0_basic_chemicals
    ├── 01. Data Science What is What if.ipynb
    └── 03. Developer tools for Data Science.ipynb
├── 1_apprentice
    ├── 2.3. Scipy Challenge.ipynb
    ├── 2.2. Scipy Sparse_Matrices.ipynb
    ├── 3.4 Level Up.ipynb
    └── 1.5. Numpy Challenge.ipynb
├── README.md
└── 4_archmage
    ├── intro_to_ann.csv
    ├── ann.py
    └── 2.2.1  Keras Backend.ipynb


/images/MLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/MLP.png


--------------------------------------------------------------------------------
/images/ds_ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ds_ai.png


--------------------------------------------------------------------------------
/images/backprop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/backprop.png


--------------------------------------------------------------------------------
/images/fwd_step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/fwd_step.png


--------------------------------------------------------------------------------
/images/github.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/github.jpg


--------------------------------------------------------------------------------
/images/ndarray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ndarray.png


--------------------------------------------------------------------------------
/images/pycharm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/pycharm.png


--------------------------------------------------------------------------------
/images/sigmoid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/sigmoid.png


--------------------------------------------------------------------------------
/images/tf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/tf_logo.png


--------------------------------------------------------------------------------
/images/venn_ds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/venn_ds.png


--------------------------------------------------------------------------------
/images/Perceptron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/Perceptron.png


--------------------------------------------------------------------------------
/images/cluster_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/cluster_0.png


--------------------------------------------------------------------------------
/images/cluster_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/cluster_1.png


--------------------------------------------------------------------------------
/images/df_inside.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/df_inside.png


--------------------------------------------------------------------------------
/images/df_outside.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/df_outside.png


--------------------------------------------------------------------------------
/images/ds_ai_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ds_ai_full.png


--------------------------------------------------------------------------------
/images/edit_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/edit_mode.png


--------------------------------------------------------------------------------
/images/join-inner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/join-inner.png


--------------------------------------------------------------------------------
/images/join-left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/join-left.png


--------------------------------------------------------------------------------
/images/join-outer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/join-outer.png


--------------------------------------------------------------------------------
/images/join-right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/join-right.png


--------------------------------------------------------------------------------
/images/reference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/reference.png


--------------------------------------------------------------------------------
/images/bkwd_step_net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/bkwd_step_net.png


--------------------------------------------------------------------------------
/images/command_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/command_mode.png


--------------------------------------------------------------------------------
/images/fwd_step_net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/fwd_step_net.png


--------------------------------------------------------------------------------
/images/gmail_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/gmail_small.png


--------------------------------------------------------------------------------
/images/iris_setosa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/iris_setosa.jpg


--------------------------------------------------------------------------------
/images/ml-wordle-436.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ml-wordle-436.jpg


--------------------------------------------------------------------------------
/images/mlp_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/mlp_details.png


--------------------------------------------------------------------------------
/images/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/overfitting.png


--------------------------------------------------------------------------------
/images/petal_sepal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/petal_sepal.jpg


--------------------------------------------------------------------------------
/images/scikit-learn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/scikit-learn.png


--------------------------------------------------------------------------------
/images/single_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/single_layer.png


--------------------------------------------------------------------------------
/images/storage_index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/storage_index.png


--------------------------------------------------------------------------------
/images/twitter_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/twitter_small.png


--------------------------------------------------------------------------------
/images/venn_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/venn_diagram.png


--------------------------------------------------------------------------------
/images/df_inside_numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/df_inside_numpy.png


--------------------------------------------------------------------------------
/images/iris_versicolor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/iris_versicolor.jpg


--------------------------------------------------------------------------------
/images/iris_virginica.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/iris_virginica.jpg


--------------------------------------------------------------------------------
/images/linkedin_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/linkedin_small.png


--------------------------------------------------------------------------------
/images/menubar_toolbar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/menubar_toolbar.png


--------------------------------------------------------------------------------
/images/multi-layers-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/multi-layers-1.png


--------------------------------------------------------------------------------
/images/multi-layers-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/multi-layers-2.png


--------------------------------------------------------------------------------
/images/storage_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/storage_simple.png


--------------------------------------------------------------------------------
/images/tensorflow_head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/tensorflow_head.png


--------------------------------------------------------------------------------
/images/Perceptron and MLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/Perceptron and MLP.png


--------------------------------------------------------------------------------
/images/cluster_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/cluster_comparison.png


--------------------------------------------------------------------------------
/images/keras-logo-small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/keras-logo-small.jpg


--------------------------------------------------------------------------------
/images/logistic_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/logistic_function.png


--------------------------------------------------------------------------------
/images/modeling_data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/modeling_data_flow.png


--------------------------------------------------------------------------------
/images/ndarray_with_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ndarray_with_details.png


--------------------------------------------------------------------------------
/images/keras-tensorflow-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/keras-tensorflow-logo.jpg


--------------------------------------------------------------------------------
/images/ml_supervised_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ml_supervised_example.png


--------------------------------------------------------------------------------
/images/ml_unsupervised_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/ml_unsupervised_example.png


--------------------------------------------------------------------------------
/images/scikit-learn-cheatsheet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/images/scikit-learn-cheatsheet.png


--------------------------------------------------------------------------------
/2_alchemist/data/blooth_sales_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/2_alchemist/data/blooth_sales_data.xlsx


--------------------------------------------------------------------------------
/2_alchemist/data/blooth_sales_data_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leriomaggio/develer-data-science/master/2_alchemist/data/blooth_sales_data_clean.xlsx


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyter~=1.0.0
 2 | keras>=2.2.4
 3 | matplotlib~=3.0.0
 4 | bokeh>=0.12.16
 5 | notebook~=5.7.0
 6 | numpy~=1.15.2
 7 | pandas~=0.23.4
 8 | scikit-learn~=0.20.0
 9 | scipy~=1.1.0
10 | tensorflow>=1.10.0
11 | Pillow==7.1.0  # necessary to matplotlib to load png
12 | pscript>=0.6.1  # bokeh custom javascript
13 | openpyxl>=2.5.8  # Excel write pandas
14 | xlrd>=1.1.0  # Excel read pandas
15 | 


--------------------------------------------------------------------------------
/conda-environment.yml:
--------------------------------------------------------------------------------
 1 | name: develer-science
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - jupyter=1.0*
 7 |   - jupyterlab=0.34*
 8 |   - matplotlib=3.0*
 9 |   - notebook=5.7*
10 |   - pandas=0.23*
11 |   - scikit-learn=0.20*
12 |   - certifi=2018*
13 |   - bokeh=0.12*
14 |   - pip:
15 |     - keras>=2.2.4
16 |     - tensorflow>=1.8
17 |     - Pillow~=5.3
18 |     - openpyxl>=2.5.8
19 |     - pscript>=0.6.1
20 |     - xlrd>=1.1.0
21 | 


--------------------------------------------------------------------------------
/2_alchemist/helpers.py:
--------------------------------------------------------------------------------
 1 | # helper to display Pandas Table output side by side
 2 | 
 3 | from IPython.display import display_html
 4 | 
 5 | def highlight(data):
 6 |     return ['background-color: yellow' for x in data]
 7 | 
 8 | def display_side_by_side(subset, *args):
 9 |     html_str=''
10 |     for i, df in enumerate(args):
11 |         if i:
12 |             html_str+=df.style.render()
13 |         else:
14 |             df.style.apply(highlight, subset)
15 |             html_str+=df.style.render()
16 |             
17 |     display_html(html_str.replace('table','table style="display:inline"'),raw=True)
18 |     
19 | 
20 | 


--------------------------------------------------------------------------------
/2_alchemist/data/sampledf.json:
--------------------------------------------------------------------------------
1 | {"0":{"0":79,"1":25,"2":37,"3":74,"4":79,"5":45,"6":12,"7":36,"8":55,"9":46},"1":{"0":19,"1":39,"2":64,"3":61,"4":60,"5":26,"6":29,"7":32,"8":53,"9":74},"2":{"0":21,"1":89,"2":31,"3":100,"4":83,"5":73,"6":18,"7":22,"8":89,"9":36},"3":{"0":99,"1":66,"2":69,"3":6,"4":85,"5":73,"6":98,"7":4,"8":13,"9":54},"4":{"0":35,"1":9,"2":61,"3":58,"4":16,"5":100,"6":62,"7":66,"8":84,"9":21},"5":{"0":59,"1":41,"2":97,"3":80,"4":5,"5":60,"6":68,"7":25,"8":87,"9":12},"6":{"0":44,"1":6,"2":5,"3":95,"4":16,"5":21,"6":92,"7":63,"8":74,"9":68},"7":{"0":25,"1":69,"2":11,"3":50,"4":69,"5":19,"6":29,"7":51,"8":3,"9":33},"8":{"0":75,"1":63,"2":76,"3":15,"4":5,"5":95,"6":74,"7":59,"8":2,"9":80},"9":{"0":58,"1":3,"2":57,"3":51,"4":20,"5":12,"6":96,"7":14,"8":64,"9":25}}


--------------------------------------------------------------------------------
/3_mage/utils/plot_linear_svc_regularization.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.svm import SVC
 4 | from sklearn.datasets import make_blobs
 5 | from .plot_2d_separator import plot_2d_separator
 6 | 
 7 | 
 8 | def plot_linear_svc_regularization():
 9 |     X, y = make_blobs(centers=2, random_state=4, n_samples=30)
10 |     # a carefully hand-designed dataset lol
11 |     y[7] = 0
12 |     y[27] = 0
13 | 
14 |     fig, axes = plt.subplots(1, 3, figsize=(12, 4))
15 | 
16 |     for ax, C in zip(axes, [1e-2, 1, 1e2]):
17 |         ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
18 | 
19 |         svm = SVC(kernel='linear', C=C).fit(X, y)
20 |         plot_2d_separator(svm, X, ax=ax, eps=.5)
21 |         ax.set_title("C = %f" % C)
22 | 


--------------------------------------------------------------------------------
/3_mage/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .plot_2d_separator import plot_2d_separator
 2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \
 3 |     plot_regression_datasets, make_dataset
 4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization
 5 | from .plot_interactive_tree import plot_tree_interactive
 6 | from .plot_interactive_forest import plot_forest_interactive
 7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters
 8 | from .plot_rbf_svm_parameters import plot_svm_interactive
 9 | 
10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization',
11 |            'plot_linear_svc_regularization', 'plot_tree_interactive',
12 |            'plot_regression_datasets', 'make_dataset',
13 |            "plot_forest_interactive", "plot_rbf_svm_parameters",
14 |            "plot_svm_interactive"]
15 | 


--------------------------------------------------------------------------------
/2_alchemist/createFakeHDF.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pandas import HDFStore, DataFrame
 3 | 
 4 | # create (or open) an hdf5 file and opens in append mode
 5 | hdf = HDFStore('data/hdata.h5')
 6 | 
 7 | df = DataFrame(np.random.rand(1000, 3), columns=('A', 'B', 'C'))
 8 | # put the dataset in the storage
 9 | hdf.put('d1', df, format='table', data_columns=True)
10 | print(hdf['d1'].shape)
11 | 
12 | hdf.append('d1', DataFrame(np.random.rand(5, 3),
13 |                            columns=('A', 'B', 'C')),
14 |            format='table', data_columns=True)
15 | 
16 | df = DataFrame(np.random.rand(1000, 3), columns=('A', 'B', 'C'))
17 | # put the dataset in the storage
18 | hdf.put('d2', df, format='table', data_columns=True)
19 | print(hdf['d2'].shape)
20 | 
21 | hdf.append('d2', DataFrame(np.random.rand(5, 3),
22 |                            columns=('A', 'B', 'C')),
23 |            format='table', data_columns=True)
24 | hdf.close()  # closes the file
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Valerio Maggio
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/2_alchemist/plot_clustering.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_kmeans_clustering_results(c1, c2, c3, vq1, vq2, vq3):
 5 | 
 6 |     # Setting plot limits
 7 |     x1, x2 = -10, 10
 8 |     y1, y2 = -10, 10
 9 | 
10 |     fig = plt.figure()
11 |     fig.subplots_adjust(hspace=0.1, wspace=0.1)
12 | 
13 |     ax1 = fig.add_subplot(121, aspect='equal')
14 |     ax1.scatter(c1[:, 0], c1[:, 1], lw=0.5, color='#00CC00')
15 |     ax1.scatter(c2[:, 0], c2[:, 1], lw=0.5, color='#028E9B')
16 |     ax1.scatter(c3[:, 0], c3[:, 1], lw=0.5, color='#FF7800')
17 |     ax1.xaxis.set_visible(False)
18 |     ax1.yaxis.set_visible(False)
19 |     ax1.set_xlim(x1, x2)
20 |     ax1.set_ylim(y1, y2)
21 |     ax1.text(-9, 8, 'Original')
22 | 
23 |     ax2 = fig.add_subplot(122, aspect='equal')
24 |     ax2.scatter(vqc1[:, 0], vqc1[:, 1], lw=0.5, color='#00CC00')
25 |     ax2.scatter(vqc2[:, 0], vqc2[:, 1], lw=0.5, color='#028E9B')
26 |     ax2.scatter(vqc3[:, 0], vqc3[:, 1], lw=0.5, color='#FF7800')
27 |     ax2.xaxis.set_visible(False)
28 |     ax2.yaxis.set_visible(False)
29 |     ax2.set_xlim(x1, x2)
30 |     ax2.set_ylim(y1, y2)
31 |     ax2.text(-9, 8, 'VQ identified')
32 | 
33 |     return fig


--------------------------------------------------------------------------------
/3_mage/utils/plot_interactive_forest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.datasets import make_blobs
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | 
 7 | 
 8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
 9 | 
10 | 
11 | def plot_forest(max_depth=1):
12 |     plt.figure()
13 |     ax = plt.gca()
14 |     h = 0.02
15 | 
16 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
17 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
18 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
19 | 
20 |     if max_depth != 0:
21 |         forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth,
22 |                                         random_state=1).fit(X, y)
23 |         Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
24 |         Z = Z.reshape(xx.shape)
25 |         ax.contourf(xx, yy, Z, alpha=.4)
26 |         ax.set_title("max_depth = %d" % max_depth)
27 |     else:
28 |         ax.set_title("data set")
29 |     ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
30 |     ax.set_xlim(x_min, x_max)
31 |     ax.set_ylim(y_min, y_max)
32 |     ax.set_xticks(())
33 |     ax.set_yticks(())
34 | 
35 | 
36 | def plot_forest_interactive():
37 |     from IPython.html.widgets import interactive, IntSlider
38 |     slider = IntSlider(min=0, max=8, step=1, value=0)
39 |     return interactive(plot_forest, max_depth=slider)
40 | 


--------------------------------------------------------------------------------
/3_mage/utils/plot_kneighbors_regularization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.neighbors import KNeighborsRegressor
 5 | 
 6 | 
 7 | def make_dataset(n_samples=100):
 8 |     rnd = np.random.RandomState(42)
 9 |     x = np.linspace(-3, 3, n_samples)
10 |     y_no_noise = np.sin(4 * x) + x
11 |     y = y_no_noise + rnd.normal(size=len(x))
12 |     return x, y
13 | 
14 | 
15 | def plot_regression_datasets():
16 |     fig, axes = plt.subplots(1, 3, figsize=(15, 5))
17 |     for n_samples, ax in zip([10, 100, 1000], axes):
18 |         x, y = make_dataset(n_samples)
19 |         ax.plot(x, y, 'o', alpha=.6)
20 | 
21 | 
22 | def plot_kneighbors_regularization():
23 |     rnd = np.random.RandomState(42)
24 |     x = np.linspace(-3, 3, 100)
25 |     y_no_noise = np.sin(4 * x) + x
26 |     y = y_no_noise + rnd.normal(size=len(x))
27 |     X = x[:, np.newaxis]
28 |     fig, axes = plt.subplots(1, 3, figsize=(15, 5))
29 | 
30 |     x_test = np.linspace(-3, 3, 1000)
31 | 
32 |     for n_neighbors, ax in zip([2, 5, 20], axes.ravel()):
33 |         kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors)
34 |         kneighbor_regression.fit(X, y)
35 |         ax.plot(x, y_no_noise, label="true function")
36 |         ax.plot(x, y, "o", label="data")
37 |         ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]),
38 |                 label="prediction")
39 |         ax.legend()
40 |         ax.set_title("n_neighbors = %d" % n_neighbors)
41 | 
42 | if __name__ == "__main__":
43 |     plot_kneighbors_regularization()
44 |     plt.show()
45 | 


--------------------------------------------------------------------------------
/3_mage/utils/plot_2d_separator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None):
 6 |     if eps is None:
 7 |         eps = X.std() / 2.
 8 |     x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
 9 |     y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
10 |     xx = np.linspace(x_min, x_max, 100)
11 |     yy = np.linspace(y_min, y_max, 100)
12 | 
13 |     X1, X2 = np.meshgrid(xx, yy)
14 |     X_grid = np.c_[X1.ravel(), X2.ravel()]
15 |     try:
16 |         decision_values = classifier.decision_function(X_grid)
17 |         levels = [0]
18 |         fill_levels = [decision_values.min(), 0, decision_values.max()]
19 |     except AttributeError:
20 |         # no decision_function
21 |         decision_values = classifier.predict_proba(X_grid)[:, 1]
22 |         levels = [.5]
23 |         fill_levels = [0, .5, 1]
24 | 
25 |     if ax is None:
26 |         ax = plt.gca()
27 |     if fill:
28 |         ax.contourf(X1, X2, decision_values.reshape(X1.shape),
29 |                     levels=fill_levels, colors=['blue', 'red'])
30 |     else:
31 |         ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,
32 |                    colors="black")
33 |     ax.set_xlim(x_min, x_max)
34 |     ax.set_ylim(y_min, y_max)
35 |     ax.set_xticks(())
36 |     ax.set_yticks(())
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     from sklearn.datasets import make_blobs
41 |     from sklearn.linear_model import LogisticRegression
42 |     X, y = make_blobs(centers=2, random_state=42)
43 |     clf = LogisticRegression().fit(X, y)
44 |     plot_2d_separator(clf, X, fill=True)
45 |     plt.scatter(X[:, 0], X[:, 1], c=y)
46 |     plt.show()
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/0_basic_chemicals/01. Data Science What is What if.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# AI, Machine Learning, Data Science..."
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "![DS AI](../images/ds_ai.png)"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "## ... to make it funny"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "![DS AI full](../images/ds_ai_full.png)"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "markdown",
33 |    "metadata": {},
34 |    "source": [
35 |     "----"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "markdown",
40 |    "metadata": {},
41 |    "source": [
42 |     "# What do you need to become a Data Scientist?"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "markdown",
47 |    "metadata": {},
48 |    "source": [
49 |     "![Data Science Venn Cornway](../images/venn_ds.png)"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "markdown",
54 |    "metadata": {},
55 |    "source": [
56 |     "## ... despite Venn Diagrams can go really bad ..."
57 |    ]
58 |   },
59 |   {
60 |    "cell_type": "markdown",
61 |    "metadata": {},
62 |    "source": [
63 |     "![venn](../images/venn_diagram.png)"
64 |    ]
65 |   }
66 |  ],
67 |  "metadata": {
68 |   "kernelspec": {
69 |    "display_name": "Python 3.6 (Develer Science)",
70 |    "language": "python",
71 |    "name": "develer-science"
72 |   },
73 |   "language_info": {
74 |    "codemirror_mode": {
75 |     "name": "ipython",
76 |     "version": 3
77 |    },
78 |    "file_extension": ".py",
79 |    "mimetype": "text/x-python",
80 |    "name": "python",
81 |    "nbconvert_exporter": "python",
82 |    "pygments_lexer": "ipython3",
83 |    "version": "3.6.6"
84 |   }
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 2
88 | }
89 | 


--------------------------------------------------------------------------------
/3_mage/utils/plot_rbf_svm_parameters.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.svm import SVC
 4 | from sklearn.datasets import make_blobs
 5 | from .plot_2d_separator import plot_2d_separator
 6 | 
 7 | 
 8 | def make_handcrafted_dataset():
 9 |     # a carefully hand-designed dataset lol
10 |     X, y = make_blobs(centers=2, random_state=4, n_samples=30)
11 |     y[np.array([7, 27])] = 0
12 |     mask = np.ones(len(X), dtype=np.bool)
13 |     mask[np.array([0, 1, 5, 26])] = 0
14 |     X, y = X[mask], y[mask]
15 |     return X, y
16 | 
17 | 
18 | def plot_rbf_svm_parameters():
19 |     X, y = make_handcrafted_dataset()
20 | 
21 |     fig, axes = plt.subplots(1, 3, figsize=(12, 4))
22 |     for ax, C in zip(axes, [1e0, 5, 10, 100]):
23 |         ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
24 | 
25 |         svm = SVC(kernel='rbf', C=C).fit(X, y)
26 |         plot_2d_separator(svm, X, ax=ax, eps=.5)
27 |         ax.set_title("C = %f" % C)
28 | 
29 |     fig, axes = plt.subplots(1, 4, figsize=(15, 3))
30 |     for ax, gamma in zip(axes, [0.1, .5, 1, 10]):
31 |         ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
32 |         svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y)
33 |         plot_2d_separator(svm, X, ax=ax, eps=.5)
34 |         ax.set_title("gamma = %f" % gamma)
35 | 
36 | 
37 | def plot_svm(log_C, log_gamma):
38 |     X, y = make_handcrafted_dataset()
39 |     C = 10. ** log_C
40 |     gamma = 10. ** log_gamma
41 |     svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y)
42 |     ax = plt.gca()
43 |     plot_2d_separator(svm, X, ax=ax, eps=.5)
44 |     # plot data
45 |     ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
46 |     # plot support vectors
47 |     sv = svm.support_vectors_
48 |     ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3)
49 |     ax.set_title("C = %.4f gamma = %.4f" % (C, gamma))
50 | 
51 | 
52 | def plot_svm_interactive():
53 |     from IPython.html.widgets import interactive, FloatSlider
54 |     C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False)
55 |     gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False)
56 |     return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider)
57 | 


--------------------------------------------------------------------------------
/3_mage/utils/plot_interactive_tree.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.datasets import make_blobs
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 | from sklearn.externals.six import StringIO  # doctest: +SKIP
 8 | from sklearn.tree import export_graphviz
 9 | from scipy import ndimage
10 | try: 
11 |     from scipy.misc import imread
12 | except ImportError:
13 |     from scipy.ndimage import imread
14 | 
15 | import re
16 | 
17 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
18 | 
19 | 
20 | def tree_image(tree, fout=None):
21 |     try:
22 |         import pydot
23 |     except ImportError:
24 |         # make a hacky white plot
25 |         x = np.ones((10, 10))
26 |         x[0, 0] = 0
27 |         return x
28 |     dot_data = StringIO()
29 |     export_graphviz(tree, out_file=dot_data)
30 |     data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue())
31 |     data = re.sub(r"samples = [0-9]+\\n", "", data)
32 |     data = re.sub(r"\\nsamples = [0-9]+", "", data)
33 | 
34 |     graph = pydot.graph_from_dot_data(data)
35 |     if fout is None:
36 |         fout = "tmp.png"
37 |     graph.write_png(fout)
38 |     return imread(fout)
39 | 
40 | 
41 | def plot_tree(max_depth=1):
42 |     fig, ax = plt.subplots(1, 2, figsize=(15, 7))
43 |     h = 0.02
44 | 
45 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
46 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
47 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
48 | 
49 |     if max_depth != 0:
50 |         tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
51 |         Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
52 |         Z = Z.reshape(xx.shape)
53 |         faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
54 |         faces = faces.reshape(xx.shape)
55 |         border = ndimage.laplace(faces) != 0
56 |         ax[0].contourf(xx, yy, Z, alpha=.4)
57 |         ax[0].scatter(xx[border], yy[border], marker='.', s=1)
58 |         ax[0].set_title("max_depth = %d" % max_depth)
59 |         ax[1].imshow(tree_image(tree))
60 |         ax[1].axis("off")
61 |     else:
62 |         ax[0].set_title("data set")
63 |         ax[1].set_visible(False)
64 |     ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
65 |     ax[0].set_xlim(x_min, x_max)
66 |     ax[0].set_ylim(y_min, y_max)
67 |     ax[0].set_xticks(())
68 |     ax[0].set_yticks(())
69 | 
70 | 
71 | def plot_tree_interactive():
72 |     from IPython.html.widgets import interactive, IntSlider
73 |     slider = IntSlider(min=0, max=8, step=1, value=0)
74 |     return interactive(plot_tree, max_depth=slider)
75 | 


--------------------------------------------------------------------------------
/0_basic_chemicals/03. Developer tools for Data Science.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Developer Tools and IDEs for Data Science\n",
  8 |     "\n",
  9 |     "**Note**: This is not intended to be a comprehensive guide on tools and IDEs for Data Science.\n",
 10 |     "The spirit is to reference some interesting tools out there to be used."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## Jupyter Notebooks \n",
 18 |     "\n",
 19 |     "<img alt=\"Jupyter notebook logo\" src=\"http://jupyter.org/assets/main-logo.svg\" width=\"30%\"/>"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## nteract\n",
 27 |     "\n",
 28 |     "<div style=\"background-color: rgb(51, 72, 101)\">\n",
 29 |     "    <img src=\"https://nteract.github.io/assets/images/feature_nteract_logo_header_white@2x.png\" />\n",
 30 |     "</div>\n"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/html": [
 41 |        "<video src=\"https://nteract.github.io/assets/images/video/nteract_app_demo@2x.mp4\" controls>\n",
 42 |        "      Your browser does not support the <code>video</code> element.\n",
 43 |        "    </video>"
 44 |       ],
 45 |       "text/plain": [
 46 |        "<IPython.core.display.Video object>"
 47 |       ]
 48 |      },
 49 |      "execution_count": 2,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "from IPython.display import Video\n",
 56 |     "\n",
 57 |     "Video(\"https://nteract.github.io/assets/images/video/nteract_app_demo@2x.mp4\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Atom + HydroGen"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "<img src=\"https://i.github-camo.com/e17ac2bfffce4ede5cae57c3109ef7f53effc997/68747470733a2f2f636c6f75642e67697468756275736572636f6e74656e742e636f6d2f6173736574732f31333238353830382f32303336303838362f37653033653532342d616330332d313165362d393137362d3337363737663232363631392e676966\" alt=\"Atom Hydrogen GIF\" />"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## PyCharm IDE"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "attachments": {},
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "<img src=\"../images/pycharm.png\" width=\"30%\">\n",
 87 |     "\n",
 88 |     "\n",
 89 |     "### Scientific Mode\n",
 90 |     "\n",
 91 |     "[PyCharm Scientific Mode](https://www.jetbrains.com/help/pycharm/matplotlib-tutorial.html)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Others worthwile mentioning..."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "* **Spyder**: [https://www.spyder-ide.org]()\n",
106 |     "* **Rodeo**: [https://rodeo.yhat.com]()\n",
107 |     "* **VS Code**: [https://code.visualstudio.com]()"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "kernelspec": {
113 |    "display_name": "Python 3.6 (Develer Science)",
114 |    "language": "python",
115 |    "name": "develer-science"
116 |   },
117 |   "language_info": {
118 |    "codemirror_mode": {
119 |     "name": "ipython",
120 |     "version": 3
121 |    },
122 |    "file_extension": ".py",
123 |    "mimetype": "text/x-python",
124 |    "name": "python",
125 |    "nbconvert_exporter": "python",
126 |    "pygments_lexer": "ipython3",
127 |    "version": "3.6.6"
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 2
132 | }
133 | 


--------------------------------------------------------------------------------
/2_alchemist/0. Introducing Pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pandas\n",
  8 |     "\n",
  9 |     "Pandas is the Swiss-Multipurpose Knife for Data Analysis in Python. With Pandas dealing with data-analysis is easy and simple but there are some things you need to get your head around first as Data-Frames and Data-Series. \n",
 10 |     "\n",
 11 |     "The tutorial provides a compact introduction to Pandas for beginners for I/O, data visualisation, statistical data analysis and aggregation within Jupiter notebooks."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Content at a glance\n",
 19 |     "\n",
 20 |     "#### A Practical Start: Reading and Writing Data Across Multiple Formats \n",
 21 |     "\n",
 22 |     "* CSV\n",
 23 |     "* Excel\n",
 24 |     "* JSON\n",
 25 |     "* Clipboard\n",
 26 |     " \n",
 27 |     "* data\n",
 28 |     "    * .info\n",
 29 |     "    * .describe\n",
 30 |     "\n",
 31 |     "#### DataSeries & DataFrames / NumPy\n",
 32 |     "\n",
 33 |     "* Ode to NumPy\n",
 34 |     "* Data-Series\n",
 35 |     "* Data-Frames\n",
 36 |     "\n",
 37 |     "#### Data selection & Indexing\n",
 38 |     "\n",
 39 |     "* Data-Series: \n",
 40 |     "    * Slicing\n",
 41 |     "    * Access by label\n",
 42 |     "    * Index\n",
 43 |     "* Data-Frames: \n",
 44 |     "    * Slicing\n",
 45 |     "    * Access by label\n",
 46 |     "    * Peek into joining data\n",
 47 |     "* Returns a copy / inplace\n",
 48 |     "* Boolean indexing\n",
 49 |     "\n",
 50 |     "#### Operations\n",
 51 |     "    \n",
 52 |     " * add/substract\n",
 53 |     " * multiply\n",
 54 |     " * mention Index but don't go deep\n",
 55 |     "\n",
 56 |     "#### Data Visualisation\n",
 57 |     "\n",
 58 |     " * plot your data directly into your notebook\n",
 59 |     " \n",
 60 |     "#### Anti Patterns\n",
 61 |     "\n",
 62 |     "  * a collection of (anti-)patterns when using Pandas"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "---"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## Brief Introduction to Pandas\n",
 77 |     "\n",
 78 |     "Pandas builds on top of two main data structures: **Data Frame** and **Series**"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "### Data Frame _from the outside_\n",
 86 |     "\n",
 87 |     "<img src=\"../images/df_outside.png\" width=\"50%\" />"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "### Data Frame _from the inside_\n",
 95 |     "\n",
 96 |     "<img src=\"../images/df_inside.png\" width=\"60%\" />"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "### Data Frame vs Numpy Array"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "#### Numpy Array\n",
111 |     "\n",
112 |     "<img src=\"../images/ndarray.png\" />"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "#### Pandas Data Frame"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "<img src=\"../images/df_inside_numpy.png\" width=\"70%\" />"
127 |    ]
128 |   }
129 |  ],
130 |  "metadata": {
131 |   "kernelspec": {
132 |    "display_name": "Python 3.6 (Develer Science)",
133 |    "language": "python",
134 |    "name": "develer-science"
135 |   },
136 |   "language_info": {
137 |    "codemirror_mode": {
138 |     "name": "ipython",
139 |     "version": 3
140 |    },
141 |    "file_extension": ".py",
142 |    "mimetype": "text/x-python",
143 |    "name": "python",
144 |    "nbconvert_exporter": "python",
145 |    "pygments_lexer": "ipython3",
146 |    "version": "3.6.6"
147 |   }
148 |  },
149 |  "nbformat": 4,
150 |  "nbformat_minor": 2
151 | }
152 | 


--------------------------------------------------------------------------------
/1_apprentice/2.3. Scipy Challenge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SciPy Challenge\n",
  8 |     "\n",
  9 |     "## SciPy at a Glance\n",
 10 |     "\n",
 11 |     "The SciPy framework builds on top of the low-level NumPy framework for multidimensional arrays, and provides a large number of higher-level scientific algorithms. Some of the topics that SciPy covers are:\n",
 12 |     "\n",
 13 |     "* Special functions ([scipy.special](http://docs.scipy.org/doc/scipy/reference/special.html))\n",
 14 |     "* Integration ([scipy.integrate](http://docs.scipy.org/doc/scipy/reference/integrate.html))\n",
 15 |     "* Optimization ([scipy.optimize](http://docs.scipy.org/doc/scipy/reference/optimize.html))\n",
 16 |     "* Interpolation ([scipy.interpolate](http://docs.scipy.org/doc/scipy/reference/interpolate.html))\n",
 17 |     "* Fourier Transforms ([scipy.fftpack](http://docs.scipy.org/doc/scipy/reference/fftpack.html))\n",
 18 |     "* Signal Processing ([scipy.signal](http://docs.scipy.org/doc/scipy/reference/signal.html))\n",
 19 |     "* Linear Algebra ([scipy.linalg](http://docs.scipy.org/doc/scipy/reference/linalg.html))\n",
 20 |     "* Sparse Eigenvalue Problems ([scipy.sparse](http://docs.scipy.org/doc/scipy/reference/sparse.html))\n",
 21 |     "* Statistics ([scipy.stats](http://docs.scipy.org/doc/scipy/reference/stats.html))\n",
 22 |     "* Multi-dimensional image processing ([scipy.ndimage](http://docs.scipy.org/doc/scipy/reference/ndimage.html))\n",
 23 |     "* File IO ([scipy.io](http://docs.scipy.org/doc/scipy/reference/io.html))"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Sparse Matrices\n",
 31 |     "\n",
 32 |     "**Sparse Matrices** are very nice in some situations.  \n",
 33 |     "\n",
 34 |     "For example, in some machine learning tasks, especially those associated\n",
 35 |     "with textual analysis, the data may be mostly zeros.  \n",
 36 |     "\n",
 37 |     "Storing all these zeros is very inefficient.  \n",
 38 |     "\n",
 39 |     "We can create and manipulate sparse matrices using the `scipy.sparse` module."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "There exists different implementations of sparse matrices, which are supposed to be efficient in different scenarios:\n",
 47 |     "\n",
 48 |     "- CSR: Compressed Sparse Rows\n",
 49 |     "- CSC: Compressec Sparse Colums\n",
 50 |     "- DOK: Dictionary of Keys\n",
 51 |     "- LIL: List of Lists\n",
 52 |     "- BSR: Block Sparse Row"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Ex 1.1\n",
 60 |     "\n",
 61 |     "Create a big numpy **dense** matrix filled with random numbers in \n",
 62 |     "`[0, 1)`.\n",
 63 |     "Generate a random number within this range and subsitute all the elements in the matrix **less than** this number with a zero.\n",
 64 |     "\n",
 65 |     "Save resulting matrix as a `DOK` sparse matrix"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": []
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Ex 1.2\n",
 82 |     "\n",
 83 |     "Repeat the previous exercise, but this time use a `CSR` sparse matrix."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": []
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Ex 1.3\n",
100 |     "\n",
101 |     "Transform the previously generated sparse matrix back to a full dense `numpy.array`."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "collapsed": true
109 |    },
110 |    "outputs": [],
111 |    "source": []
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "## Ex 1.4 \n",
118 |     "\n",
119 |     "Generate two sparse Matrix and sum them together, choosing the most appropriate internal representation (i.e. `LIL`, `CSR`, `DOK`...).\n",
120 |     "\n",
121 |     "#### Hint: Oh c'mon.. :)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "collapsed": true
129 |    },
130 |    "outputs": [],
131 |    "source": []
132 |   }
133 |  ],
134 |  "metadata": {
135 |   "kernelspec": {
136 |    "display_name": "Python 3.6 (Develer Science)",
137 |    "language": "python",
138 |    "name": "develer-science"
139 |   },
140 |   "language_info": {
141 |    "codemirror_mode": {
142 |     "name": "ipython",
143 |     "version": 3
144 |    },
145 |    "file_extension": ".py",
146 |    "mimetype": "text/x-python",
147 |    "name": "python",
148 |    "nbconvert_exporter": "python",
149 |    "pygments_lexer": "ipython3",
150 |    "version": "3.6.6"
151 |   }
152 |  },
153 |  "nbformat": 4,
154 |  "nbformat_minor": 2
155 | }
156 | 


--------------------------------------------------------------------------------
/3_mage/5.1. Review of Scikit-learn API.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# A recap on Scikit-learn's estimator interface"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "slideshow": {
 14 |      "slide_type": "subslide"
 15 |     }
 16 |    },
 17 |    "source": [
 18 |     "Scikit-learn strives to have a uniform interface across all methods. Given a scikit-learn *estimator*\n",
 19 |     "object named `model`, the following methods are available (not all for each model)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {
 25 |     "slideshow": {
 26 |      "slide_type": "subslide"
 27 |     }
 28 |    },
 29 |    "source": [
 30 |     "- Available in **all Estimators**\n",
 31 |     "  + `model.fit()` : fit training data. For supervised learning applications,\n",
 32 |     "    this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n",
 33 |     "    For unsupervised learning applications, ``fit`` takes only a single argument,\n",
 34 |     "    the data `X` (e.g. `model.fit(X)`)."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {
 40 |     "slideshow": {
 41 |      "slide_type": "subslide"
 42 |     }
 43 |    },
 44 |    "source": [
 45 |     "- Available in **supervised estimators**\n",
 46 |     "  + `model.predict()` : given a trained model, predict the label of a new set of data.\n",
 47 |     "    This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n",
 48 |     "    and returns the learned label for each object in the array.\n",
 49 |     "  + `model.predict_proba()` : For classification problems, some estimators also provide\n",
 50 |     "    this method, which returns the probability that a new observation has each categorical label.\n",
 51 |     "    In this case, the label with the highest probability is returned by `model.predict()`.\n",
 52 |     "  "
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {
 58 |     "slideshow": {
 59 |      "slide_type": "subslide"
 60 |     }
 61 |    },
 62 |    "source": [
 63 |     "- Available in **supervised estimators** (cont.)\n",
 64 |     "  \n",
 65 |     "  + `model.decision_function()` : For classification problems, some estimators provide an uncertainty estimate that is not a probability. For binary classification, a decision_function >= 0 means the positive class will be predicted, while < 0 means the negative class.\n",
 66 |     "  + `model.score()` : for classification or regression problems, most (all?) estimators implement\n",
 67 |     "    a score method.  Scores are between 0 and 1, with a larger score indicating a better fit."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {
 73 |     "slideshow": {
 74 |      "slide_type": "subslide"
 75 |     }
 76 |    },
 77 |    "source": [
 78 |     "- Available in **supervised estimators** (cont.)\n",
 79 |     "\n",
 80 |     "    + `model.transform()` : For feature selection algorithms, this will reduce the dataset to the selected features. For some classification and regression models such as some linear models and random forests, this method reduces the dataset to the most informative features. These classification and regression models can therefor also be used as feature selection methods."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {
 86 |     "slideshow": {
 87 |      "slide_type": "subslide"
 88 |     }
 89 |    },
 90 |    "source": [
 91 |     "- Available in **unsupervised estimators**\n",
 92 |     "  + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n",
 93 |     "    This also accepts one argument `X_new`, and returns the new representation of the data based\n",
 94 |     "    on the unsupervised model.\n",
 95 |     "  + `model.fit_transform()` : some estimators implement this method,\n",
 96 |     "    which more efficiently performs a fit and a transform on the same input data."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {
102 |     "slideshow": {
103 |      "slide_type": "subslide"
104 |     }
105 |    },
106 |    "source": [
107 |     "- Available in **unsupervised estimators** (cont.)\n",
108 |     "\n",
109 |     "    + `model.predict()` : for clustering algorithms, the predict method will produce cluster labels for new data points. Not all clustering methods have this functionality.\n",
110 |     "    + `model.predict_proba()` : Gaussian mixture models (GMMs) provide the probability for each point to be generated by a given mixture component.\n",
111 |     "    + `model.score()` : Density models like KDE and GMMs provide the likelihood of the data under the model."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {
117 |     "slideshow": {
118 |      "slide_type": "subslide"
119 |     }
120 |    },
121 |    "source": [
122 |     "Apart from ``fit``, the two most important functions are arguably ``predict`` to produce a target variable (a ``y``) ``transform``, which produces a new representation of the data (an ``X``).\n",
123 |     "The following table shows for which class of models which function applies:\n",
124 |     "\n"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "<table>\n",
132 |     "<tr style=\"border:None; font-size:20px; padding:10px;\"><th>``model.predict``</th><th>``model.transform``</th></tr>\n",
133 |     "<tr style=\"border:None; font-size:20px; padding:10px;\"><td>Classification</td><td>Preprocessing</td></tr>\n",
134 |     "<tr style=\"border:None; font-size:20px; padding:10px;\"><td>Regression</td><td>Dimensionality Reduction</td></tr>\n",
135 |     "<tr style=\"border:None; font-size:20px; padding:10px;\"><td>Clustering</td><td>Feature Extraction</td></tr>\n",
136 |     "<tr style=\"border:None; font-size:20px; padding:10px;\"><td>&nbsp;</td><td>Feature selection</td></tr>\n",
137 |     "\n",
138 |     "</table>\n",
139 |     "\n",
140 |     "\n"
141 |    ]
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "kernelspec": {
146 |    "display_name": "Python 3.6 (Develer Science)",
147 |    "language": "python",
148 |    "name": "develer-science"
149 |   },
150 |   "language_info": {
151 |    "codemirror_mode": {
152 |     "name": "ipython",
153 |     "version": 3
154 |    },
155 |    "file_extension": ".py",
156 |    "mimetype": "text/x-python",
157 |    "name": "python",
158 |    "nbconvert_exporter": "python",
159 |    "pygments_lexer": "ipython3",
160 |    "version": "3.6.6"
161 |   }
162 |  },
163 |  "nbformat": 4,
164 |  "nbformat_minor": 2
165 | }
166 | 


--------------------------------------------------------------------------------
/3_mage/1.1. Introduction to Machine Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Introduction to Machine Learning in Python"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "slideshow": {
 18 |      "slide_type": "subslide"
 19 |     }
 20 |    },
 21 |    "source": [
 22 |     "## What is Machine Learning?"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {
 28 |     "slideshow": {
 29 |      "slide_type": "subslide"
 30 |     }
 31 |    },
 32 |    "source": [
 33 |     "### Machine Learning at Glance"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "slideshow": {
 40 |      "slide_type": "-"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "<img src=\"../images/ml-wordle-436.jpg\" width=\"60%\">"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {
 50 |     "slideshow": {
 51 |      "slide_type": "subslide"
 52 |     }
 53 |    },
 54 |    "source": [
 55 |     "> Machine learning teaches machines how to carry out tasks by themselves. It is that simple.\n",
 56 |     "The complexity comes with the details.\n",
 57 |     "\n",
 58 |     "_W. Richert & L.P. Coelho, 2013\n",
 59 |     "Building Machine Learning Systems with Python_"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {
 65 |     "slideshow": {
 66 |      "slide_type": "subslide"
 67 |     }
 68 |    },
 69 |    "source": [
 70 |     "Machine learning is the process to automatically **extract knowledge** from data, usually with the goal of making **predictions** on _new_, _unseen_ data. \n",
 71 |     "\n",
 72 |     "A classical example is a _spam filter_, for which the user keeps labeling incoming mails as either spam or not spam. \n",
 73 |     "\n",
 74 |     "A machine learning algorithm then \"learns\" what distinguishes spam from normal emails, and can predict for new emails whether they are spam or not."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {
 80 |     "slideshow": {
 81 |      "slide_type": "subslide"
 82 |     }
 83 |    },
 84 |    "source": [
 85 |     "Central to machine learning is the concept of **making decision automatically** from data, **without the user specifying explicit rules** how this decision should be made.\n",
 86 |     "\n",
 87 |     "For the case of emails, the user doesn't provide a list of words or characteristics that make an email spam. Instead, the user provides examples of spam and non-spam emails."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {
 93 |     "slideshow": {
 94 |      "slide_type": "subslide"
 95 |     }
 96 |    },
 97 |    "source": [
 98 |     "The second central concept is **generalization**. \n",
 99 |     "\n",
100 |     "The goal of a machine learning algorithm is to predict on new, previously unseen data. We are not interested in marking an email as spam or not, that the human already labeled. Instead, we want to make the users life easier by making an automatic decision for new incoming mail."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {
106 |     "slideshow": {
107 |      "slide_type": "subslide"
108 |     }
109 |    },
110 |    "source": [
111 |     "There are two kinds of machine learning we will talk about in these notebooks: \n",
112 |     "\n",
113 |     "* **Supervised learning;** \n",
114 |     "* **Unsupervised learning.**"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {
120 |     "slideshow": {
121 |      "slide_type": "slide"
122 |     }
123 |    },
124 |    "source": [
125 |     "### Supervised Learning\n",
126 |     "\n",
127 |     "In **Supervised Learning**, we have a dataset consisting of both input features and a desired output, such as in the spam / no-spam example.\n",
128 |     "\n",
129 |     "The task is to construct a model (or program) which is able to predict the desired output of an unseen object\n",
130 |     "given the set of features."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {
136 |     "slideshow": {
137 |      "slide_type": "subslide"
138 |     }
139 |    },
140 |    "source": [
141 |     "<img src=\"../images/ml_supervised_example.png\" width=\"100%\" />"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {
147 |     "slideshow": {
148 |      "slide_type": "subslide"
149 |     }
150 |    },
151 |    "source": [
152 |     "Supervised learning is further broken down into two categories, **classification** and **regression**.\n",
153 |     "\n",
154 |     "In classification, the label is discrete (a.k.a. _Categorical Data_, i.e. _Integer values_), such as \"spam\" or \"no spam\". \n",
155 |     "\n",
156 |     "In other words, it provides a clear-cut distinction between categories. \n",
157 |     "\n",
158 |     "In regression, the label is continuous, i.e. _Float output_."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {
164 |     "slideshow": {
165 |      "slide_type": "subslide"
166 |     }
167 |    },
168 |    "source": [
169 |     "### Other Examples\n",
170 |     "\n",
171 |     "Some more complicated examples are:\n",
172 |     "\n",
173 |     "- given a multicolor image of an object through a telescope, determine\n",
174 |     "  whether that object is a star, a quasar, or a galaxy.\n",
175 |     "- given a photograph of a person, identify the person in the photo.\n",
176 |     "- given a list of movies a person has watched and their personal rating\n",
177 |     "  of the movie, recommend a list of movies they would like.\n",
178 |     "- given a persons age, education and position, infer their salary"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {
184 |     "slideshow": {
185 |      "slide_type": "subslide"
186 |     }
187 |    },
188 |    "source": [
189 |     "What these tasks have in common is that there is one or more unknown\n",
190 |     "quantities associated with the object which needs to be determined from other\n",
191 |     "observed quantities."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {
197 |     "slideshow": {
198 |      "slide_type": "subslide"
199 |     }
200 |    },
201 |    "source": [
202 |     "### For example\n",
203 |     "\n",
204 |     "* In astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a **classification problem**: the label is from three distinct categories. \n",
205 |     "\n",
206 |     "* On the other hand, we might wish to estimate the age of an object based on such observations: this would be a **regression problem**, because the label (age) is a continuous quantity."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {
212 |     "slideshow": {
213 |      "slide_type": "slide"
214 |     }
215 |    },
216 |    "source": [
217 |     "### Unsupervised Learning\n",
218 |     "\n",
219 |     "In **Unsupervised Learning** there is no desired output associated with the data.\n",
220 |     "\n",
221 |     "Instead, we are interested in extracting some form of knowledge or model from the given data.\n",
222 |     "\n",
223 |     "In a sense, you can think of unsupervised learning as a means of discovering labels from the data itself.\n",
224 |     "\n",
225 |     "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n",
226 |     "*density estimation*. "
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {
232 |     "slideshow": {
233 |      "slide_type": "subslide"
234 |     }
235 |    },
236 |    "source": [
237 |     "<img src=\"../images/ml_unsupervised_example.png\" width=\"100%\" />"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {
243 |     "slideshow": {
244 |      "slide_type": "fragment"
245 |     }
246 |    },
247 |    "source": [
248 |     "Unsupervised learning is often harder to understand and to evaluate.\n",
249 |     "\n",
250 |     "Sometimes the two may even be combined: e.g. Unsupervised learning can be used to find useful\n",
251 |     "features in heterogeneous data, and then these features can be used within a supervised\n",
252 |     "framework."
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {
258 |     "slideshow": {
259 |      "slide_type": "subslide"
260 |     }
261 |    },
262 |    "source": [
263 |     "### Other Examples\n",
264 |     "\n",
265 |     "Some more involved unsupervised learning problems are:\n",
266 |     "\n",
267 |     "- given detailed observations of distant galaxies, determine which features or combinations of\n",
268 |     "  features summarize best the information.\n",
269 |     "- given a mixture of two sound sources (for example, a person talking over some music),\n",
270 |     "  separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n",
271 |     "- given a large collection of news articles, find recurring topics inside these articles.\n",
272 |     "- given a collection of images, cluster similar images together (for example to group them when visualizing a collection)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {
278 |     "slideshow": {
279 |      "slide_type": "slide"
280 |     }
281 |    },
282 |    "source": [
283 |     "# Scikit-learn at a Glance"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "<img src=\"../images/scikit-learn-cheatsheet.png\" width=\"100%\" />"
291 |    ]
292 |   }
293 |  ],
294 |  "metadata": {
295 |   "celltoolbar": "Slideshow",
296 |   "kernelspec": {
297 |    "display_name": "Python 3.6 (Develer Science)",
298 |    "language": "python",
299 |    "name": "develer-science"
300 |   },
301 |   "language_info": {
302 |    "codemirror_mode": {
303 |     "name": "ipython",
304 |     "version": 3
305 |    },
306 |    "file_extension": ".py",
307 |    "mimetype": "text/x-python",
308 |    "name": "python",
309 |    "nbconvert_exporter": "python",
310 |    "pygments_lexer": "ipython3",
311 |    "version": "3.6.6"
312 |   }
313 |  },
314 |  "nbformat": 4,
315 |  "nbformat_minor": 2
316 | }
317 | 


--------------------------------------------------------------------------------
/2_alchemist/5. Level Up.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# StarWars"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Introduction:\n",
 15 |     "\n",
 16 |     "This time you will create the data.\n",
 17 |     "```\n",
 18 |     "raw_data = {\"name\": ['Dart Vather', 'Leia Princess','Luke Skywalker','Yoda'],\n",
 19 |     "            \"class\": ['Empire','Rebels','Rebels','Rebels'],\n",
 20 |     "            \"gender\": ['M', 'F', 'M', 'J'],\n",
 21 |     "            \"hp\": [45, 39, 44, 45],\n",
 22 |     "            \"like\": ['yes', 'no','yes','no']                        \n",
 23 |     "            }\n",
 24 |     "````\n",
 25 |     "\n",
 26 |     "### Step 1. Import the necessary libraries"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [],
 36 |    "source": []
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "### Step 2. Create a data dictionary"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": []
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "### Step 3. Create a DataFrame object starting from the data dictionary, and print its head"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": []
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "### Step 4. Ops...it seems the DataFrame columns are in alphabetical order. Place  the order of the columns as name, type, hp, evolution, pokedex"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": []
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Step 5. Add another column called actor, and insert what you have in mind."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [],
100 |    "source": []
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "### Step 6. Present the type of each column"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "collapsed": false
114 |    },
115 |    "outputs": [],
116 |    "source": []
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "# Filtering and Sorting Data"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "This time we are going to pull data directly from the internet.\n",
130 |     "Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.\n",
131 |     "\n",
132 |     "### Step 1. Import the necessary libraries"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [],
142 |    "source": []
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). "
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "### Step 3. Assign it to a variable called chipo."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": false
163 |    },
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Step 4. How many products cost more than $10.00?"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {
178 |     "collapsed": false
179 |    },
180 |    "outputs": [],
181 |    "source": []
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### Step 5. What is the price of each item? \n",
188 |     "###### print a data frame with only two columns item_name and item_price"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": false
196 |    },
197 |    "outputs": [],
198 |    "source": []
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "### Step 6. Sort by the name of the item"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": false
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "### Step 7. What was the quantity of the most expensive item ordered?"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": []
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "### Step 8. How many times were a Veggie Salad Bowl ordered?"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {
243 |     "collapsed": false
244 |    },
245 |    "outputs": [],
246 |    "source": []
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "### Step 9. How many times people orderd more than one Canned Soda?"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {
259 |     "collapsed": false
260 |    },
261 |    "outputs": [],
262 |    "source": []
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "# Exercises - GroupBy"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "### Introduction:\n",
276 |     "\n",
277 |     "GroupBy can be summarizes as Split-Apply-Combine.\n",
278 |     "\n",
279 |     "Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.\n",
280 |     "\n",
281 |     "Check out this Diagram:\n",
282 |     "\n",
283 |     "![Group-by Diagram](https://i.imgur.com/yjNkiwL.png?1)  \n",
284 |     "\n",
285 |     "\n",
286 |     "### Step 1. Import the necessary libraries"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {
293 |     "collapsed": false
294 |    },
295 |    "outputs": [],
296 |    "source": []
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv). "
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "### Step 3. Assign it to a variable called drinks."
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": false
317 |    },
318 |    "outputs": [],
319 |    "source": []
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "### Step 4. Which continent drinks more beer on average?"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": []
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "### Step 5. For each continent print the statistics for wine consumption."
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [],
351 |    "source": []
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "### Step 6. Print the mean alcoohol consumption per continent for every column"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {
364 |     "collapsed": false
365 |    },
366 |    "outputs": [],
367 |    "source": []
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "### Step 7. Print the median alcoohol consumption per continent for every column"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": false
381 |    },
382 |    "outputs": [],
383 |    "source": []
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "### Step 8. Print the mean, min and max values for spirit consumption.\n",
390 |     "#### This time output a DataFrame"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {
397 |     "collapsed": false
398 |    },
399 |    "outputs": [],
400 |    "source": []
401 |   }
402 |  ],
403 |  "metadata": {
404 |   "anaconda-cloud": {},
405 |   "kernelspec": {
406 |    "display_name": "Python 3.6 (Develer Science)",
407 |    "language": "python",
408 |    "name": "develer-science"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.6.6"
421 |   }
422 |  },
423 |  "nbformat": 4,
424 |  "nbformat_minor": 2
425 | }
426 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Develer turns to Data Science
  2 | 
  3 | ## Lecture notes for the "Data Science, the Pythonic way" @ [Develer](https://www.develer.com/)
  4 | 
  5 | <img src="http://bit.ly/develer_logo" title="Develer Logo" width="20%" />
  6 | 
  7 | ### Author: Valerio Maggio
  8 | 
  9 | #### _PostDoc Data Scientist @ FBK/MPBA_
 10 | 
 11 | #### Contacts:
 12 | 
 13 | <table style="border: 0px; display: inline-table">
 14 |     <tbody>
 15 |         <tr style="border: 0px;">
 16 |             <td style="border: 0px;">
 17 |                 <img src="images/twitter_small.png" style="display: inline-block;" />
 18 |                 <a href="http://twitter.com/leriomaggio" target="\_blank">@leriomaggio</a>
 19 |             </td>
 20 |             <td style="border: 0px;">
 21 |                 <img src="images/linkedin_small.png" style="display: inline-block;" />
 22 |                 <a href="it.linkedin.com/in/valeriomaggio" target="\_blank">valeriomaggio</a>
 23 |             </td>
 24 |             <td style="border: 0px;">
 25 |                 <img src="images/gmail_small.png" style="display: inline-block;" />
 26 |                 valeriomaggio_at_gmail_dot_com
 27 |             </td>
 28 |        </tr>
 29 |   </tbody>
 30 | </table>
 31 | 
 32 | # Materials:
 33 | 
 34 | ![github](./images/github.jpg)
 35 | 
 36 | ```shell
 37 | git clone https://github.com/leriomaggio/develer-data-science.git
 38 | ```
 39 | 
 40 | # Outline at a glance:
 41 | (from _apprentice_ to _doctor strange_)
 42 | 
 43 | - **Level I**) _Apprentice_:  **Pythonic tools for Data Science**
 44 | 
 45 |     * _Dev Tools_ for Data Scientist and Jupyter notebooks
 46 |     * Numerical computation in Python: `numpy`
 47 |     * Working with data: `pandas`
 48 | 
 49 | 
 50 | - **Level II**) _Alchemist_: **Data Visualisation**
 51 | 
 52 |     * Basic principles of data visualisation
 53 |     * Introduction to `matplotlib`
 54 |     * interactive data visualisation using `bokeh`
 55 | 
 56 | 
 57 | - **Level III**) _Mage_: **Crash course on Machine Learning**
 58 | 
 59 |     * What is _Machine Learning_
 60 |     * Introduction to `sklearn`
 61 |     * _Supervised_ and _**Un**supervised_ Machine learning
 62 |     * Robust Machine Learning: _selection bias and cross-validation_
 63 | 
 64 | 
 65 | - **Level IV**) _Arch-Mage_ : **Deep Learning & Pythonic perspectives**
 66 |     * What is _Deep Learning_
 67 |     * Deep Learning frameworks
 68 |     * Introduction to Keras
 69 | 
 70 | ### Description
 71 | 
 72 | The course will be organised in **four** different parts,
 73 | mostly covering the basics (plus some more advanced topics)
 74 | related to Machine Learning and Data Science.
 75 | 
 76 | We will start by introducing the basics of data science in Python,
 77 | and the (development) tools and frameworks to be used.
 78 | Then we will start working with real data (in different formats)
 79 | to have a very general feeling of what does it _mean_ to be
 80 | a _data scientist_. There will also be a section specifically
 81 | focused on basic principles (and tools) of
 82 | data visualisation.
 83 | Finally, more advanced concepts will be introduced.
 84 | In particular, a general introduction to Machine Learning models
 85 | and settings (i.e. _supervised_ and _unsupervised_) will be
 86 | provided, along with a glimpse of Deep learning models and
 87 | frameworks.
 88 | 
 89 | All these parts will be presented always considering the
 90 | perspective of the developer and practitioner who wants to
 91 | learn (and understand) _Data Science_ in a very practical way.
 92 | For this aim, the materials will contain lots of
 93 | exercises and challenges along the way to test your
 94 | skills.
 95 | 
 96 | ---
 97 | 
 98 | # Technical Requirements
 99 | 
100 | This tutorial requires the following packages:
101 | 
102 | - Python version 3.6
103 |     - Python 3.4+ should be fine as well
104 |     - likely Python 2.7 would be also fine, but *who knows*? :P
105 | - `numpy`: http://www.numpy.org/
106 | - `scipy`: http://www.scipy.org/
107 | - `matplotlib`: http://matplotlib.org/
108 | - `pandas`: http://pandas.pydata.org
109 | - `scikit-learn` : http://scikit-learn.org
110 | - `jupyter` & `notebook`: http://jupyter.org
111 | 
112 | Plus - for the last Deep learning section:
113 | - `keras`: http://keras.io
114 | - `tensorflow`: https://www.tensorflow.org
115 | - (optional) `torch`: http://pytorch.org
116 | 
117 | The easiest way to get (most of) these is to use an all-in-one installer
118 | such as [Anaconda](https://www.anaconda.com/download/) from Continuum,
119 | which is available for multiple computer platforms, namely Linux,
120 | Windows, and OSX.
121 | 
122 | ---
123 | 
124 | ### Python Version
125 | 
126 | I'm currently running this tutorial with **Python 3** on **Anaconda**
127 | 
128 | 
129 | ```shell
130 | $ python --version
131 | Python 3.6.6
132 | ```
133 | 
134 | ---
135 | 
136 | # Accessing the materials
137 | 
138 | If you want to access the materials, you have several options:
139 | 
140 | ## Jupyter Notebook
141 | 
142 | Most of the materials in this course is provided as a collection of
143 | Jupyter Notebooks.
144 | 
145 | In case you don't know **what is** a Jupyter notebook, here is a good
146 | reference for a quick introduction:
147 | [Jupyter Notebook Beginner Guide](https://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html).
148 | 
149 | On the other hand, if you also want to know (_and you should_) **what is NOT**
150 | a Jupyter notebook - *spoiler alert:* **it is NOT an IDE** -
151 | here is a very nice reference:
152 | 
153 | &rightarrow; [I Don't like Notebooks,](https://twitter.com/joelgrus/status/1033035196428378113)
154 | by _Joel Grus_ @ JupyterCon 2018.
155 | 
156 | If you **already have all the environment setup** on your machine,
157 | all you need to do is to run the Jupyter notebook server:
158 | 
159 | ```shell
160 | $ jupyter notebook
161 | ```
162 | 
163 | Alternatively, I suggest you to try the new **Jupyter Lab** environment:
164 | ```shell
165 | $ jupyter lab
166 | ```
167 | 
168 | **NOTE**: Before running Jupyter server, it is mandatory to enable
169 | the (Python) virtual environment.
170 | 
171 | Please refer to the section [Setting the Environment](#setup) for
172 | detailed instructions on how to install all the required
173 | packages and libraries.
174 | 
175 | 
176 | ## Binder
177 | 
178 | (Consider this option only if your WiFi is stable)
179 | 
180 | If you don't want the hassle of setting up all the environment and
181 | libraries on your machine, or simply you want to avoid doing
182 | "_too much computation_" on your hardware setup,
183 | I strongly suggest you to use the **Binder** service.
184 | 
185 | The primary goal of Binder is to turn a GitHub repo into a collection of
186 | interactive Jupyter notebooks
187 | 
188 | To start using Binder, just click on the button below:
189 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/leriomaggio/develer-data-science/master)
190 | 
191 | ## Google Colaboratory
192 | 
193 | [Colaboratory](https://colab.research.google.com/) is a free Jupyter
194 | notebook environment that
195 | requires no setup and runs entirely in the Google cloud.
196 | Moreover, **GPU** and **TPU** runtime environments are available,
197 | and completely for free.
198 | (This last option will be worthwhile mentioning in the very
199 | last part of the course, when we will talk
200 | about Deep Learning networks).
201 | 
202 | [Here](https://colab.research.google.com/notebooks/welcome.ipynb)
203 | is an overview of the main features offered by Colaboratory.
204 | 
205 | To start using Colaboratory, just click on the button below:
206 | [![Colab](https://img.shields.io/badge/launch-colaboratory-yellow.svg)](https://colab.research.google.com/)
207 | 
208 | ---
209 | 
210 | <a name='setup'></a>
211 | # Setting the Environment
212 | 
213 | In this repository, files to install the required packages are provided.
214 | The first step to setup the environment is to create a
215 | Python [Virtual Environment](https://docs.python.org/3.6/tutorial/venv.html).
216 | 
217 | Whether you are using [Anaconda](https://www.anaconda.com/download/)
218 | Python Distribution or the **Standard
219 | Python framework** (from [python.org](https://www.python.org/downloads/)),
220 | below are reported the instructions for the two cases, respectively.
221 | 
222 | ## (a) Conda Environment
223 | 
224 | This repository includes a `conda-environment.yml` file that is necessary
225 | to re-create the Conda virtual environment.
226 | 
227 | To re-create the virtual environments:
228 | 
229 | ```shell
230 | $ conda env create -f conda-environment.yml
231 | ```
232 | 
233 | Then, to **activate** the virtual environment:
234 | 
235 | ```shell
236 | $ conda activate develer-science
237 | ```
238 | 
239 | ## (b) `pyenv` & `virtualenv`
240 | 
241 | Alternatively, if you don't want to install (yet) another Python
242 | distribution on your machine, or you prefer not to use the full-stack Anaconda
243 | Python, I strongly suggest to give a try to the new `pyenv` project.
244 | 
245 | ### 1. Setup `pyenv`
246 | 
247 | `pyenv` is a new package that lets you easily switch between multiple
248 | versions of Python.
249 | It is simple, unobtrusive, and follows the UNIX tradition of single-purpose
250 | tools that do one thing well.
251 | 
252 | To **setup** `pyenv`, please follow the instructions reported on the
253 | [GitHub Repository](https://github.com/pyenv/pyenv) of the project,
254 | according to the specific platform and operating system.
255 | 
256 | There exists a `pyenv` plugin named `pyenv-virtualenv` which comes with various
257 | features to help `pyenv` users to manage virtual environments created by
258 | `virtualenv` or Anaconda.
259 | 
260 | ### 2. Installing `pyenv-virtualenv`
261 | 
262 | I would recommend to install `pyenv-virtualenv` as reported in
263 | the official
264 | [documentation](https://github.com/pyenv/pyenv-virtualenv/blob/master/README.md).
265 | 
266 | ### 3. Setting up the virtual environment
267 | 
268 | Once `pyenv` and `pyenv-virtualenv` have been correctly installed and
269 | configured, these are the instructions to
270 | set up the virtual environment for this tutorial:
271 | 
272 | ```shell
273 | $ pyenv install 3.6.6  # downloads and enables Python 3.6
274 | $ pyenv virtualenv 3.6.6 develer-science  # create virtual env using Py3.6
275 | $ pyenv activate develer-science  # activate the environment
276 | $ pip install -r requirements.txt  # install requirements
277 | 
278 | ```
279 | 
280 | ### Installing Jupyter Kernel (Optional)
281 | 
282 | All the notebooks in this tutorial have been saved using a Jupyter Kernel
283 | defined on the created virtual environment, named "Python 3.6 (DL Keras TF)".
284 | 
285 | In case you got a warning of _non-existent kernel_ when you open the
286 | notebooks on your machine, you need to create the corresponding
287 | `IPython` kernel:
288 | 
289 | ```shell
290 | $ python -m ipykernel install --user --name develer-science --display-name "Python 3.6 (Develer Science)"
291 | ```
292 | 
293 | ---
294 | 
295 | ## Test if everything is up&running
296 | 
297 | ### 1. Check import
298 | 
299 | 
300 | ```Python
301 | >>> import numpy as np
302 | >>> import scipy as sp
303 | >>> import pandas as pd
304 | >>> import matplotlib.pyplot as plt
305 | >>> import sklearn
306 | >>> import keras
307 | Using TensorFlow backend.
308 | ```
309 | 
310 | ### 2. Check installed Versions
311 | 
312 | 
313 | ```Python
314 | >>> import numpy
315 | >>> print('numpy:', numpy.__version__)
316 | >>> import scipy
317 | >>> print('scipy:', scipy.__version__)
318 | >>> import matplotlib
319 | >>> print('matplotlib:', matplotlib.__version__)
320 | >>> import sklearn
321 | >>> print('scikit-learn:', sklearn.__version__)
322 | ```
323 | ```
324 |     numpy: 1.15.2
325 |     scipy: 1.1.0
326 |     matplotlib: 3.0.0
327 |     scikit-learn: 0.20.0
328 | ```
329 | 
330 | <br>
331 | <h2 style="text-align: center;">If everything worked till down here, you're ready to start!</h2>
332 | 


--------------------------------------------------------------------------------
/4_archmage/intro_to_ann.csv:
--------------------------------------------------------------------------------
1 | Feature1,Feature2,Target2.067788388,0.258133225,10.993994008,-0.609144512,1-0.690315436,0.749920622,01.023582376,0.52900308,00.700747058,-0.496724018,10.955062941,0.371061016,0-0.051023466,0.009786883,12.111668915,0.29146667,11.173019389,-0.101473076,0-0.57794707,1.277303087,00.909597624,0.036308672,02.077734242,0.629147857,1-0.962290964,0.827233623,00.947003591,-0.290186106,10.383209976,-0.309917567,1-0.150287574,0.317294355,10.080361123,0.502094351,11.200125191,-0.527865179,1-0.922386507,0.465574823,00.512394856,-0.033492128,10.660176708,0.354198518,00.919878779,-0.59377197,10.185474117,-0.214731663,1-0.963168026,0.294003942,01.426699651,-0.099509079,10.976457825,-0.449373622,11.75471374,-0.261642816,11.168186512,-0.672740552,10.27774654,-0.607361346,10.672725148,0.666333709,00.002033057,0.376967935,10.314327033,-0.243225802,10.931967471,0.173111872,0-0.409800131,0.597131669,0-0.476941175,1.019581533,00.838882632,-0.071573271,01.033284866,-0.525249709,10.326116205,-0.577021121,10.85947284,1.340802524,01.551221655,-0.528586441,10.101201434,0.540632498,0-0.09049836,0.749596127,12.028777695,0.151478964,10.580198862,1.033262412,00.192071709,-0.121336928,1-1.462223951,0.07106442,01.906780301,0.506260992,10.695029196,-0.599016144,11.001086685,-0.70844148,11.110090768,-0.339190684,10.710397938,-0.460291185,12.042904712,-0.090700237,10.456990999,-0.563120595,10.087652349,0.095485193,11.332034469,-0.563468501,10.360617086,0.455561453,01.579667988,0.251685059,10.597723548,0.688549389,0-0.805950346,1.010476996,00.271306147,0.832912579,01.899341837,0.044502462,11.527602184,-0.735397326,11.639754245,-0.27615701,1-0.037638919,0.923068888,00.176032053,0.076067456,12.148428077,0.031181293,10.045108737,0.218221448,10.724493125,0.363354865,0-0.664756351,0.643436386,0-0.801725803,0.999480546,00.861801719,0.844010631,01.727336522,0.145706011,11.704197081,0.113234553,1-0.288162751,0.584628604,10.327757328,0.535062913,10.180931746,1.191129451,01.090762095,-0.099058367,00.152277456,0.337631885,1-0.873448576,0.616061304,01.000549409,0.326569244,0-0.384538159,0.632005344,0-0.26699789,0.155658861,1-0.083148995,0.434163485,10.682670623,0.418051246,01.230411746,-0.810815948,10.164731882,1.030431426,0-1.176887533,0.440652126,00.222984654,0.280817027,11.840889076,0.239281488,10.07449561,-0.282217468,1-0.024120207,0.304116449,10.148422149,0.027105547,1-1.031412466,0.380071744,0-0.943866831,0.600079978,01.040815444,-0.442568799,11.497320582,-0.390044387,10.262257211,0.096298493,11.835052164,-0.103412045,10.212307547,0.256849647,1-0.214058501,1.330767658,00.072106929,-0.279012748,10.510387152,0.89046706,00.859342773,-0.430765764,11.231053667,0.281765888,02.095878754,-0.124457184,1-0.048682973,0.848149869,0-0.069962169,0.732770866,00.921461668,0.537777084,0-0.072085224,0.445207272,10.691680107,-0.670960694,11.996373123,-0.06278932,1-0.029574955,-0.143949774,1-1.180878055,0.287093293,0-0.809065601,0.981521134,00.792498292,0.754776331,00.570914979,0.418499076,00.959679592,-0.614855395,10.683776074,0.718920684,00.268748478,-0.539059384,10.829946449,0.810127371,00.602300611,1.060983364,01.089894854,-0.258045447,01.215326407,-0.108394387,10.363525509,1.059532137,00.87411916,0.790742277,0-0.343226394,-0.150687513,1-0.963432794,0.169237163,00.126423731,0.257156827,1-1.266957539,0.122578452,01.809965093,0.521589108,1-0.228845381,0.864591676,01.037129806,0.052176264,01.53870046,-0.054829392,11.843084808,0.119603176,10.182608949,0.360479486,1-0.783997593,0.898602849,0-0.927885151,0.846814488,0-0.869158721,0.860426049,01.432638854,-0.684939943,10.585717127,0.931606566,01.756197562,0.520718884,11.591918487,0.03892334,10.910273683,1.120535967,00.043718854,-0.324789656,1-0.20580902,0.603084516,00.79754503,-0.671824307,10.132794235,0.301552639,1-0.088050173,0.956326244,0-0.910303685,-0.140268609,01.273308231,-0.369064664,11.832106199,0.641188872,1-0.073000565,1.023659911,00.097588494,-0.365188283,11.15143737,-0.378877952,11.006185066,0.586229182,0-1.059323432,0.265637226,00.437426846,-0.184497995,10.24070156,0.771780065,01.826147629,-0.005370137,1-0.698250117,0.518885048,00.033997867,1.002559399,0-0.365756211,0.467188778,12.072602655,0.840845991,11.282207264,-0.3852855,10.294091067,-0.106090673,1-0.684048777,0.498068262,00.569431434,-0.166176935,11.240374044,-0.338748665,10.296825382,-0.070172696,1-1.002931362,0.137418862,00.772972777,0.39039837,00.199129842,1.169529783,00.272673306,1.177485462,01.300638002,-0.454605085,11.564634514,0.44804985,1-0.040331609,1.026401238,01.629887767,0.244857792,11.143086991,-0.604189955,10.396320491,-0.39621675,1-0.626659052,1.014021465,01.721056871,0.646281907,1-0.604186387,0.703327277,00.057670035,1.272818815,00.454523688,-0.401695138,1-0.670639988,0.612491482,00.328273511,1.160241489,00.510588023,1.016852601,00.789854793,0.132560638,00.500701227,0.528972528,00.961569333,0.454935146,01.80512166,0.544117663,10.030248855,1.07059265,01.73386192,0.379948472,1-0.881264897,0.33091525,00.645125709,0.010547475,0-0.567454227,1.065282884,00.275296535,-0.175275933,1-0.633857944,0.464939873,0-1.056542614,0.685542647,00.228879606,1.134429351,0-0.053634126,0.322480786,1-0.118192714,0.309408119,10.409426898,-0.434405407,11.229937385,-0.210990997,01.981178869,0.257237744,11.625934204,-0.617482966,1-0.472291616,0.878888915,02.192799683,0.328102442,10.960354135,0.077361355,0-0.790501055,1.31421158,02.237470292,0.06894621,11.414529152,-0.518533021,10.610237507,0.683805557,01.233649984,-0.556688892,10.586796523,-0.251126721,10.81754217,0.666592737,00.580221038,0.932275998,00.058137766,0.896289048,00.579846614,-0.567698327,11.929656615,0.156016477,10.992711133,-0.027168436,02.027815527,0.363122856,11.338652267,-0.823465756,11.598191518,-0.411064707,10.976142984,-0.190358151,01.61250291,-0.521032611,11.35546992,-0.490609874,10.499861695,1.058450639,01.015369625,0.207315117,01.018118245,0.456516401,01.096461863,-0.280727167,1-1.194040968,0.416441978,00.568309164,0.647458129,00.960588057,0.208865115,01.449178024,-0.459902987,11.88199633,0.466290375,10.631996864,-0.309905342,10.654989783,-0.253093095,10.84516718,0.285090431,00.401198088,0.436221932,00.527232031,0.768876824,00.372940088,-0.3860018,12.066393868,-0.539216929,11.430582326,-0.388926174,10.189907716,-0.056403166,10.882697896,-0.734363957,1-1.190098679,0.637042186,0-0.894856639,0.556642259,00.70215789,-0.187835487,11.950487011,-0.345075672,10.022418039,1.004134807,00.849971596,-0.21483451,12.036410764,-0.025556022,10.275048593,-0.408398856,1-0.108810254,0.159667885,10.938494136,0.64097432,0-0.236808252,0.341747012,10.835324588,0.327895824,00.833630821,0.38695039,00.602538337,-0.450144939,1-1.316428825,0.754775796,01.844066074,0.193577285,11.064502653,-0.496454037,10.642658768,-0.020278558,11.016500344,-0.411243574,11.975868019,-0.0481966,10.993444732,-0.573538352,10.261224684,1.209148214,01.207647248,-0.681878027,11.029184593,0.662369558,0-0.896113151,0.767599977,00.447930908,-0.476552598,11.193378751,-0.671368037,10.986163944,0.389985835,01.515764334,0.093743109,10.869459733,0.054546438,01.063058924,-0.003285827,10.094687292,-0.154667249,1-0.215654363,1.093575329,0-1.106925474,0.335575538,01.284327882,0.263986996,0-1.337557833,-0.00412942,00.340293431,0.213608836,11.058099207,-0.020903818,0-0.360133801,1.078035323,0-0.644516105,0.488370431,0-0.906095366,0.215727927,0-0.652783156,1.173715873,01.022420252,0.192338596,0-0.780675441,0.879523891,00.916308838,0.419944074,00.087696458,1.06618628,0-0.495034594,0.721509893,00.604990594,-0.164890834,11.155923352,-0.549577539,10.071604266,1.697739637,0-0.994669061,-0.007707493,01.480666468,-0.397497896,10.202685948,0.755654894,00.760454103,-0.180150242,00.029265726,0.06831265,1-0.96321318,0.368704483,0-1.070563993,0.553634924,00.046489389,0.953371901,00.486832393,-0.538941402,10.22261155,-0.15254824,12.179258079,0.541416792,1-0.64634876,0.06402454,00.506319646,-0.86308181,12.114109853,-0.398015027,1-0.03509274,0.108265231,1-0.029109978,0.427875674,1-0.140841702,0.661982894,00.266868529,-0.14631097,1-0.998676417,0.116940821,00.082773572,0.961283817,00.941346049,-0.384150223,11.501332258,-0.275121139,10.29313361,0.37612275,10.539404176,0.643377656,00.189764846,0.00193534,11.556984609,-0.165952231,1-0.24062961,0.071192873,10.919671265,0.383063909,0-0.672636342,1.001715405,01.09642227,0.907604584,01.130676385,0.057702229,01.380338971,-0.376954849,10.090890973,0.118510932,11.858138806,0.239015185,10.179356967,-0.183583258,10.482296078,-0.651576504,10.761994902,0.633107619,02.473241953,7.86E-05,1-0.245505241,0.336720392,0-0.349850263,0.356718546,00.221741674,0.790938927,00.453347673,0.734276866,00.701948022,1.279234222,00.313783222,-0.62045118,1-0.73809991,0.409950052,0-0.650429008,0.41313687,00.81779875,-0.780491191,11.295782411,-0.615319879,1-0.882350817,0.7161834,01.225622536,-0.199088478,11.701879096,0.109120994,1-0.845272933,0.584193311,00.159783489,0.90737903,00.20528482,0.854610183,01.167600439,-0.072608591,1-1.002638614,0.325598778,0-0.771454661,0.289385589,01.657852572,-0.432481714,1-1.140596214,0.368647611,00.05651217,-0.27020989,11.222078237,-0.107769216,10.166799331,-0.092400646,11.382139915,-0.54972713,1-0.060441546,1.072743677,00.734895361,0.442755242,00.803896949,-0.493066015,11.953519644,-0.175476112,12.414351092,0.333630678,10.775111425,-0.812628459,1-0.348238275,0.963577238,00.366964433,-0.310046727,1-0.790850087,0.103205044,00.172920477,0.768141966,0-0.028795956,0.686002968,00.796633375,0.482423152,00.849063798,0.672392408,00.361955413,0.991183959,00.119040405,-0.367741577,1-0.492589106,0.625064565,0-0.54233366,0.469779538,01.763243475,0.309898796,1-1.153543233,-0.07626472,01.071246403,-0.310178222,00.906786599,0.546905393,02.090023124,0.414629671,12.083480948,0.249232463,11.186357975,-0.187071486,00.738270466,-0.307318366,11.891660982,-0.294302439,1-0.77720761,0.938230073,00.405869823,0.838221554,0-0.701489744,0.255676765,00.387436461,0.895997608,0-0.426901469,0.796977399,01.252474088,0.419778642,0-1.322590514,0.450941153,00.247652562,-0.108573295,1-0.850472738,0.854716727,0-0.507554218,1.177745763,00.54236405,1.049010618,0-0.125318758,0.557509852,11.370866032,-0.322147727,11.825410945,-0.379217248,11.562090065,0.210769453,10.434462285,0.950519364,00.131676796,0.935932583,01.229664001,0.725615425,00.312860044,0.989189572,01.90131256,0.15072306,10.193500004,1.050598819,00.03392665,0.407114286,11.651426238,-0.379391344,12.053219604,-0.039639925,10.243898894,-0.03794715,1-1.113157386,0.744162073,0-1.331684771,0.319815034,00.127646667,0.956519535,00.998491626,-0.49700126,10.662523004,0.055986284,0-0.777262489,0.055346478,0-0.731865243,0.305513652,0-0.640417258,0.809781443,0-0.76055756,0.491641159,01.151879338,-0.336581332,10.800560366,-0.542805923,10.884866314,0.583140372,00.925878139,0.740087536,00.874841419,0.430962575,0-0.198712663,0.913528351,02.103667833,0.397014915,12.059901907,-0.231619581,10.473996884,-0.27048088,1-0.57451854,0.584002067,0-0.701419019,0.397124136,00.711614355,0.782885753,0-0.496588268,1.302124802,00.066449298,0.784290559,00.538144645,-0.130769101,11.755682505,0.025561113,1-0.849798595,0.931951784,00.333367885,-0.132341271,10.705602088,-0.229055609,1-0.873354454,-0.49267613,0-0.825327122,0.67304811,01.517665755,-0.523883147,10.120672904,-0.07052083,10.725710646,0.762302198,00.344673,0.208553298,10.263766538,0.049840726,1-0.88392615,0.497464642,0-1.156119607,0.170227029,01.248965737,0.369726326,00.952682479,-0.36811615,11.727901537,0.0372861,1-0.767097925,0.709731654,01.165707539,0.769217807,00.049487533,0.720903576,0-0.539098522,0.824388711,0-0.982058703,0.288854587,01.528935488,-0.112802146,1-0.672473504,0.686282233,00.895096305,0.665784299,0-0.00760403,0.924168238,0-0.240290739,1.021991518,0-0.283467781,1.045292529,0-1.026508891,0.459402069,01.588325794,-0.338967375,1-0.348265559,1.213028067,00.77874607,-0.653521284,11.806149473,0.021203959,1-0.129694598,0.725221931,0-0.373456638,0.592067642,00.990046473,-0.464129679,11.772713799,-0.601498909,10.631039539,1.099329971,01.856709577,-0.190336881,10.018943698,0.21532497,10.172846724,0.604169382,00.557979808,0.757427667,00.080418897,0.119262443,1-0.897248067,0.438920507,00.261996093,0.918375619,00.678953635,-0.435800274,11.210765492,-0.599573281,10.557261634,0.939216926,00.134711277,1.06457682,01.607493338,0.190921234,11.420577116,-0.148027678,11.744704015,-0.140667505,1-0.215743207,1.230288577,00.750635451,-0.116643372,11.778321698,-0.54794534,10.957379288,0.795259767,0-0.10590001,-0.197994179,11.851059402,-0.071373547,10.338727554,-0.211027921,11.960801446,-0.089686612,1-0.17839517,0.727372153,1-0.380811187,0.995282564,0-1.215691519,0.453117521,0-0.283985656,0.10845028,10.859746592,-0.59355665,1-0.235539753,0.951239001,00.304995074,-0.57115376,1


--------------------------------------------------------------------------------
/4_archmage/ann.py:
--------------------------------------------------------------------------------
  1 | """Python Implementation of Multi-Layer Perceptron"""
  2 | 
  3 | import numpy as np
  4 | from numpy.random import seed
  5 | 
  6 | # =================
  7 | # Utility functions
  8 | # =================
  9 | 
 10 | import random
 11 | random.seed(123)
 12 | 
 13 | # calculate a random number where:  a <= rand < b
 14 | def rand(a, b):
 15 |     return (b-a)*random.random() + a
 16 | 
 17 | # Make a matrix
 18 | def makeMatrix(I, J, fill=0.0):
 19 |     return np.zeros([I,J])
 20 | 
 21 | # our sigmoid function
 22 | def sigmoid(x):
 23 |     #return math.tanh(x)
 24 |     return 1/(1+np.exp(-x))
 25 | 
 26 | # derivative of our sigmoid function, in terms of the output (i.e. y)
 27 | def dsigmoid(y):
 28 |     return y - y**2
 29 | 
 30 | # ==================
 31 | 
 32 | 
 33 | class MLP:
 34 |     """Multi Layer Perceptron
 35 | 
 36 |     Parameters
 37 |     ------------
 38 |     ni : int
 39 |         Number of Input neurons
 40 |     n_h : int
 41 |         Number of Hidden neurons
 42 |     n_o : int
 43 |         Number of Output neurons
 44 | 
 45 |     Attributes
 46 |     -----------
 47 |     ni : int
 48 |         Number of Input neurons
 49 |     n_h : int
 50 |         Number of Hidden neurons
 51 |     n_o : int
 52 |         Number of Output neurons
 53 | 
 54 |     ai : 1d-array (size: n_i)
 55 |         Activations for Input layer
 56 |     ah : 1d-array (size: n_h)
 57 |         Activations for Hidden layer
 58 |     ao : 1d-array (size: n_o)
 59 |         Activations for Output layer
 60 | 
 61 |     wi : 2d-array (shape n_i x n_h_)
 62 |         Weight matrix between Input and Hidden Layer.
 63 |     wo : 2d-array (shape n_h x n_o)
 64 |         Weight matrix between Hidden and Output Layer.
 65 | 
 66 |     """
 67 | 
 68 |     def __init__(self, ni, nh, no):
 69 |         # number of input, hidden, and output nodes
 70 |         self.ni = ni + 1 # +1 for bias node
 71 |         self.nh = nh
 72 |         self.no = no
 73 | 
 74 |         # activations for nodes
 75 |         self.ai = [1.0]*self.ni
 76 |         self.ah = [1.0]*self.nh
 77 |         self.ao = [1.0]*self.no
 78 | 
 79 |         # create weights
 80 |         self.wi = makeMatrix(self.ni, self.nh)
 81 |         self.wo = makeMatrix(self.nh, self.no)
 82 | 
 83 |         # set them to random vaules
 84 |         for i in range(self.ni):
 85 |             for j in range(self.nh):
 86 |                 self.wi[i][j] = rand(-0.2, 0.2)
 87 |         for j in range(self.nh):
 88 |             for k in range(self.no):
 89 |                 self.wo[j][k] = rand(-2.0, 2.0)
 90 | 
 91 |         # last change in weights for momentum
 92 |         self.ci = makeMatrix(self.ni, self.nh)
 93 |         self.co = makeMatrix(self.nh, self.no)
 94 | 
 95 | 
 96 |     def backPropagate(self, targets, N, M):
 97 | 
 98 |         if len(targets) != self.no:
 99 |             print(targets)
100 |             raise ValueError('wrong number of target values')
101 | 
102 |         # calculate error terms for output
103 |         output_deltas = np.zeros(self.no)
104 |         for k in range(self.no):
105 |             error = targets[k]-self.ao[k]
106 |             output_deltas[k] = dsigmoid(self.ao[k]) * error
107 | 
108 |         # calculate error terms for hidden
109 |         hidden_deltas = np.zeros(self.nh)
110 |         for j in range(self.nh):
111 |             error = 0.0
112 |             for k in range(self.no):
113 |                 error += output_deltas[k]*self.wo[j][k]
114 |             hidden_deltas[j] = dsigmoid(self.ah[j]) * error
115 | 
116 |         # update output weights
117 |         for j in range(self.nh):
118 |             for k in range(self.no):
119 |                 change = output_deltas[k] * self.ah[j]
120 |                 self.wo[j][k] += N*change + M*self.co[j][k]
121 |                 self.co[j][k] = change
122 | 
123 |         # update input weights
124 |         for i in range(self.ni):
125 |             for j in range(self.nh):
126 |                 change = hidden_deltas[j]*self.ai[i]
127 |                 self.wi[i][j] += N*change + M*self.ci[i][j]
128 |                 self.ci[i][j] = change
129 | 
130 |         # calculate error
131 |         error = 0.0
132 |         for k in range(len(targets)):
133 |             error += 0.5*(targets[k]-self.ao[k])**2
134 |         return error
135 | 
136 | 
137 |     def test(self, patterns):
138 |         self.predict = np.empty([len(patterns), self.no])
139 |         for i, p in enumerate(patterns):
140 |             self.predict[i] = self.activate(p)
141 |             #self.predict[i] = self.activate(p[0])
142 | 
143 |     def activate(self, inputs):
144 | 
145 |         if len(inputs) != self.ni-1:
146 |             print(inputs)
147 |             raise ValueError('wrong number of inputs')
148 | 
149 |         # input activations
150 |         for i in range(self.ni-1):
151 |             self.ai[i] = inputs[i]
152 | 
153 |         # hidden activations
154 |         for j in range(self.nh):
155 |             sum_h = 0.0
156 |             for i in range(self.ni):
157 |                 sum_h += self.ai[i] * self.wi[i][j]
158 |             self.ah[j] = sigmoid(sum_h)
159 | 
160 |         # output activations
161 |         for k in range(self.no):
162 |             sum_o = 0.0
163 |             for j in range(self.nh):
164 |                 sum_o += self.ah[j] * self.wo[j][k]
165 |             self.ao[k] = sigmoid(sum_o)
166 | 
167 |         return self.ao[:]
168 | 
169 | 
170 |     def train(self, patterns, iterations=1000, N=0.5, M=0.1):
171 |         # N: learning rate
172 |         # M: momentum factor
173 |         patterns = list(patterns)
174 |         for i in range(iterations):
175 |             error = 0.0
176 |             for p in patterns:
177 |                 inputs = p[0]
178 |                 targets = p[1]
179 |                 self.activate(inputs)
180 |                 error += self.backPropagate([targets], N, M)
181 |             if i % 5 == 0:
182 |                 print('error in interation %d : %-.5f' % (i,error))
183 |             print('Final training error: %-.5f' % error)
184 | 
185 | 
186 | class Perceptron(object):
187 |     """Perceptron classifier.
188 | 
189 |     Parameters
190 |     ------------
191 |     eta : float
192 |         Learning rate (between 0.0 and 1.0)
193 |     n_iter : int
194 |         Passes over the training dataset.
195 | 
196 |     Attributes
197 |     -----------
198 |     w_ : 1d-array
199 |         Weights after fitting.
200 |     errors_ : list
201 |         Number of misclassifications in every epoch.
202 | 
203 |     """
204 |     def __init__(self, eta=0.01, n_iter=10):
205 |         self.eta = eta
206 |         self.n_iter = n_iter
207 | 
208 |     def fit(self, X, y):
209 |         """Fit training data.
210 | 
211 |         Parameters
212 |         ----------
213 |         X : {array-like}, shape = [n_samples, n_features]
214 |             Training vectors, where n_samples is the number of samples and
215 |             n_features is the number of features.
216 |         y : array-like, shape = [n_samples]
217 |             Target values.
218 | 
219 |         Returns
220 |         -------
221 |         self : object
222 | 
223 |         """
224 |         self.w_ = np.zeros(1 + X.shape[1])
225 |         self.errors_ = []
226 | 
227 |         for _ in range(self.n_iter):
228 |             errors = 0
229 |             for xi, target in zip(X, y):
230 |                 update = self.eta * (target - self.predict(xi))
231 |                 self.w_[1:] += update * xi
232 |                 self.w_[0] += update
233 |                 errors += int(update != 0.0)
234 |             self.errors_.append(errors)
235 |         return self
236 | 
237 |     def net_input(self, X):
238 |         """Calculate net input"""
239 |         return np.dot(X, self.w_[1:]) + self.w_[0]
240 | 
241 |     def predict(self, X):
242 |         """Return class label after unit step"""
243 |         return np.where(self.net_input(X) >= 0.0, 1, -1)
244 | 
245 | 
246 | class AdalineGD:
247 |     """ADAptive LInear NEuron classifier.
248 | 
249 |     Parameters
250 |     ------------
251 |     eta : float
252 |         Learning rate (between 0.0 and 1.0)
253 |     n_iter : int
254 |         Passes over the training dataset.
255 | 
256 |     Attributes
257 |     -----------
258 |     w_ : 1d-array
259 |         Weights after fitting.
260 |     errors_ : list
261 |         Number of misclassifications in every epoch.
262 | 
263 |     """
264 |     def __init__(self, eta=0.01, n_iter=50):
265 |         self.eta = eta
266 |         self.n_iter = n_iter
267 | 
268 |     def fit(self, X, y):
269 |         """ Fit training data.
270 | 
271 |         Parameters
272 |         ----------
273 |         X : {array-like}, shape = [n_samples, n_features]
274 |             Training vectors, where n_samples is the number of samples and
275 |             n_features is the number of features.
276 |         y : array-like, shape = [n_samples]
277 |             Target values.
278 | 
279 |         Returns
280 |         -------
281 |         self : object
282 |         """
283 |         self.w_ = np.zeros(1 + X.shape[1])
284 |         self.cost_ = []
285 | 
286 |         for i in range(self.n_iter):
287 |             output = self.net_input(X)
288 |             errors = (y - output)
289 |             self.w_[1:] += self.eta * X.T.dot(errors)
290 |             self.w_[0] += self.eta * errors.sum()
291 |             cost = (errors**2).sum() / 2.0
292 |             self.cost_.append(cost)
293 |         return self
294 | 
295 |     def net_input(self, X):
296 |         """Calculate net input"""
297 |         return np.dot(X, self.w_[1:]) + self.w_[0]
298 | 
299 |     def activation(self, X):
300 |         """Compute linear activation"""
301 |         return self.net_input(X)
302 | 
303 |     def predict(self, X):
304 |         """Return class label after unit step"""
305 |         return np.where(self.activation(X) >= 0.0, 1, -1)
306 | 
307 | class AdalineSGD(object):
308 |     """ADAptive LInear NEuron classifier.
309 | 
310 |     Parameters
311 |     ------------
312 |     eta : float
313 |         Learning rate (between 0.0 and 1.0)
314 |     n_iter : int
315 |         Passes over the training dataset.
316 | 
317 |     Attributes
318 |     -----------
319 |     w_ : 1d-array
320 |         Weights after fitting.
321 |     errors_ : list
322 |         Number of misclassifications in every epoch.
323 |     shuffle : bool (default: True)
324 |         Shuffles training data every epoch if True to prevent cycles.
325 |     random_state : int (default: None)
326 |         Set random state for shuffling and initializing the weights.
327 | 
328 |     """
329 |     def __init__(self, eta=0.01, n_iter=10, shuffle=True, random_state=None):
330 |         self.eta = eta
331 |         self.n_iter = n_iter
332 |         self.w_initialized = False
333 |         self.shuffle = shuffle
334 |         if random_state:
335 |             seed(random_state)
336 | 
337 |     def fit(self, X, y):
338 |         """ Fit training data.
339 | 
340 |         Parameters
341 |         ----------
342 |         X : {array-like}, shape = [n_samples, n_features]
343 |             Training vectors, where n_samples is the number of samples and
344 |             n_features is the number of features.
345 |         y : array-like, shape = [n_samples]
346 |             Target values.
347 | 
348 |         Returns
349 |         -------
350 |         self : object
351 | 
352 |         """
353 |         self._initialize_weights(X.shape[1])
354 |         self.cost_ = []
355 |         for i in range(self.n_iter):
356 |             if self.shuffle:
357 |                 X, y = self._shuffle(X, y)
358 |             cost = []
359 |             for xi, target in zip(X, y):
360 |                 cost.append(self._update_weights(xi, target))
361 |             avg_cost = sum(cost)/len(y)
362 |             self.cost_.append(avg_cost)
363 |         return self
364 | 
365 |     def partial_fit(self, X, y):
366 |         """Fit training data without reinitializing the weights"""
367 |         if not self.w_initialized:
368 |             self._initialize_weights(X.shape[1])
369 |         if y.ravel().shape[0] > 1:
370 |             for xi, target in zip(X, y):
371 |                 self._update_weights(xi, target)
372 |         else:
373 |             self._update_weights(X, y)
374 |         return self
375 | 
376 |     def _shuffle(self, X, y):
377 |         """Shuffle training data"""
378 |         r = np.random.permutation(len(y))
379 |         return X[r], y[r]
380 | 
381 |     def _initialize_weights(self, m):
382 |         """Initialize weights to zeros"""
383 |         self.w_ = np.zeros(1 + m)
384 |         self.w_initialized = True
385 | 
386 |     def _update_weights(self, xi, target):
387 |         """Apply Adaline learning rule to update the weights"""
388 |         output = self.net_input(xi)
389 |         error = (target - output)
390 |         self.w_[1:] += self.eta * xi.dot(error)
391 |         self.w_[0] += self.eta * error
392 |         cost = 0.5 * error**2
393 |         return cost
394 | 
395 |     def net_input(self, X):
396 |         """Calculate net input"""
397 |         return np.dot(X, self.w_[1:]) + self.w_[0]
398 | 
399 |     def activation(self, X):
400 |         """Compute linear activation"""
401 |         return self.net_input(X)
402 | 
403 |     def predict(self, X):
404 |         """Return class label after unit step"""
405 |         return np.where(self.activation(X) >= 0.0, 1, -1)
406 | 


--------------------------------------------------------------------------------
/1_apprentice/2.2. Scipy Sparse_Matrices.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Scipy Sparse Matrices"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "slideshow": {
 18 |      "slide_type": "subslide"
 19 |     }
 20 |    },
 21 |    "source": [
 22 |     "**Sparse Matrices** are very nice in some situations.  \n",
 23 |     "\n",
 24 |     "For example, in some machine learning tasks, especially those associated\n",
 25 |     "with textual analysis, the data may be mostly zeros.  \n",
 26 |     "\n",
 27 |     "Storing all these zeros is very inefficient.  \n",
 28 |     "\n",
 29 |     "We can create and manipulate sparse matrices as follows:"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 1,
 35 |    "metadata": {
 36 |     "collapsed": true,
 37 |     "slideshow": {
 38 |      "slide_type": "skip"
 39 |     }
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import numpy as np\n",
 44 |     "\n",
 45 |     "np.random.seed(42)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "subslide"
 54 |     }
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "[[0.37454012 0.95071431 0.73199394 0.59865848 0.15601864]\n",
 62 |       " [0.15599452 0.05808361 0.86617615 0.60111501 0.70807258]\n",
 63 |       " [0.02058449 0.96990985 0.83244264 0.21233911 0.18182497]\n",
 64 |       " [0.18340451 0.30424224 0.52475643 0.43194502 0.29122914]\n",
 65 |       " [0.61185289 0.13949386 0.29214465 0.36636184 0.45606998]\n",
 66 |       " [0.78517596 0.19967378 0.51423444 0.59241457 0.04645041]\n",
 67 |       " [0.60754485 0.17052412 0.06505159 0.94888554 0.96563203]\n",
 68 |       " [0.80839735 0.30461377 0.09767211 0.68423303 0.44015249]\n",
 69 |       " [0.12203823 0.49517691 0.03438852 0.9093204  0.25877998]\n",
 70 |       " [0.66252228 0.31171108 0.52006802 0.54671028 0.18485446]]\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# Create a random array with a lot of zeros\n",
 76 |     "X = np.random.random((10, 5))\n",
 77 |     "print(X)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {
 84 |     "slideshow": {
 85 |      "slide_type": "subslide"
 86 |     }
 87 |    },
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "[[0.         0.95071431 0.73199394 0.         0.        ]\n",
 94 |       " [0.         0.         0.86617615 0.         0.70807258]\n",
 95 |       " [0.         0.96990985 0.83244264 0.         0.        ]\n",
 96 |       " [0.         0.         0.         0.         0.        ]\n",
 97 |       " [0.         0.         0.         0.         0.        ]\n",
 98 |       " [0.78517596 0.         0.         0.         0.        ]\n",
 99 |       " [0.         0.         0.         0.94888554 0.96563203]\n",
100 |       " [0.80839735 0.         0.         0.         0.        ]\n",
101 |       " [0.         0.         0.         0.9093204  0.        ]\n",
102 |       " [0.         0.         0.         0.         0.        ]]\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "X[X < 0.7] = 0\n",
108 |     "print(X)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 4,
114 |    "metadata": {
115 |     "slideshow": {
116 |      "slide_type": "subslide"
117 |     }
118 |    },
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "  (0, 1)\t0.9507143064099162\n",
125 |       "  (0, 2)\t0.7319939418114051\n",
126 |       "  (1, 2)\t0.8661761457749352\n",
127 |       "  (1, 4)\t0.7080725777960455\n",
128 |       "  (2, 1)\t0.9699098521619943\n",
129 |       "  (2, 2)\t0.8324426408004217\n",
130 |       "  (5, 0)\t0.7851759613930136\n",
131 |       "  (6, 3)\t0.9488855372533332\n",
132 |       "  (6, 4)\t0.9656320330745594\n",
133 |       "  (7, 0)\t0.8083973481164611\n",
134 |       "  (8, 3)\t0.9093204020787821\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "from scipy import sparse\n",
140 |     "\n",
141 |     "# turn X into a csr (Compressed-Sparse-Row) matrix\n",
142 |     "X_csr = sparse.csr_matrix(X)\n",
143 |     "print(X_csr)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 5,
149 |    "metadata": {
150 |     "slideshow": {
151 |      "slide_type": "subslide"
152 |     }
153 |    },
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "[[0.         0.95071431 0.73199394 0.         0.        ]\n",
160 |       " [0.         0.         0.86617615 0.         0.70807258]\n",
161 |       " [0.         0.96990985 0.83244264 0.         0.        ]\n",
162 |       " [0.         0.         0.         0.         0.        ]\n",
163 |       " [0.         0.         0.         0.         0.        ]\n",
164 |       " [0.78517596 0.         0.         0.         0.        ]\n",
165 |       " [0.         0.         0.         0.94888554 0.96563203]\n",
166 |       " [0.80839735 0.         0.         0.         0.        ]\n",
167 |       " [0.         0.         0.         0.9093204  0.        ]\n",
168 |       " [0.         0.         0.         0.         0.        ]]\n"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "# convert the sparse matrix to a dense array\n",
174 |     "print(X_csr.toarray())"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 6,
180 |    "metadata": {
181 |     "slideshow": {
182 |      "slide_type": "subslide"
183 |     }
184 |    },
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "True"
190 |       ]
191 |      },
192 |      "execution_count": 6,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "# Sparse matrices support linear algebra:\n",
199 |     "y = np.random.random(X_csr.shape[1])\n",
200 |     "z1 = X_csr.dot(y)\n",
201 |     "z2 = X.dot(y)\n",
202 |     "np.allclose(z1, z2)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {
208 |     "slideshow": {
209 |      "slide_type": "subslide"
210 |     }
211 |    },
212 |    "source": [
213 |     "* The CSR representation can be very efficient for computations, but it is not as good for adding elements.  \n",
214 |     "\n",
215 |     "* For that, the **LIL** (List-In-List) representation is better:"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 7,
221 |    "metadata": {
222 |     "slideshow": {
223 |      "slide_type": "fragment"
224 |     }
225 |    },
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "  (0, 2)\t2.0\n",
232 |       "  (0, 3)\t3.0\n",
233 |       "  (0, 4)\t4.0\n",
234 |       "  (1, 0)\t1.0\n",
235 |       "  (1, 1)\t2.0\n",
236 |       "  (1, 4)\t5.0\n",
237 |       "  (2, 0)\t2.0\n",
238 |       "  (2, 2)\t4.0\n",
239 |       "  (2, 4)\t6.0\n",
240 |       "  (3, 0)\t3.0\n",
241 |       "  (3, 3)\t6.0\n",
242 |       "  (3, 4)\t7.0\n",
243 |       "  (4, 0)\t4.0\n",
244 |       "  (4, 4)\t8.0\n",
245 |       "[[0. 0. 2. 3. 4.]\n",
246 |       " [1. 2. 0. 0. 5.]\n",
247 |       " [2. 0. 4. 0. 6.]\n",
248 |       " [3. 0. 0. 6. 7.]\n",
249 |       " [4. 0. 0. 0. 8.]]\n"
250 |      ]
251 |     }
252 |    ],
253 |    "source": [
254 |     "# Create an empty LIL matrix and add some items\n",
255 |     "X_lil = sparse.lil_matrix((5, 5))\n",
256 |     "\n",
257 |     "for i, j in np.random.randint(0, 5, (15, 2)):\n",
258 |     "    X_lil[i, j] = i + j\n",
259 |     "\n",
260 |     "print(X_lil)\n",
261 |     "print(X_lil.toarray())"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {
267 |     "slideshow": {
268 |      "slide_type": "subslide"
269 |     }
270 |    },
271 |    "source": [
272 |     "* Often, once an LIL matrix is created, it is useful to convert it to a CSR format \n",
273 |     "    * **Note**: many scikit-learn algorithms require CSR or CSC format"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 8,
279 |    "metadata": {
280 |     "slideshow": {
281 |      "slide_type": "fragment"
282 |     }
283 |    },
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "  (0, 2)\t2.0\n",
290 |       "  (0, 3)\t3.0\n",
291 |       "  (0, 4)\t4.0\n",
292 |       "  (1, 0)\t1.0\n",
293 |       "  (1, 1)\t2.0\n",
294 |       "  (1, 4)\t5.0\n",
295 |       "  (2, 0)\t2.0\n",
296 |       "  (2, 2)\t4.0\n",
297 |       "  (2, 4)\t6.0\n",
298 |       "  (3, 0)\t3.0\n",
299 |       "  (3, 3)\t6.0\n",
300 |       "  (3, 4)\t7.0\n",
301 |       "  (4, 0)\t4.0\n",
302 |       "  (4, 4)\t8.0\n"
303 |      ]
304 |     }
305 |    ],
306 |    "source": [
307 |     "X_csr = X_lil.tocsr()\n",
308 |     "print(X_csr)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {
314 |     "slideshow": {
315 |      "slide_type": "subslide"
316 |     }
317 |    },
318 |    "source": [
319 |     "There are several other sparse formats that can be useful for various problems:\n",
320 |     "\n",
321 |     "- `CSC` (compressed sparse column)\n",
322 |     "- `BSR` (block sparse row)\n",
323 |     "- `COO` (coordinate)\n",
324 |     "- `DIA` (diagonal)\n",
325 |     "- `DOK` (dictionary of keys)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "markdown",
330 |    "metadata": {
331 |     "slideshow": {
332 |      "slide_type": "slide"
333 |     }
334 |    },
335 |    "source": [
336 |     "## CSC - Compressed Sparse Column\n",
337 |     "\n",
338 |     "**Advantages of the CSC format**\n",
339 |     "\n",
340 |     "    * efficient arithmetic operations CSC + CSC, CSC * CSC, etc.\n",
341 |     "    * efficient column slicing\n",
342 |     "    * fast matrix vector products (CSR, BSR may be faster)\n",
343 |     "\n",
344 |     "**Disadvantages of the CSC format**\n",
345 |     "\n",
346 |     "    * slow row slicing operations (consider CSR)\n",
347 |     "    * changes to the sparsity structure are expensive (consider LIL or DOK)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {
353 |     "slideshow": {
354 |      "slide_type": "subslide"
355 |     }
356 |    },
357 |    "source": [
358 |     "### BSR - Block Sparse Row\n",
359 |     "\n",
360 |     "The Block Compressed Row (`BSR`) format is very similar to the Compressed Sparse Row (`CSR`) format. \n",
361 |     "\n",
362 |     "BSR is appropriate for sparse matrices with *dense sub matrices* like the example below. \n",
363 |     "\n",
364 |     "Block matrices often arise in *vector-valued* finite element discretizations. \n",
365 |     "\n",
366 |     "In such cases, BSR is **considerably more efficient** than CSR and CSC for many sparse arithmetic operations."
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 9,
372 |    "metadata": {
373 |     "slideshow": {
374 |      "slide_type": "subslide"
375 |     }
376 |    },
377 |    "outputs": [
378 |     {
379 |      "data": {
380 |       "text/plain": [
381 |        "array([[1, 1, 0, 0, 2, 2],\n",
382 |        "       [1, 1, 0, 0, 2, 2],\n",
383 |        "       [0, 0, 0, 0, 3, 3],\n",
384 |        "       [0, 0, 0, 0, 3, 3],\n",
385 |        "       [4, 4, 5, 5, 6, 6],\n",
386 |        "       [4, 4, 5, 5, 6, 6]])"
387 |       ]
388 |      },
389 |      "execution_count": 9,
390 |      "metadata": {},
391 |      "output_type": "execute_result"
392 |     }
393 |    ],
394 |    "source": [
395 |     "from scipy.sparse import bsr_matrix\n",
396 |     "\n",
397 |     "indptr = np.array([0, 2, 3, 6])\n",
398 |     "indices = np.array([0, 2, 2, 0, 1, 2])\n",
399 |     "data = np.array([1, 2, 3, 4, 5, 6]).repeat(4).reshape(6, 2, 2)\n",
400 |     "bsr_matrix((data,indices,indptr), shape=(6, 6)).toarray()"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {
406 |     "slideshow": {
407 |      "slide_type": "slide"
408 |     }
409 |    },
410 |    "source": [
411 |     "## COO - Coordinate Sparse Matrix\n",
412 |     "\n",
413 |     "**Advantages of the CSC format**\n",
414 |     "\n",
415 |     "    * facilitates fast conversion among sparse formats\n",
416 |     "    * permits duplicate entries (see example)\n",
417 |     "    * very fast conversion to and from CSR/CSC formats\n",
418 |     "\n",
419 |     "**Disadvantages of the CSC format**\n",
420 |     "\n",
421 |     "    * does not directly support arithmetic operations and slicing\n",
422 |     "    \n",
423 |     "** Intended Usage**\n",
424 |     "\n",
425 |     "    * COO is a fast format for constructing sparse matrices\n",
426 |     "    * Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector\n",
427 |     "    operations\n",
428 |     "    * By default when converting to CSR or CSC format, duplicate (i,j) entries will be summed together. \n",
429 |     "    This facilitates efficient construction of finite element matrices and the like.\n"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "metadata": {
435 |     "slideshow": {
436 |      "slide_type": "slide"
437 |     }
438 |    },
439 |    "source": [
440 |     "## DOK - Dictionary of Keys\n",
441 |     "\n",
442 |     "Sparse matrices can be used in arithmetic operations: they support addition, subtraction, multiplication, division, and matrix power.\n",
443 |     "\n",
444 |     "Allows for efficient O(1) access of individual elements. Duplicates are not allowed. Can be efficiently converted to a coo_matrix once constructed."
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 10,
450 |    "metadata": {
451 |     "slideshow": {
452 |      "slide_type": "subslide"
453 |     }
454 |    },
455 |    "outputs": [
456 |     {
457 |      "data": {
458 |       "text/plain": [
459 |        "array([[0., 1., 2., 3., 4.],\n",
460 |        "       [0., 2., 3., 4., 5.],\n",
461 |        "       [0., 0., 4., 5., 6.],\n",
462 |        "       [0., 0., 0., 6., 7.],\n",
463 |        "       [0., 0., 0., 0., 8.]], dtype=float32)"
464 |       ]
465 |      },
466 |      "execution_count": 10,
467 |      "metadata": {},
468 |      "output_type": "execute_result"
469 |     }
470 |    ],
471 |    "source": [
472 |     "from scipy.sparse import dok_matrix\n",
473 |     "S = dok_matrix((5, 5), dtype=np.float32)\n",
474 |     "for i in range(5):\n",
475 |     "    for j in range(i, 5):\n",
476 |     "        S[i,j] = i+j\n",
477 |     "        \n",
478 |     "S.toarray()"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "markdown",
483 |    "metadata": {
484 |     "slideshow": {
485 |      "slide_type": "subslide"
486 |     }
487 |    },
488 |    "source": [
489 |     "The ``scipy.sparse`` submodule also has a lot of functions for sparse matrices\n",
490 |     "including linear algebra, sparse solvers, graph algorithms, and much more."
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": []
499 |   }
500 |  ],
501 |  "metadata": {
502 |   "kernelspec": {
503 |    "display_name": "Python 3.6 (Develer Science)",
504 |    "language": "python",
505 |    "name": "develer-science"
506 |   },
507 |   "language_info": {
508 |    "codemirror_mode": {
509 |     "name": "ipython",
510 |     "version": 3
511 |    },
512 |    "file_extension": ".py",
513 |    "mimetype": "text/x-python",
514 |    "name": "python",
515 |    "nbconvert_exporter": "python",
516 |    "pygments_lexer": "ipython3",
517 |    "version": "3.6.6"
518 |   }
519 |  },
520 |  "nbformat": 4,
521 |  "nbformat_minor": 2
522 | }
523 | 


--------------------------------------------------------------------------------
/1_apprentice/3.4 Level Up.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Level Up: Final Challenge on Data Viz"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## matplotlib - 2D and 3D plotting in Python"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# This line configures matplotlib to show figures embedded in the notebook, \n",
 24 |     "# instead of opening a new window for each figure.\n",
 25 |     "%matplotlib notebook"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Brief Recap to Matplotlib"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Matplotlib is an excellent 2D and 3D graphics library for generating scientific figures. Some of the many advantages of this library include:\n",
 40 |     "\n",
 41 |     "* Easy to get started\n",
 42 |     "* Support for $\\LaTeX$ formatted labels and texts\n",
 43 |     "* Great control of every element in a figure, including figure size and DPI. \n",
 44 |     "* High-quality output in many formats, including PNG, PDF, SVG, EPS, and PGF.\n",
 45 |     "* GUI for interactively exploring figures *and* support for headless generation of figure files (useful for batch jobs).\n",
 46 |     "\n",
 47 |     "One of the of the key features of matplotlib that I would like to emphasize, and that I think makes matplotlib highly suitable for generating figures for scientific publications is that all aspects of the figure can be controlled *programmatically*. This is important for reproducibility and convenient when one needs to regenerate the figure with updated data or change its appearance. \n",
 48 |     "\n",
 49 |     "More information at the Matplotlib web page: http://matplotlib.org/\n",
 50 |     "\n",
 51 |     "**To get started**:\n",
 52 |     "import the `matplotlib.pyplot` module under the name `plt` (the tidy way):"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {
 59 |     "collapsed": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import matplotlib.pyplot as plt  # de-facto convention"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "# 1. Matlab-like API - Basic Plotting\n",
 71 |     "\n",
 72 |     "The easiest way to get started with plotting using matplotlib is often to use the MATLAB-like API provided by matplotlib. \n",
 73 |     "\n",
 74 |     "It is designed to be compatible with MATLAB's plotting functions, so it is easy to get started with if you are familiar with MATLAB."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Ex. 1.1\n",
 82 |     "\n",
 83 |     "Generate two arrays, `x` and `y` so that values of `y` are any function of `x` (e.g. $y = x^2)$. Plot the resulting values.\n",
 84 |     "\n",
 85 |     "#### Hint: Take a look at `plot` to generage plot. Take also a look to `xlabel` and `ylabel`. When you're done, call `show()` to actually display the chart."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": []
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## Ex 1.1.1\n",
102 |     "\n",
103 |     "Plot the `sin` function"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": []
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## Ex 1.2\n",
120 |     "\n",
121 |     "With the same two `x` and `y` arrays of the previous exercise, generate two plots inverting axis. If you fancy, play a bit with options before moving forward (e.g. set two different colors and different tickers..)\n",
122 |     "\n",
123 |     "#### Hint: Take a look at `subplot` + plot options. "
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": []
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Ex 1.3 \n",
140 |     "\n",
141 |     "Generate two plots of any arbitrary `x`, and `y` values whose labels are the corresponding functions written using LaTeX formulas.\n",
142 |     "\n",
143 |     "Example:\n",
144 |     "\n",
145 |     "```python\n",
146 |     "fig, ax = plt.subplots()\n",
147 |     "\n",
148 |     "ax.plot(.....\n",
149 |     "```\n",
150 |     "Put `legend` inside the chart.\n",
151 |     "\n",
152 |     "#### Hint: Take a look at `legend`, axix and `label` plot option"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": []
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Colors \n",
169 |     "\n",
170 |     "With matplotlib, you can define the colors of lines and other graphical elements in a number of ways. \n",
171 |     "\n",
172 |     "First of all, we can use the MATLAB-like syntax where `'b'` means blue, `'g'` means green, etc. The MATLAB API for selecting line styles are also supported: where, for example, 'b.-' means a blue line with dots\n",
173 |     "\n",
174 |     "Example:\n",
175 |     "\n",
176 |     "```python\n",
177 |     "\n",
178 |     "# MATLAB style line color and style \n",
179 |     "ax.plot(x, x**2, 'b.-') # blue line with dots\n",
180 |     "ax.plot(x, x**3, 'g--') # green dashed line\n",
181 |     "```"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Ex 1.4\n",
189 |     "\n",
190 |     "Define colors of previous plots by their names or RGB hex codes and optionally provide an alpha value using the `color` and `alpha` keyword arguments"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": []
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Line and marker styles\n",
207 |     "\n",
208 |     "To change the line width, we can use the `linewidth` or `lw` keyword argument. The line style can be selected using the `linestyle` or `ls` keyword arguments"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "## Ex 1.5\n",
216 |     "\n",
217 |     "Generate three plots with different line widths"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {
224 |     "collapsed": true
225 |    },
226 |    "outputs": [],
227 |    "source": []
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "## Ex 1.6\n",
234 |     "\n",
235 |     "Generate two plots for ($x$, $x^2$) and ($x$, $e(x)$) using normal and logarithmic scales.\n",
236 |     "\n",
237 |     "#### Hint: Take a look at `set_yscale`"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": []
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "# 2 .Charts\n",
254 |     "\n",
255 |     "Matplotlib supports different chart types. For example: *Scatter plots*, *Bar plots*, *histograms*."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "## Ex 2.1\n",
263 |     "\n",
264 |     "Generate a `numpy.array` of $10^5$ random numbers. Plot the histogram of this array.\n",
265 |     "\n",
266 |     "#### Hint: Take a look at `plt.hist`. Also, see returned values of this function."
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "collapsed": true
274 |    },
275 |    "outputs": [],
276 |    "source": []
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "The `plt.hist` docstring has more information on other customization options available. "
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {},
288 |    "source": [
289 |     "## Ex 2.2\n",
290 |     "\n",
291 |     "Plot the histogram of three (or more) different normal distributions.\n",
292 |     "\n",
293 |     "#### hint: Take a look at `histtype` and `alpha` parameter of the `plt.hist`. \n",
294 |     "\n",
295 |     "#### To generate normal distributions, take a look at `np.random.normal`"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {
302 |     "collapsed": true
303 |    },
304 |    "outputs": [],
305 |    "source": []
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "## Ex 2.3\n",
312 |     "\n",
313 |     "Generate a scatter plot of random numbers and itself plus some random perturbations: $y = x + c * \\text{gaussian noise}$.\n",
314 |     "Select colors and markers that you fancy.\n",
315 |     "\n",
316 |     "#### Hint: See `plt.scatter`."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "collapsed": true
324 |    },
325 |    "outputs": [],
326 |    "source": []
327 |   },
328 |   {
329 |    "cell_type": "markdown",
330 |    "metadata": {},
331 |    "source": [
332 |     "## Ex 2.4\n",
333 |     "\n",
334 |     "Generate a random number distribution and generate a box plot.\n",
335 |     "\n",
336 |     "#### Hint: See `plt.boxplot`"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {
343 |     "collapsed": true
344 |    },
345 |    "outputs": [],
346 |    "source": []
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "## Ex 2.4\n",
353 |     "\n",
354 |     "Given \n",
355 |     "```python\n",
356 |     "\n",
357 |     "mean = [0, 0]\n",
358 |     "cov = [[1, 1], [1, 2]]\n",
359 |     "```\n",
360 |     "Get `x` and `y` drawn from a Gaussian Multivariate Distribution and plot a 2D Histogram, along with a corresponding colorbar.\n",
361 |     "\n",
362 |     "#### Hint: See `plot.hist2d` and `plt.colorbar`\n",
363 |     "\n"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {
370 |     "collapsed": true
371 |    },
372 |    "outputs": [],
373 |    "source": []
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "# 3. Challenge: Handwritten Digits\n",
380 |     "\n",
381 |     "For an example of where matplotlib might be useful, let’s look at an interesting visualization of some hand-written digits data. \n",
382 |     "\n",
383 |     "This data is included in scikit-learn, and consists of nearly $2000$ $8\\times8$ thumbnails showing various hand-written digits.\n",
384 |     "\n",
385 |     "For now, let’s start by downloading the digits data and visualizing several of the example images with plt.imshow()"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "## Ex 3.1\n",
393 |     "\n",
394 |     "Download the digits dataset from `sklearn`\n",
395 |     "\n",
396 |     "#### Hint: Take a look at `sklearn.datasets`"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {
403 |     "collapsed": true
404 |    },
405 |    "outputs": [],
406 |    "source": []
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "## Ex 3.2\n",
413 |     "\n",
414 |     "Generate an $8\\times8$ grid of images with a `figsize` of `(6, 6)` and show images from the dataset.\n",
415 |     "\n",
416 |     "#### Hint: Take a look at `plt.imshow`"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": []
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {},
431 |    "source": [
432 |     "## Ex 3.3 (Manifold Learing Preview)\n",
433 |     "\n",
434 |     "Provided the following two-dimensional manifold learning projection of digits data:\n",
435 |     "\n",
436 |     "```python\n",
437 |     "\n",
438 |     "# project the digits into 2 dimensions using IsoMap from sklearn.manifold \n",
439 |     "import Isomap\n",
440 |     "iso = Isomap(n_components=2)\n",
441 |     "projection = iso.fit_transform(digits.data)\n",
442 |     "```\n",
443 |     "Use a discrete colormap to view the results in a scatter plot, setting the ticks and clim to improve the aesthetics of the resulting colorbar\n"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {
450 |     "collapsed": true
451 |    },
452 |    "outputs": [],
453 |    "source": []
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "# There's more: 3D Plotting\n",
460 |     "\n",
461 |     "To use 3D graphics in matplotlib, we first need to create an instance of the `Axes3D` class. 3D axes can be added to a matplotlib figure canvas in exactly the same way as 2D axes; or, more conveniently, by passing a `projection='3d'` keyword argument to the `add_axes` or `add_subplot` methods.\n",
462 |     "\n",
463 |     "Start with \n",
464 |     "\n",
465 |     "`from mpl_toolkits.mplot3d.axes3d import Axes3D`"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {
472 |     "collapsed": true
473 |    },
474 |    "outputs": [],
475 |    "source": []
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "# Bokeh\n",
482 |     "\n",
483 |     "Bokeh is a Python interactive visualization library that targets modern web browsers for presentation. Its goal is to provide elegant, concise construction of novel graphics in the style of D3.js, and to extend this capability with high-performance interactivity over very large or streaming datasets. Bokeh can help anyone who would like to quickly and easily create interactive plots, dashboards, and data applications.\n",
484 |     "\n",
485 |     "More on: [http://bokeh.pydata.org/en/latest/](http://bokeh.pydata.org/en/latest/)\n"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "### Using Bokeh\n",
493 |     "\n",
494 |     "Bokeh APIs are slightly different from those of Matplotlib.\n",
495 |     "For example, to make a line you should use `bokeh.plotting.charts.line`:\n",
496 |     "\n",
497 |     "```python\n",
498 |     "# MISSING - PLEASE FILL HERE\n",
499 |     "\n",
500 |     "from bokeh.plotting import figure, show\n",
501 |     "\n",
502 |     "# prepare some data\n",
503 |     "x = list(range(1, 6))\n",
504 |     "y = [6, 7, 2, 4, 5]\n",
505 |     "\n",
506 |     "# create a new plot with axis label and title\n",
507 |     "p = figure(title=\"Sample Line Example\", x_axis_label='x', y_axis_label='y')\n",
508 |     "\n",
509 |     "# add a line rendered with legend and line thickness\n",
510 |     "p.line(x, y, legend='Temp.', line_width=2)\n",
511 |     "\n",
512 |     "# show the result\n",
513 |     "show(p)\n",
514 |     "``` \n",
515 |     "\n",
516 |     "Try this snippet out and fill the missing part - See `bokeh.io.output_notebook`"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {
523 |     "collapsed": true
524 |    },
525 |    "outputs": [],
526 |    "source": []
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {},
531 |    "source": [
532 |     "## Scatter Plot\n",
533 |     "\n",
534 |     "Generate a Scatter plot using Bokeh library"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {
541 |     "collapsed": true
542 |    },
543 |    "outputs": [],
544 |    "source": []
545 |   }
546 |  ],
547 |  "metadata": {
548 |   "kernelspec": {
549 |    "display_name": "Python 3.6 (Develer Science)",
550 |    "language": "python",
551 |    "name": "develer-science"
552 |   },
553 |   "language_info": {
554 |    "codemirror_mode": {
555 |     "name": "ipython",
556 |     "version": 3
557 |    },
558 |    "file_extension": ".py",
559 |    "mimetype": "text/x-python",
560 |    "name": "python",
561 |    "nbconvert_exporter": "python",
562 |    "pygments_lexer": "ipython3",
563 |    "version": "3.6.6"
564 |   }
565 |  },
566 |  "nbformat": 4,
567 |  "nbformat_minor": 2
568 | }
569 | 


--------------------------------------------------------------------------------
/1_apprentice/1.5. Numpy Challenge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Numpy Basics Challenge"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# To import numpy:\n",
 19 |     "import numpy as np  # np is de-facto standard convention"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Brief Recap on Numpy Arrays\n",
 27 |     "\n",
 28 |     "NumPy's main object is the **homogeneous** ***multidimensional array***. It is a table of elements (usually numbers), all of the same type. \n",
 29 |     "\n",
 30 |     "In Numpy dimensions are called **axes**. \n",
 31 |     "\n",
 32 |     "The number of axes is called **rank**. \n",
 33 |     "\n",
 34 |     "The most important attributes of an ndarray object are:\n",
 35 |     "\n",
 36 |     "* **ndarray.ndim**     - the number of axes (dimensions) of the array. \n",
 37 |     "* **ndarray.shape**    - the dimensions of the array. For a matrix with n rows and m columns, shape will be (n,m). \n",
 38 |     "* **ndarray.size**     - the total number of elements of the array. \n",
 39 |     "* **ndarray.dtype**    - numpy.int32, numpy.int16, and numpy.float64 are some examples. \n",
 40 |     "* **ndarray.itemsize** - the size in bytes of elements of the array. For example, elements of type float64 has itemsize 8 (=64/8) "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Warm Up: Creating `numpy` arrays\n",
 48 |     "\n",
 49 |     "There are a number of ways to initialize new numpy arrays, for example from a Python list or tuples!"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "array([1, 2, 3, 4])"
 61 |       ]
 62 |      },
 63 |      "execution_count": 2,
 64 |      "metadata": {},
 65 |      "output_type": "execute_result"
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "np.array([1, 2, 3, 4])"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## Ex 1. Create an array containing integers from $2$ to $2^6$\n",
 77 |     "\n",
 78 |     "#### Hint: use Python `range` function"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": []
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## 1.1 Print `ndarray` attributes and properties\n",
 95 |     "(e.g. `type`, `dtype`, `shape...`) using previous on"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": []
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Ex 2. Create a 3x3 Matrix array and fill it with integer numbers"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": []
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Ex3: Create a Matrix of any size and fill it with random numbers\n",
128 |     "\n",
129 |     "### HInt: take a look at `numpy.random.rand`"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": []
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "# 1. Using _array-generating_ functions\n",
146 |     "\n",
147 |     "For larger arrays it is inpractical to initialize the data manually, using explicit python lists. \n",
148 |     "\n",
149 |     "Instead we can use one of the many **functions** in `numpy` that generates arrays of different forms. \n",
150 |     "\n",
151 |     "So far, you should already have used one that is `numpy.random.rand`.\n",
152 |     "\n",
153 |     "Some of the most common are: \n",
154 |     "\n",
155 |     "* `np.arange`; \n",
156 |     "* `np.linspace`; \n",
157 |     "* `np.logspace`; \n",
158 |     "* `np.ones`;\n",
159 |     "* `np.zeros`;\n",
160 |     "\n",
161 |     "The following challenges will require you to use one (or many) of these functions."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Ex 1.1\n",
169 |     "\n",
170 |     "Create an array of floating-point numbers in an arbitrary range (randomly generated), and using a decimal step (e.g. `0.2`) \n",
171 |     "\n",
172 |     "**Note**: You CAN use **three** numpy functions in this exercise. Guess the difference!."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": []
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Ex 1.2\n",
189 |     "\n",
190 |     "Create an Matrix of any shape full of zeros"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": []
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Ex 1.3\n",
207 |     "\n",
208 |     "Create an array of ones and put as the main diagonal of a matrix \n",
209 |     "\n",
210 |     "#### Hint: Take a look at `np.diag` and `np.eye`"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": []
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Ex 1.4\n",
227 |     "\n",
228 |     "Create an arbitrary array of numbers and put it as the first upper off-diagonal of a new matrix\n",
229 |     "\n",
230 |     "#### Hint: Look at the parameters of  `np.diag`"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "collapsed": true
238 |    },
239 |    "outputs": [],
240 |    "source": []
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "# 2. File I/O\n",
247 |     "\n",
248 |     "## Comma-separated values (CSV)\n",
249 |     "\n",
250 |     "A very common file format for data files are the comma-separated values (CSV), or related format such as TSV (tab-separated values). To read data from such file into Numpy arrays we can use the `numpy.genfromtxt` function. For example:"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 2,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "1800  1  1    -6.1    -6.1    -6.1 1\n",
263 |       "1800  1  2   -15.4   -15.4   -15.4 1\n",
264 |       "1800  1  3   -15.0   -15.0   -15.0 1\n",
265 |       "1800  1  4   -19.3   -19.3   -19.3 1\n",
266 |       "1800  1  5   -16.8   -16.8   -16.8 1\n",
267 |       "1800  1  6   -11.4   -11.4   -11.4 1\n",
268 |       "1800  1  7    -7.6    -7.6    -7.6 1\n",
269 |       "1800  1  8    -7.1    -7.1    -7.1 1\n",
270 |       "1800  1  9   -10.1   -10.1   -10.1 1\n",
271 |       "1800  1 10    -9.5    -9.5    -9.5 1\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "!head files/stockholm_td_adj.dat"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "# Ex 2.1\n",
284 |     "\n",
285 |     "Generate a numpy array with data extracted from csv file.\n",
286 |     "\n",
287 |     "#### Hint: Take a look at `np.genfromtxt`"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": []
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "## Ex 2.2 \n",
304 |     "\n",
305 |     "Analise properties of resulting `np.ndarrays` loaded from file."
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "outputs": [],
315 |    "source": []
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "## Ex 2.3\n",
322 |     "\n",
323 |     "Create a $100 \\times 100$ matrix of random numbers, `reshape` as a $10000 \\times 1$ array and save it in a new file in the `data` folder.\n",
324 |     "\n",
325 |     "#### Hint: `np.save` might be useful :). The other hint is in the text !-)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": []
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "## Ex 2.4\n",
342 |     "\n",
343 |     "Load back the previously saved array and print out the first 20 elements\n",
344 |     "\n",
345 |     "#### Note: We are bit anticipating indexing here, but I believe you may guess if you know a bit about Python List"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "collapsed": true
353 |    },
354 |    "outputs": [],
355 |    "source": []
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "### Ex 2.4.1\n",
362 |     "\n",
363 |     "To make it a little bit harder, `reshape` the array back into a \n",
364 |     "$100 \\times 100$ matrix and print out the first row and the first column."
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {
371 |     "collapsed": true
372 |    },
373 |    "outputs": [],
374 |    "source": []
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "# 3. Indexing"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "markdown",
385 |    "metadata": {},
386 |    "source": [
387 |     "Index slicing is the technical name for the syntax `M[lower:upper:step]` to extract part of an array"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "## Ex 3.1 \n",
395 |     "\n",
396 |     "Generate a three-dimensional array of any size containing random numbers taken from an uniform distribution (_guess the numpy function in `np.random`_). Then print out separately the first entry along the three axis (i.e. `x, y, z`)  \n",
397 |     "\n",
398 |     "\n",
399 |     "#### Hint: Slicing with numpy arrays works quite like Python lists"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {
406 |     "collapsed": true
407 |    },
408 |    "outputs": [],
409 |    "source": []
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {},
414 |    "source": [
415 |     "## Ex 3.2\n",
416 |     "\n",
417 |     "Create a vector and print out elements in reverse order\n",
418 |     "\n",
419 |     "#### Hint: Use slicing for this exercise"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "collapsed": true
427 |    },
428 |    "outputs": [],
429 |    "source": []
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "## Ex 3.3\n",
436 |     "\n",
437 |     "Generate a $7 \\times 7$ matrix and replace all the elements in odd rows and even columns with `1`.\n",
438 |     "\n",
439 |     "#### Hint: Use slicing to solve this exercise!\n",
440 |     "\n",
441 |     "#### Note: Take a look at the original matrix, then."
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {
448 |     "collapsed": true
449 |    },
450 |    "outputs": [],
451 |    "source": []
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {},
456 |    "source": [
457 |     "## Ex 3.4 \n",
458 |     "\n",
459 |     "Generate a `10 x 10` matrix of numbers `A`. Then, generate a numpy array of integers in range `1-9`. Pick `5` random values (with no repetition) from this array and use these values to extract rows from the original matrix `A`."
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {
466 |     "collapsed": true
467 |    },
468 |    "outputs": [],
469 |    "source": []
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {},
474 |    "source": [
475 |     "## Ex 3.5 \n",
476 |     "\n",
477 |     "Repeat the previous exercise but this time extract columns from `A`"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "collapsed": true
485 |    },
486 |    "outputs": [],
487 |    "source": []
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "## Ex 3.6\n",
494 |     "\n",
495 |     "Generate an array of numbers from `0` to `20` with step `0.5`. \n",
496 |     "Extract all the values greater than a randomly generated number in the same range.\n",
497 |     "\n",
498 |     "#### Hint: Try to write the condition as an expression and save it to a variable. Then, use this variable in square brackets to index.... this is when the magic happens!"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {
505 |     "collapsed": true
506 |    },
507 |    "outputs": [],
508 |    "source": []
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "# 4. Basic Arithmetics\n",
515 |     "\n",
516 |     "Vectorizing code is the key to writing efficient numerical calculation with Python/Numpy. That means that as much as possible of a program should be formulated in terms of matrix and vector operations, like matrix-matrix multiplication.\n",
517 |     "\n",
518 |     "## Scalar-array operartions\n",
519 |     "\n",
520 |     "We can use the usual arithmetic operators to multiply, add, subtract, and divide arrays with scalar numbers."
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "## Ex 4.1 \n",
528 |     "\n",
529 |     "Generate a matrix of any size. Then multiply each element by `2` and subtract `1`.\n",
530 |     "\n",
531 |     "#### HInt: The most intuitive way to implement this is the right one!"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {
538 |     "collapsed": true
539 |    },
540 |    "outputs": [],
541 |    "source": []
542 |   },
543 |   {
544 |    "cell_type": "markdown",
545 |    "metadata": {},
546 |    "source": [
547 |     "## Ex 4.2\n",
548 |     "\n",
549 |     "Generate two square matrix of random numbers and multiply them element wise.\n",
550 |     "\n",
551 |     "#### Hint: The clues from the previous exercise also apply here!-)"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {
558 |     "collapsed": true
559 |    },
560 |    "outputs": [],
561 |    "source": []
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {},
566 |    "source": [
567 |     "## Ex 4.3\n",
568 |     "\n",
569 |     "Execute the `dot` product between two randomly generated matrix of comparable shapes.\n",
570 |     "\n",
571 |     "#### Hint: Since we are using Python 3 here, you should be aware of the `@` operator.. :)"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": null,
577 |    "metadata": {
578 |     "collapsed": true
579 |    },
580 |    "outputs": [],
581 |    "source": []
582 |   },
583 |   {
584 |    "cell_type": "markdown",
585 |    "metadata": {},
586 |    "source": [
587 |     "## Ex 4.4\n",
588 |     "\n",
589 |     "Generate a matrix of any size and create the transpose.\n",
590 |     "\n",
591 |     "#### Hint: Guess the difference from `np.transpose` and `array.T`"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": null,
597 |    "metadata": {
598 |     "collapsed": true
599 |    },
600 |    "outputs": [],
601 |    "source": []
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "## Ex 4.5\n",
608 |     "\n",
609 |     "Generate a `ndarray` of any size and shape of random numbers and calculate some statistics (`mean`, `sum` along axis, `min`, `max`)\n",
610 |     "\n",
611 |     "\n",
612 |     "#### HInt: Clues in the text of the exercise !-)"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": null,
618 |    "metadata": {
619 |     "collapsed": true
620 |    },
621 |    "outputs": [],
622 |    "source": []
623 |   },
624 |   {
625 |    "cell_type": "markdown",
626 |    "metadata": {},
627 |    "source": [
628 |     "# 5. Stacking and repeating arrays\n",
629 |     "\n",
630 |     "Using function `repeat`, `tile`, `vstack`, `hstack`, and `concatenate` we can create larger vectors and matrices from smaller ones\n",
631 |     "\n"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "markdown",
636 |    "metadata": {},
637 |    "source": [
638 |     "## Ex 5.1\n",
639 |     "\n",
640 |     "Generate two random matrix of the same size and concatenate on the `y-axis`.\n",
641 |     "\n",
642 |     "#### Hint: You CAN use **two** functions to solve this challenge. Guess the differences."
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "code",
647 |    "execution_count": null,
648 |    "metadata": {
649 |     "collapsed": true
650 |    },
651 |    "outputs": [],
652 |    "source": []
653 |   },
654 |   {
655 |    "cell_type": "markdown",
656 |    "metadata": {},
657 |    "source": [
658 |     "## Ex 5.2\n",
659 |     "\n",
660 |     "Generate two random matrix of the same size and concatenate on the `x-axis`.\n",
661 |     "\n",
662 |     "#### Hint: You CAN use **two** functions to solve this challenge. Guess the differences."
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": null,
668 |    "metadata": {
669 |     "collapsed": true
670 |    },
671 |    "outputs": [],
672 |    "source": []
673 |   },
674 |   {
675 |    "cell_type": "markdown",
676 |    "metadata": {},
677 |    "source": [
678 |     "# So, why is it useful then?\n",
679 |     "\n",
680 |     "So far the `numpy.ndarray` looks awefully much like a Python **list** (or **nested list**). \n",
681 |     "\n",
682 |     "*Why not simply use Python lists for computations instead of creating a new array type?*\n",
683 |     "\n",
684 |     "There are several reasons:\n",
685 |     "\n",
686 |     "* Python lists are very general. \n",
687 |     "    - They can contain any kind of object. \n",
688 |     "    - They are dynamically typed. \n",
689 |     "    - They do not support mathematical functions such as matrix and dot multiplications, etc. \n",
690 |     "    - Implementing such functions for Python lists would not be very efficient because of the dynamic typing.\n",
691 |     "    \n",
692 |     "    \n",
693 |     "* Numpy arrays are **statically typed** and **homogeneous**. \n",
694 |     "    - The type of the elements is determined when array is created.\n",
695 |     "    \n",
696 |     "    \n",
697 |     "* Numpy arrays are memory efficient.\n",
698 |     "    - Because of the static typing, fast implementation of mathematical functions such as multiplication and addition of `numpy` arrays can be implemented in a compiled language (C and Fortran is used)."
699 |    ]
700 |   },
701 |   {
702 |    "cell_type": "markdown",
703 |    "metadata": {},
704 |    "source": [
705 |     "### Benchmark\n",
706 |     "\n",
707 |     "Initialise a range of 1000 numbers as a python `list` and as a `numpy.array`. Square all the elements and time this operation!\n",
708 |     "\n",
709 |     "### Hint: Use `%timeit` IPython Magic to take timing"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": null,
715 |    "metadata": {
716 |     "collapsed": true
717 |    },
718 |    "outputs": [],
719 |    "source": []
720 |   }
721 |  ],
722 |  "metadata": {
723 |   "kernelspec": {
724 |    "display_name": "Python 3.6 (Develer Science)",
725 |    "language": "python",
726 |    "name": "develer-science"
727 |   },
728 |   "language_info": {
729 |    "codemirror_mode": {
730 |     "name": "ipython",
731 |     "version": 3
732 |    },
733 |    "file_extension": ".py",
734 |    "mimetype": "text/x-python",
735 |    "name": "python",
736 |    "nbconvert_exporter": "python",
737 |    "pygments_lexer": "ipython3",
738 |    "version": "3.6.6"
739 |   }
740 |  },
741 |  "nbformat": 4,
742 |  "nbformat_minor": 2
743 | }
744 | 


--------------------------------------------------------------------------------
/2_alchemist/1. Data selection & Indexing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import random"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%run 'helpers.py'"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Data selection & Indexing"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Series"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "series = pd.Series([3, 62, 75, 83, 47, 43, 39, 16, 19, 2])"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 4,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "0     3\n",
 55 |        "1    62\n",
 56 |        "2    75\n",
 57 |        "3    83\n",
 58 |        "4    47\n",
 59 |        "5    43\n",
 60 |        "6    39\n",
 61 |        "7    16\n",
 62 |        "8    19\n",
 63 |        "9     2\n",
 64 |        "dtype: int64"
 65 |       ]
 66 |      },
 67 |      "execution_count": 4,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "series"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Access by Position / Slice"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "3"
 92 |       ]
 93 |      },
 94 |      "execution_count": 5,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "series[0]"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 6,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "3    83\n",
112 |        "4    47\n",
113 |        "5    43\n",
114 |        "dtype: int64"
115 |       ]
116 |      },
117 |      "execution_count": 6,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "series[3:6]"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 7,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "3    83\n",
135 |        "4    47\n",
136 |        "5    43\n",
137 |        "dtype: int64"
138 |       ]
139 |      },
140 |      "execution_count": 7,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "# series[3:6]\n",
147 |     "series.iloc[3:6]\n",
148 |     "# note [] not ()!"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "### Access by label"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 8,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "10"
167 |       ]
168 |      },
169 |      "execution_count": 8,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "len(series)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 9,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# set alpha label as new index for the series\n",
185 |     "series.index = [x for x in \"ABCDEFGHIJKLMNOPQRSTUVWXYZ\"][:len(series)]"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 10,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "A     3\n",
197 |        "B    62\n",
198 |        "C    75\n",
199 |        "D    83\n",
200 |        "E    47\n",
201 |        "F    43\n",
202 |        "G    39\n",
203 |        "H    16\n",
204 |        "I    19\n",
205 |        "J     2\n",
206 |        "dtype: int64"
207 |       ]
208 |      },
209 |      "execution_count": 10,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "series"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 11,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "D    83\n",
227 |        "E    47\n",
228 |        "F    43\n",
229 |        "dtype: int64"
230 |       ]
231 |      },
232 |      "execution_count": 11,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "series[3:6]\n",
239 |     "# position, pythonic"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 12,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "D    83\n",
251 |        "E    47\n",
252 |        "F    43\n",
253 |        "dtype: int64"
254 |       ]
255 |      },
256 |      "execution_count": 12,
257 |      "metadata": {},
258 |      "output_type": "execute_result"
259 |     }
260 |    ],
261 |    "source": [
262 |     "series['D':'F']\n",
263 |     "# by label: slice includes end! "
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 13,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "ename": "SyntaxError",
273 |      "evalue": "invalid syntax (<ipython-input-13-9a6e77a410bd>, line 1)",
274 |      "output_type": "error",
275 |      "traceback": [
276 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-13-9a6e77a410bd>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    series[['D':'F', 'I':'J']]\u001b[0m\n\u001b[0m               ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
277 |      ]
278 |     }
279 |    ],
280 |    "source": [
281 |     "series[['D':'F', 'I':'J']]\n",
282 |     "# cannot combine multiple ranges"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "pd.concat([series['D':'F'], series['I':'J']])\n",
292 |     "# concat to combine multiple ranges"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "# set alpha label as new index for the series\n",
302 |     "series.index = [x for x in \"GATTACAXYZ\"][:len(series)]"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "series"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "series.loc['G']"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "series.loc['A']"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {
336 |     "scrolled": true
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "series.loc['G':'A']\n",
341 |     "# non-unique values breaks slicing"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "series.loc['X':'Z']\n",
351 |     "# while unique values are still slicable in a non-unique index"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "## DataFrames, 2D Data"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "df = pd.read_json('./data/sampledf.json')"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "from IPython import display"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "# visualisation of below - for presentation\n",
386 |     "display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 2]))\n",
387 |     "\n",
388 |     "# column\n",
389 |     "df[2]"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "df[2:4]"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "# visualisation of below - for presentation\n",
408 |     "display.display_html(df.style.apply(highlight, \n",
409 |     "                                    subset=pd.IndexSlice[range(2, 4), :]))\n",
410 |     "\n",
411 |     "# column\n",
412 |     "df[2]"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "df.iloc[2:, 2]"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "# visualisation of below - for presentation\n",
431 |     "display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[range(2, 4), range(2, 4)]))\n",
432 |     "\n",
433 |     "\n",
434 |     "# segment\n",
435 |     "df.iloc[2, :]"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "# visualisation of below - for presentation\n",
445 |     "display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, range(2, 4)]))\n",
446 |     "\n",
447 |     "# column slice\n",
448 |     "df.iloc[:, 2:4]"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "df"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "df.index = [\"R{:02d}\".format(i) for i in range(len(df))]"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "df.columns = [\"C{:02d}\".format(i) for i in range(len(df.columns))]"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": null,
481 |    "metadata": {},
482 |    "outputs": [],
483 |    "source": [
484 |     "df"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "# visualisation of below - for presentation\n",
494 |     "display.display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 'C05']))\n",
495 |     "\n",
496 |     "df['C05']"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "# visualisation of below - for presentation\n",
506 |     "display.display_html(df.style.apply(highlight, subset=pd.IndexSlice['R02':'R05', :]))\n",
507 |     "\n",
508 |     "\n",
509 |     "df['R02':'R05']"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "df[['C04', 'C05']]"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "metadata": {},
525 |    "outputs": [],
526 |    "source": [
527 |     "# visualisation of below - for presentation\n",
528 |     "display.display_html(df.style.apply(highlight, subset=pd.IndexSlice['R02':'R05', 'C04':'C05']))\n",
529 |     "\n",
530 |     "\n",
531 |     "# segment\n",
532 |     "df.loc['R02':'R05', 'C04':'C05']"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "df.loc['R02':'R05', 'C04':'C05']"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "### Excercise"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "sales_data = pd.read_excel('./data/blooth_sales_data_clean.xlsx')\n",
558 |     "sales_data.head(5)"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {},
565 |    "outputs": [],
566 |    "source": [
567 |     "sales_data.info()"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "markdown",
572 |    "metadata": {},
573 |    "source": [
574 |     "Select columns two to four (three columns in total)"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "# Your code here\n"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "markdown",
588 |    "metadata": {},
589 |    "source": [
590 |     "Select the columns *birthday and name* (together)"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": null,
596 |    "metadata": {},
597 |    "outputs": [],
598 |    "source": [
599 |     "# Your code here\n"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "markdown",
604 |    "metadata": {},
605 |    "source": [
606 |     "Select the rows 2 to 4 (three rows)"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {},
613 |    "outputs": [],
614 |    "source": [
615 |     "# Your code here"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "markdown",
620 |    "metadata": {},
621 |    "source": [
622 |     "Select the rows 55, 77"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": [
631 |     "# Your code here\n"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "markdown",
636 |    "metadata": {},
637 |    "source": [
638 |     "## Boolean Index"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "markdown",
643 |    "metadata": {},
644 |    "source": [
645 |     "A boolean index is an array of true/false values: [1, 0, 1, 1, 0, 0, 1, …]\n",
646 |     "\n",
647 |     "! though the index name it's not one of the Pandas Index Types."
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": null,
653 |    "metadata": {},
654 |    "outputs": [],
655 |    "source": [
656 |     "df['C04']"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": null,
662 |    "metadata": {},
663 |    "outputs": [],
664 |    "source": [
665 |     "df['C04'] > 60"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": null,
671 |    "metadata": {},
672 |    "outputs": [],
673 |    "source": [
674 |     "df[df['C04'] > 60]"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": null,
680 |    "metadata": {},
681 |    "outputs": [],
682 |    "source": [
683 |     "df[(df['C04'] < 60) | (df['C04'] > 80)]  # multiple OR"
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "code",
688 |    "execution_count": null,
689 |    "metadata": {},
690 |    "outputs": [],
691 |    "source": [
692 |     "df[(df['C04'] < 60) & (df['C04'] % 2 == 0)]  # multiple AND"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "markdown",
697 |    "metadata": {},
698 |    "source": [
699 |     "### Excercise"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": null,
705 |    "metadata": {},
706 |    "outputs": [],
707 |    "source": [
708 |     "sales_data = pd.read_excel('./data/blooth_sales_data_clean.xlsx')\n",
709 |     "sales_data.head(5)"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": null,
715 |    "metadata": {},
716 |    "outputs": [],
717 |    "source": [
718 |     "sales_data.info(5)"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "markdown",
723 |    "metadata": {},
724 |    "source": [
725 |     "Find all rows with exactly 50 units"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": null,
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": [
734 |     "# Your code here\n"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "markdown",
739 |    "metadata": {},
740 |    "source": [
741 |     "Find all rows with exactly 50 playstations"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": null,
747 |    "metadata": {},
748 |    "outputs": [],
749 |    "source": [
750 |     "# Your code here\n"
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "markdown",
755 |    "metadata": {},
756 |    "source": [
757 |     "## filter"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "markdown",
762 |    "metadata": {},
763 |    "source": [
764 |     "Filter by label or index"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "code",
769 |    "execution_count": null,
770 |    "metadata": {},
771 |    "outputs": [],
772 |    "source": [
773 |     "df.columns"
774 |    ]
775 |   },
776 |   {
777 |    "cell_type": "code",
778 |    "execution_count": null,
779 |    "metadata": {
780 |     "scrolled": true
781 |    },
782 |    "outputs": [],
783 |    "source": [
784 |     "df.filter(like='R0', axis=0)  # , axis=1 per default"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "code",
789 |    "execution_count": null,
790 |    "metadata": {
791 |     "scrolled": true
792 |    },
793 |    "outputs": [],
794 |    "source": [
795 |     "df.filter(regex='.0[2-4]', axis=0)"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "markdown",
800 |    "metadata": {},
801 |    "source": [
802 |     "### Transpose with .T"
803 |    ]
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": null,
808 |    "metadata": {},
809 |    "outputs": [],
810 |    "source": [
811 |     "df.iloc[2:3]"
812 |    ]
813 |   },
814 |   {
815 |    "cell_type": "code",
816 |    "execution_count": null,
817 |    "metadata": {},
818 |    "outputs": [],
819 |    "source": [
820 |     "df.iloc[2:3].T"
821 |    ]
822 |   },
823 |   {
824 |    "cell_type": "markdown",
825 |    "metadata": {},
826 |    "source": [
827 |     "### Formatting with Styler"
828 |    ]
829 |   },
830 |   {
831 |    "cell_type": "code",
832 |    "execution_count": null,
833 |    "metadata": {},
834 |    "outputs": [],
835 |    "source": [
836 |     "df = pd.read_json('./data/sampledf.json')\n",
837 |     "df"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {},
844 |    "outputs": [],
845 |    "source": [
846 |     "df.style.highlight_min()"
847 |    ]
848 |   },
849 |   {
850 |    "cell_type": "code",
851 |    "execution_count": null,
852 |    "metadata": {},
853 |    "outputs": [],
854 |    "source": [
855 |     "def odd_or_even(data):\n",
856 |     "    return [('background-color: green; color:white;' if x%2==0 else 'background-color: orange') \n",
857 |     "            for x in data]\n",
858 |     "df.style.apply(odd_or_even)"
859 |    ]
860 |   }
861 |  ],
862 |  "metadata": {
863 |   "kernelspec": {
864 |    "display_name": "Python 3.6 (Develer Science)",
865 |    "language": "python",
866 |    "name": "develer-science"
867 |   },
868 |   "language_info": {
869 |    "codemirror_mode": {
870 |     "name": "ipython",
871 |     "version": 3
872 |    },
873 |    "file_extension": ".py",
874 |    "mimetype": "text/x-python",
875 |    "name": "python",
876 |    "nbconvert_exporter": "python",
877 |    "pygments_lexer": "ipython3",
878 |    "version": "3.6.6"
879 |   },
880 |   "varInspector": {
881 |    "cols": {
882 |     "lenName": 16,
883 |     "lenType": 16,
884 |     "lenVar": 40
885 |    },
886 |    "kernels_config": {
887 |     "python": {
888 |      "delete_cmd_postfix": "",
889 |      "delete_cmd_prefix": "del ",
890 |      "library": "var_list.py",
891 |      "varRefreshCmd": "print(var_dic_list())"
892 |     },
893 |     "r": {
894 |      "delete_cmd_postfix": ") ",
895 |      "delete_cmd_prefix": "rm(",
896 |      "library": "var_list.r",
897 |      "varRefreshCmd": "cat(var_dic_list()) "
898 |     }
899 |    },
900 |    "types_to_exclude": [
901 |     "module",
902 |     "function",
903 |     "builtin_function_or_method",
904 |     "instance",
905 |     "_Feature"
906 |    ],
907 |    "window_display": false
908 |   }
909 |  },
910 |  "nbformat": 4,
911 |  "nbformat_minor": 2
912 | }
913 | 


--------------------------------------------------------------------------------
/4_archmage/2.2.1  Keras Backend.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Keras Backend\n",
  8 |     "\n",
  9 |     "In this notebook we will be using the [Keras backend module](http://keras.io/backend/), which provides an abstraction over both Theano and Tensorflow."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Let's try to re-implement the Logistic Regression Model using the `keras.backend` APIs.\n",
 17 |     "\n",
 18 |     "The following code will look like very similar to what we would write in Theano or Tensorflow (with the *only difference* that it may run on both the two backends)."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stderr",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Using TensorFlow backend.\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "import keras.backend as K\n",
 36 |     "import numpy as np\n",
 37 |     "import matplotlib.pyplot as plt\n",
 38 |     "\n",
 39 |     "%matplotlib inline"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "from kaggle_data import load_data, preprocess_data, preprocess_labels"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "9 classes\n",
 61 |       "93 dims\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "X_train, labels = load_data('../data/kaggle_ottogroup/train.csv', train=True)\n",
 67 |     "X_train, scaler = preprocess_data(X_train)\n",
 68 |     "Y_train, encoder = preprocess_labels(labels)\n",
 69 |     "\n",
 70 |     "X_test, ids = load_data('../data/kaggle_ottogroup/test.csv', train=False)\n",
 71 |     "\n",
 72 |     "X_test, _ = preprocess_data(X_test, scaler)\n",
 73 |     "\n",
 74 |     "nb_classes = Y_train.shape[1]\n",
 75 |     "print(nb_classes, 'classes')\n",
 76 |     "\n",
 77 |     "dims = X_train.shape[1]\n",
 78 |     "print(dims, 'dims')"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "feats = dims\n",
 88 |     "training_steps = 25"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "x = K.placeholder(dtype=\"float\", shape=X_train.shape) \n",
 98 |     "target = K.placeholder(dtype=\"float\", shape=Y_train.shape)\n",
 99 |     "\n",
100 |     "# Set model weights\n",
101 |     "W = K.variable(np.random.rand(dims, nb_classes))\n",
102 |     "b = K.variable(np.random.rand(nb_classes))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# Define model and loss\n",
112 |     "y = K.dot(x, W) + b\n",
113 |     "loss = K.categorical_crossentropy(y, target)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 7,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "activation = K.softmax(y) # Softmax"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 8,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "lr = K.constant(0.01)\n",
132 |     "grads = K.gradients(loss, [W,b])\n",
133 |     "updates = [(W, W-lr*grads[0]), (b, b-lr*grads[1])]"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 9,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "train = K.function(inputs=[x, target], outputs=[loss], updates=updates)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 10,
148 |    "metadata": {
149 |     "scrolled": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "Loss: [1296.8262  -427.89313  -16.26692 ... 1968.651   -704.3656  1619.4004 ]\n",
157 |       "Loss: [-28785862. -25194672. -28507862. ... -26947364. -22272130. -31571466.]\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "# Training\n",
163 |     "loss_history = []\n",
164 |     "for epoch in range(training_steps):\n",
165 |     "    current_loss = train([X_train, Y_train])[0]\n",
166 |     "    loss_history.append(current_loss)\n",
167 |     "    if epoch % 20 == 0:\n",
168 |     "        print(\"Loss: {}\".format(current_loss))"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 11,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "loss_history = [np.mean(lh) for lh in loss_history]"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 12,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAERCAYAAABsNEDqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzt3Xl0VfX57/H3I1CJyGVQihBEQS1VIASJCIrVyjyoyIWfilSsA7KUiu0ChYuXOiyVSh1qJ+SnVhQULRX0J1Qmp7qEQgJhkiqD0IZwCxVDQaUSfO4f5ySEeBLOSXLOPsPntVYWZ+/zPXs/O5vkyf5+97O/5u6IiIhE64SgAxARkdSixCEiIjFR4hARkZgocYiISEyUOEREJCZKHCIiEpO0TRxm9pyZ7TGzjVG0fcLMCsNfn5hZSSJiFBFJRZaudRxm9gPgIPCCu3eK4XM/Abq6+01xC05EJIWl7RWHu78P7Ku4zszOMrO3zKzAzP5iZt+P8NHrgJcTEqSISAqqH3QACTYTGOvuW8zsQuB3wOVlb5rZGUA74O2A4hMRSXoZkzjM7GTgIuCPZla2+sRKza4F5rn7kUTGJiKSSjImcRDqlitx99xq2lwL3JGgeEREUlLajnFU5u7/Bj41sxEAFtKl7H0z6wA0A1YEFKKISEpI28RhZi8TSgIdzKzIzG4GrgduNrN1wCbgqgofuQ6Y6+l6m5mISB1J29txRUQkPtL2ikNEROIjLQfHTz31VD/zzDODDkNEJGUUFBT8y91bRNM2LRPHmWeeSX5+ftBhiIikDDPbGW1bdVWJiEhMlDhERCQmShwiIhKTtBzjkPR1+PBhioqKOHToUNChiKSkhg0b0qZNGxo0aFDjbShxSEopKiqicePGnHnmmVR45piIRMHd+eyzzygqKqJdu3Y13k6gXVVmNsDMPjazrWY2KcL7J5rZK+H3/2pmZ8YrlgVrd3HxtLdpN2khF097mwVrd8VrV1ILhw4d4pRTTlHSEKkBM+OUU06p9RV7YInDzOoBvwUGAucB15nZeZWa3Qx87u5nA08Av4hHLAvW7mLyaxvYVfIVDuwq+YrJr21Q8khSShoiNVcXPz9BXnF0B7a6+3Z3/xqYy7HPjiK8PCv8eh7Q2+LwW2P64o/56vCxT1L/6vARpi/+uK53JSKS8oJMHNnAPyosF4XXRWzj7qXAfuCUSBszszFmlm9m+Xv37o0pkOKSr2JaL5nt5JNPrvU2iouLGT58eJXvl5SU8Lvf/S7q9pXdeOONtGvXjtzcXLp06cLy5ctrFW9dmzFjBi+88EKttrFhwwZyc3PJzc2lefPm5cfbp0+fmLbTv39/Dhw4UG2bKVOm8M4779Qm3OPq1asXhYWFcd1HXQlycDzSlUPlJy5G0ya00n0moRn+yMvLi+nJja2bZrErQpJo3TQrls1IElqwdhfTF39McclXtG6axcT+HRjatfLfJ4nXunVr5s2bV+X7ZYnj9ttvj6p9JNOnT2f48OG88847jBkzhi1bttQqZoDS0lLq16/9r42xY8fWehudO3cu/0V74403MmTIkIjJ9XgxL168+Lj7euihh2oeaBoK8oqjCDi9wnIboLiqNmZWH2hCpXnE68LE/h3IalDvmHVZDeoxsX+Hut6VJFAix6527txJ7969ycnJoXfv3vz9738HYNu2bfTo0YMLLriAqVOnll+t7Nixg06dOgGwadMmunfvTm5uLjk5OWzZsoVJkyaxbds2cnNzmThx4jHtjxw5woQJE+jcuTM5OTn8+te/rja2nj17smvX0WMuKCjg0ksvpVu3bvTv35/du3cDsHr1anJycujZsycTJ04s39/zzz/PiBEjuOKKK+jXrx8QSkoXXHABOTk5/PznPwfgiy++YPDgwXTp0oVOnTrxyiuvADBp0iTOO+88cnJymDBhAgD33Xcfv/zlLwEoLCykR48e5OTkcPXVV/P5558DcNlll3HPPffQvXt3vve97/GXv/wl6vOxbNky+vTpw7XXXkvXrl0BuOKKK+jWrRsdO3bkmWeeKW/bpk0bSkpK2Lp1K506deLmm2+mY8eODBw4sHwQedSoUSxYsKC8/X333UfXrl3Jycnhk08+AWDPnj307t2b888/n9tvv53s7GxKSkqOiau0tJSmTZvy05/+lPPPP5++ffvy2Weflb8/d+5cunfvTocOHfjwww+B0P+hSy65hK5du9KtWzf++te/ArBr1y569epFbm4unTp1Km//5z//mZ49e3L++edzzTXX8MUXX0T9fYtWkIljNXCOmbUzs+8Qmn3vjUpt3gBGh18PB96Ox3wZQ7tm88iwzmQ3zcKA7KZZPDKsc1L8ZSo1l8ixq3HjxnHDDTewfv16rr/+eu68804Axo8fz/jx41m9ejWtW7eO+NkZM2Ywfvx4CgsLyc/Pp02bNkybNo2zzjqLwsJCpk+ffkz7mTNn8umnn7J27dry/VXnrbfeYujQoUCoDuYnP/kJ8+bNo6CggJtuuokpU6YA8OMf/5gZM2awYsUK6tU79g+pFStWMGvWLN5++22WLFnCli1bWLVqFYWFhRQUFPD+++/z1ltv0bp1a9atW8fGjRsZMGAA+/btY/78+WzatIn169dz7733fiu+G264gV/84hesX7+ezp07c//995e/V1payqpVq3jyySePWR+NlStX8uijj7JhwwYAZs2aRUFBAatXr+bxxx8vT1AVffzxx9x1111s2rSJrKys8mRRWcuWLVm7di233HILjz/+OABTp05lwIABrFmzhkGDBlFcXPnv4JD9+/fTo0cP1qxZQ8+ePXnwwQfL33N3Vq1axfTp03nggQcAaNWqFUuXLmXt2rXMmTOn/P/W7NmzueKKKygsLGTdunXk5OSwZ88epk2bxvLly1mzZg05OTn86le/iun7Fo3AuqrcvdTMxgGLgXrAc+6+ycweAPLd/Q3gWeBFM9tK6Erj2njFM7RrthJFmknk2NWKFSt47bXXAPjRj37E3XffXb6+7JfPyJEjy//irqhnz5489NBDFBUVMWzYMM4555xq97Vs2TLGjh1b3v3SvHnziO0mTpzI3XffzZ49e1i5ciUQ+sW4ceNG+vbtC4SuXlq1akVJSQkHDhzgoosuKo/1zTffLN9W3759y/ezZMkSlixZUv6X/MGDB9myZQuXXHIJEyZM4J577mHIkCFccskllJaW0rBhQ2655RYGDx7MkCFDjolx//79lJSUcOmllwIwevRoRowYUf7+sGHDAOjWrRs7duyo9vtSWc+ePWnbtm358hNPPMEbb4T+Ni0qKmLbtm3k5eUd85mzzz6bzp07H3efFeNatGgRAB988EF5Eh4yZAiNGzeO+Nn69euXH+OoUaMYOXJkxO2W7fs///kP48aNY926ddSvX59t27YBcMEFF3Dbbbdx6NAhhg4dSpcuXVi2bBkfffRR+Xn8+uuv6dWrVxTfrdgEWgDo7ouARZXWTa3w+hAwovLnkkGy9p3LUUGOXcVy89/IkSO58MILWbhwIf379+eZZ56hffv2VbZ396i2P336dIYNG8ZTTz3F6NGjKSgowN3p2LEjK1YcO0NypL++K2rUqNEx+588eTK33Xbbt9oVFBSwaNEiJk+eTL9+/Zg6dSqrVq1i+fLlzJ07l9/85je8/fbbx429zIknnghAvXr1KC0tjfpzlWNetmwZ77//PitXriQrK4tevXpFrGUo29/x9hkprmg7Qyqfu4rLkbb72GOPcfrppzN79mwOHz5c3t15+eWX8+6777Jw4UKuv/56Jk+ezEknncSAAQN48cUXo4qlpvSsqhpQ3UdqSOTY1UUXXcTcuXMBmDNnTvlfeT169OBPf/oTQPn7lW3fvp327dtz5513cuWVV7J+/XoaN25c5Z0+/fr1Y8aMGeW/WPbtq3rY74QTTmD8+PF88803LF68mA4dOrB3797yxHH48GE2bdpEs2bNaNy4cfmVSVWxQugupOeee46DBw8Cob72PXv2UFxczEknncSoUaOYMGECa9as4eDBg+zfv59Bgwbx5JNPfuuuoSZNmtCsWbPy8YsXX3yx/OqjLu3fv5/mzZuTlZXFpk2bWL16dZ3vo1evXrz66qsALFq0qMrzd/jw4fKr05deeum4VwT79++nVatWmBmzZs0qT1A7d+7ktNNOY8yYMdx4442sXbuWiy66iPfee4/t27cDoXGnurgpojI9cqQGqus711VH8ig7F3V9Zfjll1/Spk2b8uWf/exnPPXUU9x0001Mnz6dFi1a8Ic//AGAJ598klGjRvHYY48xePBgmjRp8q3tvfLKK8yePZsGDRpw2mmnMXXqVJo3b87FF19Mp06dGDhwIHfccUd5+1tuuYVPPvmEnJwcGjRowK233sq4ceOqjNfMuPfee3n00Ufp378/8+bN484772T//v2UlpZy11130bFjR5599lluvfVWGjVqxGWXXRYxVgglrs2bN9OzZ08gdHvy7Nmz2bp1KxMnTuSEE06gQYMG/P73v+fAgQNcddVVHDp0CHfniSee+Nb2Zs2axdixY/nyyy9p3759+feuLg0ePJiZM2fSpUsXvv/973PhhRfW+T7uv/9+Ro4cyZw5c7j88stp2bLlMVc9ZZo0acKaNWt4+OGHad68eflNBFUZN24cw4cP5+WXX6ZPnz7lVyXLly/n8ccfp0GDBuXnoGXLljz77LNcc801fP311wA8/PDDx+3+jFVazjmel5fn8ZzIqd2khRHvCTbg02mD47Zfgc2bN3PuuecGHUbUvvzyS7KysjAz5s6dy8svv8zrr78edFgRHTx4sLwbZNq0aezevTsuA6vp6tChQ9SvX5/69evzwQcfcNddd31rQrnS0lJOPfXUb91tlWiRfo7MrMDd86r4yDF0xVEDqvuQaBUUFDBu3DjcnaZNm/Lcc88FHVKVFi5cyCOPPEJpaSlnnHEGzz//fNAhpZQdO3Zw3XXXceTIEU488USefvrpoEOKG11x1EDZGEfF7qqsBvV0C28CpNoVh0gy0hVHAOLVdy7RifauIhH5trq4WFDiqCHVfQSjYcOGfPbZZ3q0ukgNlM3H0bBhw1ptR4lDUkqbNm0oKioi1gdZikhI2QyAtaHEkUAqGqy9Bg0a1GrmMhGpPSWOBKk8oF5WNAgoeYhISlHleIJosigRSRdKHAmiyaJEJF0ocSRIVcWBKhoUkVSjxJEgmixKRNKFBscTREWDIpIulDgSSEWDIpIOlDiSnGo/RCTZKHEkMdV+iEgy0uB4ElPth4gkIyWOJKbaDxFJRkocSUy1HyKSjAJJHGbW3MyWmtmW8L/Nqmh3xMwKw19vJDrOoKn2Q0SSUVBXHJOA5e5+DrA8vBzJV+6eG/66MnHhJYehXbN5ZFhnsptmYUB20yzNMigigQtk6lgz+xi4zN13m1kr4F13/9af0WZ20N1PjnX78Z46VkQk3aTC1LEt3X03QDh5fLeKdg3NLB8oBaa5+4KERZjCVPshIvEUt8RhZsuA0yK8NSWGzbR192Izaw+8bWYb3H1bFfsbA4wBaNu2bczxpgvVfohIvMUtcbh7n6reM7N/mlmrCl1Ve6rYRnH43+1m9i7QFYiYONx9JjATQl1VtQw/ZVVX+6HEISJ1IajB8TeA0eHXo4HXKzcws2ZmdmL49anAxcBHCYswRan2Q0TiLajEMQ3oa2ZbgL7hZcwsz8yeCbc5F8g3s3XAO4TGOJQ4jkO1HyISb4EMjrv7Z0DvCOvzgVvCrz8EOic4tJQ3sX+HY8Y4QLUfIlK39JDDNKN5P0Qk3pQ40pDm/RCReFLiENV9iEhMlDgynOo+RCRWejpuhtOcHyISKyWODKe6DxGJlRJHhlPdh4jESokjw2nODxGJlQbHM5zqPkQkVkocoroPEYmJuqpERCQmuuKQGlHRoEjmUuKQmKloUCSzqatKYqaiQZHMpsQhMVPRoEhmU+KQmKloUCSzKXFIzFQ0KJLZNDguMVPRoEhmU+KQGlHRoEjmUuKQhFHth0h6UOKQhFDth0j60OC4JIRqP0TSRyCJw8xGmNkmM/vGzPKqaTfAzD42s61mNimRMUrdUu2HSPoI6opjIzAMeL+qBmZWD/gtMBA4D7jOzM5LTHhS11T7IZI+Akkc7r7Z3Y/XR9Ed2Oru2939a2AucFX8o5N4UO2HSPpI5jGObOAfFZaLwusiMrMxZpZvZvl79+6Ne3ASm6Fds3lkWGeym2ZhQHbTLB4Z1lkD4yIpKG53VZnZMuC0CG9NcffXo9lEhHVeVWN3nwnMBMjLy6uynQRHtR8i6SFuicPd+9RyE0XA6RWW2wDFtdympBDVfYgkp2Su41gNnGNm7YBdwLXAyGBDkkRR3YdI8grqdtyrzawI6AksNLPF4fWtzWwRgLuXAuOAxcBm4FV33xREvJJ4qvsQSV6BXHG4+3xgfoT1xcCgCsuLgEUJDE2ShOo+RJJXMt9VJRlMdR8iyUuJQ5KS6j5EklcyD45LBtOcHyLJS4lDkpbqPkSSkxKHpBXVfojEnxKHpA3VfogkhgbHJW2o9kMkMZQ4JG2o9kMkMZQ4JG2o9kMkMZQ4JG2o9kMkMTQ4LmlDtR8iiaHEIWlFtR8i8aeuKhERiYmuOCTjqWhQJDZKHJLRVDQoEjt1VUlGU9GgSOyUOCSjqWhQJHZKHJLRVDQoEjslDsloKhoUiZ0GxyWjqWhQJHZKHJLxVDQoEptAEoeZjQDuA84Furt7fhXtdgAHgCNAqbvnJSpGkaqo7kMyXVBXHBuBYcDTUbT9obv/K87xiERFdR8iAQ2Ou/tmd9eN8pJyVPchEmXiCHctHXddHDiwxMwKzGxMdQ3NbIyZ5ZtZ/t69exMQmmQi1X2IRH/FMTnKdeXMbJmZbYzwdVUM8V3s7ucDA4E7zOwHVTV095nunufueS1atIhhFyLRU92HyHHGOMxsIDAIyDazpyq89b+A0uo+6+59ahucuxeH/91jZvOB7sD7td2uSE1N7N/hmDEOUN2HZJ7jDY4XA/nAlUBBhfUHgJ/GKygAM2sEnODuB8Kv+wEPxHOfIsejug8RMHc/fiOzBu5+OPy6GXC6u6+v8U7NrgZ+DbQASoBCd+9vZq2BZ9x9kJm1B+aHP1IfeMndH4pm+3l5eZ6fH/EOXxERicDMCqIteYj2dtylZnZluH0hsNfM3nP3n9UkQHefz9GkUHF9MaGuMdx9O9ClJtsXSTaq/ZB0Eu3geBN3/zeh2os/uHs3oNZjGCKZoKz2Y1fJVzhHaz8WrN0VdGgiNRJt4qhvZq2A/wLejGM8ImlHtR+SbqJNHA8Ai4Ft7r46PP6wJX5hiaQP1X5IuolqjMPd/wj8scLyduB/xysokXTSumkWuyIkCdV+SKqKtnK8jZnNN7M9ZvZPM/uTmbWJd3Ai6UBzfki6ibar6g/AG0BrIBv4n/A6ETmOoV2zeWRYZ7KbZmFAdtMsHhnWWXdVScqKto6j0N1zj7cuWaiOQ0QkNvGo4/iXmY0CXg4vXwd8VpPgRCQ6qv2QZBVtV9VNhG7F/X/AbmA48ON4BSWS6VT7Icks2sTxIDDa3Vu4+3cJJZL74haVSIZT7Ycks2gTR467f1624O77gK7xCUlEVPshySzaxHFC+OGGAJhZc4KbdlYk7WneD0lm0SaOx4APzexBM3sA+BB4NH5hiWQ21X5IMou2cvwFM8sHLgcMGObuH8U1MpEMpnk/JJlF3d0UThRKFiIJMrRrthKFJKVou6pEREQADXCLpA0VDEqiKHGIpIGygsGy2o+ygkFAyUPqnLqqRNKACgYlkZQ4RNKACgYlkZQ4RNKACgYlkQJJHGY23cz+ZmbrwxNENa2i3QAz+9jMtprZpETHKZIqVDAoiRTUFcdSoJO75wCfAJMrNzCzesBvgYHAecB1ZnZeQqMUSRGaLEoSKZC7qtx9SYXFlYQe015Zd2BreH5zzGwucBUqQhSJSAWDkijJcDvuTcArEdZnA/+osFwEXJiQiEQyhGo/pCbiljjMbBlwWoS3prj76+E2U4BSYE6kTURYV+U8t2Y2BhgD0LZt25jjFck0qv2Qmopb4nD3PtW9b2ajgSFAb4888XkRcHqF5TZAcTX7mwnMhNCc4zEHLJJhqqv9UOKQ6gR1V9UA4B7gSnf/sopmq4FzzKydmX0HuBZ4I1ExiqQ71X5ITQV1V9VvgMbAUjMrNLMZAGbW2swWAbh7KTAOWAxsBl51900BxSuSdlT7ITUV1F1VZ1exvhgYVGF5EbAoUXGJZJKJ/TscM8YBqv2Q6CTDXVUiEgBNFiU1pcQhksFU+yE1ocQhIjFR7YcocYhI1FT7IaCn44pIDDTvh4ASh4jEQLUfAkocIhID1X4IKHGISAw074eABsdFJAaq/RBQ4hCRGKn2Q5Q4RCSuVPeRfpQ4RCRuVPeRnjQ4LiJxo7qP9KTEISJxo7qP9KTEISJxo7qP9KTEISJxo7qP9KTBcRGJG9V9pCclDhGJK9V9pB8lDhFJOqr9SG5KHCKSVFT7kfw0OC4iSUW1H8lPiUNEkopqP5JfIInDzKab2d/MbL2ZzTezplW022FmG8ys0MzyEx2niCSeaj+SX1BXHEuBTu6eA3wCTK6m7Q/dPdfd8xITmogESbUfyS+QxOHuS9y9NLy4EmgTRBwiknyGds3mkWGdyW6ahQHZTbN4ZFhnDYwnEXP3YAMw+x/gFXefHeG9T4HPAQeedveZ1WxnDDAGoG3btt127twZp4hFRNKPmRVE27MTt9txzWwZcFqEt6a4++vhNlOAUmBOFZu52N2Lzey7wFIz+5u7vx+pYTipzATIy8sLNhuKiKSxuCUOd+9T3ftmNhoYAvT2Ki573L04/O8eM5sPdAciJg4RyWwqGkycoO6qGgDcA1zp7l9W0aaRmTUuew30AzYmLkoRSRVlRYO7Sr7COVo0uGDtrqBDS0tB3VX1G6Axoe6nQjObAWBmrc1sUbhNS+ADM1sHrAIWuvtbwYQrIslMRYOJFcgjR9z97CrWFwODwq+3A10SGZeIpCYVDSaWKsdFJOWpaDCxlDhEJOWpaDCx9HRcEUl5mjAqsZQ4RCQtaMKoxFHiEJGMpLqPmlPiEJGMo8miakeD4yKScVT3UTtKHCKScVT3UTtKHCKScVT3UTtKHCKScVT3UTsaHBeRjKO6j9pR4hCRjKS6j5pT4hARiZJqP0KUOEREoqDaj6M0OC4iEgXVfhylxCEiEgXVfhylxCEiEgXVfhylxCEiEgXVfhylwXERkSio9uMoJQ4RkSip9iNEiUNEJI7SsfYjsDEOM3vQzNabWaGZLTGz1lW0G21mW8JfoxMdp4hITZXVfuwq+QrnaO3HgrW7gg6tVoIcHJ/u7jnungu8CUyt3MDMmgM/By4EugM/N7NmiQ1TRKRm0rX2I7DE4e7/rrDYCPAIzfoDS919n7t/DiwFBiQiPhGR2krX2o9AxzjM7CHgBmA/8MMITbKBf1RYLgqvi7StMcAYgLZt29ZtoCIiNdC6aRa7IiSJVK/9iOsVh5ktM7ONEb6uAnD3Ke5+OjAHGBdpExHWRboywd1nunueu+e1aNGi7g5CRKSG0rX2I65XHO7eJ8qmLwELCY1nVFQEXFZhuQ3wbq0DExFJgHSt/Qisq8rMznH3LeHFK4G/RWi2GHi4woB4P2ByIuITEakL6Vj7EeQYxzQz6wB8A+wExgKYWR4w1t1vcfd9ZvYgsDr8mQfcfV8w4YqICIC5RxwySGl5eXmen58fdBgiIjELqmDQzArcPS+atqocFxFJEqkyWZSejisikiRSpWBQiUNEJEmkSsGgEoeISJJIlcmilDhERJJEqhQManBcRCRJpErBoBKHiEgSSYWCQSUOEZEUl+jaDyUOEZEUFkTthwbHRURSWBC1H0ocIiIpLIjaDyUOEZEUFkTthxKHiEgKC6L2Q4PjIiIpLIjaDyUOEZEUl+jaD3VViYhITJQ4REQkJkocIiISEyUOERGJiRKHiIjExNw96BjqnJntBXbW8OOnAv+qw3BSSSYfO2T28evYM1fZ8Z/h7i2i+UBaJo7aMLN8d88LOo4gZPKxQ2Yfv449M48danb86qoSEZGYKHGIiEhMlDi+bWbQAQQok48dMvv4deyZK+bj1xiHiIjERFccIiISEyUOERGJiRJHmJkNMLOPzWyrmU0KOp5EM7MdZrbBzArNLD/oeOLJzJ4zsz1mtrHCuuZmttTMtoT/bRZkjPFUxfHfZ2a7wue/0MwGBRljvJjZ6Wb2jpltNrNNZjY+vD7tz381xx7zudcYB2Bm9YBPgL5AEbAauM7dPwo0sAQysx1AnrunfSGUmf0AOAi84O6dwuseBfa5+7TwHw7N3P2eIOOMlyqO/z7goLv/MsjY4s3MWgGt3H2NmTUGCoChwI2k+fmv5tj/ixjPva44QroDW919u7t/DcwFrgo4JokTd38f2Fdp9VXArPDrWYR+oNJSFcefEdx9t7uvCb8+AGwGssmA81/NscdMiSMkG/hHheUiavgNTWEOLDGzAjMbE3QwAWjp7rsh9AMGfDfgeIIwzszWh7uy0q6rpjIzOxPoCvyVDDv/lY4dYjz3ShwhFmFdpvXhXezu5wMDgTvC3RmSOX4PnAXkAruBx4INJ77M7GTgT8Bd7v7voONJpAjHHvO5V+IIKQJOr7DcBigOKJZAuHtx+N89wHxC3XeZ5J/hPuCyvuA9AceTUO7+T3c/4u7fAP9NGp9/M2tA6BfnHHd/Lbw6I85/pGOvyblX4ghZDZxjZu3M7DvAtcAbAceUMGbWKDxYhpk1AvoBG6v/VNp5Axgdfj0aeD3AWBKu7Jdm2NWk6fk3MwOeBTa7++MV3kr781/Vsdfk3OuuqrDwLWhPAvWA59z9oYBDShgza0/oKgOgPvBSOh+/mb0MXEbocdL/BH4OLABeBdoCfwdGuHtaDiBXcfyXEeqqcGAHcFtZn386MbNewF+ADcA34dX/h1Bff1qf/2qO/TpiPPdKHCIiEhN1VYmISEyUOEREJCZKHCIiEhMlDhERiYkSh4iIxESJQySJmNllZvZm0HGIVEeJQ0REYqLEIVIDZjbKzFaF5y942szqmdnHw4ZxAAABkklEQVRBM3vMzNaY2XIzaxFum2tmK8MPkZtf9hA5MzvbzJaZ2brwZ84Kb/5kM5tnZn8zsznhil+RpKHEIRIjMzsXuIbQgyFzgSPA9UAjYE34YZHvEarIBngBuMfdcwhV7ZatnwP81t27ABcResAchJ5aehdwHtAeuDjuByUSg/pBByCSgnoD3YDV4YuBLEIPxfsGeCXcZjbwmpk1AZq6+3vh9bOAP4afDZbt7vMB3P0QQHh7q9y9KLxcCJwJfBD/wxKJjhKHSOwMmOXuk49ZafZ/K7Wr7nk+1XU//afC6yPo51SSjLqqRGK3HBhuZt+F8vmqzyD08zQ83GYk8IG77wc+N7NLwut/BLwXngehyMyGhrdxopmdlNCjEKkh/SUjEiN3/8jM7iU0Y+IJwGHgDuALoKOZFQD7CY2DQOgx3TPCiWE78OPw+h8BT5vZA+FtjEjgYYjUmJ6OK1JHzOygu58cdBwi8aauKhERiYmuOEREJCa64hARkZgocYiISEyUOEREJCZKHCIiEhMlDhERicn/B1Wje2YNV+NyAAAAAElFTkSuQmCC\n",
188 |       "text/plain": [
189 |        "<Figure size 432x288 with 1 Axes>"
190 |       ]
191 |      },
192 |      "metadata": {
193 |       "needs_background": "light"
194 |      },
195 |      "output_type": "display_data"
196 |     }
197 |    ],
198 |    "source": [
199 |     "# plotting\n",
200 |     "plt.plot(range(len(loss_history)), loss_history, 'o', label='Logistic Regression Training phase')\n",
201 |     "plt.ylabel('cost')\n",
202 |     "plt.xlabel('epoch')\n",
203 |     "plt.legend()\n",
204 |     "plt.show()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Your Turn\n",
212 |     "\n",
213 |     "Please switch to the **Theano** backend and **restart** the notebook.\n",
214 |     "\n",
215 |     "You _should_ see no difference in the execution!\n",
216 |     "\n",
217 |     "**Reminder**: please keep in mind that you *can* execute shell commands from a notebook (pre-pending a `!` sign).\n",
218 |     "Thus:\n",
219 |     "\n",
220 |     "```shell\n",
221 |     "    !cat ~/.keras/keras.json\n",
222 |     "```\n",
223 |     "should show you the content of your keras configuration file."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {
229 |     "collapsed": true
230 |    },
231 |    "source": [
232 |     "### Moreover\n",
233 |     "\n",
234 |     "Try to play a bit with the **learning reate** parameter to see how the loss history floats... "
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "---"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## Exercise: Linear Regression\n",
249 |     "To get familiar with automatic differentiation, we start by learning a simple linear regression model using Stochastic Gradient Descent (SGD).\n",
250 |     "\n",
251 |     "Recall that given a dataset $\\{(x_i, y_i)\\}_{i=0}^N$, with $x_i, y_i \\in \\mathbb{R}$, the objective of linear regression is to find two scalars $w$ and $b$ such that $y = w\\cdot x + b$ fits the dataset. In this tutorial we will learn $w$ and $b$ using SGD and a Mean Square Error (MSE) loss:\n",
252 |     "\n",
253 |     "$$\\mathcal{l} = \\frac{1}{N} \\sum_{i=0}^N (w\\cdot x_i + b - y_i)^2$$\n",
254 |     "\n",
255 |     "Starting from random values, parameters $w$ and $b$ will be updated at each iteration via the following rule:\n",
256 |     "\n",
257 |     "$$w_t = w_{t-1} - \\eta \\frac{\\partial \\mathcal{l}}{\\partial w}$$\n",
258 |     "<br>\n",
259 |     "$$b_t = b_{t-1} - \\eta \\frac{\\partial \\mathcal{l}}{\\partial b}$$\n",
260 |     "\n",
261 |     "where $\\eta$ is the learning rate.\n",
262 |     "\n",
263 |     "**NOTE:** Recall that **linear regression** is indeed a **simple neuron** with a linear activation function!!"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "### Definition: Placeholders and Variables"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "First of all, we define the necessary variables and placeholders for our computational graph. Variables maintain state across executions of the computational graph, while placeholders are ways to feed the graph with external data.\n",
278 |     "\n",
279 |     "For the linear regression example, we need three variables: `w`, `b`, and the learning rate for SGD, `lr`. \n",
280 |     "\n",
281 |     "Two placeholders `x` and `target` are created to store $x_i$ and $y_i$ values."
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 13,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "# Placeholders and variables\n",
291 |     "x = K.placeholder()\n",
292 |     "target = K.placeholder()\n",
293 |     "w = K.variable(np.random.rand())\n",
294 |     "b = K.variable(np.random.rand())"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "#### Notes:\n",
302 |     "\n",
303 |     "In case you're wondering what's the difference between a **placeholder** and a **variable**, in short:\n",
304 |     "\n",
305 |     "* Use `K.variable()` for trainable variables such as weights (`W`) and biases (`b`) for your model.\n",
306 |     "* Use `K.placeholder()` to feed actual data (e.g. training examples)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "source": [
315 |     "## Model definition\n",
316 |     "Now we can define the $y = w\\cdot x + b$ relation as well as the MSE loss in the computational graph."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "collapsed": true
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "# Define model and loss"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {
334 |     "collapsed": true
335 |    },
336 |    "outputs": [],
337 |    "source": [
338 |     "# %load ../solutions/sol_2311.py"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "markdown",
343 |    "metadata": {},
344 |    "source": [
345 |     "Then, given the gradient of MSE wrt to `w` and `b`, we can define how we update the parameters via SGD:"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "# %load ../solutions/sol_2312.py"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "The whole model can be encapsulated in a `function`, which takes as input `x` and `target`, returns the current loss value and updates its parameter according to `updates`."
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "collapsed": true
369 |    },
370 |    "outputs": [],
371 |    "source": [
372 |     "train = K.function(inputs=[x, target], outputs=[loss], updates=updates)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "## Training\n",
380 |     "Training is now just a matter of calling the `function` we have just defined. Each time `train` is called, indeed, `w` and `b` will be updated using the SGD rule.\n",
381 |     "\n",
382 |     "Having generated some random training data, we will feed the `train` function for several epochs and observe the values of `w`, `b`, and loss."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "# Generate data\n",
394 |     "np_x = np.random.rand(1000)\n",
395 |     "np_target = 0.96*np_x + 0.24"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "collapsed": true
403 |    },
404 |    "outputs": [],
405 |    "source": [
406 |     "# Training\n",
407 |     "loss_history = []\n",
408 |     "for epoch in range(200):\n",
409 |     "    current_loss = train([np_x, np_target])[0]\n",
410 |     "    loss_history.append(current_loss)\n",
411 |     "    if epoch % 20 == 0:\n",
412 |     "        print(\"Loss: %.03f, w, b: [%.02f, %.02f]\" % (current_loss, K.eval(w), K.eval(b)))"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "We can also plot the loss history:"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "collapsed": true
427 |    },
428 |    "outputs": [],
429 |    "source": [
430 |     "# Plot loss history"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "# %load ../solutions/sol_2313.py"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "### Final Note:\n",
449 |     "\n",
450 |     "Please switch back your backend to `tensorflow` before moving on. It may be useful for next notebooks !-)"
451 |    ]
452 |   }
453 |  ],
454 |  "metadata": {
455 |   "kernelspec": {
456 |    "display_name": "Python 3.6 (DL Keras TF)",
457 |    "language": "python",
458 |    "name": "dl-keras-tf"
459 |   },
460 |   "language_info": {
461 |    "codemirror_mode": {
462 |     "name": "ipython",
463 |     "version": 3
464 |    },
465 |    "file_extension": ".py",
466 |    "mimetype": "text/x-python",
467 |    "name": "python",
468 |    "nbconvert_exporter": "python",
469 |    "pygments_lexer": "ipython3",
470 |    "version": "3.6.6"
471 |   }
472 |  },
473 |  "nbformat": 4,
474 |  "nbformat_minor": 1
475 | }
476 | 


--------------------------------------------------------------------------------