├── 0. Basics
    ├── .ipynb_checkpoints
    │   └── 0. Basics 4. Jupyter notebook magics, shell and R-checkpoint.ipynb
    ├── 0. Basics 0. Jupyter notebook.ipynb
    ├── 0. Basics 1. Python.ipynb
    ├── 0. Basics 2. Numpy.ipynb
    ├── 0. Basics 3. Pandas.ipynb
    ├── 0. Basics 4. Jupyter notebook magics, shell and R.ipynb
    ├── 0. Basics 5. Exercise 0.ipynb
    ├── 0. Basics 5. Exercise help.ipynb
    ├── 0. Basics 6. Some data.zip
    ├── Numpy_Python_Cheat_Sheet.pdf
    └── PandasPythonForDataScience.pdf
├── 1. In and Export
    ├── 1. import and export 0. Object Storage.ipynb
    ├── 1. import and export 1. Download and upload.ipynb
    ├── 1. import and export 2. DashDB.ipynb
    ├── 1. import and export 3. Cloudant.ipynb
    ├── 1. import and export 4. Twitter.ipynb
    └── 1. import and export 5. BigInsights.ipynb
├── 2. Watson APIs
    ├── 2. Watson 0. Weather API.ipynb
    ├── 2. Watson 1. Personality Insights.ipynb
    ├── 2. Watson 2. Alchemy News.ipynb
    ├── 2. Watson 3. Alchemy language.ipynb
    ├── 2. Watson 4. Tone analyzer.ipynb
    └── 2. Watson 5. Natural language classifier.ipynb
├── 3. Visualization
    ├── 3. Visualization 0. Matplotlib.ipynb
    ├── 3. Visualization 1. Machine learning techniques.ipynb
    ├── 3. Visualization 2. Pixiedust.ipynb
    ├── 3. Visualization 3. Bokeh.ipynb
    ├── Python_Bokeh_Cheat_Sheet.pdf
    └── Python_Matplotlib_Cheat_Sheet.pdf
├── 4. Spark
    ├── 4. Spark 0. rdd-creation.ipynb
    ├── 4. Spark 1. rdd-basics.ipynb
    ├── 4. Spark 2. rdd-sampling.ipynb
    ├── 4. Spark 3. rdd-set.ipynb
    ├── 4. Spark 4. rdd-aggregations.ipynb
    ├── 4. Spark 5. rdd-key-value.ipynb
    ├── 4. Spark 6. mllib-statistics.ipynb
    ├── 4. Spark 7. mllib-logit.ipynb
    ├── 4. Spark 8. mllib-trees.ipynb
    ├── 4. Spark 9. sql-dataframes.ipynb
    ├── LICENSE
    └── README.md
├── 5. Machine Learning
    ├── 5. ML 0. Install requirements.ipynb
    ├── 5. ML 1. Introduction.ipynb
    ├── 5. ML 2. Data preparation.ipynb
    ├── 5. ML 3. Scikit Learn interface.ipynb
    ├── 5. ML 4. Bias and variance.ipynb
    ├── 5. ML 5. Model evaluation.ipynb
    ├── 5. ML 6. Ensemble methods.ipynb
    ├── 5. ML 7. Ensemble methods advanced.ipynb
    ├── 5. ML 8. Multi Model Ensembles.ipynb
    ├── 5. ML 9. Time series.ipynb
    └── Scikit_Learn_Cheat_Sheet_Python.pdf
├── 6. Deep Learning
    ├── .ipynb_checkpoints
    │   └── 6. DL 2. Convolutional networks-checkpoint.ipynb
    ├── 6. DL 0. Keras starter kit.ipynb
    ├── 6. DL 1. Fun with activation functions.ipynb
    ├── 6. DL 2. Convolutional networks.ipynb
    ├── 6. DL 3. Embedding.ipynb
    ├── 6. DL 4. Multi-input models.ipynb
    ├── 6. DL 5. Auto encoder.ipynb
    ├── 6. DL 6. Recurrent networks.ipynb
    ├── Keras.js Demos.url
    └── Keras_Cheat_Sheet_Python.pdf
├── 7. Misc
    ├── Training Dataset.arff
    ├── bias and variance.png
    ├── biasvariance.py
    ├── ensemble_explore_hastie.png
    ├── international-airline-passengers.csv
    ├── learning_curves.png
    ├── matlab_test_data_01.mat
    ├── moon phases.xlsx
    ├── test.csv
    ├── train.csv
    ├── tree.png
    └── weather.txt
├── Class intro.pptx
├── Putting data to work.pptx
├── README.md
├── agenda voting.ipynb
├── agenda.txt
└── other
    ├── Data Generation.ipynb
    ├── HELPDESK - DATA IMPORT.ipynb
    ├── Lecture-4-Matplotlib.ipynb
    ├── Lime.ipynb
    ├── PandasCheatSheet.ipynb
    ├── SF Usage 1B records.ipynb
    ├── Scikit-learn models.ipynb
    ├── recharge model.ipynb
    ├── things_in_pandas.ipynb
    └── tmp.ipynb


/0. Basics/0. Basics 0. Jupyter notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat": 4, 
  3 |     "nbformat_minor": 0, 
  4 |     "cells": [
  5 |         {
  6 |             "source": "# Jupyter notebook basics", 
  7 |             "cell_type": "markdown", 
  8 |             "metadata": {
  9 |                 "collapsed": true
 10 |             }
 11 |         }, 
 12 |         {
 13 |             "source": "Notebook documents (or \u201cnotebooks\u201d, all lower case) are documents produced by the Jupyter Notebook App, which contain both computer code (e.g. python) and rich text elements (paragraph, equations, figures, links, etc...). \n\nNotebook documents are both human-readable documents containing the analysis description and the results (figures, tables, etc..) as well as executable documents which can be run to perform data analysis.", 
 14 |             "cell_type": "markdown", 
 15 |             "metadata": {}
 16 |         }, 
 17 |         {
 18 |             "source": "The Jupyter Notebook App is a server-client application that allows editing and running notebook documents via a web browser. The IBM version of Jupyter Notebook App is installed on a remote server and accessed through the internet.", 
 19 |             "cell_type": "markdown", 
 20 |             "metadata": {}
 21 |         }, 
 22 |         {
 23 |             "source": "A notebook kernel is a \u201ccomputational engine\u201d that executes the code contained in a Notebook document. The ipython kernel, referenced in this guide, executes python code. Kernels for many other languages exist (check the Kernels menu above).\n\nWhen you open a Notebook document, the associated kernel is automatically launched. When the notebook is executed (either cell-by-cell or with menu Cell -> Run All), the kernel performs the computation and produces the results. Depending on the type of computations, the kernel may consume significant CPU and RAM. Note that the RAM is not released until the kernel is shut-down", 
 24 |             "cell_type": "markdown", 
 25 |             "metadata": {}
 26 |         }, 
 27 |         {
 28 |             "source": "# Useful shortcuts in Jupter:\n\nctrl-enter: execute the active cell and stay in that cell\nshift-enter: execute the active cell and move to the next cell\n\nTry it out in the next cell:", 
 29 |             "cell_type": "markdown", 
 30 |             "metadata": {}
 31 |         }, 
 32 |         {
 33 |             "execution_count": 1, 
 34 |             "metadata": {
 35 |                 "collapsed": false, 
 36 |                 "scrolled": true
 37 |             }, 
 38 |             "outputs": [
 39 |                 {
 40 |                     "text": "Hello world\n", 
 41 |                     "name": "stdout", 
 42 |                     "output_type": "stream"
 43 |                 }
 44 |             ], 
 45 |             "cell_type": "code", 
 46 |             "source": "print (\"Hello world\")"
 47 |         }, 
 48 |         {
 49 |             "execution_count": 3, 
 50 |             "metadata": {
 51 |                 "collapsed": false, 
 52 |                 "scrolled": true
 53 |             }, 
 54 |             "outputs": [
 55 |                 {
 56 |                     "text": "Second line\n", 
 57 |                     "name": "stdout", 
 58 |                     "output_type": "stream"
 59 |                 }
 60 |             ], 
 61 |             "cell_type": "code", 
 62 |             "source": "print (\"Second line\")"
 63 |         }, 
 64 |         {
 65 |             "source": "# Inserting new cells\n\nwhen a cell is selected in blue (click in the margin to the left of the cell), it shows a blue surrounding box.\n\nType: \na (above) to create a new empty cell above the currently active cell\nb (below) to create a new empty cell below the currently active cell\n\nTry it out.", 
 66 |             "cell_type": "markdown", 
 67 |             "metadata": {}
 68 |         }, 
 69 |         {
 70 |             "source": "# markup code\n\nThe shorcut m (with the blue selection) changes the cell from computation to markdown. This allows to create rich text elements to document the code.", 
 71 |             "cell_type": "markdown", 
 72 |             "metadata": {}
 73 |         }, 
 74 |         {
 75 |             "execution_count": null, 
 76 |             "metadata": {
 77 |                 "collapsed": true
 78 |             }, 
 79 |             "outputs": [], 
 80 |             "cell_type": "code", 
 81 |             "source": "Try it out: make this text part of the markdown. "
 82 |         }, 
 83 |         {
 84 |             "execution_count": null, 
 85 |             "metadata": {
 86 |                 "collapsed": true
 87 |             }, 
 88 |             "outputs": [], 
 89 |             "cell_type": "code", 
 90 |             "source": "# Heading 1\n## Heading 2\n### Heading 3\n#### Heading 4\n\n**bold**\n\n*italics*\n\nempty line is a paragraph\n\nthis is a new parapgraph"
 91 |         }, 
 92 |         {
 93 |             "source": "# Heading 1\n## Heading 2\n### Heading 3\n#### Heading 4\n\n**bold**\n\n*italics*\n\nempty line is a paragraph\n\nthis is a new parapgraph", 
 94 |             "cell_type": "markdown", 
 95 |             "metadata": {}
 96 |         }, 
 97 |         {
 98 |             "source": "# other shotcuts\n\nThe shorcut h (with the blue selection) shows other shortcuts\nThe shortcut o (with the blue selection) will hide long output.", 
 99 |             "cell_type": "markdown", 
100 |             "metadata": {}
101 |         }, 
102 |         {
103 |             "execution_count": 5, 
104 |             "metadata": {
105 |                 "collapsed": false, 
106 |                 "scrolled": true
107 |             }, 
108 |             "outputs": [
109 |                 {
110 |                     "text": "Hello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\n", 
111 |                     "name": "stdout", 
112 |                     "output_type": "stream"
113 |                 }
114 |             ], 
115 |             "cell_type": "code", 
116 |             "source": "# Try toggling the output\nfor i in range(10):\n    print (\"Hello world\")"
117 |         }, 
118 |         {
119 |             "source": "Help on objects:\nobj?, obj??      : Get help, or more help for object (also works as\n                   ?obj, ??obj).\n?foo.*abc*       : List names in 'foo' containing 'abc' in them.", 
120 |             "cell_type": "markdown", 
121 |             "metadata": {}
122 |         }, 
123 |         {
124 |             "execution_count": 15, 
125 |             "metadata": {
126 |                 "collapsed": false
127 |             }, 
128 |             "outputs": [], 
129 |             "cell_type": "code", 
130 |             "source": "#try it out\n?dict"
131 |         }
132 |     ], 
133 |     "metadata": {
134 |         "language_info": {
135 |             "nbconvert_exporter": "python", 
136 |             "file_extension": ".py", 
137 |             "codemirror_mode": {
138 |                 "name": "ipython", 
139 |                 "version": 2
140 |             }, 
141 |             "pygments_lexer": "ipython2", 
142 |             "version": "2.7.11", 
143 |             "mimetype": "text/x-python", 
144 |             "name": "python"
145 |         }, 
146 |         "kernelspec": {
147 |             "language": "python", 
148 |             "display_name": "Python 2 with Spark 1.6", 
149 |             "name": "python2"
150 |         }
151 |     }
152 | }


--------------------------------------------------------------------------------
/0. Basics/0. Basics 5. Exercise 0.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Moon phases"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {
13 |     "collapsed": false,
14 |     "scrolled": false
15 |    },
16 |    "source": [
17 |     "# This site contains the moon phases \n",
18 |     "### http://aa.usno.navy.mil/data/docs/MoonFraction.php\n",
19 |     "    \n",
20 |     "# Download this data and convert in the appropriate format (think about what appropriate means)    \n",
21 |     "### http://aa.usno.navy.mil/cgi-bin/aa_moonill2.pl?form=1&year=2017&task=00&tz=-05"
22 |    ]
23 |   }
24 |  ],
25 |  "metadata": {
26 |   "anaconda-cloud": {},
27 |   "kernelspec": {
28 |    "display_name": "Python [conda root]",
29 |    "language": "python",
30 |    "name": "conda-root-py"
31 |   },
32 |   "language_info": {
33 |    "codemirror_mode": {
34 |     "name": "ipython",
35 |     "version": 2
36 |    },
37 |    "file_extension": ".py",
38 |    "mimetype": "text/x-python",
39 |    "name": "python",
40 |    "nbconvert_exporter": "python",
41 |    "pygments_lexer": "ipython2",
42 |    "version": "2.7.12"
43 |   }
44 |  },
45 |  "nbformat": 4,
46 |  "nbformat_minor": 1
47 | }
48 | 


--------------------------------------------------------------------------------
/0. Basics/0. Basics 6. Some data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/0. Basics/0. Basics 6. Some data.zip


--------------------------------------------------------------------------------
/0. Basics/Numpy_Python_Cheat_Sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/0. Basics/Numpy_Python_Cheat_Sheet.pdf


--------------------------------------------------------------------------------
/0. Basics/PandasPythonForDataScience.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/0. Basics/PandasPythonForDataScience.pdf


--------------------------------------------------------------------------------
/1. In and Export/1. import and export 0. Object Storage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown", 
  5 |             "metadata": {}, 
  6 |             "source": "# In-and export from Object storage"
  7 |         }, 
  8 |         {
  9 |             "cell_type": "code", 
 10 |             "metadata": {
 11 |                 "collapsed": true
 12 |             }, 
 13 |             "outputs": [], 
 14 |             "source": "#imports\nfrom io import StringIO\nimport requests\nimport json\n\nimport pandas as pd\nimport numpy as np", 
 15 |             "execution_count": 2
 16 |         }, 
 17 |         {
 18 |             "cell_type": "markdown", 
 19 |             "metadata": {}, 
 20 |             "source": "Credentials"
 21 |         }, 
 22 |         {
 23 |             "cell_type": "code", 
 24 |             "metadata": {
 25 |                 "collapsed": true
 26 |             }, 
 27 |             "outputs": [], 
 28 |             "source": "@hidden_cell\n\ncredentials_1= {\n  \"auth_url\": \"https://identity.open.softlayer.com\",\n  \"project\": \"object_storage_effacaaa_ad08_4ee4_bd59_b2e105bc9639\",\n  \"projectId\": \"e96165fa44c44a7d956507ebf4026cfb\",\n  \"region\": \"dallas\",\n  \"userId\": \"5002ddd00432452d8dd086e3f74ed3f1\",\n  \"username\": \"admin_bdffbeee0797e72fe86a4270ec23774a835cbd4b\",\n  \"password\": \"TAu!-k36VR[&xp9V\",\n  \"domainId\": \"193a321481be4f73b08e76a87e7d585a\",\n  \"container\":\"DSETraining101ObjectStorage\",\n  \"domainName\": \"1123181\",\n  \"role\": \"admin\"\n}\n", 
 29 |             "execution_count": 3
 30 |         }, 
 31 |         {
 32 |             "cell_type": "code", 
 33 |             "metadata": {
 34 |                 "collapsed": false
 35 |             }, 
 36 |             "outputs": [
 37 |                 {
 38 |                     "metadata": {}, 
 39 |                     "data": {
 40 |                         "text/plain": "         0         1         2         3         4         5         6   \\\n0  0.218387  0.681619  0.127457  0.176184  0.208149  0.431590  0.436492   \n1  0.438189  0.477055  0.292263  0.737119  0.986888  0.571864  0.952132   \n2  0.246476  0.425691  0.344394  0.302752  0.592942  0.350859  0.620692   \n3  0.303911  0.326860  0.147415  0.208054  0.371225  0.910451  0.544563   \n4  0.885194  0.156615  0.795850  0.208292  0.658539  0.738021  0.750869   \n\n         7         8         9     ...           90        91        92  \\\n0  0.346246  0.632588  0.829314    ...     0.432520  0.974076  0.894935   \n1  0.589837  0.010117  0.174665    ...     0.301537  0.552984  0.879647   \n2  0.516486  0.033306  0.304789    ...     0.131419  0.591061  0.840573   \n3  0.394877  0.424432  0.418725    ...     0.556135  0.013987  0.149688   \n4  0.233611  0.890301  0.501873    ...     0.149136  0.617435  0.659965   \n\n         93        94        95        96        97        98        99  \n0  0.410939  0.487212  0.428583  0.598402  0.706644  0.385664  0.493396  \n1  0.380063  0.300516  0.398224  0.739755  0.462363  0.046500  0.510257  \n2  0.664932  0.472257  0.998087  0.073214  0.984443  0.957306  0.745294  \n3  0.604044  0.574204  0.930515  0.797487  0.785470  0.447085  0.587239  \n4  0.096339  0.922107  0.220996  0.989596  0.604612  0.461251  0.658624  \n\n[5 rows x 100 columns]", 
 41 |                         "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n      <th>4</th>\n      <th>5</th>\n      <th>6</th>\n      <th>7</th>\n      <th>8</th>\n      <th>9</th>\n      <th>...</th>\n      <th>90</th>\n      <th>91</th>\n      <th>92</th>\n      <th>93</th>\n      <th>94</th>\n      <th>95</th>\n      <th>96</th>\n      <th>97</th>\n      <th>98</th>\n      <th>99</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.218387</td>\n      <td>0.681619</td>\n      <td>0.127457</td>\n      <td>0.176184</td>\n      <td>0.208149</td>\n      <td>0.431590</td>\n      <td>0.436492</td>\n      <td>0.346246</td>\n      <td>0.632588</td>\n      <td>0.829314</td>\n      <td>...</td>\n      <td>0.432520</td>\n      <td>0.974076</td>\n      <td>0.894935</td>\n      <td>0.410939</td>\n      <td>0.487212</td>\n      <td>0.428583</td>\n      <td>0.598402</td>\n      <td>0.706644</td>\n      <td>0.385664</td>\n      <td>0.493396</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.438189</td>\n      <td>0.477055</td>\n      <td>0.292263</td>\n      <td>0.737119</td>\n      <td>0.986888</td>\n      <td>0.571864</td>\n      <td>0.952132</td>\n      <td>0.589837</td>\n      <td>0.010117</td>\n      <td>0.174665</td>\n      <td>...</td>\n      <td>0.301537</td>\n      <td>0.552984</td>\n      <td>0.879647</td>\n      <td>0.380063</td>\n      <td>0.300516</td>\n      <td>0.398224</td>\n      <td>0.739755</td>\n      <td>0.462363</td>\n      <td>0.046500</td>\n      <td>0.510257</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.246476</td>\n      <td>0.425691</td>\n      <td>0.344394</td>\n      <td>0.302752</td>\n      <td>0.592942</td>\n      <td>0.350859</td>\n      <td>0.620692</td>\n      <td>0.516486</td>\n      <td>0.033306</td>\n      <td>0.304789</td>\n      <td>...</td>\n      <td>0.131419</td>\n      <td>0.591061</td>\n      <td>0.840573</td>\n      <td>0.664932</td>\n      <td>0.472257</td>\n      <td>0.998087</td>\n      <td>0.073214</td>\n      <td>0.984443</td>\n      <td>0.957306</td>\n      <td>0.745294</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.303911</td>\n      <td>0.326860</td>\n      <td>0.147415</td>\n      <td>0.208054</td>\n      <td>0.371225</td>\n      <td>0.910451</td>\n      <td>0.544563</td>\n      <td>0.394877</td>\n      <td>0.424432</td>\n      <td>0.418725</td>\n      <td>...</td>\n      <td>0.556135</td>\n      <td>0.013987</td>\n      <td>0.149688</td>\n      <td>0.604044</td>\n      <td>0.574204</td>\n      <td>0.930515</td>\n      <td>0.797487</td>\n      <td>0.785470</td>\n      <td>0.447085</td>\n      <td>0.587239</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.885194</td>\n      <td>0.156615</td>\n      <td>0.795850</td>\n      <td>0.208292</td>\n      <td>0.658539</td>\n      <td>0.738021</td>\n      <td>0.750869</td>\n      <td>0.233611</td>\n      <td>0.890301</td>\n      <td>0.501873</td>\n      <td>...</td>\n      <td>0.149136</td>\n      <td>0.617435</td>\n      <td>0.659965</td>\n      <td>0.096339</td>\n      <td>0.922107</td>\n      <td>0.220996</td>\n      <td>0.989596</td>\n      <td>0.604612</td>\n      <td>0.461251</td>\n      <td>0.658624</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows \u00d7 100 columns</p>\n</div>"
 42 |                     }, 
 43 |                     "execution_count": 8, 
 44 |                     "output_type": "execute_result"
 45 |                 }
 46 |             ], 
 47 |             "source": "# create some data\nrandom_data=pd.DataFrame(np.random.random((1000, 100)))\nrandom_data.head()", 
 48 |             "execution_count": 8
 49 |         }, 
 50 |         {
 51 |             "cell_type": "code", 
 52 |             "metadata": {
 53 |                 "collapsed": false
 54 |             }, 
 55 |             "outputs": [
 56 |                 {
 57 |                     "text": "<Response [201]>\n", 
 58 |                     "output_type": "stream", 
 59 |                     "name": "stdout"
 60 |                 }
 61 |             ], 
 62 |             "source": "#usecase you have a file in Python (for example from a database) and you want to create a file in local object storage\n\ndef put_file(credentials, local_file_name):  \n    \"\"\"This functions returns a StringIO object containing\n    the file content from Bluemix Object Storage V3.\"\"\"\n    f = open(local_file_name,'r')\n    my_data = f.read()\n    url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])\n    data = {'auth': {'identity': {'methods': ['password'],\n            'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domainId']},\n            'password': credentials['password']}}}}}\n    headers1 = {'Content-Type': 'application/json'}\n    resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)\n    resp1_body = resp1.json()\n    for e1 in resp1_body['token']['catalog']:\n        if(e1['type']=='object-store'):\n            for e2 in e1['endpoints']:\n                        if(e2['interface']=='public'and e2['region']=='dallas'):\n                            url2 = ''.join([e2['url'],'/', credentials['container'], '/', local_file_name])\n    s_subject_token = resp1.headers['x-subject-token']\n    headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}\n    resp2 = requests.put(url=url2, headers=headers2, data = my_data )\n    print resp2\n    \n    \n# step 1: store object in local file.\n# data_train is an earlier defined pandas dataframe containing data\nrandom_data.to_csv('random_data.csv',index=False)\n#step 2: move to object storage    \nput_file(credentials_1,\"random_data.csv\")  ", 
 63 |             "execution_count": 9
 64 |         }, 
 65 |         {
 66 |             "cell_type": "code", 
 67 |             "metadata": {
 68 |                 "collapsed": false
 69 |             }, 
 70 |             "outputs": [
 71 |                 {
 72 |                     "metadata": {}, 
 73 |                     "data": {
 74 |                         "text/plain": "          0         1         2         3         4         5         6  \\\n0  0.218387  0.681619  0.127457  0.176184  0.208149  0.431590  0.436492   \n1  0.438189  0.477055  0.292263  0.737119  0.986888  0.571864  0.952132   \n2  0.246476  0.425691  0.344394  0.302752  0.592942  0.350859  0.620692   \n3  0.303911  0.326860  0.147415  0.208054  0.371225  0.910451  0.544563   \n4  0.885194  0.156615  0.795850  0.208292  0.658539  0.738021  0.750869   \n\n          7         8         9    ...           90        91        92  \\\n0  0.346246  0.632588  0.829314    ...     0.432520  0.974076  0.894935   \n1  0.589837  0.010117  0.174665    ...     0.301537  0.552984  0.879647   \n2  0.516486  0.033306  0.304789    ...     0.131419  0.591061  0.840573   \n3  0.394877  0.424432  0.418725    ...     0.556135  0.013987  0.149688   \n4  0.233611  0.890301  0.501873    ...     0.149136  0.617435  0.659965   \n\n         93        94        95        96        97        98        99  \n0  0.410939  0.487212  0.428583  0.598402  0.706644  0.385664  0.493396  \n1  0.380063  0.300516  0.398224  0.739755  0.462363  0.046500  0.510257  \n2  0.664932  0.472257  0.998087  0.073214  0.984443  0.957306  0.745294  \n3  0.604044  0.574204  0.930515  0.797487  0.785470  0.447085  0.587239  \n4  0.096339  0.922107  0.220996  0.989596  0.604612  0.461251  0.658624  \n\n[5 rows x 100 columns]", 
 75 |                         "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n      <th>4</th>\n      <th>5</th>\n      <th>6</th>\n      <th>7</th>\n      <th>8</th>\n      <th>9</th>\n      <th>...</th>\n      <th>90</th>\n      <th>91</th>\n      <th>92</th>\n      <th>93</th>\n      <th>94</th>\n      <th>95</th>\n      <th>96</th>\n      <th>97</th>\n      <th>98</th>\n      <th>99</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.218387</td>\n      <td>0.681619</td>\n      <td>0.127457</td>\n      <td>0.176184</td>\n      <td>0.208149</td>\n      <td>0.431590</td>\n      <td>0.436492</td>\n      <td>0.346246</td>\n      <td>0.632588</td>\n      <td>0.829314</td>\n      <td>...</td>\n      <td>0.432520</td>\n      <td>0.974076</td>\n      <td>0.894935</td>\n      <td>0.410939</td>\n      <td>0.487212</td>\n      <td>0.428583</td>\n      <td>0.598402</td>\n      <td>0.706644</td>\n      <td>0.385664</td>\n      <td>0.493396</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.438189</td>\n      <td>0.477055</td>\n      <td>0.292263</td>\n      <td>0.737119</td>\n      <td>0.986888</td>\n      <td>0.571864</td>\n      <td>0.952132</td>\n      <td>0.589837</td>\n      <td>0.010117</td>\n      <td>0.174665</td>\n      <td>...</td>\n      <td>0.301537</td>\n      <td>0.552984</td>\n      <td>0.879647</td>\n      <td>0.380063</td>\n      <td>0.300516</td>\n      <td>0.398224</td>\n      <td>0.739755</td>\n      <td>0.462363</td>\n      <td>0.046500</td>\n      <td>0.510257</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.246476</td>\n      <td>0.425691</td>\n      <td>0.344394</td>\n      <td>0.302752</td>\n      <td>0.592942</td>\n      <td>0.350859</td>\n      <td>0.620692</td>\n      <td>0.516486</td>\n      <td>0.033306</td>\n      <td>0.304789</td>\n      <td>...</td>\n      <td>0.131419</td>\n      <td>0.591061</td>\n      <td>0.840573</td>\n      <td>0.664932</td>\n      <td>0.472257</td>\n      <td>0.998087</td>\n      <td>0.073214</td>\n      <td>0.984443</td>\n      <td>0.957306</td>\n      <td>0.745294</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.303911</td>\n      <td>0.326860</td>\n      <td>0.147415</td>\n      <td>0.208054</td>\n      <td>0.371225</td>\n      <td>0.910451</td>\n      <td>0.544563</td>\n      <td>0.394877</td>\n      <td>0.424432</td>\n      <td>0.418725</td>\n      <td>...</td>\n      <td>0.556135</td>\n      <td>0.013987</td>\n      <td>0.149688</td>\n      <td>0.604044</td>\n      <td>0.574204</td>\n      <td>0.930515</td>\n      <td>0.797487</td>\n      <td>0.785470</td>\n      <td>0.447085</td>\n      <td>0.587239</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.885194</td>\n      <td>0.156615</td>\n      <td>0.795850</td>\n      <td>0.208292</td>\n      <td>0.658539</td>\n      <td>0.738021</td>\n      <td>0.750869</td>\n      <td>0.233611</td>\n      <td>0.890301</td>\n      <td>0.501873</td>\n      <td>...</td>\n      <td>0.149136</td>\n      <td>0.617435</td>\n      <td>0.659965</td>\n      <td>0.096339</td>\n      <td>0.922107</td>\n      <td>0.220996</td>\n      <td>0.989596</td>\n      <td>0.604612</td>\n      <td>0.461251</td>\n      <td>0.658624</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows \u00d7 100 columns</p>\n</div>"
 76 |                     }, 
 77 |                     "execution_count": 10, 
 78 |                     "output_type": "execute_result"
 79 |                 }
 80 |             ], 
 81 |             "source": "#use case: you upload a file in the object storage and want to access it in Python.\n\ndef get_object_storage_file_with_credentials(credentials,container, filename):\n    \"\"\"This functions returns a StringIO object containing\n    the file content from Bluemix Object Storage.\"\"\"\n\n    url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])\n    data = {'auth': {'identity': {'methods': ['password'],\n            'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domainId']},\n            'password': credentials['password']}}}}}\n    headers1 = {'Content-Type': 'application/json'}\n    resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)\n    resp1_body = resp1.json()\n    for e1 in resp1_body['token']['catalog']:\n        if(e1['type']=='object-store'):\n            for e2 in e1['endpoints']:\n                        if(e2['interface']=='public'and e2['region']=='dallas'):\n                            url2 = ''.join([e2['url'],'/', container, '/', filename])\n    s_subject_token = resp1.headers['x-subject-token']\n    headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}\n    resp2 = requests.get(url=url2, headers=headers2)\n    return StringIO(resp2.text)\n\n# step 1: get file from object storage\n#data_train.csv is an existing object in the Object Storage \nrandom_data = pd.read_csv(get_object_storage_file_with_credentials(credentials_1,'DSETraining101ObjectStorage', 'random_data.csv'))\nrandom_data.head()\n\n\n", 
 82 |             "execution_count": 10
 83 |         }
 84 |     ], 
 85 |     "nbformat": 4, 
 86 |     "metadata": {
 87 |         "kernelspec": {
 88 |             "display_name": "Python 2 with Spark 1.6", 
 89 |             "name": "python2", 
 90 |             "language": "python"
 91 |         }, 
 92 |         "language_info": {
 93 |             "version": "2.7.11", 
 94 |             "codemirror_mode": {
 95 |                 "version": 2, 
 96 |                 "name": "ipython"
 97 |             }, 
 98 |             "mimetype": "text/x-python", 
 99 |             "nbconvert_exporter": "python", 
100 |             "file_extension": ".py", 
101 |             "name": "python", 
102 |             "pygments_lexer": "ipython2"
103 |         }
104 |     }, 
105 |     "nbformat_minor": 0
106 | }


--------------------------------------------------------------------------------
/1. In and Export/1. import and export 1. Download and upload.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat": 4, 
  3 |     "nbformat_minor": 0, 
  4 |     "cells": [
  5 |         {
  6 |             "source": "# In-and export from url\n\nhttp://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python\n", 
  7 |             "cell_type": "markdown", 
  8 |             "metadata": {}
  9 |         }, 
 10 |         {
 11 |             "execution_count": 21, 
 12 |             "metadata": {
 13 |                 "collapsed": true
 14 |             }, 
 15 |             "outputs": [
 16 |                 {
 17 |                     "text": "Requirement already satisfied (use --upgrade to upgrade): tqdm in /gpfs/global_fs01/sym_shared/YPProdSpark/user/s16e-7918d85e6de098-7a7840b6cba3/.local/lib/python2.7/site-packages\nCollecting Image\n  Downloading image-1.5.5.tar.gz\nRequirement already satisfied (use --upgrade to upgrade): pillow in /usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages (from Image)\nCollecting django (from Image)\n  Downloading Django-1.10.5-py2.py3-none-any.whl (6.8MB)\n\u001b[K    100% |\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6.8MB 171kB/s \n\u001b[?25hInstalling collected packages: django, Image\n  Running setup.py install for Image ... \u001b[?25l-\b \b\\\b \bdone\n\u001b[?25hSuccessfully installed Image-1.5.5 django-1.10.5\n", 
 18 |                     "name": "stdout", 
 19 |                     "output_type": "stream"
 20 |                 }
 21 |             ], 
 22 |             "cell_type": "code", 
 23 |             "source": "!pip install tqdm"
 24 |         }, 
 25 |         {
 26 |             "execution_count": 1, 
 27 |             "metadata": {
 28 |                 "collapsed": true
 29 |             }, 
 30 |             "outputs": [], 
 31 |             "cell_type": "code", 
 32 |             "source": "import urllib\nfrom tqdm import tqdm\nimport requests\nfrom requests.auth import HTTPDigestAuth\nimport json\nimport os"
 33 |         }, 
 34 |         {
 35 |             "execution_count": 2, 
 36 |             "metadata": {
 37 |                 "collapsed": false
 38 |             }, 
 39 |             "outputs": [
 40 |                 {
 41 |                     "data": {
 42 |                         "text/plain": "('10MB.zip', <httplib.HTTPMessage instance at 0x7f302421dd40>)"
 43 |                     }, 
 44 |                     "execution_count": 2, 
 45 |                     "output_type": "execute_result", 
 46 |                     "metadata": {}
 47 |                 }
 48 |             ], 
 49 |             "cell_type": "code", 
 50 |             "source": "#simplest version\nurllib.urlretrieve (\"http://download.thinkbroadband.com/10MB.zip\", \"10MB.zip\")"
 51 |         }, 
 52 |         {
 53 |             "execution_count": null, 
 54 |             "metadata": {
 55 |                 "collapsed": true
 56 |             }, 
 57 |             "outputs": [], 
 58 |             "cell_type": "code", 
 59 |             "source": "# Python 3 variant:\n\nfrom requests import get  \ndef download(url, file_name):\n    # open in binary mode\n    with open(file_name, \"wb\") as file:\n        # get request\n        response = get(url)\n        # write to file\n        file.write(response.content)\n"
 60 |         }, 
 61 |         {
 62 |             "execution_count": null, 
 63 |             "metadata": {
 64 |                 "collapsed": true
 65 |             }, 
 66 |             "outputs": [], 
 67 |             "cell_type": "code", 
 68 |             "source": "url = \"http://download.thinkbroadband.com/10MB.zip\"\nresponse = requests.get(url, stream=True)\n\nwith open(\"10MB\", \"wb\") as handle:\n    for data in tqdm(response.iter_content()):\n        handle.write(data)"
 69 |         }, 
 70 |         {
 71 |             "execution_count": 13, 
 72 |             "metadata": {
 73 |                 "collapsed": false, 
 74 |                 "scrolled": true
 75 |             }, 
 76 |             "outputs": [
 77 |                 {
 78 |                     "text": "200\n{\"type\": \"success\", \"value\": {\"joke\": \"Chuck Norris doesn't read books. He stares them down until he gets the information he wants.\", \"id\": 3, \"categories\": []}}\n", 
 79 |                     "name": "stdout", 
 80 |                     "output_type": "stream"
 81 |                 }
 82 |             ], 
 83 |             "cell_type": "code", 
 84 |             "source": "# get data from an api call\n\n# visit http://www.icndb.com/api/ to see other options to quesry the Chuck norris jokes database\nurl = \"http://api.icndb.com/jokes/random\"\n\n# It is a good practice not to hardcode the credentials. So ask the user to enter credentials at runtime\nmyResponse = requests.get(url)\n#myResponse = requests.get(url,auth=HTTPDigestAuth(raw_input(\"username: \"), raw_input(\"Password: \")), verify=True)\nprint (myResponse.status_code)\n# For successful API call, response code will be 200 (OK)\n\njData = json.loads(myResponse.content)\nprint json.dumps(jData)\n"
 85 |         }, 
 86 |         {
 87 |             "execution_count": 18, 
 88 |             "metadata": {
 89 |                 "collapsed": false
 90 |             }, 
 91 |             "outputs": [
 92 |                 {
 93 |                     "text": "<Response [200]>\n", 
 94 |                     "name": "stdout", 
 95 |                     "output_type": "stream"
 96 |                 }
 97 |             ], 
 98 |             "cell_type": "code", 
 99 |             "source": "#uploading data\n\nwith open('output_file', 'wb') as fout:\n    fout.write(os.urandom(1024)) \n\nr = requests.post('http://httpbin.org/post', files={'output_file': open('output_file', 'rb')})\nprint r"
100 |         }
101 |     ], 
102 |     "metadata": {
103 |         "language_info": {
104 |             "nbconvert_exporter": "python", 
105 |             "file_extension": ".py", 
106 |             "codemirror_mode": {
107 |                 "name": "ipython", 
108 |                 "version": 2
109 |             }, 
110 |             "pygments_lexer": "ipython2", 
111 |             "version": "2.7.11", 
112 |             "mimetype": "text/x-python", 
113 |             "name": "python"
114 |         }, 
115 |         "kernelspec": {
116 |             "language": "python", 
117 |             "display_name": "Python 2 with Spark 1.6", 
118 |             "name": "python2"
119 |         }
120 |     }
121 | }


--------------------------------------------------------------------------------
/1. In and Export/1. import and export 2. DashDB.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown", 
  5 |             "metadata": {
  6 |                 "collapsed": true
  7 |             }, 
  8 |             "source": "# In-and export from DashDB\n\nUseful doc: \nhttp://pythonhosted.org/ibmdbpy/start.html"
  9 |         }, 
 10 |         {
 11 |             "cell_type": "code", 
 12 |             "metadata": {
 13 |                 "collapsed": true
 14 |             }, 
 15 |             "outputs": [], 
 16 |             "source": "#imports\nimport ibmdbpy\nfrom ibmdbpy import IdaDataBase, IdaDataFrame\n\nimport pandas as pd\nimport numpy as np", 
 17 |             "execution_count": 3
 18 |         }, 
 19 |         {
 20 |             "cell_type": "code", 
 21 |             "metadata": {
 22 |                 "scrolled": true, 
 23 |                 "collapsed": false
 24 |             }, 
 25 |             "outputs": [
 26 |                 {
 27 |                     "metadata": {}, 
 28 |                     "data": {
 29 |                         "text/plain": "          a         b         c\n0  0.579127  0.691267  0.156212\n1  0.475068  0.030056  0.096346\n2  0.950316  0.898397  0.994278\n3  0.540834  0.900723  0.902864\n4  0.299987  0.733373  0.101006", 
 30 |                         "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>a</th>\n      <th>b</th>\n      <th>c</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.579127</td>\n      <td>0.691267</td>\n      <td>0.156212</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.475068</td>\n      <td>0.030056</td>\n      <td>0.096346</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.950316</td>\n      <td>0.898397</td>\n      <td>0.994278</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.540834</td>\n      <td>0.900723</td>\n      <td>0.902864</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.299987</td>\n      <td>0.733373</td>\n      <td>0.101006</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
 31 |                     }, 
 32 |                     "execution_count": 23, 
 33 |                     "output_type": "execute_result"
 34 |                 }
 35 |             ], 
 36 |             "source": "# create some data\nrandom_data=pd.DataFrame(np.random.random((1000, 3)),columns=['a','b','c'])\nrandom_data.head()", 
 37 |             "execution_count": 23
 38 |         }, 
 39 |         {
 40 |             "cell_type": "code", 
 41 |             "metadata": {
 42 |                 "collapsed": true
 43 |             }, 
 44 |             "outputs": [], 
 45 |             "source": "@hidden_cell \n\ncredentials_1 = {\n  'host':'dashdb-entry-yp-dal09-08.services.dal.bluemix.net',\n  'port':'50000',\n  'user':'dash8753',\n  'password':\"\"\"ddd6463d0ddc\"\"\",\n  'database':'BLUDB'\n}", 
 46 |             "execution_count": 24
 47 |         }, 
 48 |         {
 49 |             "cell_type": "code", 
 50 |             "metadata": {
 51 |                 "collapsed": false
 52 |             }, 
 53 |             "outputs": [
 54 |                 {
 55 |                     "text": "Uploading 1000 rows (maxnrow was set to 2666)\n", 
 56 |                     "output_type": "stream", 
 57 |                     "name": "stdout"
 58 |                 }, 
 59 |                 {
 60 |                     "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in <bound method Cursor.__del__ of <pypyodbc.Cursor instance at 0x7f78e03f0dd0>> ignored\n", 
 61 |                     "output_type": "stream", 
 62 |                     "name": "stderr"
 63 |                 }
 64 |             ], 
 65 |             "source": "# use case: you have some data in Python (for example from a csv file) and you want to upload it to a database\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + \n                    credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + \n                    credentials_1[\"user\"] + \";PWD=\" + \n                    credentials_1[\"password\"])\n\nidadf = idadb.as_idadataframe(random_data, \"RANDOM_DATA\", clear_existing=True) #input: a Pandas dataframe", 
 66 |             "execution_count": 25
 67 |         }, 
 68 |         {
 69 |             "cell_type": "code", 
 70 |             "metadata": {
 71 |                 "collapsed": false
 72 |             }, 
 73 |             "outputs": [
 74 |                 {
 75 |                     "text": "Uploading 1000 rows (maxnrow was set to 2666)\n", 
 76 |                     "output_type": "stream", 
 77 |                     "name": "stdout"
 78 |                 }
 79 |             ], 
 80 |             "source": "# use case: you have an existing table and you want to append additional records\n\nRANDOM_DATA_DATAFRAME = IdaDataFrame(idadb, 'RANDOM_DATA')  #define the IdaDataFrame\n   \nidadb.append(RANDOM_DATA_DATAFRAME,random_data) # add the Pandas Dataframe 'random_data' once more to the database\nRANDOM_DATA_DATAFRAME.commit() #ensure a commit or else you won't see the addition", 
 81 |             "execution_count": 26
 82 |         }, 
 83 |         {
 84 |             "cell_type": "code", 
 85 |             "metadata": {
 86 |                 "collapsed": false
 87 |             }, 
 88 |             "outputs": [
 89 |                 {
 90 |                     "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in <bound method Cursor.__del__ of <pypyodbc.Cursor instance at 0x7f78eaf55cf8>> ignored\n", 
 91 |                     "output_type": "stream", 
 92 |                     "name": "stderr"
 93 |                 }, 
 94 |                 {
 95 |                     "metadata": {}, 
 96 |                     "data": {
 97 |                         "text/plain": "(2000, 3)"
 98 |                     }, 
 99 |                     "execution_count": 28, 
100 |                     "output_type": "execute_result"
101 |                 }
102 |             ], 
103 |             "source": "# use case: you have data in a database and want to make it accessible to Pyton\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + \n                    credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + \n                    credentials_1[\"user\"] + \";PWD=\" + \n                    credentials_1[\"password\"])\n\nrandom_data = IdaDataFrame(idadb, 'RANDOM_DATA')\nrandom_data = random_data.as_dataframe()\nrandom_data.head()\n#random_data.shape #2000 due to the appending", 
104 |             "execution_count": 28
105 |         }, 
106 |         {
107 |             "cell_type": "code", 
108 |             "metadata": {
109 |                 "collapsed": true
110 |             }, 
111 |             "outputs": [], 
112 |             "source": "", 
113 |             "execution_count": null
114 |         }
115 |     ], 
116 |     "nbformat": 4, 
117 |     "metadata": {
118 |         "kernelspec": {
119 |             "display_name": "Python 2 with Spark 1.6", 
120 |             "name": "python2", 
121 |             "language": "python"
122 |         }, 
123 |         "language_info": {
124 |             "version": "2.7.11", 
125 |             "codemirror_mode": {
126 |                 "version": 2, 
127 |                 "name": "ipython"
128 |             }, 
129 |             "mimetype": "text/x-python", 
130 |             "nbconvert_exporter": "python", 
131 |             "file_extension": ".py", 
132 |             "name": "python", 
133 |             "pygments_lexer": "ipython2"
134 |         }
135 |     }, 
136 |     "nbformat_minor": 0
137 | }


--------------------------------------------------------------------------------
/1. In and Export/1. import and export 3. Cloudant.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat": 4, 
  3 |     "cells": [
  4 |         {
  5 |             "cell_type": "markdown", 
  6 |             "metadata": {}, 
  7 |             "source": "# In-and export from Cloudant\n\nhttp://python-cloudant.readthedocs.io/en/latest/getting_started.html\n\nCloudant is a NoSQL database as a service (DBaaS) built to scale globally, run nonstop, and handle a wide variety of data types like JSON, full-text, and geospatial. Cloudant NoSQL DB is an operational data store optimized to handle concurrent reads and writes and to provide high availability and data durability."
  8 |         }, 
  9 |         {
 10 |             "cell_type": "code", 
 11 |             "execution_count": 1, 
 12 |             "source": "!pip install --user cloudant", 
 13 |             "metadata": {
 14 |                 "collapsed": false
 15 |             }, 
 16 |             "outputs": [
 17 |                 {
 18 |                     "name": "stdout", 
 19 |                     "output_type": "stream", 
 20 |                     "text": "Collecting cloudant\n  Downloading cloudant-2.3.1-py2-none-any.whl (63kB)\n\u001b[K    100% |\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71kB 5.2MB/s \n\u001b[?25hRequirement already satisfied (use --upgrade to upgrade): requests<3.0.0,>=2.7.0 in /usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages (from cloudant)\nInstalling collected packages: cloudant\nSuccessfully installed cloudant-2.3.1\n"
 21 |                 }
 22 |             ]
 23 |         }, 
 24 |         {
 25 |             "cell_type": "code", 
 26 |             "execution_count": null, 
 27 |             "source": "from cloudant.client import Cloudant\nfrom cloudant.result import Result\nimport pandas as pd, json", 
 28 |             "metadata": {
 29 |                 "collapsed": true
 30 |             }, 
 31 |             "outputs": []
 32 |         }, 
 33 |         {
 34 |             "cell_type": "code", 
 35 |             "execution_count": 2, 
 36 |             "source": "@hidden_cell\n\ncredentials_1 = {\n  'username':'b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix',\n  'password':\"\"\"a5a4d2e9a0db63949c7c592b016f1d12b86880fa398476c00472360891df09b2\"\"\",\n  'host':'b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix.cloudant.com',\n  'port':'443',\n  'url':'https://b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix:a5a4d2e9a0db63949c7c592b016f1d12b86880fa398476c00472360891df09b2@b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix.cloudant.com'\n}", 
 37 |             "metadata": {
 38 |                 "collapsed": true
 39 |             }, 
 40 |             "outputs": []
 41 |         }, 
 42 |         {
 43 |             "cell_type": "code", 
 44 |             "execution_count": 3, 
 45 |             "source": "# connect to cloudant\nclient = Cloudant(credentials_1['username'], credentials_1['password'], url=credentials_1['url'])\nclient.connect()\n# Disconnect from the server\n# client.disconnect()", 
 46 |             "metadata": {
 47 |                 "collapsed": true
 48 |             }, 
 49 |             "outputs": []
 50 |         }, 
 51 |         {
 52 |             "cell_type": "code", 
 53 |             "execution_count": 4, 
 54 |             "source": "client.all_dbs()", 
 55 |             "metadata": {
 56 |                 "collapsed": false
 57 |             }, 
 58 |             "outputs": [
 59 |                 {
 60 |                     "execution_count": 4, 
 61 |                     "data": {
 62 |                         "text/plain": "[u'_replicator', u'_users']"
 63 |                     }, 
 64 |                     "metadata": {}, 
 65 |                     "output_type": "execute_result"
 66 |                 }
 67 |             ]
 68 |         }, 
 69 |         {
 70 |             "cell_type": "code", 
 71 |             "execution_count": 5, 
 72 |             "source": "# Create a database using an initialized client\n\nmy_database = client.create_database('my_database')\nmy_database.exists()", 
 73 |             "metadata": {
 74 |                 "collapsed": false
 75 |             }, 
 76 |             "outputs": [
 77 |                 {
 78 |                     "execution_count": 5, 
 79 |                     "data": {
 80 |                         "text/plain": "True"
 81 |                     }, 
 82 |                     "metadata": {}, 
 83 |                     "output_type": "execute_result"
 84 |                 }
 85 |             ]
 86 |         }, 
 87 |         {
 88 |             "cell_type": "code", 
 89 |             "execution_count": 6, 
 90 |             "source": "# Open an existing database\nmy_database = client['my_database']", 
 91 |             "metadata": {
 92 |                 "collapsed": true
 93 |             }, 
 94 |             "outputs": []
 95 |         }, 
 96 |         {
 97 |             "cell_type": "code", 
 98 |             "execution_count": null, 
 99 |             "source": "# Delete a database using an initialized client\n# client.delete_database('my_database')", 
100 |             "metadata": {
101 |                 "collapsed": true
102 |             }, 
103 |             "outputs": []
104 |         }, 
105 |         {
106 |             "cell_type": "code", 
107 |             "execution_count": 9, 
108 |             "source": "# Create document content data\ndata = {\n    '_id': 'julia30', # Setting _id is optional\n    'name': 'Julia',\n    'age': 30,\n    'pets': ['cat', 'dog', 'frog']\n    }\n\n# Create a document using the Database API\nmy_document = my_database.create_document(data)", 
109 |             "metadata": {
110 |                 "collapsed": false
111 |             }, 
112 |             "outputs": []
113 |         }, 
114 |         {
115 |             "cell_type": "code", 
116 |             "execution_count": 10, 
117 |             "source": "my_document = my_database['julia30']\n\n# Display the document\nprint my_document", 
118 |             "metadata": {
119 |                 "collapsed": false
120 |             }, 
121 |             "outputs": [
122 |                 {
123 |                     "name": "stdout", 
124 |                     "output_type": "stream", 
125 |                     "text": "{'_rev': u'3-8cf90dfd0627cb4e3f7284ebacb59a36', 'age': 30, '_id': u'julia30', 'name': 'Julia', 'pets': ['cat', 'dog', 'frog']}\n"
126 |                 }
127 |             ]
128 |         }, 
129 |         {
130 |             "cell_type": "code", 
131 |             "execution_count": 11, 
132 |             "source": "# Get all of the documents from my_database\nfor document in my_database:\n    print document", 
133 |             "metadata": {
134 |                 "collapsed": false
135 |             }, 
136 |             "outputs": [
137 |                 {
138 |                     "name": "stdout", 
139 |                     "output_type": "stream", 
140 |                     "text": "{u'name': u'Julia', u'pets': [u'cat', u'dog', u'frog'], u'_rev': u'3-8cf90dfd0627cb4e3f7284ebacb59a36', '_id': u'julia30', u'age': 30}\n"
141 |                 }
142 |             ]
143 |         }, 
144 |         {
145 |             "cell_type": "code", 
146 |             "execution_count": 12, 
147 |             "source": "# First retrieve the document\nmy_document = my_database['julia30']\n\n# Update the document content\n# This can be done as you would any other dictionary\nmy_document['name'] = 'Jules'\nmy_document['age'] = 6\n\n# You must save the document in order to update it on the database\nmy_document.save()", 
148 |             "metadata": {
149 |                 "collapsed": true
150 |             }, 
151 |             "outputs": []
152 |         }, 
153 |         {
154 |             "cell_type": "code", 
155 |             "execution_count": 13, 
156 |             "source": "# First retrieve the document\nmy_document = my_database['julia30']\n\n# Delete the document\n#my_document.delete()", 
157 |             "metadata": {
158 |                 "collapsed": true
159 |             }, 
160 |             "outputs": []
161 |         }, 
162 |         {
163 |             "cell_type": "code", 
164 |             "execution_count": 14, 
165 |             "source": "from cloudant.result import Result, ResultByKey\n\n# Retrieve Result wrapped document content.\n# Note: The include_docs parameter is optional and is used to illustrate that view query\n# parameters can be used to customize the result collection.\nresult_collection = Result(my_database.all_docs, include_docs=True)\n\n# Get the result at a given location in the result collection\n# Note: Valid result collection indexing starts at 0\nresult = result_collection[0]                   # result is the 1st in the collection\nresult = result_collection[9]                   # result is the 10th in the collection\n\n# Get the result for matching a key\nresult = result_collection['julia30']           # result is all that match key 'julia30'\n\n# If your key is an integer then use the ResultByKey class to differentiate your integer\n# key from an indexed location within the result collection which is also an integer.\nresult = result_collection[ResultByKey(9)]      # result is all that match key 9\n\n# Slice by key values\nresult = result_collection['julia30': 'ruby99'] # result is between and including keys\nresult = result_collection['julia30': ]         # result is after and including key\nresult = result_collection[: 'ruby99']          # result is up to and including key\n\n# Slice by index values\nresult = result_collection[100: 200]            # result is between 100 to 200, including 200th\nresult = result_collection[: 200]               # result is up to and including the 200th\nresult = result_collection[100: ]               # result is after the 100th\n\n# Iterate over the result collection\nfor result in result_collection:\n    print result", 
166 |             "metadata": {
167 |                 "collapsed": false
168 |             }, 
169 |             "outputs": [
170 |                 {
171 |                     "name": "stdout", 
172 |                     "output_type": "stream", 
173 |                     "text": "{u'value': {u'rev': u'4-e40a1ebb1dc53a84bd0ca9c3431b1436'}, u'id': u'julia30', u'key': u'julia30', u'doc': {u'_rev': u'4-e40a1ebb1dc53a84bd0ca9c3431b1436', u'_id': u'julia30', u'age': 6, u'pets': [u'cat', u'dog', u'frog'], u'name': u'Jules'}}\n"
174 |                 }
175 |             ]
176 |         }
177 |     ], 
178 |     "metadata": {
179 |         "kernelspec": {
180 |             "display_name": "Python 2 with Spark 1.6", 
181 |             "language": "python", 
182 |             "name": "python2"
183 |         }, 
184 |         "language_info": {
185 |             "codemirror_mode": {
186 |                 "version": 2, 
187 |                 "name": "ipython"
188 |             }, 
189 |             "version": "2.7.11", 
190 |             "name": "python", 
191 |             "pygments_lexer": "ipython2", 
192 |             "nbconvert_exporter": "python", 
193 |             "mimetype": "text/x-python", 
194 |             "file_extension": ".py"
195 |         }
196 |     }, 
197 |     "nbformat_minor": 0
198 | }


--------------------------------------------------------------------------------
/1. In and Export/1. import and export 4. Twitter.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nbformat": 4, 
 3 |     "cells": [
 4 |         {
 5 |             "cell_type": "markdown", 
 6 |             "metadata": {
 7 |                 "collapsed": true
 8 |             }, 
 9 |             "source": "# Getting data from Twitter\n\nhttps://cdeservice.mybluemix.net/rest-api/\n\n#more complex version:\nhttps://github.com/ibm-cds-labs/Spark-Twitter-Watson-Dashboard"
10 |         }, 
11 |         {
12 |             "cell_type": "code", 
13 |             "execution_count": 7, 
14 |             "source": "import requests\nimport json", 
15 |             "metadata": {
16 |                 "collapsed": true
17 |             }, 
18 |             "outputs": []
19 |         }, 
20 |         {
21 |             "cell_type": "code", 
22 |             "execution_count": 53, 
23 |             "source": "@hidden_cell \n\ncredentials_1={\n  \"username\": \"2983643d-0d0f-460b-91a4-d2ff200e0605\",\n  \"password\": \"VEs96toZlP\",\n  \"host\": \"cdeservice.mybluemix.net\",\n  \"port\": 443,\n  \"url\": \"https://2983643d-0d0f-460b-91a4-d2ff200e0605:VEs96toZlP@cdeservice.mybluemix.net\"\n}", 
24 |             "metadata": {
25 |                 "collapsed": true
26 |             }, 
27 |             "outputs": []
28 |         }, 
29 |         {
30 |             "cell_type": "code", 
31 |             "execution_count": 88, 
32 |             "source": "# searcing for Tweets\n\n# Twitter query language\n# https://console.ng.bluemix.net/docs/services/Twitter/twitter_rest_apis.html#querylanguage\n\nquery=\"/api/v1/messages/search?q=IBM\"\nmyResponse = requests.get(credentials_1[\"url\"]+query)\n\njData = json.loads(myResponse.content)", 
33 |             "metadata": {
34 |                 "collapsed": true
35 |             }, 
36 |             "outputs": []
37 |         }, 
38 |         {
39 |             "cell_type": "code", 
40 |             "execution_count": 90, 
41 |             "source": "#extracting info from Tweets\n#http://support.gnip.com/sources/twitter/data_format.html\n\nfor line_object in jData['tweets']:\n    try:\n        actor_id_string = line_object[\"message\"][\"actor\"][\"id\"]\n        actor_id = int( actor_id_string.split(\":\")[2] )\n        language_code = line_object[\"message\"][\"twitter_lang\"]\n        print \"{0:12d}, {1:2s}\".format(actor_id,language_code)\n    except KeyError, e:\n        actor_id = -1\n        language_code = \"Null\"", 
42 |             "metadata": {
43 |                 "collapsed": true
44 |             }, 
45 |             "outputs": [
46 |                 {
47 |                     "name": "stdout", 
48 |                     "output_type": "stream", 
49 |                     "text": "  2421007620, und\n    17392046, en\n    18217950, en\n  1081239367, en\n  1396865473, da\n  2466483601, en\n  2195374282, en\n  2271229628, ja\n    11409612, en\n   256639900, en\n  1217548591, en\n    71131576, en\n   581308169, en\n  1910749248, en\n  2501250108, in\n  2913390709, ja\n   331855732, en\n  1096826832, en\n  1048291658, en\n  1067882804, ro\n  2493431107, en\n    59147680, pt\n  2704548373, en\n   721015415, uk\n   107337350, ja\n  2443345184, ja\n  2846486337, en\n   323808915, en\n   497156789, en\n   572681115, en\n  1838811798, ja\n     6822322, en\n   595479894, en\n    28426792, en\n  1664464753, en\n   270806514, en\n   293415739, en\n    55799972, en\n   772793874, en\n  2655479348, en\n     3819701, en\n  2402201132, en\n  1592589091, in\n  2916190101, en\n  1368782430, en\n   113142094, ja\n   304322350, ja\n   328741551, en\n  1662300529, en\n  2447547871, en\n   135262662, en\n  2148828366, en\n   381660534, en\n  2552766488, tr\n  2452816490, en\n    21155258, en\n  2341580630, en\n   110368039, en\n  2950974153, en\n  2483742385, en\n  2211680239, en\n   625263018, ja\n  2815420262, en\n    14347265, en\n  1711071769, en\n  2342275374, en\n    62294711, en\n  1220269800, en\n   623615343, en\n  2876570757, ko\n   300286188, en\n  1951171886, en\n   266953814, en\n    14590971, en\n    11407622, en\n  1389090468, en\n  1217548591, en\n  2704548373, en\n    20591919, en\n    14352195, en\n   600128032, de\n  1389090468, en\n  2452816490, en\n   425963964, en\n   317831666, en\n    24524448, en\n   305991028, en\n   271692852, ja\n   266715506, en\n  2644810332, es\n    17181265, en\n  1508595090, en\n  2359580791, en\n  1646558305, en\n  2493423925, en\n  1240538113, en\n  2433920005, de\n  2759252876, en\n    34571479, en\n  1418812723, en\n"
50 |                 }
51 |             ]
52 |         }, 
53 |         {
54 |             "cell_type": "code", 
55 |             "execution_count": 22, 
56 |             "source": "# The code was removed by DSX for sharing.", 
57 |             "metadata": {
58 |                 "collapsed": false
59 |             }, 
60 |             "outputs": []
61 |         }, 
62 |         {
63 |             "cell_type": "code", 
64 |             "execution_count": 91, 
65 |             "source": "#query from cloudant\n#result_collection = Result(my_database.all_docs, include_docs=True)\n\nfor line_object in result_collection:\n    try:\n        actor_id_string = line_object['doc']['message']['actor']['id']\n        actor_id = int( actor_id_string.split(\":\")[2] )\n        language_code = line_object['doc'][\"message\"][\"twitter_lang\"]\n        print \"{0:12d}, {1:2s}\".format(actor_id,language_code)\n    except KeyError, e:\n        actor_id = -1\n        language_code = \"Null\"\n\n\n", 
66 |             "metadata": {
67 |                 "collapsed": true
68 |             }, 
69 |             "outputs": [
70 |                 {
71 |                     "name": "stdout", 
72 |                     "output_type": "stream", 
73 |                     "text": "    18217950, en\n   721015415, uk\n     3819701, en\n   328741551, en\n   381660534, en\n  2452816490, en\n  2211680239, en\n  1711071769, en\n    14352195, en\n  1389090468, en\n  2195374282, en\n    11409612, en\n    71131576, en\n  2501250108, in\n  2341580630, en\n    14347265, en\n    34571479, en\n  1418812723, en\n   107337350, ja\n  2402201132, en\n   266953814, en\n    20591919, en\n   425963964, en\n    24524448, en\n  1508595090, en\n   256639900, en\n  1910749248, en\n   331855732, en\n   772793874, en\n  1368782430, en\n  2876570757, ko\n    17181265, en\n  1396865473, da\n    59147680, pt\n  1838811798, ja\n    21155258, en\n   110368039, en\n  2483742385, en\n   623615343, en\n    14590971, en\n  2452816490, en\n   317831666, en\n   305991028, en\n  2759252876, en\n  2271229628, ja\n  2493431107, en\n    28426792, en\n  1664464753, en\n    55799972, en\n  1592589091, in\n  2447547871, en\n  2704548373, en\n  2493423925, en\n    17392046, en\n  2913390709, ja\n  2443345184, ja\n  2846486337, en\n  1662300529, en\n  2342275374, en\n   300286188, en\n   266715506, en\n   323808915, en\n   572681115, en\n  2655479348, en\n  2552766488, tr\n  1220269800, en\n  1646558305, en\n  2421007620, und\n  2466483601, en\n  1067882804, ro\n   497156789, en\n     6822322, en\n  2916190101, en\n   304322350, ja\n  2644810332, es\n  1081239367, en\n  1048291658, en\n   270806514, en\n   293415739, en\n   625263018, ja\n  2433920005, de\n  1096826832, en\n  2704548373, en\n   135262662, en\n  1951171886, en\n    11407622, en\n  1217548591, en\n  1217548591, en\n   581308169, en\n   595479894, en\n  2148828366, en\n  2950974153, en\n  2815420262, en\n    62294711, en\n  1389090468, en\n   600128032, de\n   271692852, ja\n  2359580791, en\n  1240538113, en\n  2728916672, en\n"
74 |                 }
75 |             ]
76 |         }
77 |     ], 
78 |     "metadata": {
79 |         "kernelspec": {
80 |             "display_name": "Python 2 with Spark 1.6", 
81 |             "language": "python", 
82 |             "name": "python2"
83 |         }, 
84 |         "language_info": {
85 |             "codemirror_mode": {
86 |                 "version": 2, 
87 |                 "name": "ipython"
88 |             }, 
89 |             "version": "2.7.11", 
90 |             "name": "python", 
91 |             "pygments_lexer": "ipython2", 
92 |             "nbconvert_exporter": "python", 
93 |             "mimetype": "text/x-python", 
94 |             "file_extension": ".py"
95 |         }
96 |     }, 
97 |     "nbformat_minor": 0
98 | }


--------------------------------------------------------------------------------
/1. In and Export/1. import and export 5. BigInsights.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nbformat": 4, 
 3 |     "nbformat_minor": 0, 
 4 |     "cells": [
 5 |         {
 6 |             "source": "# In-and export from BigInsights\n\nhttps://developer.ibm.com/hadoop/docs/getting-started/tutorials/big-sql-hadoop-tutorial/", 
 7 |             "cell_type": "markdown", 
 8 |             "metadata": {
 9 |                 "collapsed": true
10 |             }
11 |         }, 
12 |         {
13 |             "source": "!pip install --user ibm_db", 
14 |             "execution_count": 1, 
15 |             "cell_type": "code", 
16 |             "metadata": {
17 |                 "scrolled": true, 
18 |                 "collapsed": false
19 |             }, 
20 |             "outputs": [
21 |                 {
22 |                     "name": "stdout", 
23 |                     "text": "Collecting ibm-db\n  Downloading ibm_db-2.0.7.tar.gz (553kB)\n\u001b[K    100% |\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 563kB 1.9MB/s \n\u001b[?25hInstalling collected packages: ibm-db\n  Running setup.py install for ibm-db ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \bdone\n\u001b[?25hSuccessfully installed ibm-db-2.0.7\n", 
24 |                     "output_type": "stream"
25 |                 }
26 |             ]
27 |         }, 
28 |         {
29 |             "source": "import ibm_db", 
30 |             "execution_count": 2, 
31 |             "cell_type": "code", 
32 |             "metadata": {
33 |                 "collapsed": true
34 |             }, 
35 |             "outputs": []
36 |         }, 
37 |         {
38 |             "source": "Credentials", 
39 |             "cell_type": "markdown", 
40 |             "metadata": {}
41 |         }, 
42 |         {
43 |             "source": "@hidden_cell\n\ncredentials_1 = {\n  'user':'bootcamp',\n  'password':'bootcamp1bootcamp',\n  'database' : 'bigsql',\n  'hostname' : 'iop-bi-master.imdemocloud.com',\n  'port' : '32051'  \n}", 
44 |             "execution_count": null, 
45 |             "cell_type": "code", 
46 |             "metadata": {
47 |                 "collapsed": true
48 |             }, 
49 |             "outputs": []
50 |         }, 
51 |         {
52 |             "source": "conn_string = (\n      \"DRIVER={{IBM DB2 ODBC DRIVER}};\"\n      \"DATABASE={0};\"\n      \"HOSTNAME={1};\"\n      \"PORT={2};\"\n      \"PROTOCOL=TCPIP;\"\n      \"UID={3};\"\n      \"PWD={4};\").format(database, hostname, port, username, password);\n\nconn = ibm_db.connect(conn_string, \"\", \"\")", 
53 |             "execution_count": null, 
54 |             "cell_type": "code", 
55 |             "metadata": {
56 |                 "collapsed": true
57 |             }, 
58 |             "outputs": []
59 |         }, 
60 |         {
61 |             "source": "query = \"USE \"+username+\";\";\nibm_db.exec_immediate(conn, query);", 
62 |             "execution_count": null, 
63 |             "cell_type": "code", 
64 |             "metadata": {
65 |                 "collapsed": true
66 |             }, 
67 |             "outputs": []
68 |         }, 
69 |         {
70 |             "source": "# BigSQL not available in current cloud based BigInsights offering", 
71 |             "execution_count": null, 
72 |             "cell_type": "code", 
73 |             "metadata": {
74 |                 "collapsed": true
75 |             }, 
76 |             "outputs": []
77 |         }
78 |     ], 
79 |     "metadata": {
80 |         "language_info": {
81 |             "nbconvert_exporter": "python", 
82 |             "mimetype": "text/x-python", 
83 |             "pygments_lexer": "ipython2", 
84 |             "version": "2.7.11", 
85 |             "file_extension": ".py", 
86 |             "name": "python", 
87 |             "codemirror_mode": {
88 |                 "name": "ipython", 
89 |                 "version": 2
90 |             }
91 |         }, 
92 |         "kernelspec": {
93 |             "name": "python2", 
94 |             "display_name": "Python 2 with Spark 1.6", 
95 |             "language": "python"
96 |         }
97 |     }
98 | }


--------------------------------------------------------------------------------
/2. Watson APIs/2. Watson 3. Alchemy language.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat": 4, 
  3 |     "nbformat_minor": 0, 
  4 |     "cells": [
  5 |         {
  6 |             "source": "# Alchemy language", 
  7 |             "cell_type": "markdown", 
  8 |             "metadata": {
  9 |                 "collapsed": true
 10 |             }
 11 |         }, 
 12 |         {
 13 |             "execution_count": null, 
 14 |             "metadata": {
 15 |                 "collapsed": true
 16 |             }, 
 17 |             "outputs": [], 
 18 |             "cell_type": "code", 
 19 |             "source": "!pip install watson-developer-cloud"
 20 |         }, 
 21 |         {
 22 |             "execution_count": 3, 
 23 |             "metadata": {
 24 |                 "collapsed": true
 25 |             }, 
 26 |             "outputs": [], 
 27 |             "cell_type": "code", 
 28 |             "source": "# The code was removed by DSX for sharing."
 29 |         }, 
 30 |         {
 31 |             "execution_count": null, 
 32 |             "metadata": {
 33 |                 "collapsed": true
 34 |             }, 
 35 |             "outputs": [], 
 36 |             "cell_type": "code", 
 37 |             "source": "import json\nfrom os.path import join, dirname\nfrom watson_developer_cloud import AlchemyLanguageV1\n\nalchemy_language = AlchemyLanguageV1(api_key=credentials_1['apikey'])\n\nurl = 'https://developer.ibm.com/watson/blog/2015/11/03/price-reduction-for-watson-personality-insights/'"
 38 |         }, 
 39 |         {
 40 |             "execution_count": 6, 
 41 |             "metadata": {
 42 |                 "collapsed": true
 43 |             }, 
 44 |             "outputs": [
 45 |                 {
 46 |                     "evalue": "Error: daily-transaction-limit-exceeded", 
 47 |                     "traceback": [
 48 |                         "\u001b[1;31m\u001b[0m", 
 49 |                         "\u001b[1;31mWatsonException\u001b[0mTraceback (most recent call last)", 
 50 |                         "\u001b[1;32m<ipython-input-6-44ee7d7c3650>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjson\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0malchemy_language\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtargeted_sentiment\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'I love cats! Dogs are smelly.'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtargets\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cats'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'dogs'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlanguage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'english'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 
 51 |                         "\u001b[1;32m/gpfs/fs01/user/s2de-6c3e21af46e198-7a7840b6cba3/.local/lib/python2.7/site-packages/watson_developer_cloud/alchemy_language_v1.pyc\u001b[0m in \u001b[0;36mtargeted_sentiment\u001b[1;34m(self, targets, html, text, url, language, constraint_query, xpath_query, show_source_text, source_text_type)\u001b[0m\n\u001b[0;32m    320\u001b[0m                   'sourceText': source_text_type}\n\u001b[0;32m    321\u001b[0m         return self._alchemy_html_request('GetTargetedSentiment', html=html,\n\u001b[1;32m--> 322\u001b[1;33m                                           text=text, url=url, params=params)\n\u001b[0m", 
 52 |                         "\u001b[1;32m/gpfs/fs01/user/s2de-6c3e21af46e198-7a7840b6cba3/.local/lib/python2.7/site-packages/watson_developer_cloud/watson_developer_cloud_service.pyc\u001b[0m in \u001b[0;36m_alchemy_html_request\u001b[1;34m(self, method_name, url, html, text, params, method, method_url)\u001b[0m\n\u001b[0;32m    225\u001b[0m         return self.request(method=method, url=method_url, params=params,\n\u001b[0;32m    226\u001b[0m                             \u001b[0mdata\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl_encoded_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 227\u001b[1;33m                             accept_json=True)\n\u001b[0m\u001b[0;32m    228\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    229\u001b[0m     def _alchemy_image_request(self, method_name, image_file=None,\n", 
 53 |                         "\u001b[1;32m/gpfs/fs01/user/s2de-6c3e21af46e198-7a7840b6cba3/.local/lib/python2.7/site-packages/watson_developer_cloud/watson_developer_cloud_service.pyc\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(self, method, url, accept_json, headers, params, json, data, files, **kwargs)\u001b[0m\n\u001b[0;32m    309\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[0merror_message\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'invalid-api-key'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    310\u001b[0m                         \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m401\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 311\u001b[1;33m                     \u001b[1;32mraise\u001b[0m \u001b[0mWatsonException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Error: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0merror_message\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    312\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mresponse_json\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    313\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 
 54 |                         "\u001b[1;31mWatsonException\u001b[0m: Error: daily-transaction-limit-exceeded"
 55 |                     ], 
 56 |                     "ename": "WatsonException", 
 57 |                     "output_type": "error"
 58 |                 }
 59 |             ], 
 60 |             "cell_type": "code", 
 61 |             "source": "print(json.dumps(alchemy_language.targeted_sentiment(text='I love cats! Dogs are smelly.',targets=['cats', 'dogs'],language='english'), indent=2))"
 62 |         }, 
 63 |         {
 64 |             "execution_count": null, 
 65 |             "metadata": {
 66 |                 "collapsed": true
 67 |             }, 
 68 |             "outputs": [], 
 69 |             "cell_type": "code", 
 70 |             "source": "print(json.dumps(alchemy_language.targeted_emotion(text='I love apples. I hate bananas', targets=['apples','bananas'], language='english'), indent=2))"
 71 |         }, 
 72 |         {
 73 |             "execution_count": null, 
 74 |             "metadata": {
 75 |                 "collapsed": true
 76 |             }, 
 77 |             "outputs": [], 
 78 |             "cell_type": "code", 
 79 |             "source": "print(json.dumps(alchemy_language.author(url=url), indent=2))"
 80 |         }, 
 81 |         {
 82 |             "execution_count": null, 
 83 |             "metadata": {
 84 |                 "collapsed": true
 85 |             }, 
 86 |             "outputs": [], 
 87 |             "cell_type": "code", 
 88 |             "source": "print(json.dumps(alchemy_language.concepts(max_items=2, url=url), indent=2))"
 89 |         }, 
 90 |         {
 91 |             "execution_count": null, 
 92 |             "metadata": {
 93 |                 "collapsed": true
 94 |             }, 
 95 |             "outputs": [], 
 96 |             "cell_type": "code", 
 97 |             "source": "print(json.dumps(alchemy_language.dates(url=url, anchor_date='2016-03-22 00:00:00'), indent=2))"
 98 |         }, 
 99 |         {
100 |             "execution_count": null, 
101 |             "metadata": {
102 |                 "collapsed": true
103 |             }, 
104 |             "outputs": [], 
105 |             "cell_type": "code", 
106 |             "source": "print(json.dumps(alchemy_language.emotion(url=url), indent=2))"
107 |         }, 
108 |         {
109 |             "execution_count": null, 
110 |             "metadata": {
111 |                 "collapsed": true
112 |             }, 
113 |             "outputs": [], 
114 |             "cell_type": "code", 
115 |             "source": "print(json.dumps(alchemy_language.entities(url=url), indent=2))"
116 |         }, 
117 |         {
118 |             "execution_count": null, 
119 |             "metadata": {
120 |                 "collapsed": true
121 |             }, 
122 |             "outputs": [], 
123 |             "cell_type": "code", 
124 |             "source": "print(json.dumps(alchemy_language.entities(url=url), indent=2))"
125 |         }, 
126 |         {
127 |             "execution_count": null, 
128 |             "metadata": {
129 |                 "collapsed": true
130 |             }, 
131 |             "outputs": [], 
132 |             "cell_type": "code", 
133 |             "source": "print(json.dumps(alchemy_language.keywords(max_items=5, url=url), indent=2))"
134 |         }, 
135 |         {
136 |             "execution_count": null, 
137 |             "metadata": {
138 |                 "collapsed": true
139 |             }, 
140 |             "outputs": [], 
141 |             "cell_type": "code", 
142 |             "source": "print(json.dumps(alchemy_language.category(url=url), indent=2))"
143 |         }, 
144 |         {
145 |             "execution_count": null, 
146 |             "metadata": {
147 |                 "collapsed": true
148 |             }, 
149 |             "outputs": [], 
150 |             "cell_type": "code", 
151 |             "source": "print(json.dumps(alchemy_language.typed_relations(url=url), indent=2))"
152 |         }, 
153 |         {
154 |             "execution_count": null, 
155 |             "metadata": {
156 |                 "collapsed": true
157 |             }, 
158 |             "outputs": [], 
159 |             "cell_type": "code", 
160 |             "source": "print(json.dumps(alchemy_language.relations(url=url), indent=2))"
161 |         }, 
162 |         {
163 |             "execution_count": null, 
164 |             "metadata": {
165 |                 "collapsed": true
166 |             }, 
167 |             "outputs": [], 
168 |             "cell_type": "code", 
169 |             "source": "print(json.dumps(alchemy_language.language(url=url), indent=2))"
170 |         }, 
171 |         {
172 |             "execution_count": null, 
173 |             "metadata": {
174 |                 "collapsed": true
175 |             }, 
176 |             "outputs": [], 
177 |             "cell_type": "code", 
178 |             "source": "print(json.dumps(alchemy_language.text(url=url), indent=2))"
179 |         }, 
180 |         {
181 |             "execution_count": null, 
182 |             "metadata": {
183 |                 "collapsed": true
184 |             }, 
185 |             "outputs": [], 
186 |             "cell_type": "code", 
187 |             "source": "print(json.dumps(alchemy_language.raw_text(url=url), indent=2))"
188 |         }, 
189 |         {
190 |             "execution_count": null, 
191 |             "metadata": {
192 |                 "collapsed": true
193 |             }, 
194 |             "outputs": [], 
195 |             "cell_type": "code", 
196 |             "source": "print(json.dumps(alchemy_language.title(url=url), indent=2))"
197 |         }, 
198 |         {
199 |             "execution_count": null, 
200 |             "metadata": {
201 |                 "collapsed": true
202 |             }, 
203 |             "outputs": [], 
204 |             "cell_type": "code", 
205 |             "source": "print(json.dumps(alchemy_language.feeds(url=url), indent=2))"
206 |         }, 
207 |         {
208 |             "execution_count": null, 
209 |             "metadata": {
210 |                 "collapsed": true
211 |             }, 
212 |             "outputs": [], 
213 |             "cell_type": "code", 
214 |             "source": "print(json.dumps(alchemy_language.microformats(url='http://microformats.org/wiki/hcard-examples'), indent=2))"
215 |         }, 
216 |         {
217 |             "execution_count": null, 
218 |             "metadata": {
219 |                 "collapsed": true
220 |             }, 
221 |             "outputs": [], 
222 |             "cell_type": "code", 
223 |             "source": "print(json.dumps(alchemy_language.publication_date(url=url), indent=2))"
224 |         }, 
225 |         {
226 |             "execution_count": null, 
227 |             "metadata": {
228 |                 "collapsed": true
229 |             }, 
230 |             "outputs": [], 
231 |             "cell_type": "code", 
232 |             "source": "print(json.dumps(alchemy_language.taxonomy(url=url), indent=2))"
233 |         }, 
234 |         {
235 |             "execution_count": null, 
236 |             "metadata": {
237 |                 "collapsed": true
238 |             }, 
239 |             "outputs": [], 
240 |             "cell_type": "code", 
241 |             "source": "combined_operations = ['page-image', 'entity', 'keyword', 'title', 'author','taxonomy', 'concept', 'doc-emotion']\nprint(json.dumps(alchemy_language.combined(url=url, extract=combined_operations),indent=2))"
242 |         }, 
243 |         {
244 |             "execution_count": null, 
245 |             "metadata": {
246 |                 "collapsed": true
247 |             }, 
248 |             "outputs": [], 
249 |             "cell_type": "code", 
250 |             "source": "# Get sentiment and emotion information results for detected entities/keywords:\nprint(json.dumps(alchemy_language.entities(url=url, sentiment=True,emotion=True), indent=2))"
251 |         }, 
252 |         {
253 |             "execution_count": null, 
254 |             "metadata": {
255 |                 "collapsed": true
256 |             }, 
257 |             "outputs": [], 
258 |             "cell_type": "code", 
259 |             "source": "print(json.dumps(alchemy_language.keywords(max_items=5, url=url,sentiment=True, emotion=True), indent=2))"
260 |         }
261 |     ], 
262 |     "metadata": {
263 |         "language_info": {
264 |             "nbconvert_exporter": "python", 
265 |             "file_extension": ".py", 
266 |             "codemirror_mode": {
267 |                 "name": "ipython", 
268 |                 "version": 2
269 |             }, 
270 |             "pygments_lexer": "ipython2", 
271 |             "version": "2.7.11", 
272 |             "mimetype": "text/x-python", 
273 |             "name": "python"
274 |         }, 
275 |         "kernelspec": {
276 |             "language": "python", 
277 |             "display_name": "Python 2 with Spark 1.6", 
278 |             "name": "python2"
279 |         }
280 |     }
281 | }


--------------------------------------------------------------------------------
/2. Watson APIs/2. Watson 4. Tone analyzer.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nbformat": 4, 
 3 |     "nbformat_minor": 0, 
 4 |     "cells": [
 5 |         {
 6 |             "source": "# Tone analyzer", 
 7 |             "cell_type": "markdown", 
 8 |             "metadata": {
 9 |                 "collapsed": true
10 |             }
11 |         }, 
12 |         {
13 |             "source": "# The code was removed by DSX for sharing.", 
14 |             "execution_count": 1, 
15 |             "cell_type": "code", 
16 |             "metadata": {
17 |                 "collapsed": true
18 |             }, 
19 |             "outputs": []
20 |         }, 
21 |         {
22 |             "source": "import json\nfrom watson_developer_cloud import ToneAnalyzerV3\n\n\ntone_analyzer = ToneAnalyzerV3(\n    username=credentials_1['username'],\n    password=credentials_1['password'],\n    version='2016-02-11')\n\nprint(json.dumps(tone_analyzer.tone(text='I am very happy'), indent=2))", 
23 |             "execution_count": 3, 
24 |             "cell_type": "code", 
25 |             "metadata": {
26 |                 "collapsed": false
27 |             }, 
28 |             "outputs": [
29 |                 {
30 |                     "name": "stdout", 
31 |                     "text": "{\n  \"document_tone\": {\n    \"tone_categories\": [\n      {\n        \"category_id\": \"emotion_tone\", \n        \"tones\": [\n          {\n            \"tone_name\": \"Anger\", \n            \"score\": 0.006227, \n            \"tone_id\": \"anger\"\n          }, \n          {\n            \"tone_name\": \"Disgust\", \n            \"score\": 0.008777, \n            \"tone_id\": \"disgust\"\n          }, \n          {\n            \"tone_name\": \"Fear\", \n            \"score\": 0.007074, \n            \"tone_id\": \"fear\"\n          }, \n          {\n            \"tone_name\": \"Joy\", \n            \"score\": 0.973498, \n            \"tone_id\": \"joy\"\n          }, \n          {\n            \"tone_name\": \"Sadness\", \n            \"score\": 0.017861, \n            \"tone_id\": \"sadness\"\n          }\n        ], \n        \"category_name\": \"Emotion Tone\"\n      }, \n      {\n        \"category_id\": \"writing_tone\", \n        \"tones\": [\n          {\n            \"tone_name\": \"Analytical\", \n            \"score\": 0.0, \n            \"tone_id\": \"analytical\"\n          }, \n          {\n            \"tone_name\": \"Confident\", \n            \"score\": 0.97759, \n            \"tone_id\": \"confident\"\n          }, \n          {\n            \"tone_name\": \"Tentative\", \n            \"score\": 0.0, \n            \"tone_id\": \"tentative\"\n          }\n        ], \n        \"category_name\": \"Writing Tone\"\n      }, \n      {\n        \"category_id\": \"social_tone\", \n        \"tones\": [\n          {\n            \"tone_name\": \"Openness\", \n            \"score\": 0.096859, \n            \"tone_id\": \"openness_big5\"\n          }, \n          {\n            \"tone_name\": \"Conscientiousness\", \n            \"score\": 0.264058, \n            \"tone_id\": \"conscientiousness_big5\"\n          }, \n          {\n            \"tone_name\": \"Extraversion\", \n            \"score\": 0.472657, \n            \"tone_id\": \"extraversion_big5\"\n          }, \n          {\n            \"tone_name\": \"Agreeableness\", \n            \"score\": 0.61522, \n            \"tone_id\": \"agreeableness_big5\"\n          }, \n          {\n            \"tone_name\": \"Emotional Range\", \n            \"score\": 0.104851, \n            \"tone_id\": \"neuroticism_big5\"\n          }\n        ], \n        \"category_name\": \"Social Tone\"\n      }\n    ]\n  }\n}\n", 
32 |                     "output_type": "stream"
33 |                 }
34 |             ]
35 |         }
36 |     ], 
37 |     "metadata": {
38 |         "language_info": {
39 |             "nbconvert_exporter": "python", 
40 |             "mimetype": "text/x-python", 
41 |             "pygments_lexer": "ipython2", 
42 |             "version": "2.7.11", 
43 |             "file_extension": ".py", 
44 |             "name": "python", 
45 |             "codemirror_mode": {
46 |                 "name": "ipython", 
47 |                 "version": 2
48 |             }
49 |         }, 
50 |         "kernelspec": {
51 |             "name": "python2", 
52 |             "display_name": "Python 2 with Spark 1.6", 
53 |             "language": "python"
54 |         }
55 |     }
56 | }


--------------------------------------------------------------------------------
/2. Watson APIs/2. Watson 5. Natural language classifier.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat": 4, 
  3 |     "nbformat_minor": 0, 
  4 |     "cells": [
  5 |         {
  6 |             "source": "# Natural language classifier", 
  7 |             "cell_type": "markdown", 
  8 |             "metadata": {
  9 |                 "collapsed": true
 10 |             }
 11 |         }, 
 12 |         {
 13 |             "source": "# The code was removed by DSX for sharing.", 
 14 |             "execution_count": 1, 
 15 |             "cell_type": "code", 
 16 |             "metadata": {
 17 |                 "collapsed": true
 18 |             }, 
 19 |             "outputs": []
 20 |         }, 
 21 |         {
 22 |             "source": "import json\nfrom watson_developer_cloud import NaturalLanguageClassifierV1\n\nnatural_language_classifier = NaturalLanguageClassifierV1(\n    username=credentials_1['username'],\n    password=credentials_1['password'])\n\nclassifiers = natural_language_classifier.list()\nprint(json.dumps(classifiers, indent=2))", 
 23 |             "execution_count": 3, 
 24 |             "cell_type": "code", 
 25 |             "metadata": {
 26 |                 "collapsed": false
 27 |             }, 
 28 |             "outputs": [
 29 |                 {
 30 |                     "name": "stdout", 
 31 |                     "text": "{\n  \"classifiers\": []\n}\n", 
 32 |                     "output_type": "stream"
 33 |                 }
 34 |             ]
 35 |         }, 
 36 |         {
 37 |             "source": "#create a classifier\nimport urllib\nurllib.urlretrieve (\"https://raw.githubusercontent.com/analytics-bootcamp/Training-material/master/7.%20Training%20-%20Misc/weather.txt\", \"weather.txt\")\n\nwith open('weather.txt', 'rb') as training_data:\n     print(json.dumps(natural_language_classifier.create(training_data=training_data, name='weather'), indent=2))\n", 
 38 |             "execution_count": 10, 
 39 |             "cell_type": "code", 
 40 |             "metadata": {
 41 |                 "scrolled": true, 
 42 |                 "collapsed": false
 43 |             }, 
 44 |             "outputs": [
 45 |                 {
 46 |                     "name": "stdout", 
 47 |                     "text": "{\n  \"status\": \"Training\", \n  \"name\": \"weather\", \n  \"language\": \"en\", \n  \"created\": \"2017-03-09T04:15:28.693Z\", \n  \"url\": \"https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/4d5c10x177-nlc-2873\", \n  \"status_description\": \"The classifier instance is in its training phase, not yet ready to accept classify requests\", \n  \"classifier_id\": \"4d5c10x177-nlc-2873\"\n}\n", 
 48 |                     "output_type": "stream"
 49 |                 }
 50 |             ]
 51 |         }, 
 52 |         {
 53 |             "source": "# replace 2374f9x68-nlc-2697 with your classifier id\nstatus = natural_language_classifier.status('4d5c10x177-nlc-2873')\nprint(json.dumps(status, indent=2))", 
 54 |             "execution_count": 13, 
 55 |             "cell_type": "code", 
 56 |             "metadata": {
 57 |                 "collapsed": false
 58 |             }, 
 59 |             "outputs": [
 60 |                 {
 61 |                     "name": "stdout", 
 62 |                     "text": "{\n  \"status\": \"Training\", \n  \"name\": \"weather\", \n  \"language\": \"en\", \n  \"created\": \"2017-03-09T04:15:28.693Z\", \n  \"url\": \"https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/4d5c10x177-nlc-2873\", \n  \"status_description\": \"The classifier instance is in its training phase, not yet ready to accept classify requests\", \n  \"classifier_id\": \"4d5c10x177-nlc-2873\"\n}\n", 
 63 |                     "output_type": "stream"
 64 |                 }
 65 |             ]
 66 |         }, 
 67 |         {
 68 |             "source": "status = natural_language_classifier.status('4d5c10x177-nlc-2873')\nprint(json.dumps(status, indent=2))\n\nif status['status'] == 'Available':\n    classes = natural_language_classifier.classify('4d5c10x177-nlc-2873','How hot will it be tomorrow?')\n    print(json.dumps(classes, indent=2))\n\n", 
 69 |             "execution_count": 14, 
 70 |             "cell_type": "code", 
 71 |             "metadata": {
 72 |                 "collapsed": false
 73 |             }, 
 74 |             "outputs": [
 75 |                 {
 76 |                     "name": "stdout", 
 77 |                     "text": "{\n  \"status\": \"Training\", \n  \"name\": \"weather\", \n  \"language\": \"en\", \n  \"created\": \"2017-03-09T04:15:28.693Z\", \n  \"url\": \"https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/4d5c10x177-nlc-2873\", \n  \"status_description\": \"The classifier instance is in its training phase, not yet ready to accept classify requests\", \n  \"classifier_id\": \"4d5c10x177-nlc-2873\"\n}\n", 
 78 |                     "output_type": "stream"
 79 |                 }
 80 |             ]
 81 |         }, 
 82 |         {
 83 |             "source": "delete = natural_language_classifier.remove('4d5c10x177-nlc-2873')\nprint(json.dumps(delete, indent=2))", 
 84 |             "execution_count": null, 
 85 |             "cell_type": "code", 
 86 |             "metadata": {
 87 |                 "collapsed": true
 88 |             }, 
 89 |             "outputs": []
 90 |         }
 91 |     ], 
 92 |     "metadata": {
 93 |         "language_info": {
 94 |             "nbconvert_exporter": "python", 
 95 |             "mimetype": "text/x-python", 
 96 |             "pygments_lexer": "ipython2", 
 97 |             "version": "2.7.11", 
 98 |             "file_extension": ".py", 
 99 |             "name": "python", 
100 |             "codemirror_mode": {
101 |                 "name": "ipython", 
102 |                 "version": 2
103 |             }
104 |         }, 
105 |         "kernelspec": {
106 |             "name": "python2", 
107 |             "display_name": "Python 2 with Spark 1.6", 
108 |             "language": "python"
109 |         }
110 |     }
111 | }


--------------------------------------------------------------------------------
/3. Visualization/Python_Bokeh_Cheat_Sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/3. Visualization/Python_Bokeh_Cheat_Sheet.pdf


--------------------------------------------------------------------------------
/3. Visualization/Python_Matplotlib_Cheat_Sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/3. Visualization/Python_Matplotlib_Cheat_Sheet.pdf


--------------------------------------------------------------------------------
/4. Spark/4. Spark 0. rdd-creation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:89b31567699d26877d1a7406cc718f5609a31c4d05e95c8a8ec474b0f62daa56"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "RDD creation"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "heading",
 21 |      "level": 4,
 22 |      "metadata": {},
 23 |      "source": [
 24 |       "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)"
 25 |      ]
 26 |     },
 27 |     {
 28 |      "cell_type": "markdown",
 29 |      "metadata": {},
 30 |      "source": [
 31 |       "In this notebook we will introduce two different ways of getting data into the basic Spark data structure, the **Resilient Distributed Dataset** or **RDD**. An RDD is a distributed collection of elements. All work in Spark is expressed as either creating new RDDs, transforming existing RDDs, or calling actions on RDDs to compute a result. Spark automatically distributes the data contained in RDDs across your cluster and parallelizes the operations you perform on them."
 32 |      ]
 33 |     },
 34 |     {
 35 |      "cell_type": "heading",
 36 |      "level": 4,
 37 |      "metadata": {},
 38 |      "source": [
 39 |       "References"
 40 |      ]
 41 |     },
 42 |     {
 43 |      "cell_type": "markdown",
 44 |      "metadata": {},
 45 |      "source": [
 46 |       "The reference book for these and other Spark related topics is *Learning Spark* by Holden Karau, Andy Konwinski, Patrick Wendell, and Matei Zaharia.  "
 47 |      ]
 48 |     },
 49 |     {
 50 |      "cell_type": "markdown",
 51 |      "metadata": {},
 52 |      "source": [
 53 |       "The KDD Cup 1999 competition dataset is described in detail [here](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99)."
 54 |      ]
 55 |     },
 56 |     {
 57 |      "cell_type": "heading",
 58 |      "level": 2,
 59 |      "metadata": {},
 60 |      "source": [
 61 |       "Getting the data files  "
 62 |      ]
 63 |     },
 64 |     {
 65 |      "cell_type": "markdown",
 66 |      "metadata": {},
 67 |      "source": [
 68 |       "In this notebook we will use the reduced dataset (10 percent) provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a *Gzip* file that we will download locally.  "
 69 |      ]
 70 |     },
 71 |     {
 72 |      "cell_type": "code",
 73 |      "collapsed": false,
 74 |      "input": [
 75 |       "import urllib\n",
 76 |       "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")"
 77 |      ],
 78 |      "language": "python",
 79 |      "metadata": {},
 80 |      "outputs": [],
 81 |      "prompt_number": 31
 82 |     },
 83 |     {
 84 |      "cell_type": "heading",
 85 |      "level": 2,
 86 |      "metadata": {},
 87 |      "source": [
 88 |       "Creating a RDD from a file  "
 89 |      ]
 90 |     },
 91 |     {
 92 |      "cell_type": "markdown",
 93 |      "metadata": {},
 94 |      "source": [
 95 |       "The most common way of creating an RDD is to load it from a file. Notice that Spark's `textFile` can handle compressed files directly.    "
 96 |      ]
 97 |     },
 98 |     {
 99 |      "cell_type": "code",
100 |      "collapsed": false,
101 |      "input": [
102 |       "data_file = \"./kddcup.data_10_percent.gz\"\n",
103 |       "raw_data = sc.textFile(data_file)"
104 |      ],
105 |      "language": "python",
106 |      "metadata": {},
107 |      "outputs": [],
108 |      "prompt_number": 32
109 |     },
110 |     {
111 |      "cell_type": "markdown",
112 |      "metadata": {},
113 |      "source": [
114 |       "Now we have our data file loaded into the `raw_data` RDD."
115 |      ]
116 |     },
117 |     {
118 |      "cell_type": "markdown",
119 |      "metadata": {},
120 |      "source": [
121 |       "Without getting into Spark *transformations* and *actions*, the most basic thing we can do to check that we got our RDD contents right is to `count()` the number of lines loaded from the file into the RDD.  "
122 |      ]
123 |     },
124 |     {
125 |      "cell_type": "code",
126 |      "collapsed": false,
127 |      "input": [
128 |       "raw_data.count()"
129 |      ],
130 |      "language": "python",
131 |      "metadata": {},
132 |      "outputs": [
133 |       {
134 |        "metadata": {},
135 |        "output_type": "pyout",
136 |        "prompt_number": 33,
137 |        "text": [
138 |         "494021"
139 |        ]
140 |       }
141 |      ],
142 |      "prompt_number": 33
143 |     },
144 |     {
145 |      "cell_type": "markdown",
146 |      "metadata": {},
147 |      "source": [
148 |       "We can also check the first few entries in our data.  "
149 |      ]
150 |     },
151 |     {
152 |      "cell_type": "code",
153 |      "collapsed": false,
154 |      "input": [
155 |       "raw_data.take(5)"
156 |      ],
157 |      "language": "python",
158 |      "metadata": {},
159 |      "outputs": [
160 |       {
161 |        "metadata": {},
162 |        "output_type": "pyout",
163 |        "prompt_number": 34,
164 |        "text": [
165 |         "[u'0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',\n",
166 |         " u'0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',\n",
167 |         " u'0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',\n",
168 |         " u'0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',\n",
169 |         " u'0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']"
170 |        ]
171 |       }
172 |      ],
173 |      "prompt_number": 34
174 |     },
175 |     {
176 |      "cell_type": "markdown",
177 |      "metadata": {},
178 |      "source": [
179 |       "In the following notebooks, we will use this raw data to learn about the different Spark transformations and actions.  "
180 |      ]
181 |     },
182 |     {
183 |      "cell_type": "heading",
184 |      "level": 2,
185 |      "metadata": {},
186 |      "source": [
187 |       "Creating and RDD using `parallelize`"
188 |      ]
189 |     },
190 |     {
191 |      "cell_type": "markdown",
192 |      "metadata": {},
193 |      "source": [
194 |       "Another way of creating an RDD is to parallelize an already existing list.  "
195 |      ]
196 |     },
197 |     {
198 |      "cell_type": "code",
199 |      "collapsed": false,
200 |      "input": [
201 |       "a = range(100)\n",
202 |       "\n",
203 |       "data = sc.parallelize(a)"
204 |      ],
205 |      "language": "python",
206 |      "metadata": {},
207 |      "outputs": [],
208 |      "prompt_number": 35
209 |     },
210 |     {
211 |      "cell_type": "markdown",
212 |      "metadata": {},
213 |      "source": [
214 |       "As we did before, we can `count()` the number of elements in the RDD."
215 |      ]
216 |     },
217 |     {
218 |      "cell_type": "code",
219 |      "collapsed": false,
220 |      "input": [
221 |       "data.count()"
222 |      ],
223 |      "language": "python",
224 |      "metadata": {},
225 |      "outputs": [
226 |       {
227 |        "metadata": {},
228 |        "output_type": "pyout",
229 |        "prompt_number": 36,
230 |        "text": [
231 |         "100"
232 |        ]
233 |       }
234 |      ],
235 |      "prompt_number": 36
236 |     },
237 |     {
238 |      "cell_type": "markdown",
239 |      "metadata": {},
240 |      "source": [
241 |       "As before, we can access the first few elements on our RDD.  "
242 |      ]
243 |     },
244 |     {
245 |      "cell_type": "code",
246 |      "collapsed": false,
247 |      "input": [
248 |       "data.take(5)"
249 |      ],
250 |      "language": "python",
251 |      "metadata": {},
252 |      "outputs": [
253 |       {
254 |        "metadata": {},
255 |        "output_type": "pyout",
256 |        "prompt_number": 37,
257 |        "text": [
258 |         "[0, 1, 2, 3, 4]"
259 |        ]
260 |       }
261 |      ],
262 |      "prompt_number": 37
263 |     }
264 |    ],
265 |    "metadata": {}
266 |   }
267 |  ]
268 | }


--------------------------------------------------------------------------------
/4. Spark/4. Spark 1. rdd-basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:7ce5292fff087bd3fe623675ed06dd472f9e0de945d9b383f83f9f151eb1eaad"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "RDD basics"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "heading",
 21 |      "level": 4,
 22 |      "metadata": {},
 23 |      "source": [
 24 |       "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)"
 25 |      ]
 26 |     },
 27 |     {
 28 |      "cell_type": "markdown",
 29 |      "metadata": {},
 30 |      "source": [
 31 |       "This notebook will introduce three basic but essential Spark operations. Two of them are the *transformations* `map` and `filter`. The other is the *action* `collect`. At the same time we will introduce the concept of *persistence* in Spark.    "
 32 |      ]
 33 |     },
 34 |     {
 35 |      "cell_type": "heading",
 36 |      "level": 2,
 37 |      "metadata": {},
 38 |      "source": [
 39 |       "Getting the data and creating the RDD"
 40 |      ]
 41 |     },
 42 |     {
 43 |      "cell_type": "markdown",
 44 |      "metadata": {},
 45 |      "source": [
 46 |       "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally."
 47 |      ]
 48 |     },
 49 |     {
 50 |      "cell_type": "code",
 51 |      "collapsed": false,
 52 |      "input": [
 53 |       "import urllib\n",
 54 |       "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")"
 55 |      ],
 56 |      "language": "python",
 57 |      "metadata": {},
 58 |      "outputs": []
 59 |     },
 60 |     {
 61 |      "cell_type": "markdown",
 62 |      "metadata": {},
 63 |      "source": [
 64 |       "Now we can use this file to create our RDD."
 65 |      ]
 66 |     },
 67 |     {
 68 |      "cell_type": "code",
 69 |      "collapsed": false,
 70 |      "input": [
 71 |       "data_file = \"./kddcup.data_10_percent.gz\"\n",
 72 |       "raw_data = sc.textFile(data_file)"
 73 |      ],
 74 |      "language": "python",
 75 |      "metadata": {},
 76 |      "outputs": [],
 77 |      "prompt_number": 1
 78 |     },
 79 |     {
 80 |      "cell_type": "heading",
 81 |      "level": 2,
 82 |      "metadata": {},
 83 |      "source": [
 84 |       "The `filter` transformation"
 85 |      ]
 86 |     },
 87 |     {
 88 |      "cell_type": "markdown",
 89 |      "metadata": {},
 90 |      "source": [
 91 |       "This transformation can be applied to RDDs in order to keep just elements that satisfy a certain condition. More concretely, a function is evaluated on every element in the original RDD. The new resulting RDD will contain just those elements that make the function return `True`."
 92 |      ]
 93 |     },
 94 |     {
 95 |      "cell_type": "markdown",
 96 |      "metadata": {},
 97 |      "source": [
 98 |       "For example, imagine we want to count how many `normal.` interactions we have in our dataset. We can filter our `raw_data` RDD as follows.  "
 99 |      ]
100 |     },
101 |     {
102 |      "cell_type": "code",
103 |      "collapsed": false,
104 |      "input": [
105 |       "normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)"
106 |      ],
107 |      "language": "python",
108 |      "metadata": {},
109 |      "outputs": [],
110 |      "prompt_number": 2
111 |     },
112 |     {
113 |      "cell_type": "markdown",
114 |      "metadata": {},
115 |      "source": [
116 |       "Now we can count how many elements we have in the new RDD."
117 |      ]
118 |     },
119 |     {
120 |      "cell_type": "code",
121 |      "collapsed": false,
122 |      "input": [
123 |       "from time import time\n",
124 |       "t0 = time()\n",
125 |       "normal_count = normal_raw_data.count()\n",
126 |       "tt = time() - t0\n",
127 |       "print \"There are {} 'normal' interactions\".format(normal_count)\n",
128 |       "print \"Count completed in {} seconds\".format(round(tt,3))"
129 |      ],
130 |      "language": "python",
131 |      "metadata": {},
132 |      "outputs": [
133 |       {
134 |        "output_type": "stream",
135 |        "stream": "stdout",
136 |        "text": [
137 |         "There are 97278 'normal' interactions\n",
138 |         "Count completed in 5.951 seconds\n"
139 |        ]
140 |       }
141 |      ],
142 |      "prompt_number": 3
143 |     },
144 |     {
145 |      "cell_type": "markdown",
146 |      "metadata": {},
147 |      "source": [
148 |       "Remember from notebook 1 that we have a total of 494021 in our 10 percent dataset. Here we can see that 97278 contain the `normal.` tag word.  "
149 |      ]
150 |     },
151 |     {
152 |      "cell_type": "markdown",
153 |      "metadata": {},
154 |      "source": [
155 |       "Notice that we have measured the elapsed time for counting the elements in the RDD. We have done this because we wanted to point out that actual (distributed) computations in Spark take place when we execute *actions* and not *transformations*. In this case `count` is the action we execute on the RDD. We can apply as many transformations as we want on a our RDD and no computation will take place until we call the first action that, in this case takes a few seconds to complete."
156 |      ]
157 |     },
158 |     {
159 |      "cell_type": "heading",
160 |      "level": 2,
161 |      "metadata": {},
162 |      "source": [
163 |       "The `map` transformation"
164 |      ]
165 |     },
166 |     {
167 |      "cell_type": "markdown",
168 |      "metadata": {},
169 |      "source": [
170 |       "By using the `map` transformation in Spark, we can apply a function to every element in our RDD. Python's lambdas are specially expressive for this particular."
171 |      ]
172 |     },
173 |     {
174 |      "cell_type": "markdown",
175 |      "metadata": {},
176 |      "source": [
177 |       "In this case we want to read our data file as a CSV formatted one. We can do this by applying a lambda function to each element in the RDD as follows."
178 |      ]
179 |     },
180 |     {
181 |      "cell_type": "code",
182 |      "collapsed": false,
183 |      "input": [
184 |       "from pprint import pprint\n",
185 |       "csv_data = raw_data.map(lambda x: x.split(\",\"))\n",
186 |       "t0 = time()\n",
187 |       "head_rows = csv_data.take(5)\n",
188 |       "tt = time() - t0\n",
189 |       "print \"Parse completed in {} seconds\".format(round(tt,3))\n",
190 |       "pprint(head_rows[0])"
191 |      ],
192 |      "language": "python",
193 |      "metadata": {},
194 |      "outputs": [
195 |       {
196 |        "output_type": "stream",
197 |        "stream": "stdout",
198 |        "text": [
199 |         "Parse completed in 1.715 seconds\n",
200 |         "[u'0',\n",
201 |         " u'tcp',\n",
202 |         " u'http',\n",
203 |         " u'SF',\n",
204 |         " u'181',\n",
205 |         " u'5450',\n",
206 |         " u'0',\n",
207 |         " u'0',\n",
208 |         " u'0',\n",
209 |         " u'0',\n",
210 |         " u'0',\n",
211 |         " u'1',\n",
212 |         " u'0',\n",
213 |         " u'0',\n",
214 |         " u'0',\n",
215 |         " u'0',\n",
216 |         " u'0',\n",
217 |         " u'0',\n",
218 |         " u'0',\n",
219 |         " u'0',\n",
220 |         " u'0',\n",
221 |         " u'0',\n",
222 |         " u'8',\n",
223 |         " u'8',\n",
224 |         " u'0.00',\n",
225 |         " u'0.00',\n",
226 |         " u'0.00',\n",
227 |         " u'0.00',\n",
228 |         " u'1.00',\n",
229 |         " u'0.00',\n",
230 |         " u'0.00',\n",
231 |         " u'9',\n",
232 |         " u'9',\n",
233 |         " u'1.00',\n",
234 |         " u'0.00',\n",
235 |         " u'0.11',\n",
236 |         " u'0.00',\n",
237 |         " u'0.00',\n",
238 |         " u'0.00',\n",
239 |         " u'0.00',\n",
240 |         " u'0.00',\n",
241 |         " u'normal.']\n"
242 |        ]
243 |       }
244 |      ],
245 |      "prompt_number": 4
246 |     },
247 |     {
248 |      "cell_type": "markdown",
249 |      "metadata": {},
250 |      "source": [
251 |       "Again, all action happens once we call the first Spark *action* (i.e. *take* in this case). What if we take a lot of elements instead of just the first few?  "
252 |      ]
253 |     },
254 |     {
255 |      "cell_type": "code",
256 |      "collapsed": false,
257 |      "input": [
258 |       "t0 = time()\n",
259 |       "head_rows = csv_data.take(100000)\n",
260 |       "tt = time() - t0\n",
261 |       "print \"Parse completed in {} seconds\".format(round(tt,3))"
262 |      ],
263 |      "language": "python",
264 |      "metadata": {},
265 |      "outputs": [
266 |       {
267 |        "output_type": "stream",
268 |        "stream": "stdout",
269 |        "text": [
270 |         "Parse completed in 8.629 seconds\n"
271 |        ]
272 |       }
273 |      ],
274 |      "prompt_number": 5
275 |     },
276 |     {
277 |      "cell_type": "markdown",
278 |      "metadata": {},
279 |      "source": [
280 |       "We can see that it takes longer. The `map` function is applied now in a  distributed way to a lot of elements on the RDD, hence the longer execution time."
281 |      ]
282 |     },
283 |     {
284 |      "cell_type": "heading",
285 |      "level": 3,
286 |      "metadata": {},
287 |      "source": [
288 |       "Using `map` and predefined functions"
289 |      ]
290 |     },
291 |     {
292 |      "cell_type": "markdown",
293 |      "metadata": {},
294 |      "source": [
295 |       "Of course we can use predefined functions with `map`. Imagine we want to have each element in the RDD as a key-value pair where the key is the tag (e.g. *normal*) and the value is the whole list of elements that represents the row in the CSV formatted file. We could proceed as follows.    "
296 |      ]
297 |     },
298 |     {
299 |      "cell_type": "code",
300 |      "collapsed": false,
301 |      "input": [
302 |       "def parse_interaction(line):\n",
303 |       "    elems = line.split(\",\")\n",
304 |       "    tag = elems[41]\n",
305 |       "    return (tag, elems)\n",
306 |       "\n",
307 |       "key_csv_data = raw_data.map(parse_interaction)\n",
308 |       "head_rows = key_csv_data.take(5)\n",
309 |       "pprint(head_rows[0])"
310 |      ],
311 |      "language": "python",
312 |      "metadata": {},
313 |      "outputs": [
314 |       {
315 |        "output_type": "stream",
316 |        "stream": "stdout",
317 |        "text": [
318 |         "(u'normal.',\n",
319 |         " [u'0',\n",
320 |         "  u'tcp',\n",
321 |         "  u'http',\n",
322 |         "  u'SF',\n",
323 |         "  u'181',\n",
324 |         "  u'5450',\n",
325 |         "  u'0',\n",
326 |         "  u'0',\n",
327 |         "  u'0',\n",
328 |         "  u'0',\n",
329 |         "  u'0',\n",
330 |         "  u'1',\n",
331 |         "  u'0',\n",
332 |         "  u'0',\n",
333 |         "  u'0',\n",
334 |         "  u'0',\n",
335 |         "  u'0',\n",
336 |         "  u'0',\n",
337 |         "  u'0',\n",
338 |         "  u'0',\n",
339 |         "  u'0',\n",
340 |         "  u'0',\n",
341 |         "  u'8',\n",
342 |         "  u'8',\n",
343 |         "  u'0.00',\n",
344 |         "  u'0.00',\n",
345 |         "  u'0.00',\n",
346 |         "  u'0.00',\n",
347 |         "  u'1.00',\n",
348 |         "  u'0.00',\n",
349 |         "  u'0.00',\n",
350 |         "  u'9',\n",
351 |         "  u'9',\n",
352 |         "  u'1.00',\n",
353 |         "  u'0.00',\n",
354 |         "  u'0.11',\n",
355 |         "  u'0.00',\n",
356 |         "  u'0.00',\n",
357 |         "  u'0.00',\n",
358 |         "  u'0.00',\n",
359 |         "  u'0.00',\n",
360 |         "  u'normal.'])\n"
361 |        ]
362 |       }
363 |      ],
364 |      "prompt_number": 6
365 |     },
366 |     {
367 |      "cell_type": "markdown",
368 |      "metadata": {},
369 |      "source": [
370 |       "That was easy, wasn't it?"
371 |      ]
372 |     },
373 |     {
374 |      "cell_type": "markdown",
375 |      "metadata": {},
376 |      "source": [
377 |       "In our notebook about working with key-value pairs we will use this type of RDDs to do data aggregations (e.g. count by key)."
378 |      ]
379 |     },
380 |     {
381 |      "cell_type": "heading",
382 |      "level": 2,
383 |      "metadata": {},
384 |      "source": [
385 |       "The `collect` action"
386 |      ]
387 |     },
388 |     {
389 |      "cell_type": "markdown",
390 |      "metadata": {},
391 |      "source": [
392 |       "So far we have used the actions `count` and `take`. Another basic action we need to learn is `collect`. Basically it will get all the elements in the RDD into memory for us to work with them. For this reason it has to be used with care, specially when working with large RDDs.  "
393 |      ]
394 |     },
395 |     {
396 |      "cell_type": "markdown",
397 |      "metadata": {},
398 |      "source": [
399 |       "An example using our raw data.    "
400 |      ]
401 |     },
402 |     {
403 |      "cell_type": "code",
404 |      "collapsed": false,
405 |      "input": [
406 |       "t0 = time()\n",
407 |       "all_raw_data = raw_data.collect()\n",
408 |       "tt = time() - t0\n",
409 |       "print \"Data collected in {} seconds\".format(round(tt,3))"
410 |      ],
411 |      "language": "python",
412 |      "metadata": {},
413 |      "outputs": [
414 |       {
415 |        "output_type": "stream",
416 |        "stream": "stdout",
417 |        "text": [
418 |         "Data collected in 17.927 seconds\n"
419 |        ]
420 |       }
421 |      ],
422 |      "prompt_number": 9
423 |     },
424 |     {
425 |      "cell_type": "markdown",
426 |      "metadata": {},
427 |      "source": [
428 |       "That took longer as any other action we used before, of course. Every Spark worker node that has a fragment of the RDD has to be coordinated in order to retrieve its part, and then *reduce* everything together.    "
429 |      ]
430 |     },
431 |     {
432 |      "cell_type": "markdown",
433 |      "metadata": {},
434 |      "source": [
435 |       "As a last example combining all the previous, we want to collect all the `normal` interactions as key-value pairs.   "
436 |      ]
437 |     },
438 |     {
439 |      "cell_type": "code",
440 |      "collapsed": false,
441 |      "input": [
442 |       "# get data from file\n",
443 |       "data_file = \"./kddcup.data_10_percent.gz\"\n",
444 |       "raw_data = sc.textFile(data_file)\n",
445 |       "\n",
446 |       "# parse into key-value pairs\n",
447 |       "key_csv_data = raw_data.map(parse_interaction)\n",
448 |       "\n",
449 |       "# filter normal key interactions\n",
450 |       "normal_key_interactions = key_csv_data.filter(lambda x: x[0] == \"normal.\")\n",
451 |       "\n",
452 |       "# collect all\n",
453 |       "t0 = time()\n",
454 |       "all_normal = normal_key_interactions.collect()\n",
455 |       "tt = time() - t0\n",
456 |       "normal_count = len(all_normal)\n",
457 |       "print \"Data collected in {} seconds\".format(round(tt,3))\n",
458 |       "print \"There are {} 'normal' interactions\".format(normal_count)"
459 |      ],
460 |      "language": "python",
461 |      "metadata": {},
462 |      "outputs": [
463 |       {
464 |        "output_type": "stream",
465 |        "stream": "stdout",
466 |        "text": [
467 |         "Data collected in 12.485 seconds\n",
468 |         "There are 97278 normal interactions\n"
469 |        ]
470 |       }
471 |      ],
472 |      "prompt_number": 13
473 |     },
474 |     {
475 |      "cell_type": "markdown",
476 |      "metadata": {},
477 |      "source": [
478 |       "This count matches with the previous count for `normal` interactions. The new procedure is more time consuming. This is because we retrieve all the data with `collect` and then use Python's `len` on the resulting list. Before we were just counting the total number of elements in the RDD by using `count`.  "
479 |      ]
480 |     }
481 |    ],
482 |    "metadata": {}
483 |   }
484 |  ]
485 | }


--------------------------------------------------------------------------------
/4. Spark/4. Spark 2. rdd-sampling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:8581d2dfe951591985d0f9eb665f33044c479321d2a0b77699d2d79ad8ef0641"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Sampling RDDs"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)"
 24 |      ]
 25 |     },
 26 |     {
 27 |      "cell_type": "markdown",
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "So far we have introduced RDD creation together with some basic transformations such as `map` and `filter` and some actions such as `count`, `take`, and `collect`.  "
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "This notebook will show how to sample RDDs. Regarding transformations, `sample` will be introduced since it will be useful in many statistical learning scenarios. Then we will compare results with the `takeSample` action.      "
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "heading",
 42 |      "level": 2,
 43 |      "metadata": {},
 44 |      "source": [
 45 |       "Getting the data and creating the RDD"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "cell_type": "markdown",
 50 |      "metadata": {},
 51 |      "source": [
 52 |       "In this case we will use the complete dataset provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally."
 53 |      ]
 54 |     },
 55 |     {
 56 |      "cell_type": "code",
 57 |      "collapsed": false,
 58 |      "input": [
 59 |       "import urllib\n",
 60 |       "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz\", \"kddcup.data.gz\")"
 61 |      ],
 62 |      "language": "python",
 63 |      "metadata": {},
 64 |      "outputs": [],
 65 |      "prompt_number": 1
 66 |     },
 67 |     {
 68 |      "cell_type": "markdown",
 69 |      "metadata": {},
 70 |      "source": [
 71 |       "Now we can use this file to create our RDD."
 72 |      ]
 73 |     },
 74 |     {
 75 |      "cell_type": "code",
 76 |      "collapsed": false,
 77 |      "input": [
 78 |       "data_file = \"./kddcup.data.gz\"\n",
 79 |       "raw_data = sc.textFile(data_file)"
 80 |      ],
 81 |      "language": "python",
 82 |      "metadata": {},
 83 |      "outputs": [],
 84 |      "prompt_number": 2
 85 |     },
 86 |     {
 87 |      "cell_type": "heading",
 88 |      "level": 2,
 89 |      "metadata": {},
 90 |      "source": [
 91 |       "Sampling RDDs   "
 92 |      ]
 93 |     },
 94 |     {
 95 |      "cell_type": "markdown",
 96 |      "metadata": {},
 97 |      "source": [
 98 |       "In Spark, there are two sampling operations, the transformation `sample` and the action `takeSample`. By using a transformation we can tell Spark to apply successive transformation on a sample of a given RDD. By using an action we retrieve a given sample and we can have it in local memory to be used by any other standard library (e.g. Scikit-learn).  "
 99 |      ]
100 |     },
101 |     {
102 |      "cell_type": "heading",
103 |      "level": 3,
104 |      "metadata": {},
105 |      "source": [
106 |       "The `sample` transformation"
107 |      ]
108 |     },
109 |     {
110 |      "cell_type": "markdown",
111 |      "metadata": {},
112 |      "source": [
113 |       "The `sample` transformation takes up to three parameters. First is whether the sampling is done with replacement or not. Second is the sample size as a fraction. Finally we can optionally provide a *random seed*.  "
114 |      ]
115 |     },
116 |     {
117 |      "cell_type": "code",
118 |      "collapsed": false,
119 |      "input": [
120 |       "raw_data_sample = raw_data.sample(False, 0.1, 1234)\n",
121 |       "sample_size = raw_data_sample.count()\n",
122 |       "total_size = raw_data.count()\n",
123 |       "print \"Sample size is {} of {}\".format(sample_size, total_size)"
124 |      ],
125 |      "language": "python",
126 |      "metadata": {},
127 |      "outputs": [
128 |       {
129 |        "output_type": "stream",
130 |        "stream": "stdout",
131 |        "text": [
132 |         "Sample size is 489957 of 4898431\n"
133 |        ]
134 |       }
135 |      ],
136 |      "prompt_number": 3
137 |     },
138 |     {
139 |      "cell_type": "markdown",
140 |      "metadata": {},
141 |      "source": [
142 |       "But the power of sampling as a transformation comes from doing it as part of a sequence of additional transformations. This will show more powerful once we start doing aggregations and key-value pairs operations, and will be specially useful when using Spark's machine learning library MLlib.    "
143 |      ]
144 |     },
145 |     {
146 |      "cell_type": "markdown",
147 |      "metadata": {},
148 |      "source": [
149 |       "In the meantime, imagine we want to have an approximation of the proportion of `normal.` interactions in our dataset. We could do this by counting the total number of tags as we did in previous notebooks. However we want a quicker response and we don't need the exact answer but just an approximation. We can do it as follows.   "
150 |      ]
151 |     },
152 |     {
153 |      "cell_type": "code",
154 |      "collapsed": false,
155 |      "input": [
156 |       "from time import time\n",
157 |       "\n",
158 |       "# transformations to be applied\n",
159 |       "raw_data_sample_items = raw_data_sample.map(lambda x: x.split(\",\"))\n",
160 |       "sample_normal_tags = raw_data_sample_items.filter(lambda x: \"normal.\" in x)\n",
161 |       "\n",
162 |       "# actions + time\n",
163 |       "t0 = time()\n",
164 |       "sample_normal_tags_count = sample_normal_tags.count()\n",
165 |       "tt = time() - t0\n",
166 |       "\n",
167 |       "sample_normal_ratio = sample_normal_tags_count / float(sample_size)\n",
168 |       "print \"The ratio of 'normal' interactions is {}\".format(round(sample_normal_ratio,3)) \n",
169 |       "print \"Count done in {} seconds\".format(round(tt,3))"
170 |      ],
171 |      "language": "python",
172 |      "metadata": {},
173 |      "outputs": [
174 |       {
175 |        "output_type": "stream",
176 |        "stream": "stdout",
177 |        "text": [
178 |         "The ratio of 'normal' interactions is 0.199\n",
179 |         "Count done in 44.523 seconds\n"
180 |        ]
181 |       }
182 |      ],
183 |      "prompt_number": 4
184 |     },
185 |     {
186 |      "cell_type": "markdown",
187 |      "metadata": {},
188 |      "source": [
189 |       "Let's compare this with calculating the ratio without sampling.  "
190 |      ]
191 |     },
192 |     {
193 |      "cell_type": "code",
194 |      "collapsed": false,
195 |      "input": [
196 |       "# transformations to be applied\n",
197 |       "raw_data_items = raw_data.map(lambda x: x.split(\",\"))\n",
198 |       "normal_tags = raw_data_items.filter(lambda x: \"normal.\" in x)\n",
199 |       "\n",
200 |       "# actions + time\n",
201 |       "t0 = time()\n",
202 |       "normal_tags_count = normal_tags.count()\n",
203 |       "tt = time() - t0\n",
204 |       "\n",
205 |       "normal_ratio = normal_tags_count / float(total_size)\n",
206 |       "print \"The ratio of 'normal' interactions is {}\".format(round(normal_ratio,3)) \n",
207 |       "print \"Count done in {} seconds\".format(round(tt,3))"
208 |      ],
209 |      "language": "python",
210 |      "metadata": {},
211 |      "outputs": [
212 |       {
213 |        "output_type": "stream",
214 |        "stream": "stdout",
215 |        "text": [
216 |         "The ratio of 'normal' interactions is 0.199\n",
217 |         "Count done in 91.09 seconds\n"
218 |        ]
219 |       }
220 |      ],
221 |      "prompt_number": 5
222 |     },
223 |     {
224 |      "cell_type": "markdown",
225 |      "metadata": {},
226 |      "source": [
227 |       "We can see a gain in time. The more transformations we apply after the sampling the bigger this gain. This is because without sampling all the transformations are applied to the complete set of data.  "
228 |      ]
229 |     },
230 |     {
231 |      "cell_type": "heading",
232 |      "level": 3,
233 |      "metadata": {},
234 |      "source": [
235 |       "The `takeSample` action  "
236 |      ]
237 |     },
238 |     {
239 |      "cell_type": "markdown",
240 |      "metadata": {},
241 |      "source": [
242 |       "If what we need is to grab a sample of raw data from our RDD into local memory in order to be used by other non-Spark libraries, `takeSample` can be used.  "
243 |      ]
244 |     },
245 |     {
246 |      "cell_type": "markdown",
247 |      "metadata": {},
248 |      "source": [
249 |       "The syntax is very similar, but in this case we specify the number of items instead of the sample size as a fraction of the complete data size.  "
250 |      ]
251 |     },
252 |     {
253 |      "cell_type": "code",
254 |      "collapsed": false,
255 |      "input": [
256 |       "t0 = time()\n",
257 |       "raw_data_sample = raw_data.takeSample(False, 400000, 1234)\n",
258 |       "normal_data_sample = [x.split(\",\") for x in raw_data_sample if \"normal.\" in x]\n",
259 |       "tt = time() - t0\n",
260 |       "\n",
261 |       "normal_sample_size = len(normal_data_sample)\n",
262 |       "\n",
263 |       "normal_ratio = normal_sample_size / 400000.0\n",
264 |       "print \"The ratio of 'normal' interactions is {}\".format(normal_ratio)\n",
265 |       "print \"Count done in {} seconds\".format(round(tt,3))"
266 |      ],
267 |      "language": "python",
268 |      "metadata": {},
269 |      "outputs": [
270 |       {
271 |        "output_type": "stream",
272 |        "stream": "stdout",
273 |        "text": [
274 |         "The ratio of 'normal' interactions is 0.1988025\n",
275 |         "Count done in 76.166 seconds\n"
276 |        ]
277 |       }
278 |      ],
279 |      "prompt_number": 6
280 |     },
281 |     {
282 |      "cell_type": "markdown",
283 |      "metadata": {},
284 |      "source": [
285 |       "The process was very similar as before. We obtained a sample of about 10 percent of the data, and then filter and split.  \n",
286 |       "\n",
287 |       "However, it took longer, even with a slightly smaller sample. The reason is that Spark just distributed the execution of the sampling process. The filtering and splitting of the results were done locally in a single node.  "
288 |      ]
289 |     }
290 |    ],
291 |    "metadata": {}
292 |   }
293 |  ]
294 | }
295 | 


--------------------------------------------------------------------------------
/4. Spark/4. Spark 3. rdd-set.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:123c139134363a65ac4461e7d98848e74ede7989fa222d57f2ff95d79405e114"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Set operations on RDDs"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)"
 24 |      ]
 25 |     },
 26 |     {
 27 |      "cell_type": "markdown",
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "Spark supports many of the operations we have in mathematical sets, such as union and intersection, even when the RDDs themselves are not properly sets. It is important to note that these operations require that the RDDs being operated on are of the same type.  "
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "Set operations are quite straightforward to understand as it work as expected. The only consideration comes from the fact that RDDs are not real sets, and therefore operations such as the union of RDDs doesn't remove duplicates. In this notebook we will have a brief look at `subtract`, `distinct`, and `cartesian`.       "
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "heading",
 42 |      "level": 2,
 43 |      "metadata": {},
 44 |      "source": [
 45 |       "Getting the data and creating the RDD"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "cell_type": "markdown",
 50 |      "metadata": {},
 51 |      "source": [
 52 |       "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally."
 53 |      ]
 54 |     },
 55 |     {
 56 |      "cell_type": "code",
 57 |      "collapsed": false,
 58 |      "input": [
 59 |       "import urllib\n",
 60 |       "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")"
 61 |      ],
 62 |      "language": "python",
 63 |      "metadata": {},
 64 |      "outputs": [],
 65 |      "prompt_number": 1
 66 |     },
 67 |     {
 68 |      "cell_type": "code",
 69 |      "collapsed": false,
 70 |      "input": [
 71 |       "data_file = \"./kddcup.data_10_percent.gz\"\n",
 72 |       "raw_data = sc.textFile(data_file)"
 73 |      ],
 74 |      "language": "python",
 75 |      "metadata": {},
 76 |      "outputs": [],
 77 |      "prompt_number": 2
 78 |     },
 79 |     {
 80 |      "cell_type": "heading",
 81 |      "level": 2,
 82 |      "metadata": {},
 83 |      "source": [
 84 |       "Getting attack interactions using `subtract`"
 85 |      ]
 86 |     },
 87 |     {
 88 |      "cell_type": "markdown",
 89 |      "metadata": {},
 90 |      "source": [
 91 |       "For illustrative purposes, imagine we already have our RDD with non attack (normal) interactions from some previous analysis.   "
 92 |      ]
 93 |     },
 94 |     {
 95 |      "cell_type": "code",
 96 |      "collapsed": false,
 97 |      "input": [
 98 |       "normal_raw_data = raw_data.filter(lambda x: \"normal.\" in x)"
 99 |      ],
100 |      "language": "python",
101 |      "metadata": {},
102 |      "outputs": [],
103 |      "prompt_number": 3
104 |     },
105 |     {
106 |      "cell_type": "markdown",
107 |      "metadata": {},
108 |      "source": [
109 |       "We can obtain attack interactions by subtracting normal ones from the original unfiltered RDD as follows.  "
110 |      ]
111 |     },
112 |     {
113 |      "cell_type": "code",
114 |      "collapsed": false,
115 |      "input": [
116 |       "attack_raw_data = raw_data.subtract(normal_raw_data)"
117 |      ],
118 |      "language": "python",
119 |      "metadata": {},
120 |      "outputs": [],
121 |      "prompt_number": 4
122 |     },
123 |     {
124 |      "cell_type": "markdown",
125 |      "metadata": {},
126 |      "source": [
127 |       "Let's do some counts to check our results.  "
128 |      ]
129 |     },
130 |     {
131 |      "cell_type": "code",
132 |      "collapsed": false,
133 |      "input": [
134 |       "from time import time\n",
135 |       "\n",
136 |       "# count all\n",
137 |       "t0 = time()\n",
138 |       "raw_data_count = raw_data.count()\n",
139 |       "tt = time() - t0\n",
140 |       "print \"All count in {} secs\".format(round(tt,3))"
141 |      ],
142 |      "language": "python",
143 |      "metadata": {},
144 |      "outputs": [
145 |       {
146 |        "output_type": "stream",
147 |        "stream": "stdout",
148 |        "text": [
149 |         "All count in 5.261 secs\n"
150 |        ]
151 |       }
152 |      ],
153 |      "prompt_number": 5
154 |     },
155 |     {
156 |      "cell_type": "code",
157 |      "collapsed": false,
158 |      "input": [
159 |       "# count normal\n",
160 |       "t0 = time()\n",
161 |       "normal_raw_data_count = normal_raw_data.count()\n",
162 |       "tt = time() - t0\n",
163 |       "print \"Normal count in {} secs\".format(round(tt,3))"
164 |      ],
165 |      "language": "python",
166 |      "metadata": {},
167 |      "outputs": [
168 |       {
169 |        "output_type": "stream",
170 |        "stream": "stdout",
171 |        "text": [
172 |         "Normal count in 5.571 secs\n"
173 |        ]
174 |       }
175 |      ],
176 |      "prompt_number": 6
177 |     },
178 |     {
179 |      "cell_type": "code",
180 |      "collapsed": false,
181 |      "input": [
182 |       "# count attacks\n",
183 |       "t0 = time()\n",
184 |       "attack_raw_data_count = attack_raw_data.count()\n",
185 |       "tt = time() - t0\n",
186 |       "print \"Attack count in {} secs\".format(round(tt,3))"
187 |      ],
188 |      "language": "python",
189 |      "metadata": {},
190 |      "outputs": [
191 |       {
192 |        "output_type": "stream",
193 |        "stream": "stdout",
194 |        "text": [
195 |         "Attack count in 12.075 secs\n"
196 |        ]
197 |       }
198 |      ],
199 |      "prompt_number": 7
200 |     },
201 |     {
202 |      "cell_type": "code",
203 |      "collapsed": false,
204 |      "input": [
205 |       "print \"There are {} normal interactions and {} attacks, \\\n",
206 |       "from a total of {} interactions\".format(normal_raw_data_count,attack_raw_data_count,raw_data_count)"
207 |      ],
208 |      "language": "python",
209 |      "metadata": {},
210 |      "outputs": [
211 |       {
212 |        "output_type": "stream",
213 |        "stream": "stdout",
214 |        "text": [
215 |         "There are 97278 normal interactions and 396743 attacks, from a total of 494021 interactions\n"
216 |        ]
217 |       }
218 |      ],
219 |      "prompt_number": 8
220 |     },
221 |     {
222 |      "cell_type": "markdown",
223 |      "metadata": {},
224 |      "source": [
225 |       "So now we have two RDDs, one with normal interactions and another one with attacks.  "
226 |      ]
227 |     },
228 |     {
229 |      "cell_type": "heading",
230 |      "level": 2,
231 |      "metadata": {},
232 |      "source": [
233 |       "Protocol and service combinations using `cartesian`"
234 |      ]
235 |     },
236 |     {
237 |      "cell_type": "markdown",
238 |      "metadata": {},
239 |      "source": [
240 |       "We can compute the Cartesian product between two RDDs by using the `cartesian` transformation. It returns all possible pairs of elements between two RDDs. In our case we will use it to generate all the possible combinations between service and protocol in our network interactions.  \n",
241 |       "\n",
242 |       "First of all we need to isolate each collection of values in two separate RDDs. For that we will use `distinct` on the CSV-parsed dataset. From the [dataset description](http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names) we know that protocol is the second column and service is the third (tag is the last one and not the first as appears in the page).   "
243 |      ]
244 |     },
245 |     {
246 |      "cell_type": "markdown",
247 |      "metadata": {},
248 |      "source": [
249 |       "So first, let's get the protocols.  "
250 |      ]
251 |     },
252 |     {
253 |      "cell_type": "code",
254 |      "collapsed": false,
255 |      "input": [
256 |       "csv_data = raw_data.map(lambda x: x.split(\",\"))\n",
257 |       "protocols = csv_data.map(lambda x: x[1]).distinct()\n",
258 |       "protocols.collect()"
259 |      ],
260 |      "language": "python",
261 |      "metadata": {},
262 |      "outputs": [
263 |       {
264 |        "metadata": {},
265 |        "output_type": "pyout",
266 |        "prompt_number": 9,
267 |        "text": [
268 |         "[u'udp', u'icmp', u'tcp']"
269 |        ]
270 |       }
271 |      ],
272 |      "prompt_number": 9
273 |     },
274 |     {
275 |      "cell_type": "markdown",
276 |      "metadata": {},
277 |      "source": [
278 |       "Now we do the same for services.  "
279 |      ]
280 |     },
281 |     {
282 |      "cell_type": "code",
283 |      "collapsed": false,
284 |      "input": [
285 |       "services = csv_data.map(lambda x: x[2]).distinct()\n",
286 |       "services.collect()"
287 |      ],
288 |      "language": "python",
289 |      "metadata": {},
290 |      "outputs": [
291 |       {
292 |        "metadata": {},
293 |        "output_type": "pyout",
294 |        "prompt_number": 10,
295 |        "text": [
296 |         "[u'domain',\n",
297 |         " u'http_443',\n",
298 |         " u'Z39_50',\n",
299 |         " u'smtp',\n",
300 |         " u'urp_i',\n",
301 |         " u'private',\n",
302 |         " u'echo',\n",
303 |         " u'shell',\n",
304 |         " u'red_i',\n",
305 |         " u'eco_i',\n",
306 |         " u'sunrpc',\n",
307 |         " u'ftp_data',\n",
308 |         " u'urh_i',\n",
309 |         " u'pm_dump',\n",
310 |         " u'pop_3',\n",
311 |         " u'pop_2',\n",
312 |         " u'systat',\n",
313 |         " u'ftp',\n",
314 |         " u'uucp',\n",
315 |         " u'whois',\n",
316 |         " u'netbios_dgm',\n",
317 |         " u'efs',\n",
318 |         " u'remote_job',\n",
319 |         " u'daytime',\n",
320 |         " u'ntp_u',\n",
321 |         " u'finger',\n",
322 |         " u'ldap',\n",
323 |         " u'netbios_ns',\n",
324 |         " u'kshell',\n",
325 |         " u'iso_tsap',\n",
326 |         " u'ecr_i',\n",
327 |         " u'nntp',\n",
328 |         " u'printer',\n",
329 |         " u'domain_u',\n",
330 |         " u'uucp_path',\n",
331 |         " u'courier',\n",
332 |         " u'exec',\n",
333 |         " u'time',\n",
334 |         " u'netstat',\n",
335 |         " u'telnet',\n",
336 |         " u'gopher',\n",
337 |         " u'rje',\n",
338 |         " u'sql_net',\n",
339 |         " u'link',\n",
340 |         " u'auth',\n",
341 |         " u'netbios_ssn',\n",
342 |         " u'csnet_ns',\n",
343 |         " u'X11',\n",
344 |         " u'IRC',\n",
345 |         " u'tftp_u',\n",
346 |         " u'login',\n",
347 |         " u'supdup',\n",
348 |         " u'name',\n",
349 |         " u'nnsp',\n",
350 |         " u'mtp',\n",
351 |         " u'http',\n",
352 |         " u'bgp',\n",
353 |         " u'ctf',\n",
354 |         " u'hostnames',\n",
355 |         " u'klogin',\n",
356 |         " u'vmnet',\n",
357 |         " u'tim_i',\n",
358 |         " u'discard',\n",
359 |         " u'imap4',\n",
360 |         " u'other',\n",
361 |         " u'ssh']"
362 |        ]
363 |       }
364 |      ],
365 |      "prompt_number": 10
366 |     },
367 |     {
368 |      "cell_type": "markdown",
369 |      "metadata": {},
370 |      "source": [
371 |       "A longer list in this case."
372 |      ]
373 |     },
374 |     {
375 |      "cell_type": "markdown",
376 |      "metadata": {},
377 |      "source": [
378 |       "Now we can do the cartesian product.  "
379 |      ]
380 |     },
381 |     {
382 |      "cell_type": "code",
383 |      "collapsed": false,
384 |      "input": [
385 |       "product = protocols.cartesian(services).collect()\n",
386 |       "print \"There are {} combinations of protocol X service\".format(len(product))"
387 |      ],
388 |      "language": "python",
389 |      "metadata": {},
390 |      "outputs": [
391 |       {
392 |        "output_type": "stream",
393 |        "stream": "stdout",
394 |        "text": [
395 |         "There are 198 combinations of protocol X service\n"
396 |        ]
397 |       }
398 |      ],
399 |      "prompt_number": 11
400 |     },
401 |     {
402 |      "cell_type": "markdown",
403 |      "metadata": {},
404 |      "source": [
405 |       "Obviously, for such small RDDs doesn't really make sense to use Spark cartesian product. We could have perfectly collected the values after using `distinct` and do the cartesian product locally. Moreover, `distinct` and `cartesian` are expensive operations so they must be used with care when the operating datasets are large.    "
406 |      ]
407 |     }
408 |    ],
409 |    "metadata": {}
410 |   }
411 |  ]
412 | }


--------------------------------------------------------------------------------
/4. Spark/4. Spark 4. rdd-aggregations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:11079f4265aa0d15e0bf53fe2dd27e64eb926948e4a2f0f43e8e08a276da43f4"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Data aggregations on RDDs"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)"
 24 |      ]
 25 |     },
 26 |     {
 27 |      "cell_type": "markdown",
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "We can aggregate RDD data in Spark by using three different actions: `reduce`, `fold`, and `aggregate`. The last one is the more general one and someway includes the first two.  "
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "heading",
 35 |      "level": 2,
 36 |      "metadata": {},
 37 |      "source": [
 38 |       "Getting the data and creating the RDD"
 39 |      ]
 40 |     },
 41 |     {
 42 |      "cell_type": "markdown",
 43 |      "metadata": {},
 44 |      "source": [
 45 |       "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the [KDD Cup 1999](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html), containing nearly half million nework interactions. The file is provided as a Gzip file that we will download locally.  "
 46 |      ]
 47 |     },
 48 |     {
 49 |      "cell_type": "code",
 50 |      "collapsed": false,
 51 |      "input": [
 52 |       "import urllib\n",
 53 |       "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")"
 54 |      ],
 55 |      "language": "python",
 56 |      "metadata": {},
 57 |      "outputs": [],
 58 |      "prompt_number": 1
 59 |     },
 60 |     {
 61 |      "cell_type": "code",
 62 |      "collapsed": false,
 63 |      "input": [
 64 |       "data_file = \"./kddcup.data_10_percent.gz\"\n",
 65 |       "raw_data = sc.textFile(data_file)"
 66 |      ],
 67 |      "language": "python",
 68 |      "metadata": {},
 69 |      "outputs": [],
 70 |      "prompt_number": 2
 71 |     },
 72 |     {
 73 |      "cell_type": "heading",
 74 |      "level": 2,
 75 |      "metadata": {},
 76 |      "source": [
 77 |       "Inspecting interaction duration by tag"
 78 |      ]
 79 |     },
 80 |     {
 81 |      "cell_type": "markdown",
 82 |      "metadata": {},
 83 |      "source": [
 84 |       "Both `fold` and `reduce` take a function as an argument that is applied to two elements of the RDD. The `fold` action differs from `reduce` in that it gets and additional initial *zero value* to be used for the initial call. This value should be the identity element for the function provided.  "
 85 |      ]
 86 |     },
 87 |     {
 88 |      "cell_type": "markdown",
 89 |      "metadata": {},
 90 |      "source": [
 91 |       "As an example, imagine we want to know the total duration of our interactions for normal and attack interactions. We can use `reduce` as follows.    "
 92 |      ]
 93 |     },
 94 |     {
 95 |      "cell_type": "code",
 96 |      "collapsed": false,
 97 |      "input": [
 98 |       "# parse data\n",
 99 |       "csv_data = raw_data.map(lambda x: x.split(\",\"))\n",
100 |       "\n",
101 |       "# separate into different RDDs\n",
102 |       "normal_csv_data = csv_data.filter(lambda x: x[41]==\"normal.\")\n",
103 |       "attack_csv_data = csv_data.filter(lambda x: x[41]!=\"normal.\")"
104 |      ],
105 |      "language": "python",
106 |      "metadata": {},
107 |      "outputs": [],
108 |      "prompt_number": 3
109 |     },
110 |     {
111 |      "cell_type": "markdown",
112 |      "metadata": {},
113 |      "source": [
114 |       "The function that we pass to `reduce` gets and returns elements of the same type of the RDD. If we want to sum durations we need to extract that element into a new RDD.  "
115 |      ]
116 |     },
117 |     {
118 |      "cell_type": "code",
119 |      "collapsed": false,
120 |      "input": [
121 |       "normal_duration_data = normal_csv_data.map(lambda x: int(x[0]))\n",
122 |       "attack_duration_data = attack_csv_data.map(lambda x: int(x[0]))"
123 |      ],
124 |      "language": "python",
125 |      "metadata": {},
126 |      "outputs": [],
127 |      "prompt_number": 4
128 |     },
129 |     {
130 |      "cell_type": "markdown",
131 |      "metadata": {},
132 |      "source": [
133 |       "Now we can reduce these new RDDs.  "
134 |      ]
135 |     },
136 |     {
137 |      "cell_type": "code",
138 |      "collapsed": false,
139 |      "input": [
140 |       "total_normal_duration = normal_duration_data.reduce(lambda x, y: x + y)\n",
141 |       "total_attack_duration = attack_duration_data.reduce(lambda x, y: x + y)\n",
142 |       "\n",
143 |       "print \"Total duration for 'normal' interactions is {}\".\\\n",
144 |       "    format(total_normal_duration)\n",
145 |       "print \"Total duration for 'attack' interactions is {}\".\\\n",
146 |       "    format(total_attack_duration)"
147 |      ],
148 |      "language": "python",
149 |      "metadata": {},
150 |      "outputs": [
151 |       {
152 |        "output_type": "stream",
153 |        "stream": "stdout",
154 |        "text": [
155 |         "Total duration for 'normal' interactions is 21075991\n",
156 |         "Total duration for 'attack' interactions is 2626792\n"
157 |        ]
158 |       }
159 |      ],
160 |      "prompt_number": 5
161 |     },
162 |     {
163 |      "cell_type": "markdown",
164 |      "metadata": {},
165 |      "source": [
166 |       "We can go further and use counts to calculate duration means.  "
167 |      ]
168 |     },
169 |     {
170 |      "cell_type": "code",
171 |      "collapsed": false,
172 |      "input": [
173 |       "normal_count = normal_duration_data.count()\n",
174 |       "attack_count = attack_duration_data.count()\n",
175 |       "\n",
176 |       "print \"Mean duration for 'normal' interactions is {}\".\\\n",
177 |       "    format(round(total_normal_duration/float(normal_count),3))\n",
178 |       "print \"Mean duration for 'attack' interactions is {}\".\\\n",
179 |       "    format(round(total_attack_duration/float(attack_count),3))"
180 |      ],
181 |      "language": "python",
182 |      "metadata": {},
183 |      "outputs": [
184 |       {
185 |        "output_type": "stream",
186 |        "stream": "stdout",
187 |        "text": [
188 |         "Mean duration for 'normal' interactions is 216.657\n",
189 |         "Mean duration for 'attack' interactions is 6.621\n"
190 |        ]
191 |       }
192 |      ],
193 |      "prompt_number": 6
194 |     },
195 |     {
196 |      "cell_type": "markdown",
197 |      "metadata": {},
198 |      "source": [
199 |       "We have a first (and too simplistic) approach to identify attack interactions."
200 |      ]
201 |     },
202 |     {
203 |      "cell_type": "heading",
204 |      "level": 2,
205 |      "metadata": {},
206 |      "source": [
207 |       "A better way, using `aggregate`  "
208 |      ]
209 |     },
210 |     {
211 |      "cell_type": "markdown",
212 |      "metadata": {},
213 |      "source": [
214 |       "The `aggregate` action frees us from the constraint of having the return be the same type as the RDD we are working on. Like with `fold`, we supply an initial zero value of the type we want to return. Then we provide two functions. The first one is used to combine the elements from our RDD with the accumulator. The second function is needed to merge two accumulators. Let's see it in action calculating the mean we did before.  "
215 |      ]
216 |     },
217 |     {
218 |      "cell_type": "code",
219 |      "collapsed": false,
220 |      "input": [
221 |       "normal_sum_count = normal_duration_data.aggregate(\n",
222 |       "    (0,0), # the initial value\n",
223 |       "    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc\n",
224 |       "    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators\n",
225 |       ")\n",
226 |       "\n",
227 |       "print \"Mean duration for 'normal' interactions is {}\".\\\n",
228 |       "    format(round(normal_sum_count[0]/float(normal_sum_count[1]),3))"
229 |      ],
230 |      "language": "python",
231 |      "metadata": {},
232 |      "outputs": [
233 |       {
234 |        "output_type": "stream",
235 |        "stream": "stdout",
236 |        "text": [
237 |         "Mean duration for 'normal' interactions is 216.657\n"
238 |        ]
239 |       }
240 |      ],
241 |      "prompt_number": 7
242 |     },
243 |     {
244 |      "cell_type": "markdown",
245 |      "metadata": {},
246 |      "source": [
247 |       "In the previous aggregation, the accumulator first element keeps the total sum, while the second element keeps the count. Combining an accumulator with an RDD element consists in summing up the value and incrementing the count. Combining two accumulators requires just a pairwise sum.  "
248 |      ]
249 |     },
250 |     {
251 |      "cell_type": "markdown",
252 |      "metadata": {},
253 |      "source": [
254 |       "We can do the same with attack type interactions.  "
255 |      ]
256 |     },
257 |     {
258 |      "cell_type": "code",
259 |      "collapsed": false,
260 |      "input": [
261 |       "attack_sum_count = attack_duration_data.aggregate(\n",
262 |       "    (0,0), # the initial value\n",
263 |       "    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc\n",
264 |       "    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators\n",
265 |       ")\n",
266 |       "\n",
267 |       "print \"Mean duration for 'attack' interactions is {}\".\\\n",
268 |       "    format(round(attack_sum_count[0]/float(attack_sum_count[1]),3))"
269 |      ],
270 |      "language": "python",
271 |      "metadata": {},
272 |      "outputs": [
273 |       {
274 |        "output_type": "stream",
275 |        "stream": "stdout",
276 |        "text": [
277 |         "Mean duration for 'attack' interactions is 6.621\n"
278 |        ]
279 |       }
280 |      ],
281 |      "prompt_number": 8
282 |     }
283 |    ],
284 |    "metadata": {}
285 |   }
286 |  ]
287 | }


--------------------------------------------------------------------------------
/4. Spark/4. Spark 5. rdd-key-value.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:01c23f9757cbbcc111ac851dc4f48c2a8732f1271f4cd54e1f91839a9d62a8dc"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Working with key/value pair RDDs"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)"
 24 |      ]
 25 |     },
 26 |     {
 27 |      "cell_type": "markdown",
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "Spark provides specific functions to deal with RDDs which elements are key/value pairs. They are usually used to perform aggregations and other processings by key.  "
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "In this notebook we will show how, by working with key/value pairs, we can process our network interactions dataset in a more practical and powerful way than that used in previous notebooks. Key/value pair aggregations will show to be particularly effective when trying to explore each type of tag in our network attacks, in an individual way.  "
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "heading",
 42 |      "level": 2,
 43 |      "metadata": {},
 44 |      "source": [
 45 |       "Getting the data and creating the RDD"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "cell_type": "markdown",
 50 |      "metadata": {},
 51 |      "source": [
 52 |       "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the [KDD Cup 1999](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html), containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally.  "
 53 |      ]
 54 |     },
 55 |     {
 56 |      "cell_type": "code",
 57 |      "collapsed": false,
 58 |      "input": [
 59 |       "import urllib\n",
 60 |       "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")"
 61 |      ],
 62 |      "language": "python",
 63 |      "metadata": {},
 64 |      "outputs": [],
 65 |      "prompt_number": 1
 66 |     },
 67 |     {
 68 |      "cell_type": "code",
 69 |      "collapsed": false,
 70 |      "input": [
 71 |       "data_file = \"./kddcup.data_10_percent.gz\"\n",
 72 |       "raw_data = sc.textFile(data_file)"
 73 |      ],
 74 |      "language": "python",
 75 |      "metadata": {},
 76 |      "outputs": [],
 77 |      "prompt_number": 1
 78 |     },
 79 |     {
 80 |      "cell_type": "heading",
 81 |      "level": 2,
 82 |      "metadata": {},
 83 |      "source": [
 84 |       "Creating a pair RDD for interaction types"
 85 |      ]
 86 |     },
 87 |     {
 88 |      "cell_type": "markdown",
 89 |      "metadata": {},
 90 |      "source": [
 91 |       "In this notebook we want to do some exploratory data analysis on our network interactions dataset. More concretely we want to profile each network interaction type in terms of some of its variables such as duration. In order to do so, we first need to create the RDD suitable for that, where each interaction is parsed as a CSV row representing the value, and is put together with its corresponding tag as a key.  "
 92 |      ]
 93 |     },
 94 |     {
 95 |      "cell_type": "markdown",
 96 |      "metadata": {},
 97 |      "source": [
 98 |       "Normally we create key/value pair RDDs by applying a function using `map` to the original data. This function returns the corresponding pair for a given RDD element. We can proceed as follows.  "
 99 |      ]
100 |     },
101 |     {
102 |      "cell_type": "code",
103 |      "collapsed": false,
104 |      "input": [
105 |       "csv_data = raw_data.map(lambda x: x.split(\",\"))\n",
106 |       "key_value_data = csv_data.map(lambda x: (x[41], x)) # x[41] contains the network interaction tag"
107 |      ],
108 |      "language": "python",
109 |      "metadata": {},
110 |      "outputs": [],
111 |      "prompt_number": 2
112 |     },
113 |     {
114 |      "cell_type": "markdown",
115 |      "metadata": {},
116 |      "source": [
117 |       "We have now our key/value pair data ready to be used. Let's get the first element in order to see how it looks like.  "
118 |      ]
119 |     },
120 |     {
121 |      "cell_type": "code",
122 |      "collapsed": false,
123 |      "input": [
124 |       "key_value_data.take(1)"
125 |      ],
126 |      "language": "python",
127 |      "metadata": {},
128 |      "outputs": [
129 |       {
130 |        "metadata": {},
131 |        "output_type": "pyout",
132 |        "prompt_number": 3,
133 |        "text": [
134 |         "[(u'normal.',\n",
135 |         "  [u'0',\n",
136 |         "   u'tcp',\n",
137 |         "   u'http',\n",
138 |         "   u'SF',\n",
139 |         "   u'181',\n",
140 |         "   u'5450',\n",
141 |         "   u'0',\n",
142 |         "   u'0',\n",
143 |         "   u'0',\n",
144 |         "   u'0',\n",
145 |         "   u'0',\n",
146 |         "   u'1',\n",
147 |         "   u'0',\n",
148 |         "   u'0',\n",
149 |         "   u'0',\n",
150 |         "   u'0',\n",
151 |         "   u'0',\n",
152 |         "   u'0',\n",
153 |         "   u'0',\n",
154 |         "   u'0',\n",
155 |         "   u'0',\n",
156 |         "   u'0',\n",
157 |         "   u'8',\n",
158 |         "   u'8',\n",
159 |         "   u'0.00',\n",
160 |         "   u'0.00',\n",
161 |         "   u'0.00',\n",
162 |         "   u'0.00',\n",
163 |         "   u'1.00',\n",
164 |         "   u'0.00',\n",
165 |         "   u'0.00',\n",
166 |         "   u'9',\n",
167 |         "   u'9',\n",
168 |         "   u'1.00',\n",
169 |         "   u'0.00',\n",
170 |         "   u'0.11',\n",
171 |         "   u'0.00',\n",
172 |         "   u'0.00',\n",
173 |         "   u'0.00',\n",
174 |         "   u'0.00',\n",
175 |         "   u'0.00',\n",
176 |         "   u'normal.'])]"
177 |        ]
178 |       }
179 |      ],
180 |      "prompt_number": 3
181 |     },
182 |     {
183 |      "cell_type": "heading",
184 |      "level": 2,
185 |      "metadata": {},
186 |      "source": [
187 |       "Data aggregations with key/value pair RDDs"
188 |      ]
189 |     },
190 |     {
191 |      "cell_type": "markdown",
192 |      "metadata": {},
193 |      "source": [
194 |       "We can use all the transformations and actions available for normal RDDs with key/value pair RDDs. We just need to make the functions work with pair elements. Additionally, Spark provides specific functions to work with RDDs containing pair elements. They are very similar to those available for general RDDs.  "
195 |      ]
196 |     },
197 |     {
198 |      "cell_type": "markdown",
199 |      "metadata": {},
200 |      "source": [
201 |       "For example, we have a `reduceByKey` transformation that we can use as follows to calculate the total duration of each network interaction type.  "
202 |      ]
203 |     },
204 |     {
205 |      "cell_type": "code",
206 |      "collapsed": false,
207 |      "input": [
208 |       "key_value_duration = csv_data.map(lambda x: (x[41], float(x[0]))) \n",
209 |       "durations_by_key = key_value_duration.reduceByKey(lambda x, y: x + y)\n",
210 |       "\n",
211 |       "durations_by_key.collect()"
212 |      ],
213 |      "language": "python",
214 |      "metadata": {},
215 |      "outputs": [
216 |       {
217 |        "metadata": {},
218 |        "output_type": "pyout",
219 |        "prompt_number": 4,
220 |        "text": [
221 |         "[(u'guess_passwd.', 144.0),\n",
222 |         " (u'nmap.', 0.0),\n",
223 |         " (u'warezmaster.', 301.0),\n",
224 |         " (u'rootkit.', 1008.0),\n",
225 |         " (u'warezclient.', 627563.0),\n",
226 |         " (u'smurf.', 0.0),\n",
227 |         " (u'pod.', 0.0),\n",
228 |         " (u'neptune.', 0.0),\n",
229 |         " (u'normal.', 21075991.0),\n",
230 |         " (u'spy.', 636.0),\n",
231 |         " (u'ftp_write.', 259.0),\n",
232 |         " (u'phf.', 18.0),\n",
233 |         " (u'portsweep.', 1991911.0),\n",
234 |         " (u'teardrop.', 0.0),\n",
235 |         " (u'buffer_overflow.', 2751.0),\n",
236 |         " (u'land.', 0.0),\n",
237 |         " (u'imap.', 72.0),\n",
238 |         " (u'loadmodule.', 326.0),\n",
239 |         " (u'perl.', 124.0),\n",
240 |         " (u'multihop.', 1288.0),\n",
241 |         " (u'back.', 284.0),\n",
242 |         " (u'ipsweep.', 43.0),\n",
243 |         " (u'satan.', 64.0)]"
244 |        ]
245 |       }
246 |      ],
247 |      "prompt_number": 4
248 |     },
249 |     {
250 |      "cell_type": "markdown",
251 |      "metadata": {},
252 |      "source": [
253 |       "We have a specific counting action for key/value pairs.  "
254 |      ]
255 |     },
256 |     {
257 |      "cell_type": "code",
258 |      "collapsed": false,
259 |      "input": [
260 |       "counts_by_key = key_value_data.countByKey()\n",
261 |       "counts_by_key"
262 |      ],
263 |      "language": "python",
264 |      "metadata": {},
265 |      "outputs": [
266 |       {
267 |        "metadata": {},
268 |        "output_type": "pyout",
269 |        "prompt_number": 5,
270 |        "text": [
271 |         "defaultdict(<type 'int'>, {u'guess_passwd.': 53, u'nmap.': 231, u'warezmaster.': 20, u'rootkit.': 10, u'warezclient.': 1020, u'smurf.': 280790, u'pod.': 264, u'neptune.': 107201, u'normal.': 97278, u'spy.': 2, u'ftp_write.': 8, u'phf.': 4, u'portsweep.': 1040, u'teardrop.': 979, u'buffer_overflow.': 30, u'land.': 21, u'imap.': 12, u'loadmodule.': 9, u'perl.': 3, u'multihop.': 7, u'back.': 2203, u'ipsweep.': 1247, u'satan.': 1589})"
272 |        ]
273 |       }
274 |      ],
275 |      "prompt_number": 5
276 |     },
277 |     {
278 |      "cell_type": "heading",
279 |      "level": 3,
280 |      "metadata": {},
281 |      "source": [
282 |       "Using `combineByKey`"
283 |      ]
284 |     },
285 |     {
286 |      "cell_type": "markdown",
287 |      "metadata": {},
288 |      "source": [
289 |       "This is the most general of the per-key aggregation functions. Most of the other per-key combiners are implemented using it. We can think about it as the `aggregate` equivalent since it allows the user to return values that are not the same type as our input data."
290 |      ]
291 |     },
292 |     {
293 |      "cell_type": "markdown",
294 |      "metadata": {},
295 |      "source": [
296 |       "For example, we can use it to calculate per-type average durations as follows.  "
297 |      ]
298 |     },
299 |     {
300 |      "cell_type": "code",
301 |      "collapsed": false,
302 |      "input": [
303 |       "sum_counts = key_value_duration.combineByKey(\n",
304 |       "    (lambda x: (x, 1)), # the initial value, with value x and count 1\n",
305 |       "    (lambda acc, value: (acc[0]+value, acc[1]+1)), # how to combine a pair value with the accumulator: sum value, and increment count\n",
306 |       "    (lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1])) # combine accumulators\n",
307 |       ")\n",
308 |       "\n",
309 |       "sum_counts.collectAsMap()"
310 |      ],
311 |      "language": "python",
312 |      "metadata": {},
313 |      "outputs": [
314 |       {
315 |        "metadata": {},
316 |        "output_type": "pyout",
317 |        "prompt_number": 6,
318 |        "text": [
319 |         "{u'back.': (284.0, 2203),\n",
320 |         " u'buffer_overflow.': (2751.0, 30),\n",
321 |         " u'ftp_write.': (259.0, 8),\n",
322 |         " u'guess_passwd.': (144.0, 53),\n",
323 |         " u'imap.': (72.0, 12),\n",
324 |         " u'ipsweep.': (43.0, 1247),\n",
325 |         " u'land.': (0.0, 21),\n",
326 |         " u'loadmodule.': (326.0, 9),\n",
327 |         " u'multihop.': (1288.0, 7),\n",
328 |         " u'neptune.': (0.0, 107201),\n",
329 |         " u'nmap.': (0.0, 231),\n",
330 |         " u'normal.': (21075991.0, 97278),\n",
331 |         " u'perl.': (124.0, 3),\n",
332 |         " u'phf.': (18.0, 4),\n",
333 |         " u'pod.': (0.0, 264),\n",
334 |         " u'portsweep.': (1991911.0, 1040),\n",
335 |         " u'rootkit.': (1008.0, 10),\n",
336 |         " u'satan.': (64.0, 1589),\n",
337 |         " u'smurf.': (0.0, 280790),\n",
338 |         " u'spy.': (636.0, 2),\n",
339 |         " u'teardrop.': (0.0, 979),\n",
340 |         " u'warezclient.': (627563.0, 1020),\n",
341 |         " u'warezmaster.': (301.0, 20)}"
342 |        ]
343 |       }
344 |      ],
345 |      "prompt_number": 6
346 |     },
347 |     {
348 |      "cell_type": "markdown",
349 |      "metadata": {},
350 |      "source": [
351 |       "We can see that the arguments are pretty similar to those passed to `aggregate` in the previous notebook. The result associated to each type is in the form of a pair. If we want to actually get the averages, we need to do the division before collecting the results.  "
352 |      ]
353 |     },
354 |     {
355 |      "cell_type": "code",
356 |      "collapsed": false,
357 |      "input": [
358 |       "duration_means_by_type = sum_counts.map(lambda (key,value): (key, round(value[0]/value[1],3))).collectAsMap()\n",
359 |       "\n",
360 |       "# Print them sorted\n",
361 |       "for tag in sorted(duration_means_by_type, key=duration_means_by_type.get, reverse=True):\n",
362 |       "    print tag, duration_means_by_type[tag]"
363 |      ],
364 |      "language": "python",
365 |      "metadata": {},
366 |      "outputs": [
367 |       {
368 |        "output_type": "stream",
369 |        "stream": "stdout",
370 |        "text": [
371 |         "portsweep. 1915.299\n",
372 |         "warezclient. 615.258\n",
373 |         "spy. 318.0\n",
374 |         "normal. 216.657\n",
375 |         "multihop. 184.0\n",
376 |         "rootkit. 100.8\n",
377 |         "buffer_overflow. 91.7\n",
378 |         "perl. 41.333\n",
379 |         "loadmodule. 36.222\n",
380 |         "ftp_write. 32.375\n",
381 |         "warezmaster. 15.05\n",
382 |         "imap. 6.0\n",
383 |         "phf. 4.5\n",
384 |         "guess_passwd. 2.717\n",
385 |         "back. 0.129\n",
386 |         "satan. 0.04\n",
387 |         "ipsweep. 0.034\n",
388 |         "nmap. 0.0\n",
389 |         "smurf. 0.0\n",
390 |         "pod. 0.0\n",
391 |         "neptune. 0.0\n",
392 |         "teardrop. 0.0\n",
393 |         "land. 0.0\n"
394 |        ]
395 |       }
396 |      ],
397 |      "prompt_number": 7
398 |     },
399 |     {
400 |      "cell_type": "markdown",
401 |      "metadata": {},
402 |      "source": [
403 |       "A small step into understanding what makes a network interaction be considered an attack."
404 |      ]
405 |     }
406 |    ],
407 |    "metadata": {}
408 |   }
409 |  ]
410 | }


--------------------------------------------------------------------------------
/4. Spark/LICENSE:
--------------------------------------------------------------------------------
 1 | This repository contains a variety of content; some developed by Jose A. Dianes, and some from third-parties. The third-party content is distributed under the license provided by those parties.
 2 | 
 3 | The content developed by Jose A. Dianes is distributed under the following license:
 4 | 
 5 | Copyright 2016 Jose A Dianes
 6 | 
 7 | Licensed under the Apache License, Version 2.0 (the "License");
 8 | you may not use this file except in compliance with the License.
 9 | You may obtain a copy of the License at
10 | 
11 |   http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing, software
14 | distributed under the License is distributed on an "AS IS" BASIS,
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | See the License for the specific language governing permissions and
17 | limitations under the License.
18 | 


--------------------------------------------------------------------------------
/4. Spark/README.md:
--------------------------------------------------------------------------------
  1 | # Spark Python Notebooks  
  2 | 
  3 | [![Join the chat at https://gitter.im/jadianes/spark-py-notebooks](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/jadianes/spark-py-notebooks?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
  4 | 
  5 | This is a collection of [IPython notebook](http://ipython.org/notebook.html)/[Jupyter](https://jupyter.org/) 
  6 | notebooks intended to train the reader on different [Apache Spark](http://spark.apache.org/) concepts, from 
  7 | basic to advanced, by using the **Python** language.  
  8 | 
  9 | If Python is not your language, and it is R, you may want to have a look at our [R on Apache Spark (SparkR) notebooks](https://github.com/jadianes/spark-r-notebooks) instead. Additionally, if your are interested in being introduced to some basic Data Science
 10 | Engineering, you might find [these series of tutorials](https://github.com/jadianes/data-science-your-way)
 11 | interesting. There we explain different concepts and applications 
 12 | using Python and R.  
 13 | 
 14 | ## Instructions  
 15 | 
 16 | A good way of using these notebooks is by first cloning the repo, and then 
 17 | starting your own [IPython notebook](http://ipython.org/notebook.html)/[Jupyter](https://jupyter.org/) in 
 18 | **pySpark mode**. For example, if we have a *standalone* Spark installation
 19 | running in our `localhost` with a maximum of 6Gb per node assigned to IPython:  
 20 | 
 21 |     MASTER="spark://127.0.0.1:7077" SPARK_EXECUTOR_MEMORY="6G" IPYTHON_OPTS="notebook --pylab inline" ~/spark-1.5.0-bin-hadoop2.6/bin/pyspark
 22 | 
 23 | Notice that the path to the `pyspark` command will depend on your specific 
 24 | installation. So as requirement, you need to have
 25 | [Spark installed](https://spark.apache.org/docs/latest/index.html) in 
 26 | the same machine you are going to start the `IPython notebook` server.     
 27 | 
 28 | For more Spark options see [here](https://spark.apache.org/docs/latest/spark-standalone.html). In general it works the rule of passing options
 29 | described in the form `spark.executor.memory` as `SPARK_EXECUTOR_MEMORY` when
 30 | calling IPython/pySpark.   
 31 |  
 32 | ## Datasets  
 33 | 
 34 | We will be using datasets from the [KDD Cup 1999](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html). The results 
 35 | of this competition can be found [here](http://cseweb.ucsd.edu/~elkan/clresults.html).  
 36 | 
 37 | ## References
 38 | 
 39 | The reference book for these and other Spark related topics is:  
 40 | 
 41 | - *Learning Spark* by Holden Karau, Andy Konwinski, Patrick Wendell, and Matei Zaharia.  
 42 | 
 43 | ## Notebooks  
 44 | 
 45 | The following notebooks can be examined individually, although there is a more
 46 | or less linear 'story' when followed in sequence. By using the same dataset
 47 | they try to solve a related set of tasks with it.  
 48 |  
 49 | ### [RDD creation](https://github.com/jadianes/spark-py-notebooks/blob/master/nb1-rdd-creation/nb1-rdd-creation.ipynb)  
 50 | 
 51 | About reading files and parallelize.  
 52 |   
 53 | ### [RDDs basics](https://github.com/jadianes/spark-py-notebooks/blob/master/nb2-rdd-basics/nb2-rdd-basics.ipynb)
 54 | 
 55 | A look at `map`, `filter`, and `collect`.  
 56 |   
 57 | ### [Sampling RDDs](https://github.com/jadianes/spark-py-notebooks/blob/master/nb3-rdd-sampling/nb3-rdd-sampling.ipynb)  
 58 | 
 59 | RDD sampling methods explained.    
 60 |   
 61 | ### [RDD set operations](https://github.com/jadianes/spark-py-notebooks/blob/master/nb4-rdd-set/nb4-rdd-set.ipynb)    
 62 | 
 63 | Brief introduction to some of the RDD pseudo-set operations.  
 64 | 
 65 | ### [Data aggregations on RDDs](https://github.com/jadianes/spark-py-notebooks/blob/master/nb5-rdd-aggregations/nb5-rdd-aggregations.ipynb)  
 66 | 
 67 | RDD actions `reduce`, `fold`, and `aggregate`.   
 68 | 
 69 | ### [Working with key/value pair RDDs](https://github.com/jadianes/spark-py-notebooks/blob/master/nb6-rdd-key-value/nb6-rdd-key-value.ipynb)    
 70 | 
 71 | How to deal with key/value pairs in order to aggregate and explore data.  
 72 |   
 73 | ### [MLlib: Basic Statistics and Exploratory Data Analysis](https://github.com/jadianes/spark-py-notebooks/blob/master/nb7-mllib-statistics/nb7-mllib-statistics.ipynb)    
 74 | 
 75 | A notebook introducing Local Vector types, basic statistics 
 76 | in MLlib for Exploratory Data Analysis and model selection.  
 77 |   
 78 | ### [MLlib: Logistic Regression](https://github.com/jadianes/spark-py-notebooks/blob/master/nb8-mllib-logit/nb8-mllib-logit.ipynb)     
 79 | 
 80 | Labeled points and Logistic Regression classification of network attacks in MLlib.
 81 | Application of model selection techniques using correlation matrix and Hypothesis Testing.    
 82 | 
 83 | ### [MLlib: Decision Trees](https://github.com/jadianes/spark-py-notebooks/blob/master/nb9-mllib-trees/nb9-mllib-trees.ipynb)  
 84 | 
 85 | Use of tree-based methods and how they help explaining models and
 86 |  feature selection.  
 87 | 
 88 | ### [Spark SQL: structured processing for Data Analysis](https://github.com/jadianes/spark-py-notebooks/blob/master/nb10-sql-dataframes/nb10-sql-dataframes.ipynb)  
 89 | 
 90 | In this notebook a schema is inferred for our network interactions dataset. Based on that, we use
 91 | Spark's SQL `DataFrame` abstraction to perform a more structured exploratory data analysis.  
 92 | 
 93 | 
 94 | ## Applications  
 95 | 
 96 | Beyond the basics. Close to real-world applications using Spark and other technologies.  
 97 | 
 98 | ### [Olssen: On-line Spectral Search ENgine for proteomics](https://github.com/jadianes/olssen)  
 99 | 
100 | Same tech stack this time with an AngularJS client app.  
101 | 
102 | ### [An on-line movie recommendation web service](https://github.com/jadianes/spark-movie-lens)  
103 | 
104 | This tutorial can be used independently to build a movie recommender model based on the MovieLens dataset. Most of the code in the first part, about how to use ALS with the public MovieLens dataset, comes from my solution to one of the exercises proposed in the [CS100.1x Introduction to Big Data with Apache Spark by Anthony D. Joseph on edX](https://www.edx.org/course/introduction-big-data-apache-spark-uc-berkeleyx-cs100-1x), that is also [**publicly available since 2014 at Spark Summit**](https://databricks-training.s3.amazonaws.com/movie-recommendation-with-mllib.html). 
105 | 
106 | There I've added with minor modifications to use a larger dataset and also code about how to store and reload the model for later use. On top of that we build a Flask web service so the recommender can be use to provide movie recommendations on-line.  
107 | 
108 | ### [KDD Cup 1999](https://github.com/jadianes/kdd-cup-99-spark)  
109 | 
110 | My try using Spark with this classic dataset and Knowledge Discovery competition.  
111 | 
112 | ## Contributing
113 | 
114 | Contributions are welcome!  For bug reports or requests please [submit an issue](https://github.com/jadianes/spark-py-notebooks/issues).
115 | 
116 | ## Contact  
117 | 
118 | Feel free to contact me to discuss any issues, questions, or comments.
119 | 
120 | * Twitter: [@ja_dianes](https://twitter.com/ja_dianes)
121 | * GitHub: [jadianes](https://github.com/jadianes)
122 | * LinkedIn: [jadianes](https://www.linkedin.com/in/jadianes)
123 | * Website: [jadianes.me](http://jadianes.me)
124 | 
125 | ## License
126 | 
127 | This repository contains a variety of content; some developed by Jose A. Dianes, and some from third-parties.  The third-party content is distributed under the license provided by those parties.
128 | 
129 | The content developed by Jose A. Dianes is distributed under the following license:
130 | 
131 |     Copyright 2016 Jose A Dianes
132 | 
133 |     Licensed under the Apache License, Version 2.0 (the "License");
134 |     you may not use this file except in compliance with the License.
135 |     You may obtain a copy of the License at
136 | 
137 |        http://www.apache.org/licenses/LICENSE-2.0
138 | 
139 |     Unless required by applicable law or agreed to in writing, software
140 |     distributed under the License is distributed on an "AS IS" BASIS,
141 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
142 |     See the License for the specific language governing permissions and
143 |     limitations under the License.
144 | 


--------------------------------------------------------------------------------
/5. Machine Learning/5. ML 0. Install requirements.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nbformat": 4, 
 3 |     "nbformat_minor": 0, 
 4 |     "cells": [
 5 |         {
 6 |             "source": "%%bash\nrm -r -f addutils\ngit clone https://github.com/analytics-bootcamp/addutils.git\ncd ./addutils\n\npython setup.py install  --user", 
 7 |             "execution_count": null, 
 8 |             "cell_type": "code", 
 9 |             "metadata": {
10 |                 "collapsed": true
11 |             }, 
12 |             "outputs": []
13 |         }, 
14 |         {
15 |             "source": "!pip install seaborn", 
16 |             "execution_count": null, 
17 |             "cell_type": "code", 
18 |             "metadata": {
19 |                 "collapsed": true
20 |             }, 
21 |             "outputs": []
22 |         }
23 |     ], 
24 |     "metadata": {
25 |         "language_info": {
26 |             "nbconvert_exporter": "python", 
27 |             "mimetype": "text/x-python", 
28 |             "pygments_lexer": "ipython2", 
29 |             "version": "2.7.11", 
30 |             "file_extension": ".py", 
31 |             "name": "python", 
32 |             "codemirror_mode": {
33 |                 "name": "ipython", 
34 |                 "version": 2
35 |             }
36 |         }, 
37 |         "kernelspec": {
38 |             "name": "python2", 
39 |             "display_name": "Python 2 with Spark 1.6", 
40 |             "language": "python"
41 |         }
42 |     }
43 | }


--------------------------------------------------------------------------------
/5. Machine Learning/5. ML 1. Introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat_minor": 0, 
  3 |     "metadata": {
  4 |         "language_info": {
  5 |             "pygments_lexer": "ipython3", 
  6 |             "name": "python", 
  7 |             "file_extension": ".py", 
  8 |             "codemirror_mode": {
  9 |                 "name": "ipython", 
 10 |                 "version": 3
 11 |             }, 
 12 |             "mimetype": "text/x-python", 
 13 |             "nbconvert_exporter": "python", 
 14 |             "version": "3.5.2"
 15 |         }, 
 16 |         "kernelspec": {
 17 |             "language": "python", 
 18 |             "name": "python3", 
 19 |             "display_name": "Python 3.5 (Experimental) with Spark 1.6"
 20 |         }
 21 |     }, 
 22 |     "nbformat": 4, 
 23 |     "cells": [
 24 |         {
 25 |             "source": "# Definitions and Advices\n\nAdopted from https://github.com/addfor/tutorials", 
 26 |             "metadata": {}, 
 27 |             "cell_type": "markdown"
 28 |         }, 
 29 |         {
 30 |             "source": "## 1 What is Machine Learning ?", 
 31 |             "metadata": {}, 
 32 |             "cell_type": "markdown"
 33 |         }, 
 34 |         {
 35 |             "source": "Today we see an explosion of applications that are wide and connected with an emphasis on storage and processing. *Most companies are storing a lot of data but not solving the problem of what to do with it*. Yet most of the information is stored in raw form: There a huge amound of information locked-up in databases: information that is potentially important but has not yet been discovered. The objective of these tutorials is to show the foundamental techniques to **Discover Meaningful Information in Data**.\n\n**Data Mining** is the extraction of implicit, previously unknown and potentially useful information from data.\n\n**Machine Learning (ML)** and **Deep Learning (DL)** provide the technical basis of Data Mining. **ML** is about building programs with **tunable parameters** (typically an array of floating point values) that are adjusted automatically so as to improve their behavior by **adapting to previously seen data.**\n\n**DL** is about modeling high-level abstractions in data by using model architectures composed of **multiple non-linear transformations.**\n\n**ML and DL** can be considered a subfield of **Artificial Intelligence (AI)** since those algorithms can be seen as building blocks to make computers learn to behave more intelligently by somehow **generalizing** rather that just storing and retrieving data items like a database system would do.\n\nMost of the examples of these tutorials are taken from the [scikit-learn documentation](http://scikit-learn.org/stable/index.html): check the original documentation for further information.", 
 36 |             "metadata": {}, 
 37 |             "cell_type": "markdown"
 38 |         }, 
 39 |         {
 40 |             "source": "### 1.1 Documentation and reference:", 
 41 |             "metadata": {}, 
 42 |             "cell_type": "markdown"
 43 |         }, 
 44 |         {
 45 |             "source": "* [Numpy Reference guide](http://docs.scipy.org/doc/numpy/reference/)\n* [SciPy Reference](http://docs.scipy.org/doc/scipy/reference/)\n* [scikit-learn User Guide](http://scikit-learn.org/stable/user_guide.html)\n* [scikit-learn External Resources](http://scikit-learn.org/stable/presentations.html)", 
 46 |             "metadata": {}, 
 47 |             "cell_type": "markdown"
 48 |         }, 
 49 |         {
 50 |             "source": "## 2 Supervised and Unsupervised Learning", 
 51 |             "metadata": {}, 
 52 |             "cell_type": "markdown"
 53 |         }, 
 54 |         {
 55 |             "source": "In general, a learning problem uses a set of n data samples to predict properties of unknown data. Usually data are organized in tables where rows (first axis) represent the **samples** (or **instances**) and colums represent **attributes** (or **features**), for Supervised Learning, another array of **classes** or **target variables** is provided.\n\nWe can separate learning problems in a few large categories:\n\nIn **<font color=\"Green\">SUPERVISED LEARNING</font>**, we have a dataset consisting of both features and labels. The task is to construct an estimator which is able to predict the label of an object given the set of features.\n\n* We have a **CLASSIFICATION** task when the **target variable is nominal (discrete)**  - examples:\n\n  * predicting the species of iris given a set of measurements of its flower\n  * given a multicolor image of an object through a telescope, determine whether that object is a star, a quasar, or a galaxy.\n\n* We have a **REGRESSION** task when the **target variable is continuous** - examples:\n\n  * given a set of attributes, determine the selling price of an house\n\n\nIn **<font color=\"Green\">UNSUPERVISED LEARNING</font>** the data has no labels, and we are interested in finding similarities between the samples.\n\nUnsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and *density estimation*.\nSome unsupervised learning problems are:\n\n* **CLUSTERING** is the task that **group similar items together**  - examples:\n\n  * given observations of distant galaxies, determine which features are most important in distinguishing between them.\n\n* **DENSITY ESTIMATION** is a task were we want to **find statistical values that describe the data**\n\n* **DIMENSIONALITY REDUCTION** is for **reduce the number of the features while keeping most of the information**\n\n**<font color=\"Green\">UNSUPERVISED / SUPERVISED LEARNING</font>** in DL usally the two approach are combined, in fact the DL layers (Restricted Boltzmann Machines, Autoencoders, Convolutional Neural Networks) are used to learn the most significative features of the data. Those features are then used with standard ML regressors or classificators.", 
 56 |             "metadata": {}, 
 57 |             "cell_type": "markdown"
 58 |         }, 
 59 |         {
 60 |             "source": "## 3 Cheat Sheet - scikit-learn Algorithm Selection", 
 61 |             "metadata": {}, 
 62 |             "cell_type": "markdown"
 63 |         }, 
 64 |         {
 65 |             "source": "ML is a general concept involving a huge number of algorithms. This is a tentative Cheat Sheet to help finding the correct approach.\n\nBasically, the principle is to start simple first. If this doesn't work out, try something more complicated.\n\n<font color=\"Red\">Red Links</font> point to algorithms NOT included in scikit-learn.\n\nTo make any of the algorithms actually work, you need to do the right preprocessing.\n\nGenerally every ML algorithm needs a minimum number of samples. All the methods listed below are applicable to datasets with at least 50 samples. For tasks involving less than 50 samples most of the following methods are not suitable:", 
 66 |             "metadata": {}, 
 67 |             "cell_type": "markdown"
 68 |         }, 
 69 |         {
 70 |             "source": "* **<font color=\"Green\">To predict a QUANTITY:</font>**\n\n    * **Regression:** these methods give back a numerical outcome.\n\n        * **LESS than 100k samples with all features important (dense data):**\n\n            * TRY: [Ridge Regression](http://scikit-learn.org/stable/modules/linear_model.html#ridge-regression) *(see Generalized Linear Models)*\n\n                * If Ridge Regression doesn't work, TRY: [Support Vector Regression (svm.SVR)](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) with *linear kernel* *(see Support Vector Machines)*\n\n                    * If SVR with *linear kernel* doesn't work, TRY: [Support Vector Regression (svm.SVR)](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) with *rbf kernel* *(see Support Vector Machines)*\n                        * If none of the above methods work, USE: [Ensemble Regressors](http://scikit-learn.org/stable/modules/ensemble.html) *(RF, Extremely Randomized Trees, GBRT)* *(see Support Ensemble Methods)*\n\n        * **LESS than 100k samples with few features important (sparse data):**\n\n            * USE: [Elastic Net Lasso](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) *(see Generalized Linear Models)*\n\n        * **MORE than 100k samples:**\n            * USE: [SGD Regressor](http://scikit-learn.org/stable/modules/sgd.html#regression) *(see Stochastic Gradient Descent)*\n\n        * **Alternatively, for every problem size:**\n\n            * CALL US<br><br>\n\n    * **Dimensionality Reduction (NOT for predicting the structure of the data):** these methods are suitable for data visualization and human interpretation.\n\n        * TRY: [RandomizedPCA](http://scikit-learn.org/stable/modules/decomposition.html#principal-component-analysis-pca)\n\n            * If RandomizedPCA dont work, and you have *LESS than 10k samples*, USE: [t-distributed Stochastic Neighbor Embedding (t-SNE)](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n\n            * If RandomizedPCA dont work, and you have *MORE than 10k samples*, CALL US: most probably you need a more efficient version of t-SNE\n\n    * **For Prediction of multivariate or structured outputs:**\n\n        * TRY: [<font color=\"Red\">SVM struct</font>](http://www.cs.cornell.edu/people/tj/svm_light/svm_struct.html). This algorithm is free for non-commercial use\n\n        * TRY: [<font color=\"Red\">pystruct</font>](https://github.com/amueller/pystruct). (Under development)", 
 71 |             "metadata": {}, 
 72 |             "cell_type": "markdown"
 73 |         }, 
 74 |         {
 75 |             "source": "* **<font color=\"Green\">Predict a CATEGORY for LABELED Data (Classification):</font>**\n\n    * **LESS than 100k samples**, TRY: [Linear SVC](http://scikit-learn.org/stable/modules/svm.html#svc)\n\n        * If Linear SVC dont work, and you have *NUMERICAL DATA*, TRY: [KNeighborsClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)\n\n            * If KNeighborsClassifier doesn't work, USE: [SVC](http://scikit-learn.org/stable/modules/svm.html#svc)\n\n                * If SVC doesn't work, USE: [Ensemble Classifiers](http://scikit-learn.org/stable/modules/ensemble.html) *(RF, Extremely Randomized Trees, GBRT)*\n\n        * If Linear SVC dont work, and you have *TEXTUAL DATA*, USE: [Naive Bayes](http://scikit-learn.org/stable/modules/naive_bayes.html)\n\n    * **MORE than 100k samples**, TRY: [SGD Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)\n\n        * If SGD Classifier dont work, , USE: [Kernel Approximation](http://scikit-learn.org/stable/modules/kernel_approximation.html)", 
 76 |             "metadata": {}, 
 77 |             "cell_type": "markdown"
 78 |         }, 
 79 |         {
 80 |             "source": "* **<font color=\"Green\">Predict a CATEGORY for UNLABELED Data (Clustering):</font>**\n\n    * **LESS than 10k samples and KNOWN number of categories**\n        * USE: [Mini Batch K-Means](http://scikit-learn.org/stable/modules/clustering.html#mini-batch-k-means)\n\n    * **MORE than 10k samples and KNOWN number of categories**\n        * TRY: [K-Means Clustering](http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#k-means-clustering)\n\n            * If K-Means Clustering doesn't work, TRY: [Gaussian Mixture Models](http://scikit-learn.org/stable/modules/mixture.html#gmm-classifier)\n\n                * If GMM doesn't work, USE: [Spectral Clustering](http://scikit-learn.org/stable/modules/clustering.html#spectral-clustering)\n\n    * **LESS than 10k samples and UNKNOWN number of categories**\n        * TRY: [Mean Shift](http://scikit-learn.org/stable/modules/clustering.html#mean-shift)\n\n            * If Mean Shift doesn't work, USE: [Variational Gaussian Mixtures](http://scikit-learn.org/stable/modules/mixture.html#vbgmm-classifier-variational-gaussian-mixtures)", 
 81 |             "metadata": {}, 
 82 |             "cell_type": "markdown"
 83 |         }, 
 84 |         {
 85 |             "source": "## 4 Machine Learning Wisdom", 
 86 |             "metadata": {}, 
 87 |             "cell_type": "markdown"
 88 |         }, 
 89 |         {
 90 |             "source": "Here is a list of things to take in great consideration while developing ML systems:\n\n1. **No Free Lunch:** A wide variety of techniques exist for modeling. An important theorem in statistical machine learning essentially states that no one technique will outperform all other techniques on all problems *(Wolpert & MacReady, 1997)*. This theorem is sometimes referred to as *No Free Lunch*. Often, a modeling group will specialize in one particular technique, and will tout that technique as the being intrinsically superior to others. Such a claim should be regarded with extreme suspicion. Furthermore, the field of statistical machine learning is evolving rapidly, and new algorithms are developed at a regular pace, this determines a very fast aging for ML approaches. This is the reason why in Addfor we rely on Open Source, Lean and Data-Driven Development and **Combinatorial Innovation**.\n\n2. **Beware of False Predictors:** In selecting input variables for a model, one must be careful not to include false predictors. A false predictor is a variable that is strongly correlated with the output class, but that is not available in a realistic prediction scenario. This step is stricktly data-dependent and can be accomplished by paying attention to the choice of the validation dataset. **Correlation does not imply causation:** ice-cream sales is a strong predictor for drowning deaths.\n\n3. **Mind Data Balancing:** Always check if your algorithm is suitable to handle Data Asymmetricity.\n\n4. **Correctly Define Output Classes:**  If the model's task is to predict a system failure, it seems natural for the output classes to be \"fail\" and \"not fail\". However, characterizing the exact conditions under which failure occurs is not straightforward. For example two failures for different reasons could represent very different classes.\n\n5. **Data Preparation:** this is maybe the most important task in any predictive algorithm, we dedicate a whole notebook to it! \n\n6. **Model Selection:** Any modeling technique can be used to construct of a continuum of models, from simple to complex. One of the key issues in modeling is model selection, which involves picking the appropriate level of complexity for a model given a data set. Although model selection methods can be automated to some degree, model selection cannot be avoided. If someone claims otherwise, or does not emphasize their expertise in model selection, one should be suspicious of his abilities.\n\n7. **Segmentation:** Often, a data set can be broken into several smaller, more homogenous data sets, which is referred to as segmentation. For example, a customer data base might be split into business and residential customers. Although domain experts can readily propose segmentations, enforcing a segmentation suggested by domain experts is generally not the most prudent approach to modeling, because the data itself provides clues to how the segmentation should be performed. Consequently, one should be concerned if a modeler claims to utilize a priori segmentation.\n\n8. **Model Evaluation:** Once a model has been built, the natural question to ask is how accurate it is. Here we describe common sorts of deception that can occur in assessing and evaluating a model:\n\n    a) *Failing to use an independent test set:* To obtain a fair estimate of performance, the model must be evaluated on examples that were not contained in the training set. The available data must be split into nonoverlapping subsets, with the test set reserved only for evaluation.\n\n    b) *Assuming stationarity of the test environment:* For many difficult problems, a model built based on historical data will become a poorer and poorer predictor as time goes on, because the environment is nonstationary--the rules and behaviors of individuals change over time. Consequently, the best measure of a model's true performance will be obtained if it is tested on data from a different point in time relative to the training data.\n\n    c) *Incomplete reports of results:* An accurate model will correctly discriminate examples of one output class from examples of another output class. Discrimination performance is best reported with an ROC curve, a lift curve, or a precision-recall curve. Any report of accuracy using only a single number is suspect.\n\n    d) *Filtering data to bias results:* In a large data set, one segment of the population may be easier to predict than another. If a model is trained and tested just on this segment of the population, it will be more accurate than a model that must handle the entire population. Selective filtering can turn a hard problem into an easier problem.\n\n    e) *Selective sampling of test cases:* A fair evaluation of a model will utilize a test set that is drawn from the same population as the model will eventually encounter in actual usage.\n\n    f) *Failing to assess statistical reliability:* When comparing the accuracy of two models, it is not sufficient to report that one model performed better than the other, because the difference might not be statistically reliable. \"Statistical reliability\" means, among other things, that if the comparison were repeated using a different sample of the population, the same result would be achieved.", 
 91 |             "metadata": {}, 
 92 |             "cell_type": "markdown"
 93 |         }, 
 94 |         {
 95 |             "source": "\nThis work is licensed under a <a rel=\"license\" href=\"http://creativecommons.org/licenses/by-sa/4.0/\">Creative Commons Attribution-ShareAlike 4.0 International License</a>.", 
 96 |             "metadata": {}, 
 97 |             "cell_type": "markdown"
 98 |         }
 99 |     ]
100 | }


--------------------------------------------------------------------------------
/5. Machine Learning/Scikit_Learn_Cheat_Sheet_Python.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/5. Machine Learning/Scikit_Learn_Cheat_Sheet_Python.pdf


--------------------------------------------------------------------------------
/6. Deep Learning/Keras.js Demos.url:
--------------------------------------------------------------------------------
1 | [InternetShortcut]
2 | URL=https://transcranial.github.io/keras-js/#/mnist-cnn
3 | 


--------------------------------------------------------------------------------
/6. Deep Learning/Keras_Cheat_Sheet_Python.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/6. Deep Learning/Keras_Cheat_Sheet_Python.pdf


--------------------------------------------------------------------------------
/7. Misc/bias and variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/bias and variance.png


--------------------------------------------------------------------------------
/7. Misc/biasvariance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import bokeh.plotting as bk
 3 | 
 4 | def test_func(x, err=0.5):
 5 |     return np.random.normal(10 - 1. / (x + 0.1), err)
 6 | 
 7 | def compute_error(x, y, p):
 8 |     yfit = np.polyval(p, x)
 9 |     return np.sqrt(np.mean((y - yfit) ** 2))
10 | 
11 | def plot_bias_variance(N=8, random_seed=42, err=0.5):
12 |     np.random.seed(random_seed)
13 |     x = 10 ** np.linspace(-2, 0, N)
14 |     y = test_func(x)
15 |     xfit = np.linspace(-0.2, 1.2, 1000)
16 |     titles = ['d = 1 (under-fit; high bias)',
17 |               'd = 2',
18 |               'd = 6 (over-fit; high variance)']
19 |     degrees = [1, 2, 6]
20 |     
21 |     row = []
22 |     for i, d in enumerate(degrees):
23 |         fig = bk.figure(plot_width=240, plot_height=240,
24 |                         title=titles[i], x_range=(-0.2, 1.2), y_range=(0, 12))
25 |         fig.title.text_font_size = '11pt'
26 |         fig.xaxis.axis_label_text_font_size = '9pt'
27 |         fig.yaxis.axis_label_text_font_size = '9pt'
28 |         fig.x(x, y, color='black', size=12)
29 |         
30 |         p = np.polyfit(x, y, d)
31 |         yfit = np.polyval(p, xfit)
32 |         fig.line(xfit, yfit, line_color='blue')
33 |         
34 |         fig.xaxis.axis_label = 'house size'
35 |         fig.xaxis.axis_label_text_font_size = '9pt'
36 |         if i == 0:
37 |             fig.yaxis.axis_label = 'price'
38 |         row.append(fig)
39 | 
40 |     gp = bk.gridplot([row], border_space=0)
41 |     bk.show(gp)
42 | 
43 | 


--------------------------------------------------------------------------------
/7. Misc/ensemble_explore_hastie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/ensemble_explore_hastie.png


--------------------------------------------------------------------------------
/7. Misc/international-airline-passengers.csv:
--------------------------------------------------------------------------------
  1 | Month,Passengers
  2 | 1949-01,112
  3 | 1949-02,118
  4 | 1949-03,132
  5 | 1949-04,129
  6 | 1949-05,121
  7 | 1949-06,135
  8 | 1949-07,148
  9 | 1949-08,148
 10 | 1949-09,136
 11 | 1949-10,119
 12 | 1949-11,104
 13 | 1949-12,118
 14 | 1950-01,115
 15 | 1950-02,126
 16 | 1950-03,141
 17 | 1950-04,135
 18 | 1950-05,125
 19 | 1950-06,149
 20 | 1950-07,170
 21 | 1950-08,170
 22 | 1950-09,158
 23 | 1950-10,133
 24 | 1950-11,114
 25 | 1950-12,140
 26 | 1951-01,145
 27 | 1951-02,150
 28 | 1951-03,178
 29 | 1951-04,163
 30 | 1951-05,172
 31 | 1951-06,178
 32 | 1951-07,199
 33 | 1951-08,199
 34 | 1951-09,184
 35 | 1951-10,162
 36 | 1951-11,146
 37 | 1951-12,166
 38 | 1952-01,171
 39 | 1952-02,180
 40 | 1952-03,193
 41 | 1952-04,181
 42 | 1952-05,183
 43 | 1952-06,218
 44 | 1952-07,230
 45 | 1952-08,242
 46 | 1952-09,209
 47 | 1952-10,191
 48 | 1952-11,172
 49 | 1952-12,194
 50 | 1953-01,196
 51 | 1953-02,196
 52 | 1953-03,236
 53 | 1953-04,235
 54 | 1953-05,229
 55 | 1953-06,243
 56 | 1953-07,264
 57 | 1953-08,272
 58 | 1953-09,237
 59 | 1953-10,211
 60 | 1953-11,180
 61 | 1953-12,201
 62 | 1954-01,204
 63 | 1954-02,188
 64 | 1954-03,235
 65 | 1954-04,227
 66 | 1954-05,234
 67 | 1954-06,264
 68 | 1954-07,302
 69 | 1954-08,293
 70 | 1954-09,259
 71 | 1954-10,229
 72 | 1954-11,203
 73 | 1954-12,229
 74 | 1955-01,242
 75 | 1955-02,233
 76 | 1955-03,267
 77 | 1955-04,269
 78 | 1955-05,270
 79 | 1955-06,315
 80 | 1955-07,364
 81 | 1955-08,347
 82 | 1955-09,312
 83 | 1955-10,274
 84 | 1955-11,237
 85 | 1955-12,278
 86 | 1956-01,284
 87 | 1956-02,277
 88 | 1956-03,317
 89 | 1956-04,313
 90 | 1956-05,318
 91 | 1956-06,374
 92 | 1956-07,413
 93 | 1956-08,405
 94 | 1956-09,355
 95 | 1956-10,306
 96 | 1956-11,271
 97 | 1956-12,306
 98 | 1957-01,315
 99 | 1957-02,301
100 | 1957-03,356
101 | 1957-04,348
102 | 1957-05,355
103 | 1957-06,422
104 | 1957-07,465
105 | 1957-08,467
106 | 1957-09,404
107 | 1957-10,347
108 | 1957-11,305
109 | 1957-12,336
110 | 1958-01,340
111 | 1958-02,318
112 | 1958-03,362
113 | 1958-04,348
114 | 1958-05,363
115 | 1958-06,435
116 | 1958-07,491
117 | 1958-08,505
118 | 1958-09,404
119 | 1958-10,359
120 | 1958-11,310
121 | 1958-12,337
122 | 1959-01,360
123 | 1959-02,342
124 | 1959-03,406
125 | 1959-04,396
126 | 1959-05,420
127 | 1959-06,472
128 | 1959-07,548
129 | 1959-08,559
130 | 1959-09,463
131 | 1959-10,407
132 | 1959-11,362
133 | 1959-12,405
134 | 1960-01,417
135 | 1960-02,391
136 | 1960-03,419
137 | 1960-04,461
138 | 1960-05,472
139 | 1960-06,535
140 | 1960-07,622
141 | 1960-08,606
142 | 1960-09,508
143 | 1960-10,461
144 | 1960-11,390
145 | 1960-12,432


--------------------------------------------------------------------------------
/7. Misc/learning_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/learning_curves.png


--------------------------------------------------------------------------------
/7. Misc/matlab_test_data_01.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/matlab_test_data_01.mat


--------------------------------------------------------------------------------
/7. Misc/moon phases.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/moon phases.xlsx


--------------------------------------------------------------------------------
/7. Misc/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/tree.png


--------------------------------------------------------------------------------
/7. Misc/weather.txt:
--------------------------------------------------------------------------------
 1 | How hot is it today?,temperature
 2 | Is it hot outside?,temperature
 3 | Will it be uncomfortably hot?,temperature
 4 | Will it be sweltering?,temperature
 5 | How cold is it today?,temperature
 6 | Is it cold outside?,temperature
 7 | Will it be uncomfortably cold?,temperature
 8 | Will it be frigid?,temperature
 9 | What is the expected high for today?,temperature
10 | What is the expected temperature?,temperature
11 | Will high temperatures be dangerous?,temperature
12 | Is it dangerously cold?,temperature
13 | When will the heat subside?,temperature
14 | Is it hot?,temperature
15 | Is it cold?,temperature
16 | How cold is it now?,temperature
17 | Will we have a cold day today?,temperature
18 | When will the cold subside?,temperature
19 | What highs are we expecting?,temperature
20 | What lows are we expecting?,temperature
21 | Is it warm?,temperature
22 | Is it chilly?,temperature
23 | What's the current temp in Celsius?,temperature
24 | What is the temperature in Fahrenheit?,temperature
25 | Is it windy?,conditions
26 | Will it rain today?,conditions
27 | What are the chances for rain?,conditions
28 | Will we get snow?,conditions
29 | Are we expecting sunny conditions?,conditions
30 | Is it overcast?,conditions
31 | Will it be cloudy?,conditions
32 | How much rain will fall today?,conditions
33 | How much snow are we expecting?,conditions
34 | Is it windy outside?,conditions
35 | How much snow do we expect?,conditions
36 | Is the forecast calling for snow today?,conditions
37 | Will we see some sun?,conditions
38 | When will the rain subside?,conditions
39 | Is it cloudy?,conditions
40 | Is it sunny now?,conditions
41 | Will it rain?,conditions
42 | Will we have much snow?,conditions
43 | Are the winds dangerous?,conditions
44 | What is the expected snowfall today?,conditions
45 | Will it be dry?,conditions
46 | Will it be breezy?,conditions
47 | Will it be humid?,conditions
48 | What is today's expected humidity?,conditions
49 | Will the blizzard hit us?,conditions
50 | Is it drizzling?,conditions


--------------------------------------------------------------------------------
/Class intro.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/Class intro.pptx


--------------------------------------------------------------------------------
/Putting data to work.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/Putting data to work.pptx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Training-material
2 | Bootcamp material for IBM DSX training April 3-5 in Singapore
3 | 
4 | Start with Class intro.pptx
5 | 


--------------------------------------------------------------------------------
/agenda voting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 14,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import requests\n",
 12 |     "\n",
 13 |     "score=requests.get(\"https://codeshare.io/2pAwm4\").content\n",
 14 |     "res=score.split(\"xxx\")[1].split(\"\\\\n\")"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 71,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "scrolled": true
 23 |    },
 24 |    "outputs": [
 25 |     {
 26 |      "data": {
 27 |       "text/plain": [
 28 |        "[('0. Basics', 1.0),\n",
 29 |        " ('1. In and Export', 1.1),\n",
 30 |        " ('2. Watson APIs', 2.0),\n",
 31 |        " ('3. Visualization', 2.0),\n",
 32 |        " ('4. Spark', 2.4),\n",
 33 |        " ('5. Machine Learning', 2.3),\n",
 34 |        " ('6. Training - Deep Learning', 2.2)]"
 35 |       ]
 36 |      },
 37 |      "execution_count": 71,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "main=[i.split(\"#\") for i in res[3:10]]\n",
 44 |     "topics=[i[0].strip() for i in main]\n",
 45 |     "hours=[i[1] for i in main]\n",
 46 |     "votes = [len(i) for i in hours]\n",
 47 |     "tot_score = [sum(int(i) for i in  j) for j in hours ]\n",
 48 |     "avg_scores=[round(float(a)/float(b),1) for a,b in zip(tot_score,votes)]\n",
 49 |     "\n",
 50 |     "zip(topics,avg_scores)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 121,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "\n",
 65 |       "('0', 'Basics\\\\\\\\0. Basics 1. Python', 1.0)\n",
 66 |       "('0', 'Basics\\\\\\\\0. Basics 0. Jupyter notebook', 1.2)\n",
 67 |       "('0', 'Basics\\\\\\\\0. Basics 4. Jupyter notebook magics, shell and R', 1.4)\n",
 68 |       "('0', 'Basics\\\\\\\\0. Basics 3. Pandas', 2.5)\n",
 69 |       "('0', 'Basics\\\\\\\\0. Basics 2. Numpy', 2.6)\n",
 70 |       "\n",
 71 |       "('1', 'In and Export\\\\\\\\1. import and export 0. Object Storage', 1.3)\n",
 72 |       "('1', 'In and Export\\\\\\\\1. import and export 1. Download and upload', 1.3)\n",
 73 |       "('1', 'In and Export\\\\\\\\1. import and export 2. DashDB', 1.4)\n",
 74 |       "('1', 'In and Export\\\\\\\\1. import and export 3. Cloudant', 2.2)\n",
 75 |       "('1', 'In and Export\\\\\\\\1. import and export 4. Twitter', 2.2)\n",
 76 |       "('1', 'In and Export\\\\\\\\1. import and export 5. BigInsights', 2.3)\n",
 77 |       "\n",
 78 |       "('2', 'Watson APIs\\\\\\\\2. Watson 0. Weather API', 1.0)\n",
 79 |       "('2', 'Watson APIs\\\\\\\\2. Watson 2. Alchemy News', 1.5)\n",
 80 |       "('2', 'Watson APIs\\\\\\\\2. Watson 1. Personality Insights', 1.8)\n",
 81 |       "('2', 'Watson APIs\\\\\\\\2. Watson 5. Natural language classifier', 2.1)\n",
 82 |       "('2', 'Watson APIs\\\\\\\\2. Watson 3. Alchemy language', 2.3)\n",
 83 |       "('2', 'Watson APIs\\\\\\\\2. Watson 4. Tone analyzer', 2.3)\n",
 84 |       "\n",
 85 |       "('3', 'Visualization\\\\\\\\3. Visualization 0. Matplotlib', 1.8)\n",
 86 |       "('3', 'Visualization\\\\\\\\3. Visualization 1. Machine learning techniques', 1.8)\n",
 87 |       "('3', 'Visualization\\\\\\\\3. Visualization 2. Pixiedust', 2.0)\n",
 88 |       "('3', 'Visualization\\\\\\\\3. Visualization 3. Bokeh', 2.7)\n",
 89 |       "\n",
 90 |       "('4', 'Spark\\\\\\\\4. Spark 0. rdd-creation', 1.4)\n",
 91 |       "('4', 'Spark\\\\\\\\4. Spark 1. rdd-basics', 1.4)\n",
 92 |       "('4', 'Spark\\\\\\\\4. Spark 2. rdd-sampling', 2.5)\n",
 93 |       "('4', 'Spark\\\\\\\\4. Spark 3. rdd-set', 2.5)\n",
 94 |       "('4', 'Spark\\\\\\\\4. Spark 4. rdd-aggregations', 2.5)\n",
 95 |       "('4', 'Spark\\\\\\\\4. Spark 5. rdd-key-value', 2.5)\n",
 96 |       "('4', 'Spark\\\\\\\\4. Spark 7. mllib-logit', 2.5)\n",
 97 |       "('4', 'Spark\\\\\\\\4. Spark 8. mllib-trees', 2.5)\n",
 98 |       "('4', 'Spark\\\\\\\\4. Spark 9. sql-dataframes', 2.7)\n",
 99 |       "('4', 'Spark\\\\\\\\4. Spark 6. mllib-statistics', 2.8)\n",
100 |       "\n",
101 |       "('5', 'Machine Learning\\\\\\\\5. ML 1. Introduction', 1.3)\n",
102 |       "('5', 'Machine Learning\\\\\\\\5. ML 3. Scikit Learn interface', 2.0)\n",
103 |       "('5', 'Machine Learning\\\\\\\\5. ML 2. Data preparation', 2.1)\n",
104 |       "('5', 'Machine Learning\\\\\\\\5. ML 0. Install requirements', 2.3)\n",
105 |       "('5', 'Machine Learning\\\\\\\\5. ML 5. Model evaluation', 2.3)\n",
106 |       "('5', 'Machine Learning\\\\\\\\5. ML 4. Bias and variance', 2.4)\n",
107 |       "('5', 'Machine Learning\\\\\\\\5. ML 7. Ensemble methods advanced', 2.5)\n",
108 |       "('5', 'Machine Learning\\\\\\\\5. ML 9. Time series', 2.6)\n",
109 |       "('5', 'Machine Learning\\\\\\\\5. ML 6. Ensemble methods', 2.7)\n",
110 |       "('5', 'Machine Learning\\\\\\\\5. ML 8. Multi Model Ensembles', 2.8)\n",
111 |       "\n",
112 |       "('6', 'Deep Learning\\\\\\\\6. DL 0. Keras starter kit', 1.4)\n",
113 |       "('6', 'Deep Learning\\\\\\\\6. DL 1. Fun with activation functions', 1.8)\n",
114 |       "('6', 'Deep Learning\\\\\\\\6. DL 3. Embedding', 2.3)\n",
115 |       "('6', 'Deep Learning\\\\\\\\6. DL 5. Auto encoder', 2.3)\n",
116 |       "('6', 'Deep Learning\\\\\\\\6. DL 6. Recurrent networks', 2.5)\n",
117 |       "('6', 'Deep Learning\\\\\\\\6. DL 2. Convolutional networks', 2.6)\n",
118 |       "('6', 'Deep Learning\\\\\\\\6. DL 4. Multi-input models', 2.9)\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "main=[i.strip() for i in res[14:]]\n",
124 |     "main=filter(None, main)\n",
125 |     "main=[i.split(\"#\") for i in main]\n",
126 |     "topics=[i[0].strip() for i in main]\n",
127 |     "rank=[i[1] for i in main]\n",
128 |     "votes = [len(i) for i in rank]\n",
129 |     "tot_score = [sum(int(i) for i in  j) for j in rank ]\n",
130 |     "chapter = [i[0] for i in topics ]\n",
131 |     "topics = [i[3:] for i in topics ]\n",
132 |     "avg_scores=[round(float(a)/float(b),1) for a,b in zip(tot_score,votes)]\n",
133 |     "\n",
134 |     "score=zip(chapter,topics,avg_scores)\n",
135 |     "score =sorted(score, key=lambda score: (score[0],score[2]))\n",
136 |     "\n",
137 |     "chapter=0\n",
138 |     "for i in score:\n",
139 |     "    if i[0]!=chapter:\n",
140 |     "        print \"\"\n",
141 |     "    print i\n",
142 |     "    chapter=i[0]"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "anaconda-cloud": {},
148 |   "kernelspec": {
149 |    "display_name": "Python [conda root]",
150 |    "language": "python",
151 |    "name": "conda-root-py"
152 |   },
153 |   "language_info": {
154 |    "codemirror_mode": {
155 |     "name": "ipython",
156 |     "version": 2
157 |    },
158 |    "file_extension": ".py",
159 |    "mimetype": "text/x-python",
160 |    "name": "python",
161 |    "nbconvert_exporter": "python",
162 |    "pygments_lexer": "ipython2",
163 |    "version": "2.7.12"
164 |   }
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 1
168 | }
169 | 


--------------------------------------------------------------------------------
/agenda.txt:
--------------------------------------------------------------------------------
 1 | Spread 10 hours of training over the main topics:
 2 | 
 3 | 0. Basics                                                              #
 4 | 1. In and Export                                                       #
 5 | 2. Watson APIs                                                         #
 6 | 3. Visualization                                                       #
 7 | 4. Spark                                                               #
 8 | 5. Machine Learning                                                    #
 9 | 6. Training - Deep Learning                                            #
10 |                     
11 |    
12 | Per topic, mark your top 3 interest as 1,2,3:   
13 |                     
14 | 0. Basics\0. Basics 0. Jupyter notebook                                #
15 | 0. Basics\0. Basics 1. Python                                          #
16 | 0. Basics\0. Basics 2. Numpy                                           #
17 | 0. Basics\0. Basics 3. Pandas                                          #
18 | 0. Basics\0. Basics 4. Jupyter notebook magics, shell and R            #
19 |                                               
20 | 1. In and Export\1. import and export 0. Object Storage                #
21 | 1. In and Export\1. import and export 1. Download and upload           #
22 | 1. In and Export\1. import and export 2. DashDB                        #
23 | 1. In and Export\1. import and export 3. Cloudant                      #
24 | 1. In and Export\1. import and export 4. Twitter                       #
25 | 1. In and Export\1. import and export 5. BigInsights                   #
26 |                                               
27 | 2. Watson APIs\2. Watson 0. Weather API                                #
28 | 2. Watson APIs\2. Watson 1. Personality Insights                       #
29 | 2. Watson APIs\2. Watson 2. Alchemy News                               #
30 | 2. Watson APIs\2. Watson 3. Alchemy language                           #
31 | 2. Watson APIs\2. Watson 4. Tone analyzer                              #
32 | 2. Watson APIs\2. Watson 5. Natural language classifier                #
33 |                                          
34 | 3. Visualization\3. Visualization 0. Matplotlib                        #
35 | 3. Visualization\3. Visualization 1. Machine learning techniques       #
36 | 3. Visualization\3. Visualization 2. Pixiedust                         #
37 | 3. Visualization\3. Visualization 3. Bokeh                             #
38 |                                        
39 | 4. Spark\4. Spark 0. rdd-creation                                      #
40 | 4. Spark\4. Spark 1. rdd-basics                                        #
41 | 4. Spark\4. Spark 2. rdd-sampling                                      #
42 | 4. Spark\4. Spark 3. rdd-set                                           #
43 | 4. Spark\4. Spark 4. rdd-aggregations                                  #
44 | 4. Spark\4. Spark 5. rdd-key-value                                     #
45 | 4. Spark\4. Spark 6. mllib-statistics                                  #
46 | 4. Spark\4. Spark 7. mllib-logit                                       #
47 | 4. Spark\4. Spark 8. mllib-trees                                       #
48 | 4. Spark\4. Spark 9. sql-dataframes                                    #
49 |                                             
50 | 5. Machine Learning\5. ML 0. Install requirements                      #
51 | 5. Machine Learning\5. ML 1. Introduction                              #
52 | 5. Machine Learning\5. ML 2. Data preparation                          #
53 | 5. Machine Learning\5. ML 3. Scikit Learn interface                    #
54 | 5. Machine Learning\5. ML 4. Bias and variance                         #
55 | 5. Machine Learning\5. ML 5. Model evaluation                          #
56 | 5. Machine Learning\5. ML 6. Ensemble methods                          #
57 | 5. Machine Learning\5. ML 7. Ensemble methods advanced                 #
58 | 5. Machine Learning\5. ML 8. Multi Model Ensembles                     #
59 | 5. Machine Learning\5. ML 9. Time series                               #
60 |                                             
61 | 6. Deep Learning\6. DL 0. Keras starter kit                            #
62 | 6. Deep Learning\6. DL 1. Fun with activation functions                #
63 | 6. Deep Learning\6. DL 2. Convolutional networks                       #
64 | 6. Deep Learning\6. DL 3. Embedding                                    #
65 | 6. Deep Learning\6. DL 4. Multi-input models                           #
66 | 6. Deep Learning\6. DL 5. Auto encoder                                 #
67 | 6. Deep Learning\6. DL 6. Recurrent networks                           #
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/other/Data Generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat_minor": 0, 
  3 |     "metadata": {
  4 |         "language_info": {
  5 |             "pygments_lexer": "ipython2", 
  6 |             "name": "python", 
  7 |             "file_extension": ".py", 
  8 |             "codemirror_mode": {
  9 |                 "name": "ipython", 
 10 |                 "version": 2
 11 |             }, 
 12 |             "mimetype": "text/x-python", 
 13 |             "nbconvert_exporter": "python", 
 14 |             "version": "2.7.11"
 15 |         }, 
 16 |         "kernelspec": {
 17 |             "language": "python", 
 18 |             "name": "python2", 
 19 |             "display_name": "Python 2 with Spark 1.6"
 20 |         }
 21 |     }, 
 22 |     "nbformat": 4, 
 23 |     "cells": [
 24 |         {
 25 |             "execution_count": 1, 
 26 |             "metadata": {
 27 |                 "collapsed": true
 28 |             }, 
 29 |             "source": "import numpy as np\nimport pandas as pd", 
 30 |             "outputs": [], 
 31 |             "cell_type": "code"
 32 |         }, 
 33 |         {
 34 |             "execution_count": 28, 
 35 |             "metadata": {
 36 |                 "collapsed": false
 37 |             }, 
 38 |             "source": "def gen_data(n=100):\n    data = { \\\n    'v01': np.random.normal(0,1,n), \n    'v02': np.random.beta(3,1,n),\n    'v03': np.random.chisquare(3,n),       \n    'v04': np.random.exponential(3,n),\n    'v05': np.random.f(10,3,n),      \n    'v06': np.random.gamma(2,n),         \n    'v07': np.random.logistic(10,1,n),  \n    'v08': np.random.choice(2, n, p=[0.1, 0.9]),\n    'v09': np.random.choice(2, n, p=[0.2, 0.8]),\n    'v10': np.random.choice(2, n, p=[0.4, 0.6]),                \n    'v11': np.random.choice(3, n, p=[0.1, 0.2, 0.7]),\n    'v12': np.random.choice(3, n, p=[0.3, 0.3, 0.4]),\n    'v13': np.random.choice(5, n, p=[0.1, 0.2, 0.3, 0.2, 0.2 ]),\n    'v14': np.random.choice(5, n, p=[0.2, 0.2, 0.2, 0.2, 0.2]),\n    'v15': np.random.choice(10, n, p=[0.1, 0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1, 0.1]) \\\n         }\n\n    df = pd.DataFrame(data) \n\n\n    target =             \\\n    0.1     * df.v01 +      \\\n    -0.1    * df.v03 +      \\\n    0.015   * df.v04 +      \\\n    -0.00015* df.v06 +      \\\n    0.12    * df.v07 +      \\\n    0.11    * df.v08 +      \\\n    -0.13   * df.v10 +      \\\n    -0.05   * (df.v11==1) + \\\n    0.13    * (df.v13==1) + \\\n    -0.12   * (df.v13==3) + \\\n    0.16    * (df.v14==1) + \\\n    -0.02   * (df.v14==2) + \\\n    0.01    * (df.v15==2) + \\\n    -0.02   * (df.v15==4) + \\\n    0.04    * (df.v15==6) + \\\n    -0.04   * (df.v15==8) + \\\n    0.1     * df.v01 * df.v02 +        \\\n    -0.1    * df.v02 * df.v03 +        \\\n    0.000005* df.v04 * df.v06 +        \\\n    -0.15   * df.v03 * df.v08 +        \\\n    0.12    * df.v07 * df.v09 +        \\\n    0.11    * df.v10 * df.v11 +        \\\n    -0.13   * df.v12 * df.v14 +        \\\n    -0.05   * (df.v11==2) * df.v01 +   \\\n    0.13    * (df.v13==1) * df.v02 +   \\\n    -0.12   * (df.v13==2) * df.v03 +   \\\n    0.16    * (df.v14==2) * df.v04 +   \\\n    -0.02   * (df.v14==3) * df.v05 +   \\\n    0.00001 * (df.v15==1) * df.v06 +   \\\n    -0.02   * (df.v15==5) * df.v07 +   \\\n    0.04    * (df.v15==7) * df.v08 +   \\\n    -0.04   * (df.v15==5) * df.v09 +   \\\n    0.04    * (df.v15==4) * (df.v13==8) +  \\\n    -0.04   * (df.v15==3) * (df.v13==8) +  \\\n    0.04    * (df.v15==2) * (df.v14==8) +  \\\n    -0.04   * (df.v15==1) * (df.v14==8) +  \\\n    0.04    * (df.v15==2) * (df.v12==8) +  \\\n    -0.04   * (df.v15==3) * (df.v13==8)\n\n    #df['target']=target\n    df['target']=(np.exp(target)/(1+np.exp(target))>0.5).astype(int)\n    \n    return df\n", 
 39 |             "outputs": [], 
 40 |             "cell_type": "code"
 41 |         }, 
 42 |         {
 43 |             "execution_count": 47, 
 44 |             "metadata": {
 45 |                 "collapsed": false, 
 46 |                 "scrolled": true
 47 |             }, 
 48 |             "source": "data_train = gen_data(n=20000)\ndata_test  = gen_data(n=100000)\n\n\ndata_train.head()\n", 
 49 |             "outputs": [
 50 |                 {
 51 |                     "execution_count": 47, 
 52 |                     "metadata": {}, 
 53 |                     "data": {
 54 |                         "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>v01</th>\n      <th>v02</th>\n      <th>v03</th>\n      <th>v04</th>\n      <th>v05</th>\n      <th>v06</th>\n      <th>v07</th>\n      <th>v08</th>\n      <th>v09</th>\n      <th>v10</th>\n      <th>v11</th>\n      <th>v12</th>\n      <th>v13</th>\n      <th>v14</th>\n      <th>v15</th>\n      <th>target</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>-0.581274</td>\n      <td>0.299583</td>\n      <td>0.672571</td>\n      <td>0.879624</td>\n      <td>1.417787</td>\n      <td>12273.412584</td>\n      <td>11.577250</td>\n      <td>1</td>\n      <td>0</td>\n      <td>1</td>\n      <td>0</td>\n      <td>2</td>\n      <td>4</td>\n      <td>1</td>\n      <td>3</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.102360</td>\n      <td>0.942390</td>\n      <td>1.358432</td>\n      <td>1.289167</td>\n      <td>3.519207</td>\n      <td>12273.412584</td>\n      <td>8.978558</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>6</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>-0.131057</td>\n      <td>0.796321</td>\n      <td>1.218057</td>\n      <td>1.704464</td>\n      <td>0.582041</td>\n      <td>12273.412584</td>\n      <td>10.950337</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>0</td>\n      <td>3</td>\n      <td>3</td>\n      <td>0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>-1.664699</td>\n      <td>0.972988</td>\n      <td>2.604471</td>\n      <td>1.165724</td>\n      <td>4.582620</td>\n      <td>12273.412584</td>\n      <td>12.149130</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>2</td>\n      <td>3</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1.018567</td>\n      <td>0.796813</td>\n      <td>1.549261</td>\n      <td>5.730283</td>\n      <td>0.854525</td>\n      <td>12273.412584</td>\n      <td>8.379034</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>3</td>\n      <td>0</td>\n      <td>9</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>", 
 55 |                         "text/plain": "        v01       v02       v03       v04       v05           v06        v07  \\\n0 -0.581274  0.299583  0.672571  0.879624  1.417787  12273.412584  11.577250   \n1  0.102360  0.942390  1.358432  1.289167  3.519207  12273.412584   8.978558   \n2 -0.131057  0.796321  1.218057  1.704464  0.582041  12273.412584  10.950337   \n3 -1.664699  0.972988  2.604471  1.165724  4.582620  12273.412584  12.149130   \n4  1.018567  0.796813  1.549261  5.730283  0.854525  12273.412584   8.379034   \n\n   v08  v09  v10  v11  v12  v13  v14  v15  target  \n0    1    0    1    0    2    4    1    3       0  \n1    1    1    1    2    1    1    2    6       1  \n2    1    1    1    2    0    3    3    0       1  \n3    1    1    1    1    0    2    3    0       0  \n4    1    1    0    0    0    3    0    9       1  "
 56 |                     }, 
 57 |                     "output_type": "execute_result"
 58 |                 }
 59 |             ], 
 60 |             "cell_type": "code"
 61 |         }, 
 62 |         {
 63 |             "execution_count": null, 
 64 |             "metadata": {
 65 |                 "collapsed": true
 66 |             }, 
 67 |             "source": "@hidden_cell\n\ncredentials_1 = {\n  'host':'awh-yp-small03.services.dal.bluemix.net',\n  'port':'50000',\n  'user':'dash110459',\n  'password':\"\"\"cc7fcfe60374\"\"\",\n  'database':'BLUDB'\n}", 
 68 |             "outputs": [], 
 69 |             "cell_type": "code"
 70 |         }, 
 71 |         {
 72 |             "execution_count": 48, 
 73 |             "metadata": {
 74 |                 "collapsed": false, 
 75 |                 "scrolled": true
 76 |             }, 
 77 |             "source": "import ibmdbpy\nfrom ibmdbpy import IdaDataBase\n\n\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + credentials_1[\"user\"] + \";PWD=\" + credentials_1[\"password\"])\nidadf = idadb.as_idadataframe(data_train, \"DATA_TRAIN\", clear_existing=True)\nidadf = idadb.as_idadataframe(data_test, \"DATA_TEST\", clear_existing=True)", 
 78 |             "outputs": [
 79 |                 {
 80 |                     "name": "stderr", 
 81 |                     "output_type": "stream", 
 82 |                     "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in <bound method Cursor.__del__ of <pypyodbc.Cursor instance at 0x7f79963ed7e8>> ignored\n"
 83 |                 }, 
 84 |                 {
 85 |                     "name": "stdout", 
 86 |                     "output_type": "stream", 
 87 |                     "text": "DataFrame will be splitted into 40 chunks. (500 rows per chunk)\nUploaded: 40/40... [DONE]\n"
 88 |                 }, 
 89 |                 {
 90 |                     "name": "stderr", 
 91 |                     "output_type": "stream", 
 92 |                     "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in <bound method Cursor.__del__ of <pypyodbc.Cursor instance at 0x7f79899cc758>> ignored\n"
 93 |                 }, 
 94 |                 {
 95 |                     "name": "stdout", 
 96 |                     "output_type": "stream", 
 97 |                     "text": "DataFrame will be splitted into 200 chunks. (500 rows per chunk)\nUploaded: 200/200... [DONE]\n"
 98 |                 }
 99 |             ], 
100 |             "cell_type": "code"
101 |         }
102 |     ]
103 | }


--------------------------------------------------------------------------------
/other/SF Usage 1B records.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "nbformat_minor": 0, 
  3 |     "metadata": {
  4 |         "language_info": {
  5 |             "pygments_lexer": "ipython2", 
  6 |             "name": "python", 
  7 |             "file_extension": ".py", 
  8 |             "codemirror_mode": {
  9 |                 "name": "ipython", 
 10 |                 "version": 2
 11 |             }, 
 12 |             "mimetype": "text/x-python", 
 13 |             "nbconvert_exporter": "python", 
 14 |             "version": "2.7.11"
 15 |         }, 
 16 |         "kernelspec": {
 17 |             "language": "python", 
 18 |             "name": "python2", 
 19 |             "display_name": "Python 2 with Spark 1.6"
 20 |         }
 21 |     }, 
 22 |     "nbformat": 4, 
 23 |     "cells": [
 24 |         {
 25 |             "execution_count": 7, 
 26 |             "metadata": {
 27 |                 "collapsed": true
 28 |             }, 
 29 |             "source": "import pandas as pd\nimport numpy as np", 
 30 |             "outputs": [], 
 31 |             "cell_type": "code"
 32 |         }, 
 33 |         {
 34 |             "execution_count": 8, 
 35 |             "metadata": {
 36 |                 "collapsed": false
 37 |             }, 
 38 |             "source": "import ibmdbpy\nfrom ibmdbpy import IdaDataBase,IdaDataFrame\n\ncredentials_1 = {\n  'host':'dashdb-entry-yp-dal09-08.services.dal.bluemix.net',\n  'port':'50000',\n  'user':'dash8753',\n  'password':\"\"\"ddd6463d0ddc\"\"\",\n  'database':'BLUDB'\n}\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + credentials_1[\"user\"] + \";PWD=\" + credentials_1[\"password\"])\n\n", 
 39 |             "outputs": [], 
 40 |             "cell_type": "code"
 41 |         }, 
 42 |         {
 43 |             "execution_count": 9, 
 44 |             "metadata": {
 45 |                 "collapsed": false, 
 46 |                 "scrolled": true
 47 |             }, 
 48 |             "source": "date_range=np.hstack((\n        np.arange(20160701,20160731),\n        np.arange(20160801,20160831),\n        np.arange(20160901,20160930),        \n        np.arange(20161001,20161031),\n        np.arange(20161101,20161130),\n        np.arange(20161201,20161231)))\n\npivot=['VOICE,INCOMING','VOICE,OUTGOING','SMS,INCOMING','SMS,OUTGOING','DATA,INCOMING','DATA,OUTGOING']\n      \nbase=np.transpose([np.tile(pivot, len(date_range)), np.repeat(date_range, len(pivot))])\nbase[:,[0, 1]] = base[:,[1, 0]]\nbase = np.hstack((base[:,:1], map(lambda x: x.split(','), base[:,1])))\n\nID = 11111111\nrecs = np.random.random_integers(500,base.shape[0])\none_ID=base[np.sort(np.random.choice(range(base.shape[0]), recs, replace=False)),:]\n\nnums=np.reshape(np.maximum(0,np.random.normal(1,10,recs*4)),(-1,4))\n\n\n#from timeit import default_timer as timer\n\n#start = timer()\n#res=pd.DataFrame(np.c_[ (np.repeat(ID,recs), one_ID, nums)],columns=['ID','date','cdr_type_name','cdr_type_direction','tot_num_times','tot_duration','total_up_down','tot_costs'])\n#res=res.apply(lambda x: pd.to_numeric(x, errors='ignore'))\n#end = timer()\n#print(end - start)    \n\n#start = timer()\nres=pd.concat([pd.DataFrame(np.repeat(ID,recs),columns=['subs_id']),\n               pd.DataFrame(one_ID,columns=['prd_id','cdr_type_name','cdr_direction']),\n               pd.DataFrame(nums,columns=['tot_num_times','tot_duration','total_up_down','tot_costs'])], axis=1)\nres['prd_id']=pd.to_datetime(res['prd_id'], format='%Y%m%d', errors='ignore')\n#end = timer()\n#print(end - start)   \n\n#idadb.as_idadataframe(res, \"SF_USAGE\", clear_existing=True)", 
 49 |             "outputs": [
 50 |                 {
 51 |                     "name": "stderr", 
 52 |                     "output_type": "stream", 
 53 |                     "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in <bound method Cursor.__del__ of <pypyodbc.Cursor instance at 0x7f759ca99d88>> ignored\n"
 54 |                 }, 
 55 |                 {
 56 |                     "name": "stdout", 
 57 |                     "output_type": "stream", 
 58 |                     "text": "Uploading 987 rows (maxnrow was set to 1000)\n"
 59 |                 }, 
 60 |                 {
 61 |                     "execution_count": 9, 
 62 |                     "metadata": {}, 
 63 |                     "data": {
 64 |                         "text/plain": "<ibmdbpy.frame.IdaDataFrame at 0x7f75b0d2f350>"
 65 |                     }, 
 66 |                     "output_type": "execute_result"
 67 |                 }
 68 |             ], 
 69 |             "cell_type": "code"
 70 |         }, 
 71 |         {
 72 |             "execution_count": 11, 
 73 |             "metadata": {
 74 |                 "collapsed": false
 75 |             }, 
 76 |             "source": "%%capture\n\nID = 11235879\n\n\nSF_USAGE = IdaDataFrame(idadb, 'SF_USAGE')\nfor i in range(15232):\n    ID+=1\n    recs = np.random.random_integers(500,base.shape[0])\n    one_ID=base[np.sort(np.random.choice(range(base.shape[0]), recs, replace=False)),:]\n\n    nums=np.reshape(np.maximum(0,np.random.normal(1,10,recs*4)),(-1,4))\n    res=pd.concat([pd.DataFrame(np.repeat(ID,recs),columns=['subs_id']),\n               pd.DataFrame(one_ID,columns=['prd_id','cdr_type_name','cdr_direction']),\n               pd.DataFrame(nums,columns=['tot_num_times','tot_duration','total_up_down','tot_costs'])], axis=1)\n    res['prd_id']=pd.to_datetime(res['prd_id'], format='%Y%m%d', errors='ignore')\n    \n    idadb.append(SF_USAGE,res);\n    SF_USAGE.commit()", 
 77 |             "outputs": [
 78 |                 {
 79 |                     "name": "stdout", 
 80 |                     "output_type": "stream", 
 81 |                     "text": "Uploading 836 rows (maxnrow was set to 1000)\nUploading 903 rows (maxnrow was set to 1000)\nUploading 740 rows (maxnrow was set to 1000)\nUploading 937 rows (maxnrow was set to 1000)\nUploading 883 rows (maxnrow was set to 1000)\nUploading 732 rows (maxnrow was set to 1000)\nUploading 1046 rows (maxnrow was set to 1000)\nUploading 663 rows (maxnrow was set to 1000)\nUploading 812 rows (maxnrow was set to 1000)\nUploading 901 rows (maxnrow was set to 1000)\nUploading 596 rows (maxnrow was set to 1000)\nUploading 525 rows (maxnrow was set to 1000)\nUploading 553 rows (maxnrow was set to 1000)\nUploading 982 rows (maxnrow was set to 1000)\nUploading 808 rows (maxnrow was set to 1000)\nUploading 759 rows (maxnrow was set to 1000)\nUploading 769 rows (maxnrow was set to 1000)\nUploading 954 rows (maxnrow was set to 1000)\nUploading 667 rows (maxnrow was set to 1000)\nUploading 675 rows (maxnrow was set to 1000)\nUploading 912 rows (maxnrow was set to 1000)\n"
 82 |                 }, 
 83 |                 {
 84 |                     "ename": "KeyboardInterrupt", 
 85 |                     "evalue": "", 
 86 |                     "output_type": "error", 
 87 |                     "traceback": [
 88 |                         "\u001b[0;31m\u001b[0m", 
 89 |                         "\u001b[0;31mKeyboardInterrupt\u001b[0mTraceback (most recent call last)", 
 90 |                         "\u001b[0;32m<ipython-input-11-28e64b35a2aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m     \u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'prd_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'prd_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'%Y%m%d'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m     \u001b[0midadb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSF_USAGE\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m     \u001b[0mSF_USAGE\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 
 91 |                         "\u001b[0;32m/usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages/ibmdbpy/base.pyc\u001b[0m in \u001b[0;36mappend\u001b[0;34m(self, idadf, df, maxnrow)\u001b[0m\n\u001b[1;32m   1382\u001b[0m             \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Uploading %s rows (maxnrow was set to %s)\"\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxnrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1383\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_insert_into_database\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midadf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtablename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msilent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1385\u001b[0m             \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1386\u001b[0m                 \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 
 92 |                         "\u001b[0;32m/usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages/ibmdbpy/base.pyc\u001b[0m in \u001b[0;36m_insert_into_database\u001b[0;34m(self, dataframe, tablename, silent)\u001b[0m\n\u001b[1;32m   1926\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1927\u001b[0m                     \u001b[0mvalue_string\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m'%s,'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1928\u001b[0;31m             \u001b[0;32mif\u001b[0m \u001b[0mvalue_string\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m','\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1929\u001b[0m                 \u001b[0mvalue_string\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue_string\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1930\u001b[0m             \u001b[0mrow_string\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"(%s),\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue_string\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 
 93 |                         "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
 94 |                     ]
 95 |                 }
 96 |             ], 
 97 |             "cell_type": "code"
 98 |         }
 99 |     ]
100 | }


--------------------------------------------------------------------------------