├── 0. Basics ├── .ipynb_checkpoints │ └── 0. Basics 4. Jupyter notebook magics, shell and R-checkpoint.ipynb ├── 0. Basics 0. Jupyter notebook.ipynb ├── 0. Basics 1. Python.ipynb ├── 0. Basics 2. Numpy.ipynb ├── 0. Basics 3. Pandas.ipynb ├── 0. Basics 4. Jupyter notebook magics, shell and R.ipynb ├── 0. Basics 5. Exercise 0.ipynb ├── 0. Basics 5. Exercise help.ipynb ├── 0. Basics 6. Some data.zip ├── Numpy_Python_Cheat_Sheet.pdf └── PandasPythonForDataScience.pdf ├── 1. In and Export ├── 1. import and export 0. Object Storage.ipynb ├── 1. import and export 1. Download and upload.ipynb ├── 1. import and export 2. DashDB.ipynb ├── 1. import and export 3. Cloudant.ipynb ├── 1. import and export 4. Twitter.ipynb └── 1. import and export 5. BigInsights.ipynb ├── 2. Watson APIs ├── 2. Watson 0. Weather API.ipynb ├── 2. Watson 1. Personality Insights.ipynb ├── 2. Watson 2. Alchemy News.ipynb ├── 2. Watson 3. Alchemy language.ipynb ├── 2. Watson 4. Tone analyzer.ipynb └── 2. Watson 5. Natural language classifier.ipynb ├── 3. Visualization ├── 3. Visualization 0. Matplotlib.ipynb ├── 3. Visualization 1. Machine learning techniques.ipynb ├── 3. Visualization 2. Pixiedust.ipynb ├── 3. Visualization 3. Bokeh.ipynb ├── Python_Bokeh_Cheat_Sheet.pdf └── Python_Matplotlib_Cheat_Sheet.pdf ├── 4. Spark ├── 4. Spark 0. rdd-creation.ipynb ├── 4. Spark 1. rdd-basics.ipynb ├── 4. Spark 2. rdd-sampling.ipynb ├── 4. Spark 3. rdd-set.ipynb ├── 4. Spark 4. rdd-aggregations.ipynb ├── 4. Spark 5. rdd-key-value.ipynb ├── 4. Spark 6. mllib-statistics.ipynb ├── 4. Spark 7. mllib-logit.ipynb ├── 4. Spark 8. mllib-trees.ipynb ├── 4. Spark 9. sql-dataframes.ipynb ├── LICENSE └── README.md ├── 5. Machine Learning ├── 5. ML 0. Install requirements.ipynb ├── 5. ML 1. Introduction.ipynb ├── 5. ML 2. Data preparation.ipynb ├── 5. ML 3. Scikit Learn interface.ipynb ├── 5. ML 4. Bias and variance.ipynb ├── 5. ML 5. Model evaluation.ipynb ├── 5. ML 6. Ensemble methods.ipynb ├── 5. ML 7. Ensemble methods advanced.ipynb ├── 5. ML 8. Multi Model Ensembles.ipynb ├── 5. ML 9. Time series.ipynb └── Scikit_Learn_Cheat_Sheet_Python.pdf ├── 6. Deep Learning ├── .ipynb_checkpoints │ └── 6. DL 2. Convolutional networks-checkpoint.ipynb ├── 6. DL 0. Keras starter kit.ipynb ├── 6. DL 1. Fun with activation functions.ipynb ├── 6. DL 2. Convolutional networks.ipynb ├── 6. DL 3. Embedding.ipynb ├── 6. DL 4. Multi-input models.ipynb ├── 6. DL 5. Auto encoder.ipynb ├── 6. DL 6. Recurrent networks.ipynb ├── Keras.js Demos.url └── Keras_Cheat_Sheet_Python.pdf ├── 7. Misc ├── Training Dataset.arff ├── bias and variance.png ├── biasvariance.py ├── ensemble_explore_hastie.png ├── international-airline-passengers.csv ├── learning_curves.png ├── matlab_test_data_01.mat ├── moon phases.xlsx ├── test.csv ├── train.csv ├── tree.png └── weather.txt ├── Class intro.pptx ├── Putting data to work.pptx ├── README.md ├── agenda voting.ipynb ├── agenda.txt └── other ├── Data Generation.ipynb ├── HELPDESK - DATA IMPORT.ipynb ├── Lecture-4-Matplotlib.ipynb ├── Lime.ipynb ├── PandasCheatSheet.ipynb ├── SF Usage 1B records.ipynb ├── Scikit-learn models.ipynb ├── recharge model.ipynb ├── things_in_pandas.ipynb └── tmp.ipynb /0. Basics/0. Basics 0. Jupyter notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "# Jupyter notebook basics", 7 | "cell_type": "markdown", 8 | "metadata": { 9 | "collapsed": true 10 | } 11 | }, 12 | { 13 | "source": "Notebook documents (or \u201cnotebooks\u201d, all lower case) are documents produced by the Jupyter Notebook App, which contain both computer code (e.g. python) and rich text elements (paragraph, equations, figures, links, etc...). \n\nNotebook documents are both human-readable documents containing the analysis description and the results (figures, tables, etc..) as well as executable documents which can be run to perform data analysis.", 14 | "cell_type": "markdown", 15 | "metadata": {} 16 | }, 17 | { 18 | "source": "The Jupyter Notebook App is a server-client application that allows editing and running notebook documents via a web browser. The IBM version of Jupyter Notebook App is installed on a remote server and accessed through the internet.", 19 | "cell_type": "markdown", 20 | "metadata": {} 21 | }, 22 | { 23 | "source": "A notebook kernel is a \u201ccomputational engine\u201d that executes the code contained in a Notebook document. The ipython kernel, referenced in this guide, executes python code. Kernels for many other languages exist (check the Kernels menu above).\n\nWhen you open a Notebook document, the associated kernel is automatically launched. When the notebook is executed (either cell-by-cell or with menu Cell -> Run All), the kernel performs the computation and produces the results. Depending on the type of computations, the kernel may consume significant CPU and RAM. Note that the RAM is not released until the kernel is shut-down", 24 | "cell_type": "markdown", 25 | "metadata": {} 26 | }, 27 | { 28 | "source": "# Useful shortcuts in Jupter:\n\nctrl-enter: execute the active cell and stay in that cell\nshift-enter: execute the active cell and move to the next cell\n\nTry it out in the next cell:", 29 | "cell_type": "markdown", 30 | "metadata": {} 31 | }, 32 | { 33 | "execution_count": 1, 34 | "metadata": { 35 | "collapsed": false, 36 | "scrolled": true 37 | }, 38 | "outputs": [ 39 | { 40 | "text": "Hello world\n", 41 | "name": "stdout", 42 | "output_type": "stream" 43 | } 44 | ], 45 | "cell_type": "code", 46 | "source": "print (\"Hello world\")" 47 | }, 48 | { 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": false, 52 | "scrolled": true 53 | }, 54 | "outputs": [ 55 | { 56 | "text": "Second line\n", 57 | "name": "stdout", 58 | "output_type": "stream" 59 | } 60 | ], 61 | "cell_type": "code", 62 | "source": "print (\"Second line\")" 63 | }, 64 | { 65 | "source": "# Inserting new cells\n\nwhen a cell is selected in blue (click in the margin to the left of the cell), it shows a blue surrounding box.\n\nType: \na (above) to create a new empty cell above the currently active cell\nb (below) to create a new empty cell below the currently active cell\n\nTry it out.", 66 | "cell_type": "markdown", 67 | "metadata": {} 68 | }, 69 | { 70 | "source": "# markup code\n\nThe shorcut m (with the blue selection) changes the cell from computation to markdown. This allows to create rich text elements to document the code.", 71 | "cell_type": "markdown", 72 | "metadata": {} 73 | }, 74 | { 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "cell_type": "code", 81 | "source": "Try it out: make this text part of the markdown. " 82 | }, 83 | { 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "cell_type": "code", 90 | "source": "# Heading 1\n## Heading 2\n### Heading 3\n#### Heading 4\n\n**bold**\n\n*italics*\n\nempty line is a paragraph\n\nthis is a new parapgraph" 91 | }, 92 | { 93 | "source": "# Heading 1\n## Heading 2\n### Heading 3\n#### Heading 4\n\n**bold**\n\n*italics*\n\nempty line is a paragraph\n\nthis is a new parapgraph", 94 | "cell_type": "markdown", 95 | "metadata": {} 96 | }, 97 | { 98 | "source": "# other shotcuts\n\nThe shorcut h (with the blue selection) shows other shortcuts\nThe shortcut o (with the blue selection) will hide long output.", 99 | "cell_type": "markdown", 100 | "metadata": {} 101 | }, 102 | { 103 | "execution_count": 5, 104 | "metadata": { 105 | "collapsed": false, 106 | "scrolled": true 107 | }, 108 | "outputs": [ 109 | { 110 | "text": "Hello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\nHello world\n", 111 | "name": "stdout", 112 | "output_type": "stream" 113 | } 114 | ], 115 | "cell_type": "code", 116 | "source": "# Try toggling the output\nfor i in range(10):\n print (\"Hello world\")" 117 | }, 118 | { 119 | "source": "Help on objects:\nobj?, obj?? : Get help, or more help for object (also works as\n ?obj, ??obj).\n?foo.*abc* : List names in 'foo' containing 'abc' in them.", 120 | "cell_type": "markdown", 121 | "metadata": {} 122 | }, 123 | { 124 | "execution_count": 15, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "cell_type": "code", 130 | "source": "#try it out\n?dict" 131 | } 132 | ], 133 | "metadata": { 134 | "language_info": { 135 | "nbconvert_exporter": "python", 136 | "file_extension": ".py", 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 2 140 | }, 141 | "pygments_lexer": "ipython2", 142 | "version": "2.7.11", 143 | "mimetype": "text/x-python", 144 | "name": "python" 145 | }, 146 | "kernelspec": { 147 | "language": "python", 148 | "display_name": "Python 2 with Spark 1.6", 149 | "name": "python2" 150 | } 151 | } 152 | } -------------------------------------------------------------------------------- /0. Basics/0. Basics 5. Exercise 0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Moon phases" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": false, 14 | "scrolled": false 15 | }, 16 | "source": [ 17 | "# This site contains the moon phases \n", 18 | "### http://aa.usno.navy.mil/data/docs/MoonFraction.php\n", 19 | " \n", 20 | "# Download this data and convert in the appropriate format (think about what appropriate means) \n", 21 | "### http://aa.usno.navy.mil/cgi-bin/aa_moonill2.pl?form=1&year=2017&task=00&tz=-05" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "anaconda-cloud": {}, 27 | "kernelspec": { 28 | "display_name": "Python [conda root]", 29 | "language": "python", 30 | "name": "conda-root-py" 31 | }, 32 | "language_info": { 33 | "codemirror_mode": { 34 | "name": "ipython", 35 | "version": 2 36 | }, 37 | "file_extension": ".py", 38 | "mimetype": "text/x-python", 39 | "name": "python", 40 | "nbconvert_exporter": "python", 41 | "pygments_lexer": "ipython2", 42 | "version": "2.7.12" 43 | } 44 | }, 45 | "nbformat": 4, 46 | "nbformat_minor": 1 47 | } 48 | -------------------------------------------------------------------------------- /0. Basics/0. Basics 6. Some data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/0. Basics/0. Basics 6. Some data.zip -------------------------------------------------------------------------------- /0. Basics/Numpy_Python_Cheat_Sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/0. Basics/Numpy_Python_Cheat_Sheet.pdf -------------------------------------------------------------------------------- /0. Basics/PandasPythonForDataScience.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/0. Basics/PandasPythonForDataScience.pdf -------------------------------------------------------------------------------- /1. In and Export/1. import and export 0. Object Storage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# In-and export from Object storage" 7 | }, 8 | { 9 | "cell_type": "code", 10 | "metadata": { 11 | "collapsed": true 12 | }, 13 | "outputs": [], 14 | "source": "#imports\nfrom io import StringIO\nimport requests\nimport json\n\nimport pandas as pd\nimport numpy as np", 15 | "execution_count": 2 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": "Credentials" 21 | }, 22 | { 23 | "cell_type": "code", 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": "@hidden_cell\n\ncredentials_1= {\n \"auth_url\": \"https://identity.open.softlayer.com\",\n \"project\": \"object_storage_effacaaa_ad08_4ee4_bd59_b2e105bc9639\",\n \"projectId\": \"e96165fa44c44a7d956507ebf4026cfb\",\n \"region\": \"dallas\",\n \"userId\": \"5002ddd00432452d8dd086e3f74ed3f1\",\n \"username\": \"admin_bdffbeee0797e72fe86a4270ec23774a835cbd4b\",\n \"password\": \"TAu!-k36VR[&xp9V\",\n \"domainId\": \"193a321481be4f73b08e76a87e7d585a\",\n \"container\":\"DSETraining101ObjectStorage\",\n \"domainName\": \"1123181\",\n \"role\": \"admin\"\n}\n", 29 | "execution_count": 3 30 | }, 31 | { 32 | "cell_type": "code", 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "metadata": {}, 39 | "data": { 40 | "text/plain": " 0 1 2 3 4 5 6 \\\n0 0.218387 0.681619 0.127457 0.176184 0.208149 0.431590 0.436492 \n1 0.438189 0.477055 0.292263 0.737119 0.986888 0.571864 0.952132 \n2 0.246476 0.425691 0.344394 0.302752 0.592942 0.350859 0.620692 \n3 0.303911 0.326860 0.147415 0.208054 0.371225 0.910451 0.544563 \n4 0.885194 0.156615 0.795850 0.208292 0.658539 0.738021 0.750869 \n\n 7 8 9 ... 90 91 92 \\\n0 0.346246 0.632588 0.829314 ... 0.432520 0.974076 0.894935 \n1 0.589837 0.010117 0.174665 ... 0.301537 0.552984 0.879647 \n2 0.516486 0.033306 0.304789 ... 0.131419 0.591061 0.840573 \n3 0.394877 0.424432 0.418725 ... 0.556135 0.013987 0.149688 \n4 0.233611 0.890301 0.501873 ... 0.149136 0.617435 0.659965 \n\n 93 94 95 96 97 98 99 \n0 0.410939 0.487212 0.428583 0.598402 0.706644 0.385664 0.493396 \n1 0.380063 0.300516 0.398224 0.739755 0.462363 0.046500 0.510257 \n2 0.664932 0.472257 0.998087 0.073214 0.984443 0.957306 0.745294 \n3 0.604044 0.574204 0.930515 0.797487 0.785470 0.447085 0.587239 \n4 0.096339 0.922107 0.220996 0.989596 0.604612 0.461251 0.658624 \n\n[5 rows x 100 columns]", 41 | "text/html": "
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
0123456789...90919293949596979899
00.2183870.6816190.1274570.1761840.2081490.4315900.4364920.3462460.6325880.829314...0.4325200.9740760.8949350.4109390.4872120.4285830.5984020.7066440.3856640.493396
10.4381890.4770550.2922630.7371190.9868880.5718640.9521320.5898370.0101170.174665...0.3015370.5529840.8796470.3800630.3005160.3982240.7397550.4623630.0465000.510257
20.2464760.4256910.3443940.3027520.5929420.3508590.6206920.5164860.0333060.304789...0.1314190.5910610.8405730.6649320.4722570.9980870.0732140.9844430.9573060.745294
30.3039110.3268600.1474150.2080540.3712250.9104510.5445630.3948770.4244320.418725...0.5561350.0139870.1496880.6040440.5742040.9305150.7974870.7854700.4470850.587239
40.8851940.1566150.7958500.2082920.6585390.7380210.7508690.2336110.8903010.501873...0.1491360.6174350.6599650.0963390.9221070.2209960.9895960.6046120.4612510.658624
\n

5 rows \u00d7 100 columns

\n
" 42 | }, 43 | "execution_count": 8, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": "# create some data\nrandom_data=pd.DataFrame(np.random.random((1000, 100)))\nrandom_data.head()", 48 | "execution_count": 8 49 | }, 50 | { 51 | "cell_type": "code", 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "text": "\n", 58 | "output_type": "stream", 59 | "name": "stdout" 60 | } 61 | ], 62 | "source": "#usecase you have a file in Python (for example from a database) and you want to create a file in local object storage\n\ndef put_file(credentials, local_file_name): \n \"\"\"This functions returns a StringIO object containing\n the file content from Bluemix Object Storage V3.\"\"\"\n f = open(local_file_name,'r')\n my_data = f.read()\n url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])\n data = {'auth': {'identity': {'methods': ['password'],\n 'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domainId']},\n 'password': credentials['password']}}}}}\n headers1 = {'Content-Type': 'application/json'}\n resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)\n resp1_body = resp1.json()\n for e1 in resp1_body['token']['catalog']:\n if(e1['type']=='object-store'):\n for e2 in e1['endpoints']:\n if(e2['interface']=='public'and e2['region']=='dallas'):\n url2 = ''.join([e2['url'],'/', credentials['container'], '/', local_file_name])\n s_subject_token = resp1.headers['x-subject-token']\n headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}\n resp2 = requests.put(url=url2, headers=headers2, data = my_data )\n print resp2\n \n \n# step 1: store object in local file.\n# data_train is an earlier defined pandas dataframe containing data\nrandom_data.to_csv('random_data.csv',index=False)\n#step 2: move to object storage \nput_file(credentials_1,\"random_data.csv\") ", 63 | "execution_count": 9 64 | }, 65 | { 66 | "cell_type": "code", 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "metadata": {}, 73 | "data": { 74 | "text/plain": " 0 1 2 3 4 5 6 \\\n0 0.218387 0.681619 0.127457 0.176184 0.208149 0.431590 0.436492 \n1 0.438189 0.477055 0.292263 0.737119 0.986888 0.571864 0.952132 \n2 0.246476 0.425691 0.344394 0.302752 0.592942 0.350859 0.620692 \n3 0.303911 0.326860 0.147415 0.208054 0.371225 0.910451 0.544563 \n4 0.885194 0.156615 0.795850 0.208292 0.658539 0.738021 0.750869 \n\n 7 8 9 ... 90 91 92 \\\n0 0.346246 0.632588 0.829314 ... 0.432520 0.974076 0.894935 \n1 0.589837 0.010117 0.174665 ... 0.301537 0.552984 0.879647 \n2 0.516486 0.033306 0.304789 ... 0.131419 0.591061 0.840573 \n3 0.394877 0.424432 0.418725 ... 0.556135 0.013987 0.149688 \n4 0.233611 0.890301 0.501873 ... 0.149136 0.617435 0.659965 \n\n 93 94 95 96 97 98 99 \n0 0.410939 0.487212 0.428583 0.598402 0.706644 0.385664 0.493396 \n1 0.380063 0.300516 0.398224 0.739755 0.462363 0.046500 0.510257 \n2 0.664932 0.472257 0.998087 0.073214 0.984443 0.957306 0.745294 \n3 0.604044 0.574204 0.930515 0.797487 0.785470 0.447085 0.587239 \n4 0.096339 0.922107 0.220996 0.989596 0.604612 0.461251 0.658624 \n\n[5 rows x 100 columns]", 75 | "text/html": "
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
0123456789...90919293949596979899
00.2183870.6816190.1274570.1761840.2081490.4315900.4364920.3462460.6325880.829314...0.4325200.9740760.8949350.4109390.4872120.4285830.5984020.7066440.3856640.493396
10.4381890.4770550.2922630.7371190.9868880.5718640.9521320.5898370.0101170.174665...0.3015370.5529840.8796470.3800630.3005160.3982240.7397550.4623630.0465000.510257
20.2464760.4256910.3443940.3027520.5929420.3508590.6206920.5164860.0333060.304789...0.1314190.5910610.8405730.6649320.4722570.9980870.0732140.9844430.9573060.745294
30.3039110.3268600.1474150.2080540.3712250.9104510.5445630.3948770.4244320.418725...0.5561350.0139870.1496880.6040440.5742040.9305150.7974870.7854700.4470850.587239
40.8851940.1566150.7958500.2082920.6585390.7380210.7508690.2336110.8903010.501873...0.1491360.6174350.6599650.0963390.9221070.2209960.9895960.6046120.4612510.658624
\n

5 rows \u00d7 100 columns

\n
" 76 | }, 77 | "execution_count": 10, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": "#use case: you upload a file in the object storage and want to access it in Python.\n\ndef get_object_storage_file_with_credentials(credentials,container, filename):\n \"\"\"This functions returns a StringIO object containing\n the file content from Bluemix Object Storage.\"\"\"\n\n url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])\n data = {'auth': {'identity': {'methods': ['password'],\n 'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domainId']},\n 'password': credentials['password']}}}}}\n headers1 = {'Content-Type': 'application/json'}\n resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)\n resp1_body = resp1.json()\n for e1 in resp1_body['token']['catalog']:\n if(e1['type']=='object-store'):\n for e2 in e1['endpoints']:\n if(e2['interface']=='public'and e2['region']=='dallas'):\n url2 = ''.join([e2['url'],'/', container, '/', filename])\n s_subject_token = resp1.headers['x-subject-token']\n headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}\n resp2 = requests.get(url=url2, headers=headers2)\n return StringIO(resp2.text)\n\n# step 1: get file from object storage\n#data_train.csv is an existing object in the Object Storage \nrandom_data = pd.read_csv(get_object_storage_file_with_credentials(credentials_1,'DSETraining101ObjectStorage', 'random_data.csv'))\nrandom_data.head()\n\n\n", 82 | "execution_count": 10 83 | } 84 | ], 85 | "nbformat": 4, 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 2 with Spark 1.6", 89 | "name": "python2", 90 | "language": "python" 91 | }, 92 | "language_info": { 93 | "version": "2.7.11", 94 | "codemirror_mode": { 95 | "version": 2, 96 | "name": "ipython" 97 | }, 98 | "mimetype": "text/x-python", 99 | "nbconvert_exporter": "python", 100 | "file_extension": ".py", 101 | "name": "python", 102 | "pygments_lexer": "ipython2" 103 | } 104 | }, 105 | "nbformat_minor": 0 106 | } -------------------------------------------------------------------------------- /1. In and Export/1. import and export 1. Download and upload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "# In-and export from url\n\nhttp://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python\n", 7 | "cell_type": "markdown", 8 | "metadata": {} 9 | }, 10 | { 11 | "execution_count": 21, 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "outputs": [ 16 | { 17 | "text": "Requirement already satisfied (use --upgrade to upgrade): tqdm in /gpfs/global_fs01/sym_shared/YPProdSpark/user/s16e-7918d85e6de098-7a7840b6cba3/.local/lib/python2.7/site-packages\nCollecting Image\n Downloading image-1.5.5.tar.gz\nRequirement already satisfied (use --upgrade to upgrade): pillow in /usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages (from Image)\nCollecting django (from Image)\n Downloading Django-1.10.5-py2.py3-none-any.whl (6.8MB)\n\u001b[K 100% |\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6.8MB 171kB/s \n\u001b[?25hInstalling collected packages: django, Image\n Running setup.py install for Image ... \u001b[?25l-\b \b\\\b \bdone\n\u001b[?25hSuccessfully installed Image-1.5.5 django-1.10.5\n", 18 | "name": "stdout", 19 | "output_type": "stream" 20 | } 21 | ], 22 | "cell_type": "code", 23 | "source": "!pip install tqdm" 24 | }, 25 | { 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "cell_type": "code", 32 | "source": "import urllib\nfrom tqdm import tqdm\nimport requests\nfrom requests.auth import HTTPDigestAuth\nimport json\nimport os" 33 | }, 34 | { 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": "('10MB.zip', )" 43 | }, 44 | "execution_count": 2, 45 | "output_type": "execute_result", 46 | "metadata": {} 47 | } 48 | ], 49 | "cell_type": "code", 50 | "source": "#simplest version\nurllib.urlretrieve (\"http://download.thinkbroadband.com/10MB.zip\", \"10MB.zip\")" 51 | }, 52 | { 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "cell_type": "code", 59 | "source": "# Python 3 variant:\n\nfrom requests import get \ndef download(url, file_name):\n # open in binary mode\n with open(file_name, \"wb\") as file:\n # get request\n response = get(url)\n # write to file\n file.write(response.content)\n" 60 | }, 61 | { 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "cell_type": "code", 68 | "source": "url = \"http://download.thinkbroadband.com/10MB.zip\"\nresponse = requests.get(url, stream=True)\n\nwith open(\"10MB\", \"wb\") as handle:\n for data in tqdm(response.iter_content()):\n handle.write(data)" 69 | }, 70 | { 71 | "execution_count": 13, 72 | "metadata": { 73 | "collapsed": false, 74 | "scrolled": true 75 | }, 76 | "outputs": [ 77 | { 78 | "text": "200\n{\"type\": \"success\", \"value\": {\"joke\": \"Chuck Norris doesn't read books. He stares them down until he gets the information he wants.\", \"id\": 3, \"categories\": []}}\n", 79 | "name": "stdout", 80 | "output_type": "stream" 81 | } 82 | ], 83 | "cell_type": "code", 84 | "source": "# get data from an api call\n\n# visit http://www.icndb.com/api/ to see other options to quesry the Chuck norris jokes database\nurl = \"http://api.icndb.com/jokes/random\"\n\n# It is a good practice not to hardcode the credentials. So ask the user to enter credentials at runtime\nmyResponse = requests.get(url)\n#myResponse = requests.get(url,auth=HTTPDigestAuth(raw_input(\"username: \"), raw_input(\"Password: \")), verify=True)\nprint (myResponse.status_code)\n# For successful API call, response code will be 200 (OK)\n\njData = json.loads(myResponse.content)\nprint json.dumps(jData)\n" 85 | }, 86 | { 87 | "execution_count": 18, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "text": "\n", 94 | "name": "stdout", 95 | "output_type": "stream" 96 | } 97 | ], 98 | "cell_type": "code", 99 | "source": "#uploading data\n\nwith open('output_file', 'wb') as fout:\n fout.write(os.urandom(1024)) \n\nr = requests.post('http://httpbin.org/post', files={'output_file': open('output_file', 'rb')})\nprint r" 100 | } 101 | ], 102 | "metadata": { 103 | "language_info": { 104 | "nbconvert_exporter": "python", 105 | "file_extension": ".py", 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 2 109 | }, 110 | "pygments_lexer": "ipython2", 111 | "version": "2.7.11", 112 | "mimetype": "text/x-python", 113 | "name": "python" 114 | }, 115 | "kernelspec": { 116 | "language": "python", 117 | "display_name": "Python 2 with Spark 1.6", 118 | "name": "python2" 119 | } 120 | } 121 | } -------------------------------------------------------------------------------- /1. In and Export/1. import and export 2. DashDB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": "# In-and export from DashDB\n\nUseful doc: \nhttp://pythonhosted.org/ibmdbpy/start.html" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "outputs": [], 16 | "source": "#imports\nimport ibmdbpy\nfrom ibmdbpy import IdaDataBase, IdaDataFrame\n\nimport pandas as pd\nimport numpy as np", 17 | "execution_count": 3 18 | }, 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "scrolled": true, 23 | "collapsed": false 24 | }, 25 | "outputs": [ 26 | { 27 | "metadata": {}, 28 | "data": { 29 | "text/plain": " a b c\n0 0.579127 0.691267 0.156212\n1 0.475068 0.030056 0.096346\n2 0.950316 0.898397 0.994278\n3 0.540834 0.900723 0.902864\n4 0.299987 0.733373 0.101006", 30 | "text/html": "
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
00.5791270.6912670.156212
10.4750680.0300560.096346
20.9503160.8983970.994278
30.5408340.9007230.902864
40.2999870.7333730.101006
\n
" 31 | }, 32 | "execution_count": 23, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": "# create some data\nrandom_data=pd.DataFrame(np.random.random((1000, 3)),columns=['a','b','c'])\nrandom_data.head()", 37 | "execution_count": 23 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": "@hidden_cell \n\ncredentials_1 = {\n 'host':'dashdb-entry-yp-dal09-08.services.dal.bluemix.net',\n 'port':'50000',\n 'user':'dash8753',\n 'password':\"\"\"ddd6463d0ddc\"\"\",\n 'database':'BLUDB'\n}", 46 | "execution_count": 24 47 | }, 48 | { 49 | "cell_type": "code", 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "text": "Uploading 1000 rows (maxnrow was set to 2666)\n", 56 | "output_type": "stream", 57 | "name": "stdout" 58 | }, 59 | { 60 | "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in > ignored\n", 61 | "output_type": "stream", 62 | "name": "stderr" 63 | } 64 | ], 65 | "source": "# use case: you have some data in Python (for example from a csv file) and you want to upload it to a database\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + \n credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + \n credentials_1[\"user\"] + \";PWD=\" + \n credentials_1[\"password\"])\n\nidadf = idadb.as_idadataframe(random_data, \"RANDOM_DATA\", clear_existing=True) #input: a Pandas dataframe", 66 | "execution_count": 25 67 | }, 68 | { 69 | "cell_type": "code", 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "text": "Uploading 1000 rows (maxnrow was set to 2666)\n", 76 | "output_type": "stream", 77 | "name": "stdout" 78 | } 79 | ], 80 | "source": "# use case: you have an existing table and you want to append additional records\n\nRANDOM_DATA_DATAFRAME = IdaDataFrame(idadb, 'RANDOM_DATA') #define the IdaDataFrame\n \nidadb.append(RANDOM_DATA_DATAFRAME,random_data) # add the Pandas Dataframe 'random_data' once more to the database\nRANDOM_DATA_DATAFRAME.commit() #ensure a commit or else you won't see the addition", 81 | "execution_count": 26 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [ 89 | { 90 | "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in > ignored\n", 91 | "output_type": "stream", 92 | "name": "stderr" 93 | }, 94 | { 95 | "metadata": {}, 96 | "data": { 97 | "text/plain": "(2000, 3)" 98 | }, 99 | "execution_count": 28, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": "# use case: you have data in a database and want to make it accessible to Pyton\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + \n credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + \n credentials_1[\"user\"] + \";PWD=\" + \n credentials_1[\"password\"])\n\nrandom_data = IdaDataFrame(idadb, 'RANDOM_DATA')\nrandom_data = random_data.as_dataframe()\nrandom_data.head()\n#random_data.shape #2000 due to the appending", 104 | "execution_count": 28 105 | }, 106 | { 107 | "cell_type": "code", 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": "", 113 | "execution_count": null 114 | } 115 | ], 116 | "nbformat": 4, 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 2 with Spark 1.6", 120 | "name": "python2", 121 | "language": "python" 122 | }, 123 | "language_info": { 124 | "version": "2.7.11", 125 | "codemirror_mode": { 126 | "version": 2, 127 | "name": "ipython" 128 | }, 129 | "mimetype": "text/x-python", 130 | "nbconvert_exporter": "python", 131 | "file_extension": ".py", 132 | "name": "python", 133 | "pygments_lexer": "ipython2" 134 | } 135 | }, 136 | "nbformat_minor": 0 137 | } -------------------------------------------------------------------------------- /1. In and Export/1. import and export 3. Cloudant.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "cells": [ 4 | { 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": "# In-and export from Cloudant\n\nhttp://python-cloudant.readthedocs.io/en/latest/getting_started.html\n\nCloudant is a NoSQL database as a service (DBaaS) built to scale globally, run nonstop, and handle a wide variety of data types like JSON, full-text, and geospatial. Cloudant NoSQL DB is an operational data store optimized to handle concurrent reads and writes and to provide high availability and data durability." 8 | }, 9 | { 10 | "cell_type": "code", 11 | "execution_count": 1, 12 | "source": "!pip install --user cloudant", 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": "Collecting cloudant\n Downloading cloudant-2.3.1-py2-none-any.whl (63kB)\n\u001b[K 100% |\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71kB 5.2MB/s \n\u001b[?25hRequirement already satisfied (use --upgrade to upgrade): requests<3.0.0,>=2.7.0 in /usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages (from cloudant)\nInstalling collected packages: cloudant\nSuccessfully installed cloudant-2.3.1\n" 21 | } 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "source": "from cloudant.client import Cloudant\nfrom cloudant.result import Result\nimport pandas as pd, json", 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "source": "@hidden_cell\n\ncredentials_1 = {\n 'username':'b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix',\n 'password':\"\"\"a5a4d2e9a0db63949c7c592b016f1d12b86880fa398476c00472360891df09b2\"\"\",\n 'host':'b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix.cloudant.com',\n 'port':'443',\n 'url':'https://b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix:a5a4d2e9a0db63949c7c592b016f1d12b86880fa398476c00472360891df09b2@b02b1918-33fb-4c0b-a7a7-f5762138ee1d-bluemix.cloudant.com'\n}", 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "source": "# connect to cloudant\nclient = Cloudant(credentials_1['username'], credentials_1['password'], url=credentials_1['url'])\nclient.connect()\n# Disconnect from the server\n# client.disconnect()", 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "source": "client.all_dbs()", 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "execution_count": 4, 61 | "data": { 62 | "text/plain": "[u'_replicator', u'_users']" 63 | }, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "source": "# Create a database using an initialized client\n\nmy_database = client.create_database('my_database')\nmy_database.exists()", 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "execution_count": 5, 79 | "data": { 80 | "text/plain": "True" 81 | }, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "source": "# Open an existing database\nmy_database = client['my_database']", 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "source": "# Delete a database using an initialized client\n# client.delete_database('my_database')", 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 9, 108 | "source": "# Create document content data\ndata = {\n '_id': 'julia30', # Setting _id is optional\n 'name': 'Julia',\n 'age': 30,\n 'pets': ['cat', 'dog', 'frog']\n }\n\n# Create a document using the Database API\nmy_document = my_database.create_document(data)", 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 10, 117 | "source": "my_document = my_database['julia30']\n\n# Display the document\nprint my_document", 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": "{'_rev': u'3-8cf90dfd0627cb4e3f7284ebacb59a36', 'age': 30, '_id': u'julia30', 'name': 'Julia', 'pets': ['cat', 'dog', 'frog']}\n" 126 | } 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 11, 132 | "source": "# Get all of the documents from my_database\nfor document in my_database:\n print document", 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": "{u'name': u'Julia', u'pets': [u'cat', u'dog', u'frog'], u'_rev': u'3-8cf90dfd0627cb4e3f7284ebacb59a36', '_id': u'julia30', u'age': 30}\n" 141 | } 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 12, 147 | "source": "# First retrieve the document\nmy_document = my_database['julia30']\n\n# Update the document content\n# This can be done as you would any other dictionary\nmy_document['name'] = 'Jules'\nmy_document['age'] = 6\n\n# You must save the document in order to update it on the database\nmy_document.save()", 148 | "metadata": { 149 | "collapsed": true 150 | }, 151 | "outputs": [] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 13, 156 | "source": "# First retrieve the document\nmy_document = my_database['julia30']\n\n# Delete the document\n#my_document.delete()", 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 14, 165 | "source": "from cloudant.result import Result, ResultByKey\n\n# Retrieve Result wrapped document content.\n# Note: The include_docs parameter is optional and is used to illustrate that view query\n# parameters can be used to customize the result collection.\nresult_collection = Result(my_database.all_docs, include_docs=True)\n\n# Get the result at a given location in the result collection\n# Note: Valid result collection indexing starts at 0\nresult = result_collection[0] # result is the 1st in the collection\nresult = result_collection[9] # result is the 10th in the collection\n\n# Get the result for matching a key\nresult = result_collection['julia30'] # result is all that match key 'julia30'\n\n# If your key is an integer then use the ResultByKey class to differentiate your integer\n# key from an indexed location within the result collection which is also an integer.\nresult = result_collection[ResultByKey(9)] # result is all that match key 9\n\n# Slice by key values\nresult = result_collection['julia30': 'ruby99'] # result is between and including keys\nresult = result_collection['julia30': ] # result is after and including key\nresult = result_collection[: 'ruby99'] # result is up to and including key\n\n# Slice by index values\nresult = result_collection[100: 200] # result is between 100 to 200, including 200th\nresult = result_collection[: 200] # result is up to and including the 200th\nresult = result_collection[100: ] # result is after the 100th\n\n# Iterate over the result collection\nfor result in result_collection:\n print result", 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": "{u'value': {u'rev': u'4-e40a1ebb1dc53a84bd0ca9c3431b1436'}, u'id': u'julia30', u'key': u'julia30', u'doc': {u'_rev': u'4-e40a1ebb1dc53a84bd0ca9c3431b1436', u'_id': u'julia30', u'age': 6, u'pets': [u'cat', u'dog', u'frog'], u'name': u'Jules'}}\n" 174 | } 175 | ] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 2 with Spark 1.6", 181 | "language": "python", 182 | "name": "python2" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "version": 2, 187 | "name": "ipython" 188 | }, 189 | "version": "2.7.11", 190 | "name": "python", 191 | "pygments_lexer": "ipython2", 192 | "nbconvert_exporter": "python", 193 | "mimetype": "text/x-python", 194 | "file_extension": ".py" 195 | } 196 | }, 197 | "nbformat_minor": 0 198 | } -------------------------------------------------------------------------------- /1. In and Export/1. import and export 4. Twitter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "cells": [ 4 | { 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "source": "# Getting data from Twitter\n\nhttps://cdeservice.mybluemix.net/rest-api/\n\n#more complex version:\nhttps://github.com/ibm-cds-labs/Spark-Twitter-Watson-Dashboard" 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 7, 14 | "source": "import requests\nimport json", 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 53, 23 | "source": "@hidden_cell \n\ncredentials_1={\n \"username\": \"2983643d-0d0f-460b-91a4-d2ff200e0605\",\n \"password\": \"VEs96toZlP\",\n \"host\": \"cdeservice.mybluemix.net\",\n \"port\": 443,\n \"url\": \"https://2983643d-0d0f-460b-91a4-d2ff200e0605:VEs96toZlP@cdeservice.mybluemix.net\"\n}", 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 88, 32 | "source": "# searcing for Tweets\n\n# Twitter query language\n# https://console.ng.bluemix.net/docs/services/Twitter/twitter_rest_apis.html#querylanguage\n\nquery=\"/api/v1/messages/search?q=IBM\"\nmyResponse = requests.get(credentials_1[\"url\"]+query)\n\njData = json.loads(myResponse.content)", 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 90, 41 | "source": "#extracting info from Tweets\n#http://support.gnip.com/sources/twitter/data_format.html\n\nfor line_object in jData['tweets']:\n try:\n actor_id_string = line_object[\"message\"][\"actor\"][\"id\"]\n actor_id = int( actor_id_string.split(\":\")[2] )\n language_code = line_object[\"message\"][\"twitter_lang\"]\n print \"{0:12d}, {1:2s}\".format(actor_id,language_code)\n except KeyError, e:\n actor_id = -1\n language_code = \"Null\"", 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": " 2421007620, und\n 17392046, en\n 18217950, en\n 1081239367, en\n 1396865473, da\n 2466483601, en\n 2195374282, en\n 2271229628, ja\n 11409612, en\n 256639900, en\n 1217548591, en\n 71131576, en\n 581308169, en\n 1910749248, en\n 2501250108, in\n 2913390709, ja\n 331855732, en\n 1096826832, en\n 1048291658, en\n 1067882804, ro\n 2493431107, en\n 59147680, pt\n 2704548373, en\n 721015415, uk\n 107337350, ja\n 2443345184, ja\n 2846486337, en\n 323808915, en\n 497156789, en\n 572681115, en\n 1838811798, ja\n 6822322, en\n 595479894, en\n 28426792, en\n 1664464753, en\n 270806514, en\n 293415739, en\n 55799972, en\n 772793874, en\n 2655479348, en\n 3819701, en\n 2402201132, en\n 1592589091, in\n 2916190101, en\n 1368782430, en\n 113142094, ja\n 304322350, ja\n 328741551, en\n 1662300529, en\n 2447547871, en\n 135262662, en\n 2148828366, en\n 381660534, en\n 2552766488, tr\n 2452816490, en\n 21155258, en\n 2341580630, en\n 110368039, en\n 2950974153, en\n 2483742385, en\n 2211680239, en\n 625263018, ja\n 2815420262, en\n 14347265, en\n 1711071769, en\n 2342275374, en\n 62294711, en\n 1220269800, en\n 623615343, en\n 2876570757, ko\n 300286188, en\n 1951171886, en\n 266953814, en\n 14590971, en\n 11407622, en\n 1389090468, en\n 1217548591, en\n 2704548373, en\n 20591919, en\n 14352195, en\n 600128032, de\n 1389090468, en\n 2452816490, en\n 425963964, en\n 317831666, en\n 24524448, en\n 305991028, en\n 271692852, ja\n 266715506, en\n 2644810332, es\n 17181265, en\n 1508595090, en\n 2359580791, en\n 1646558305, en\n 2493423925, en\n 1240538113, en\n 2433920005, de\n 2759252876, en\n 34571479, en\n 1418812723, en\n" 50 | } 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 22, 56 | "source": "# The code was removed by DSX for sharing.", 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 91, 65 | "source": "#query from cloudant\n#result_collection = Result(my_database.all_docs, include_docs=True)\n\nfor line_object in result_collection:\n try:\n actor_id_string = line_object['doc']['message']['actor']['id']\n actor_id = int( actor_id_string.split(\":\")[2] )\n language_code = line_object['doc'][\"message\"][\"twitter_lang\"]\n print \"{0:12d}, {1:2s}\".format(actor_id,language_code)\n except KeyError, e:\n actor_id = -1\n language_code = \"Null\"\n\n\n", 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": " 18217950, en\n 721015415, uk\n 3819701, en\n 328741551, en\n 381660534, en\n 2452816490, en\n 2211680239, en\n 1711071769, en\n 14352195, en\n 1389090468, en\n 2195374282, en\n 11409612, en\n 71131576, en\n 2501250108, in\n 2341580630, en\n 14347265, en\n 34571479, en\n 1418812723, en\n 107337350, ja\n 2402201132, en\n 266953814, en\n 20591919, en\n 425963964, en\n 24524448, en\n 1508595090, en\n 256639900, en\n 1910749248, en\n 331855732, en\n 772793874, en\n 1368782430, en\n 2876570757, ko\n 17181265, en\n 1396865473, da\n 59147680, pt\n 1838811798, ja\n 21155258, en\n 110368039, en\n 2483742385, en\n 623615343, en\n 14590971, en\n 2452816490, en\n 317831666, en\n 305991028, en\n 2759252876, en\n 2271229628, ja\n 2493431107, en\n 28426792, en\n 1664464753, en\n 55799972, en\n 1592589091, in\n 2447547871, en\n 2704548373, en\n 2493423925, en\n 17392046, en\n 2913390709, ja\n 2443345184, ja\n 2846486337, en\n 1662300529, en\n 2342275374, en\n 300286188, en\n 266715506, en\n 323808915, en\n 572681115, en\n 2655479348, en\n 2552766488, tr\n 1220269800, en\n 1646558305, en\n 2421007620, und\n 2466483601, en\n 1067882804, ro\n 497156789, en\n 6822322, en\n 2916190101, en\n 304322350, ja\n 2644810332, es\n 1081239367, en\n 1048291658, en\n 270806514, en\n 293415739, en\n 625263018, ja\n 2433920005, de\n 1096826832, en\n 2704548373, en\n 135262662, en\n 1951171886, en\n 11407622, en\n 1217548591, en\n 1217548591, en\n 581308169, en\n 595479894, en\n 2148828366, en\n 2950974153, en\n 2815420262, en\n 62294711, en\n 1389090468, en\n 600128032, de\n 271692852, ja\n 2359580791, en\n 1240538113, en\n 2728916672, en\n" 74 | } 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 2 with Spark 1.6", 81 | "language": "python", 82 | "name": "python2" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "version": 2, 87 | "name": "ipython" 88 | }, 89 | "version": "2.7.11", 90 | "name": "python", 91 | "pygments_lexer": "ipython2", 92 | "nbconvert_exporter": "python", 93 | "mimetype": "text/x-python", 94 | "file_extension": ".py" 95 | } 96 | }, 97 | "nbformat_minor": 0 98 | } -------------------------------------------------------------------------------- /1. In and Export/1. import and export 5. BigInsights.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "# In-and export from BigInsights\n\nhttps://developer.ibm.com/hadoop/docs/getting-started/tutorials/big-sql-hadoop-tutorial/", 7 | "cell_type": "markdown", 8 | "metadata": { 9 | "collapsed": true 10 | } 11 | }, 12 | { 13 | "source": "!pip install --user ibm_db", 14 | "execution_count": 1, 15 | "cell_type": "code", 16 | "metadata": { 17 | "scrolled": true, 18 | "collapsed": false 19 | }, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "text": "Collecting ibm-db\n Downloading ibm_db-2.0.7.tar.gz (553kB)\n\u001b[K 100% |\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 563kB 1.9MB/s \n\u001b[?25hInstalling collected packages: ibm-db\n Running setup.py install for ibm-db ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \bdone\n\u001b[?25hSuccessfully installed ibm-db-2.0.7\n", 24 | "output_type": "stream" 25 | } 26 | ] 27 | }, 28 | { 29 | "source": "import ibm_db", 30 | "execution_count": 2, 31 | "cell_type": "code", 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [] 36 | }, 37 | { 38 | "source": "Credentials", 39 | "cell_type": "markdown", 40 | "metadata": {} 41 | }, 42 | { 43 | "source": "@hidden_cell\n\ncredentials_1 = {\n 'user':'bootcamp',\n 'password':'bootcamp1bootcamp',\n 'database' : 'bigsql',\n 'hostname' : 'iop-bi-master.imdemocloud.com',\n 'port' : '32051' \n}", 44 | "execution_count": null, 45 | "cell_type": "code", 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [] 50 | }, 51 | { 52 | "source": "conn_string = (\n \"DRIVER={{IBM DB2 ODBC DRIVER}};\"\n \"DATABASE={0};\"\n \"HOSTNAME={1};\"\n \"PORT={2};\"\n \"PROTOCOL=TCPIP;\"\n \"UID={3};\"\n \"PWD={4};\").format(database, hostname, port, username, password);\n\nconn = ibm_db.connect(conn_string, \"\", \"\")", 53 | "execution_count": null, 54 | "cell_type": "code", 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [] 59 | }, 60 | { 61 | "source": "query = \"USE \"+username+\";\";\nibm_db.exec_immediate(conn, query);", 62 | "execution_count": null, 63 | "cell_type": "code", 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [] 68 | }, 69 | { 70 | "source": "# BigSQL not available in current cloud based BigInsights offering", 71 | "execution_count": null, 72 | "cell_type": "code", 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [] 77 | } 78 | ], 79 | "metadata": { 80 | "language_info": { 81 | "nbconvert_exporter": "python", 82 | "mimetype": "text/x-python", 83 | "pygments_lexer": "ipython2", 84 | "version": "2.7.11", 85 | "file_extension": ".py", 86 | "name": "python", 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 2 90 | } 91 | }, 92 | "kernelspec": { 93 | "name": "python2", 94 | "display_name": "Python 2 with Spark 1.6", 95 | "language": "python" 96 | } 97 | } 98 | } -------------------------------------------------------------------------------- /2. Watson APIs/2. Watson 3. Alchemy language.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "# Alchemy language", 7 | "cell_type": "markdown", 8 | "metadata": { 9 | "collapsed": true 10 | } 11 | }, 12 | { 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "cell_type": "code", 19 | "source": "!pip install watson-developer-cloud" 20 | }, 21 | { 22 | "execution_count": 3, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "cell_type": "code", 28 | "source": "# The code was removed by DSX for sharing." 29 | }, 30 | { 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "cell_type": "code", 37 | "source": "import json\nfrom os.path import join, dirname\nfrom watson_developer_cloud import AlchemyLanguageV1\n\nalchemy_language = AlchemyLanguageV1(api_key=credentials_1['apikey'])\n\nurl = 'https://developer.ibm.com/watson/blog/2015/11/03/price-reduction-for-watson-personality-insights/'" 38 | }, 39 | { 40 | "execution_count": 6, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [ 45 | { 46 | "evalue": "Error: daily-transaction-limit-exceeded", 47 | "traceback": [ 48 | "\u001b[1;31m\u001b[0m", 49 | "\u001b[1;31mWatsonException\u001b[0mTraceback (most recent call last)", 50 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjson\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0malchemy_language\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtargeted_sentiment\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'I love cats! Dogs are smelly.'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtargets\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cats'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'dogs'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlanguage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'english'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 51 | "\u001b[1;32m/gpfs/fs01/user/s2de-6c3e21af46e198-7a7840b6cba3/.local/lib/python2.7/site-packages/watson_developer_cloud/alchemy_language_v1.pyc\u001b[0m in \u001b[0;36mtargeted_sentiment\u001b[1;34m(self, targets, html, text, url, language, constraint_query, xpath_query, show_source_text, source_text_type)\u001b[0m\n\u001b[0;32m 320\u001b[0m 'sourceText': source_text_type}\n\u001b[0;32m 321\u001b[0m return self._alchemy_html_request('GetTargetedSentiment', html=html,\n\u001b[1;32m--> 322\u001b[1;33m text=text, url=url, params=params)\n\u001b[0m", 52 | "\u001b[1;32m/gpfs/fs01/user/s2de-6c3e21af46e198-7a7840b6cba3/.local/lib/python2.7/site-packages/watson_developer_cloud/watson_developer_cloud_service.pyc\u001b[0m in \u001b[0;36m_alchemy_html_request\u001b[1;34m(self, method_name, url, html, text, params, method, method_url)\u001b[0m\n\u001b[0;32m 225\u001b[0m return self.request(method=method, url=method_url, params=params,\n\u001b[0;32m 226\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl_encoded_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 227\u001b[1;33m accept_json=True)\n\u001b[0m\u001b[0;32m 228\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 229\u001b[0m def _alchemy_image_request(self, method_name, image_file=None,\n", 53 | "\u001b[1;32m/gpfs/fs01/user/s2de-6c3e21af46e198-7a7840b6cba3/.local/lib/python2.7/site-packages/watson_developer_cloud/watson_developer_cloud_service.pyc\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(self, method, url, accept_json, headers, params, json, data, files, **kwargs)\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merror_message\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'invalid-api-key'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 310\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m401\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 311\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mWatsonException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Error: '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0merror_message\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 312\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresponse_json\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 313\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 54 | "\u001b[1;31mWatsonException\u001b[0m: Error: daily-transaction-limit-exceeded" 55 | ], 56 | "ename": "WatsonException", 57 | "output_type": "error" 58 | } 59 | ], 60 | "cell_type": "code", 61 | "source": "print(json.dumps(alchemy_language.targeted_sentiment(text='I love cats! Dogs are smelly.',targets=['cats', 'dogs'],language='english'), indent=2))" 62 | }, 63 | { 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "cell_type": "code", 70 | "source": "print(json.dumps(alchemy_language.targeted_emotion(text='I love apples. I hate bananas', targets=['apples','bananas'], language='english'), indent=2))" 71 | }, 72 | { 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "cell_type": "code", 79 | "source": "print(json.dumps(alchemy_language.author(url=url), indent=2))" 80 | }, 81 | { 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "cell_type": "code", 88 | "source": "print(json.dumps(alchemy_language.concepts(max_items=2, url=url), indent=2))" 89 | }, 90 | { 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "cell_type": "code", 97 | "source": "print(json.dumps(alchemy_language.dates(url=url, anchor_date='2016-03-22 00:00:00'), indent=2))" 98 | }, 99 | { 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "cell_type": "code", 106 | "source": "print(json.dumps(alchemy_language.emotion(url=url), indent=2))" 107 | }, 108 | { 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "cell_type": "code", 115 | "source": "print(json.dumps(alchemy_language.entities(url=url), indent=2))" 116 | }, 117 | { 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "cell_type": "code", 124 | "source": "print(json.dumps(alchemy_language.entities(url=url), indent=2))" 125 | }, 126 | { 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "cell_type": "code", 133 | "source": "print(json.dumps(alchemy_language.keywords(max_items=5, url=url), indent=2))" 134 | }, 135 | { 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "cell_type": "code", 142 | "source": "print(json.dumps(alchemy_language.category(url=url), indent=2))" 143 | }, 144 | { 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "cell_type": "code", 151 | "source": "print(json.dumps(alchemy_language.typed_relations(url=url), indent=2))" 152 | }, 153 | { 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "cell_type": "code", 160 | "source": "print(json.dumps(alchemy_language.relations(url=url), indent=2))" 161 | }, 162 | { 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": true 166 | }, 167 | "outputs": [], 168 | "cell_type": "code", 169 | "source": "print(json.dumps(alchemy_language.language(url=url), indent=2))" 170 | }, 171 | { 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": true 175 | }, 176 | "outputs": [], 177 | "cell_type": "code", 178 | "source": "print(json.dumps(alchemy_language.text(url=url), indent=2))" 179 | }, 180 | { 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "cell_type": "code", 187 | "source": "print(json.dumps(alchemy_language.raw_text(url=url), indent=2))" 188 | }, 189 | { 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "outputs": [], 195 | "cell_type": "code", 196 | "source": "print(json.dumps(alchemy_language.title(url=url), indent=2))" 197 | }, 198 | { 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "cell_type": "code", 205 | "source": "print(json.dumps(alchemy_language.feeds(url=url), indent=2))" 206 | }, 207 | { 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "cell_type": "code", 214 | "source": "print(json.dumps(alchemy_language.microformats(url='http://microformats.org/wiki/hcard-examples'), indent=2))" 215 | }, 216 | { 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "cell_type": "code", 223 | "source": "print(json.dumps(alchemy_language.publication_date(url=url), indent=2))" 224 | }, 225 | { 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "cell_type": "code", 232 | "source": "print(json.dumps(alchemy_language.taxonomy(url=url), indent=2))" 233 | }, 234 | { 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "cell_type": "code", 241 | "source": "combined_operations = ['page-image', 'entity', 'keyword', 'title', 'author','taxonomy', 'concept', 'doc-emotion']\nprint(json.dumps(alchemy_language.combined(url=url, extract=combined_operations),indent=2))" 242 | }, 243 | { 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "cell_type": "code", 250 | "source": "# Get sentiment and emotion information results for detected entities/keywords:\nprint(json.dumps(alchemy_language.entities(url=url, sentiment=True,emotion=True), indent=2))" 251 | }, 252 | { 253 | "execution_count": null, 254 | "metadata": { 255 | "collapsed": true 256 | }, 257 | "outputs": [], 258 | "cell_type": "code", 259 | "source": "print(json.dumps(alchemy_language.keywords(max_items=5, url=url,sentiment=True, emotion=True), indent=2))" 260 | } 261 | ], 262 | "metadata": { 263 | "language_info": { 264 | "nbconvert_exporter": "python", 265 | "file_extension": ".py", 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 2 269 | }, 270 | "pygments_lexer": "ipython2", 271 | "version": "2.7.11", 272 | "mimetype": "text/x-python", 273 | "name": "python" 274 | }, 275 | "kernelspec": { 276 | "language": "python", 277 | "display_name": "Python 2 with Spark 1.6", 278 | "name": "python2" 279 | } 280 | } 281 | } -------------------------------------------------------------------------------- /2. Watson APIs/2. Watson 4. Tone analyzer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "# Tone analyzer", 7 | "cell_type": "markdown", 8 | "metadata": { 9 | "collapsed": true 10 | } 11 | }, 12 | { 13 | "source": "# The code was removed by DSX for sharing.", 14 | "execution_count": 1, 15 | "cell_type": "code", 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [] 20 | }, 21 | { 22 | "source": "import json\nfrom watson_developer_cloud import ToneAnalyzerV3\n\n\ntone_analyzer = ToneAnalyzerV3(\n username=credentials_1['username'],\n password=credentials_1['password'],\n version='2016-02-11')\n\nprint(json.dumps(tone_analyzer.tone(text='I am very happy'), indent=2))", 23 | "execution_count": 3, 24 | "cell_type": "code", 25 | "metadata": { 26 | "collapsed": false 27 | }, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "text": "{\n \"document_tone\": {\n \"tone_categories\": [\n {\n \"category_id\": \"emotion_tone\", \n \"tones\": [\n {\n \"tone_name\": \"Anger\", \n \"score\": 0.006227, \n \"tone_id\": \"anger\"\n }, \n {\n \"tone_name\": \"Disgust\", \n \"score\": 0.008777, \n \"tone_id\": \"disgust\"\n }, \n {\n \"tone_name\": \"Fear\", \n \"score\": 0.007074, \n \"tone_id\": \"fear\"\n }, \n {\n \"tone_name\": \"Joy\", \n \"score\": 0.973498, \n \"tone_id\": \"joy\"\n }, \n {\n \"tone_name\": \"Sadness\", \n \"score\": 0.017861, \n \"tone_id\": \"sadness\"\n }\n ], \n \"category_name\": \"Emotion Tone\"\n }, \n {\n \"category_id\": \"writing_tone\", \n \"tones\": [\n {\n \"tone_name\": \"Analytical\", \n \"score\": 0.0, \n \"tone_id\": \"analytical\"\n }, \n {\n \"tone_name\": \"Confident\", \n \"score\": 0.97759, \n \"tone_id\": \"confident\"\n }, \n {\n \"tone_name\": \"Tentative\", \n \"score\": 0.0, \n \"tone_id\": \"tentative\"\n }\n ], \n \"category_name\": \"Writing Tone\"\n }, \n {\n \"category_id\": \"social_tone\", \n \"tones\": [\n {\n \"tone_name\": \"Openness\", \n \"score\": 0.096859, \n \"tone_id\": \"openness_big5\"\n }, \n {\n \"tone_name\": \"Conscientiousness\", \n \"score\": 0.264058, \n \"tone_id\": \"conscientiousness_big5\"\n }, \n {\n \"tone_name\": \"Extraversion\", \n \"score\": 0.472657, \n \"tone_id\": \"extraversion_big5\"\n }, \n {\n \"tone_name\": \"Agreeableness\", \n \"score\": 0.61522, \n \"tone_id\": \"agreeableness_big5\"\n }, \n {\n \"tone_name\": \"Emotional Range\", \n \"score\": 0.104851, \n \"tone_id\": \"neuroticism_big5\"\n }\n ], \n \"category_name\": \"Social Tone\"\n }\n ]\n }\n}\n", 32 | "output_type": "stream" 33 | } 34 | ] 35 | } 36 | ], 37 | "metadata": { 38 | "language_info": { 39 | "nbconvert_exporter": "python", 40 | "mimetype": "text/x-python", 41 | "pygments_lexer": "ipython2", 42 | "version": "2.7.11", 43 | "file_extension": ".py", 44 | "name": "python", 45 | "codemirror_mode": { 46 | "name": "ipython", 47 | "version": 2 48 | } 49 | }, 50 | "kernelspec": { 51 | "name": "python2", 52 | "display_name": "Python 2 with Spark 1.6", 53 | "language": "python" 54 | } 55 | } 56 | } -------------------------------------------------------------------------------- /2. Watson APIs/2. Watson 5. Natural language classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "# Natural language classifier", 7 | "cell_type": "markdown", 8 | "metadata": { 9 | "collapsed": true 10 | } 11 | }, 12 | { 13 | "source": "# The code was removed by DSX for sharing.", 14 | "execution_count": 1, 15 | "cell_type": "code", 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [] 20 | }, 21 | { 22 | "source": "import json\nfrom watson_developer_cloud import NaturalLanguageClassifierV1\n\nnatural_language_classifier = NaturalLanguageClassifierV1(\n username=credentials_1['username'],\n password=credentials_1['password'])\n\nclassifiers = natural_language_classifier.list()\nprint(json.dumps(classifiers, indent=2))", 23 | "execution_count": 3, 24 | "cell_type": "code", 25 | "metadata": { 26 | "collapsed": false 27 | }, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "text": "{\n \"classifiers\": []\n}\n", 32 | "output_type": "stream" 33 | } 34 | ] 35 | }, 36 | { 37 | "source": "#create a classifier\nimport urllib\nurllib.urlretrieve (\"https://raw.githubusercontent.com/analytics-bootcamp/Training-material/master/7.%20Training%20-%20Misc/weather.txt\", \"weather.txt\")\n\nwith open('weather.txt', 'rb') as training_data:\n print(json.dumps(natural_language_classifier.create(training_data=training_data, name='weather'), indent=2))\n", 38 | "execution_count": 10, 39 | "cell_type": "code", 40 | "metadata": { 41 | "scrolled": true, 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "text": "{\n \"status\": \"Training\", \n \"name\": \"weather\", \n \"language\": \"en\", \n \"created\": \"2017-03-09T04:15:28.693Z\", \n \"url\": \"https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/4d5c10x177-nlc-2873\", \n \"status_description\": \"The classifier instance is in its training phase, not yet ready to accept classify requests\", \n \"classifier_id\": \"4d5c10x177-nlc-2873\"\n}\n", 48 | "output_type": "stream" 49 | } 50 | ] 51 | }, 52 | { 53 | "source": "# replace 2374f9x68-nlc-2697 with your classifier id\nstatus = natural_language_classifier.status('4d5c10x177-nlc-2873')\nprint(json.dumps(status, indent=2))", 54 | "execution_count": 13, 55 | "cell_type": "code", 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "text": "{\n \"status\": \"Training\", \n \"name\": \"weather\", \n \"language\": \"en\", \n \"created\": \"2017-03-09T04:15:28.693Z\", \n \"url\": \"https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/4d5c10x177-nlc-2873\", \n \"status_description\": \"The classifier instance is in its training phase, not yet ready to accept classify requests\", \n \"classifier_id\": \"4d5c10x177-nlc-2873\"\n}\n", 63 | "output_type": "stream" 64 | } 65 | ] 66 | }, 67 | { 68 | "source": "status = natural_language_classifier.status('4d5c10x177-nlc-2873')\nprint(json.dumps(status, indent=2))\n\nif status['status'] == 'Available':\n classes = natural_language_classifier.classify('4d5c10x177-nlc-2873','How hot will it be tomorrow?')\n print(json.dumps(classes, indent=2))\n\n", 69 | "execution_count": 14, 70 | "cell_type": "code", 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "text": "{\n \"status\": \"Training\", \n \"name\": \"weather\", \n \"language\": \"en\", \n \"created\": \"2017-03-09T04:15:28.693Z\", \n \"url\": \"https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/4d5c10x177-nlc-2873\", \n \"status_description\": \"The classifier instance is in its training phase, not yet ready to accept classify requests\", \n \"classifier_id\": \"4d5c10x177-nlc-2873\"\n}\n", 78 | "output_type": "stream" 79 | } 80 | ] 81 | }, 82 | { 83 | "source": "delete = natural_language_classifier.remove('4d5c10x177-nlc-2873')\nprint(json.dumps(delete, indent=2))", 84 | "execution_count": null, 85 | "cell_type": "code", 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [] 90 | } 91 | ], 92 | "metadata": { 93 | "language_info": { 94 | "nbconvert_exporter": "python", 95 | "mimetype": "text/x-python", 96 | "pygments_lexer": "ipython2", 97 | "version": "2.7.11", 98 | "file_extension": ".py", 99 | "name": "python", 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 2 103 | } 104 | }, 105 | "kernelspec": { 106 | "name": "python2", 107 | "display_name": "Python 2 with Spark 1.6", 108 | "language": "python" 109 | } 110 | } 111 | } -------------------------------------------------------------------------------- /3. Visualization/Python_Bokeh_Cheat_Sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/3. Visualization/Python_Bokeh_Cheat_Sheet.pdf -------------------------------------------------------------------------------- /3. Visualization/Python_Matplotlib_Cheat_Sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/3. Visualization/Python_Matplotlib_Cheat_Sheet.pdf -------------------------------------------------------------------------------- /4. Spark/4. Spark 0. rdd-creation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:89b31567699d26877d1a7406cc718f5609a31c4d05e95c8a8ec474b0f62daa56" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "RDD creation" 17 | ] 18 | }, 19 | { 20 | "cell_type": "heading", 21 | "level": 4, 22 | "metadata": {}, 23 | "source": [ 24 | "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "In this notebook we will introduce two different ways of getting data into the basic Spark data structure, the **Resilient Distributed Dataset** or **RDD**. An RDD is a distributed collection of elements. All work in Spark is expressed as either creating new RDDs, transforming existing RDDs, or calling actions on RDDs to compute a result. Spark automatically distributes the data contained in RDDs across your cluster and parallelizes the operations you perform on them." 32 | ] 33 | }, 34 | { 35 | "cell_type": "heading", 36 | "level": 4, 37 | "metadata": {}, 38 | "source": [ 39 | "References" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "The reference book for these and other Spark related topics is *Learning Spark* by Holden Karau, Andy Konwinski, Patrick Wendell, and Matei Zaharia. " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "The KDD Cup 1999 competition dataset is described in detail [here](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99)." 54 | ] 55 | }, 56 | { 57 | "cell_type": "heading", 58 | "level": 2, 59 | "metadata": {}, 60 | "source": [ 61 | "Getting the data files " 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "In this notebook we will use the reduced dataset (10 percent) provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a *Gzip* file that we will download locally. " 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "collapsed": false, 74 | "input": [ 75 | "import urllib\n", 76 | "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")" 77 | ], 78 | "language": "python", 79 | "metadata": {}, 80 | "outputs": [], 81 | "prompt_number": 31 82 | }, 83 | { 84 | "cell_type": "heading", 85 | "level": 2, 86 | "metadata": {}, 87 | "source": [ 88 | "Creating a RDD from a file " 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "The most common way of creating an RDD is to load it from a file. Notice that Spark's `textFile` can handle compressed files directly. " 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "collapsed": false, 101 | "input": [ 102 | "data_file = \"./kddcup.data_10_percent.gz\"\n", 103 | "raw_data = sc.textFile(data_file)" 104 | ], 105 | "language": "python", 106 | "metadata": {}, 107 | "outputs": [], 108 | "prompt_number": 32 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Now we have our data file loaded into the `raw_data` RDD." 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Without getting into Spark *transformations* and *actions*, the most basic thing we can do to check that we got our RDD contents right is to `count()` the number of lines loaded from the file into the RDD. " 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "collapsed": false, 127 | "input": [ 128 | "raw_data.count()" 129 | ], 130 | "language": "python", 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "metadata": {}, 135 | "output_type": "pyout", 136 | "prompt_number": 33, 137 | "text": [ 138 | "494021" 139 | ] 140 | } 141 | ], 142 | "prompt_number": 33 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "We can also check the first few entries in our data. " 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "collapsed": false, 154 | "input": [ 155 | "raw_data.take(5)" 156 | ], 157 | "language": "python", 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "metadata": {}, 162 | "output_type": "pyout", 163 | "prompt_number": 34, 164 | "text": [ 165 | "[u'0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',\n", 166 | " u'0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',\n", 167 | " u'0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',\n", 168 | " u'0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',\n", 169 | " u'0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']" 170 | ] 171 | } 172 | ], 173 | "prompt_number": 34 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "In the following notebooks, we will use this raw data to learn about the different Spark transformations and actions. " 180 | ] 181 | }, 182 | { 183 | "cell_type": "heading", 184 | "level": 2, 185 | "metadata": {}, 186 | "source": [ 187 | "Creating and RDD using `parallelize`" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Another way of creating an RDD is to parallelize an already existing list. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "collapsed": false, 200 | "input": [ 201 | "a = range(100)\n", 202 | "\n", 203 | "data = sc.parallelize(a)" 204 | ], 205 | "language": "python", 206 | "metadata": {}, 207 | "outputs": [], 208 | "prompt_number": 35 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "As we did before, we can `count()` the number of elements in the RDD." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "collapsed": false, 220 | "input": [ 221 | "data.count()" 222 | ], 223 | "language": "python", 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "metadata": {}, 228 | "output_type": "pyout", 229 | "prompt_number": 36, 230 | "text": [ 231 | "100" 232 | ] 233 | } 234 | ], 235 | "prompt_number": 36 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "As before, we can access the first few elements on our RDD. " 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "collapsed": false, 247 | "input": [ 248 | "data.take(5)" 249 | ], 250 | "language": "python", 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "metadata": {}, 255 | "output_type": "pyout", 256 | "prompt_number": 37, 257 | "text": [ 258 | "[0, 1, 2, 3, 4]" 259 | ] 260 | } 261 | ], 262 | "prompt_number": 37 263 | } 264 | ], 265 | "metadata": {} 266 | } 267 | ] 268 | } -------------------------------------------------------------------------------- /4. Spark/4. Spark 1. rdd-basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:7ce5292fff087bd3fe623675ed06dd472f9e0de945d9b383f83f9f151eb1eaad" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "RDD basics" 17 | ] 18 | }, 19 | { 20 | "cell_type": "heading", 21 | "level": 4, 22 | "metadata": {}, 23 | "source": [ 24 | "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "This notebook will introduce three basic but essential Spark operations. Two of them are the *transformations* `map` and `filter`. The other is the *action* `collect`. At the same time we will introduce the concept of *persistence* in Spark. " 32 | ] 33 | }, 34 | { 35 | "cell_type": "heading", 36 | "level": 2, 37 | "metadata": {}, 38 | "source": [ 39 | "Getting the data and creating the RDD" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "collapsed": false, 52 | "input": [ 53 | "import urllib\n", 54 | "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")" 55 | ], 56 | "language": "python", 57 | "metadata": {}, 58 | "outputs": [] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Now we can use this file to create our RDD." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "collapsed": false, 70 | "input": [ 71 | "data_file = \"./kddcup.data_10_percent.gz\"\n", 72 | "raw_data = sc.textFile(data_file)" 73 | ], 74 | "language": "python", 75 | "metadata": {}, 76 | "outputs": [], 77 | "prompt_number": 1 78 | }, 79 | { 80 | "cell_type": "heading", 81 | "level": 2, 82 | "metadata": {}, 83 | "source": [ 84 | "The `filter` transformation" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "This transformation can be applied to RDDs in order to keep just elements that satisfy a certain condition. More concretely, a function is evaluated on every element in the original RDD. The new resulting RDD will contain just those elements that make the function return `True`." 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "For example, imagine we want to count how many `normal.` interactions we have in our dataset. We can filter our `raw_data` RDD as follows. " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "collapsed": false, 104 | "input": [ 105 | "normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)" 106 | ], 107 | "language": "python", 108 | "metadata": {}, 109 | "outputs": [], 110 | "prompt_number": 2 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Now we can count how many elements we have in the new RDD." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "collapsed": false, 122 | "input": [ 123 | "from time import time\n", 124 | "t0 = time()\n", 125 | "normal_count = normal_raw_data.count()\n", 126 | "tt = time() - t0\n", 127 | "print \"There are {} 'normal' interactions\".format(normal_count)\n", 128 | "print \"Count completed in {} seconds\".format(round(tt,3))" 129 | ], 130 | "language": "python", 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "output_type": "stream", 135 | "stream": "stdout", 136 | "text": [ 137 | "There are 97278 'normal' interactions\n", 138 | "Count completed in 5.951 seconds\n" 139 | ] 140 | } 141 | ], 142 | "prompt_number": 3 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Remember from notebook 1 that we have a total of 494021 in our 10 percent dataset. Here we can see that 97278 contain the `normal.` tag word. " 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "Notice that we have measured the elapsed time for counting the elements in the RDD. We have done this because we wanted to point out that actual (distributed) computations in Spark take place when we execute *actions* and not *transformations*. In this case `count` is the action we execute on the RDD. We can apply as many transformations as we want on a our RDD and no computation will take place until we call the first action that, in this case takes a few seconds to complete." 156 | ] 157 | }, 158 | { 159 | "cell_type": "heading", 160 | "level": 2, 161 | "metadata": {}, 162 | "source": [ 163 | "The `map` transformation" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "By using the `map` transformation in Spark, we can apply a function to every element in our RDD. Python's lambdas are specially expressive for this particular." 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "In this case we want to read our data file as a CSV formatted one. We can do this by applying a lambda function to each element in the RDD as follows." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "collapsed": false, 183 | "input": [ 184 | "from pprint import pprint\n", 185 | "csv_data = raw_data.map(lambda x: x.split(\",\"))\n", 186 | "t0 = time()\n", 187 | "head_rows = csv_data.take(5)\n", 188 | "tt = time() - t0\n", 189 | "print \"Parse completed in {} seconds\".format(round(tt,3))\n", 190 | "pprint(head_rows[0])" 191 | ], 192 | "language": "python", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "output_type": "stream", 197 | "stream": "stdout", 198 | "text": [ 199 | "Parse completed in 1.715 seconds\n", 200 | "[u'0',\n", 201 | " u'tcp',\n", 202 | " u'http',\n", 203 | " u'SF',\n", 204 | " u'181',\n", 205 | " u'5450',\n", 206 | " u'0',\n", 207 | " u'0',\n", 208 | " u'0',\n", 209 | " u'0',\n", 210 | " u'0',\n", 211 | " u'1',\n", 212 | " u'0',\n", 213 | " u'0',\n", 214 | " u'0',\n", 215 | " u'0',\n", 216 | " u'0',\n", 217 | " u'0',\n", 218 | " u'0',\n", 219 | " u'0',\n", 220 | " u'0',\n", 221 | " u'0',\n", 222 | " u'8',\n", 223 | " u'8',\n", 224 | " u'0.00',\n", 225 | " u'0.00',\n", 226 | " u'0.00',\n", 227 | " u'0.00',\n", 228 | " u'1.00',\n", 229 | " u'0.00',\n", 230 | " u'0.00',\n", 231 | " u'9',\n", 232 | " u'9',\n", 233 | " u'1.00',\n", 234 | " u'0.00',\n", 235 | " u'0.11',\n", 236 | " u'0.00',\n", 237 | " u'0.00',\n", 238 | " u'0.00',\n", 239 | " u'0.00',\n", 240 | " u'0.00',\n", 241 | " u'normal.']\n" 242 | ] 243 | } 244 | ], 245 | "prompt_number": 4 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "Again, all action happens once we call the first Spark *action* (i.e. *take* in this case). What if we take a lot of elements instead of just the first few? " 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "collapsed": false, 257 | "input": [ 258 | "t0 = time()\n", 259 | "head_rows = csv_data.take(100000)\n", 260 | "tt = time() - t0\n", 261 | "print \"Parse completed in {} seconds\".format(round(tt,3))" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "output_type": "stream", 268 | "stream": "stdout", 269 | "text": [ 270 | "Parse completed in 8.629 seconds\n" 271 | ] 272 | } 273 | ], 274 | "prompt_number": 5 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "We can see that it takes longer. The `map` function is applied now in a distributed way to a lot of elements on the RDD, hence the longer execution time." 281 | ] 282 | }, 283 | { 284 | "cell_type": "heading", 285 | "level": 3, 286 | "metadata": {}, 287 | "source": [ 288 | "Using `map` and predefined functions" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "Of course we can use predefined functions with `map`. Imagine we want to have each element in the RDD as a key-value pair where the key is the tag (e.g. *normal*) and the value is the whole list of elements that represents the row in the CSV formatted file. We could proceed as follows. " 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "collapsed": false, 301 | "input": [ 302 | "def parse_interaction(line):\n", 303 | " elems = line.split(\",\")\n", 304 | " tag = elems[41]\n", 305 | " return (tag, elems)\n", 306 | "\n", 307 | "key_csv_data = raw_data.map(parse_interaction)\n", 308 | "head_rows = key_csv_data.take(5)\n", 309 | "pprint(head_rows[0])" 310 | ], 311 | "language": "python", 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "output_type": "stream", 316 | "stream": "stdout", 317 | "text": [ 318 | "(u'normal.',\n", 319 | " [u'0',\n", 320 | " u'tcp',\n", 321 | " u'http',\n", 322 | " u'SF',\n", 323 | " u'181',\n", 324 | " u'5450',\n", 325 | " u'0',\n", 326 | " u'0',\n", 327 | " u'0',\n", 328 | " u'0',\n", 329 | " u'0',\n", 330 | " u'1',\n", 331 | " u'0',\n", 332 | " u'0',\n", 333 | " u'0',\n", 334 | " u'0',\n", 335 | " u'0',\n", 336 | " u'0',\n", 337 | " u'0',\n", 338 | " u'0',\n", 339 | " u'0',\n", 340 | " u'0',\n", 341 | " u'8',\n", 342 | " u'8',\n", 343 | " u'0.00',\n", 344 | " u'0.00',\n", 345 | " u'0.00',\n", 346 | " u'0.00',\n", 347 | " u'1.00',\n", 348 | " u'0.00',\n", 349 | " u'0.00',\n", 350 | " u'9',\n", 351 | " u'9',\n", 352 | " u'1.00',\n", 353 | " u'0.00',\n", 354 | " u'0.11',\n", 355 | " u'0.00',\n", 356 | " u'0.00',\n", 357 | " u'0.00',\n", 358 | " u'0.00',\n", 359 | " u'0.00',\n", 360 | " u'normal.'])\n" 361 | ] 362 | } 363 | ], 364 | "prompt_number": 6 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "That was easy, wasn't it?" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "In our notebook about working with key-value pairs we will use this type of RDDs to do data aggregations (e.g. count by key)." 378 | ] 379 | }, 380 | { 381 | "cell_type": "heading", 382 | "level": 2, 383 | "metadata": {}, 384 | "source": [ 385 | "The `collect` action" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "So far we have used the actions `count` and `take`. Another basic action we need to learn is `collect`. Basically it will get all the elements in the RDD into memory for us to work with them. For this reason it has to be used with care, specially when working with large RDDs. " 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "An example using our raw data. " 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "collapsed": false, 405 | "input": [ 406 | "t0 = time()\n", 407 | "all_raw_data = raw_data.collect()\n", 408 | "tt = time() - t0\n", 409 | "print \"Data collected in {} seconds\".format(round(tt,3))" 410 | ], 411 | "language": "python", 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "output_type": "stream", 416 | "stream": "stdout", 417 | "text": [ 418 | "Data collected in 17.927 seconds\n" 419 | ] 420 | } 421 | ], 422 | "prompt_number": 9 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "That took longer as any other action we used before, of course. Every Spark worker node that has a fragment of the RDD has to be coordinated in order to retrieve its part, and then *reduce* everything together. " 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "As a last example combining all the previous, we want to collect all the `normal` interactions as key-value pairs. " 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "collapsed": false, 441 | "input": [ 442 | "# get data from file\n", 443 | "data_file = \"./kddcup.data_10_percent.gz\"\n", 444 | "raw_data = sc.textFile(data_file)\n", 445 | "\n", 446 | "# parse into key-value pairs\n", 447 | "key_csv_data = raw_data.map(parse_interaction)\n", 448 | "\n", 449 | "# filter normal key interactions\n", 450 | "normal_key_interactions = key_csv_data.filter(lambda x: x[0] == \"normal.\")\n", 451 | "\n", 452 | "# collect all\n", 453 | "t0 = time()\n", 454 | "all_normal = normal_key_interactions.collect()\n", 455 | "tt = time() - t0\n", 456 | "normal_count = len(all_normal)\n", 457 | "print \"Data collected in {} seconds\".format(round(tt,3))\n", 458 | "print \"There are {} 'normal' interactions\".format(normal_count)" 459 | ], 460 | "language": "python", 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "output_type": "stream", 465 | "stream": "stdout", 466 | "text": [ 467 | "Data collected in 12.485 seconds\n", 468 | "There are 97278 normal interactions\n" 469 | ] 470 | } 471 | ], 472 | "prompt_number": 13 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "This count matches with the previous count for `normal` interactions. The new procedure is more time consuming. This is because we retrieve all the data with `collect` and then use Python's `len` on the resulting list. Before we were just counting the total number of elements in the RDD by using `count`. " 479 | ] 480 | } 481 | ], 482 | "metadata": {} 483 | } 484 | ] 485 | } -------------------------------------------------------------------------------- /4. Spark/4. Spark 2. rdd-sampling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:8581d2dfe951591985d0f9eb665f33044c479321d2a0b77699d2d79ad8ef0641" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Sampling RDDs" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "So far we have introduced RDD creation together with some basic transformations such as `map` and `filter` and some actions such as `count`, `take`, and `collect`. " 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "This notebook will show how to sample RDDs. Regarding transformations, `sample` will be introduced since it will be useful in many statistical learning scenarios. Then we will compare results with the `takeSample` action. " 38 | ] 39 | }, 40 | { 41 | "cell_type": "heading", 42 | "level": 2, 43 | "metadata": {}, 44 | "source": [ 45 | "Getting the data and creating the RDD" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "In this case we will use the complete dataset provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "collapsed": false, 58 | "input": [ 59 | "import urllib\n", 60 | "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz\", \"kddcup.data.gz\")" 61 | ], 62 | "language": "python", 63 | "metadata": {}, 64 | "outputs": [], 65 | "prompt_number": 1 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Now we can use this file to create our RDD." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "collapsed": false, 77 | "input": [ 78 | "data_file = \"./kddcup.data.gz\"\n", 79 | "raw_data = sc.textFile(data_file)" 80 | ], 81 | "language": "python", 82 | "metadata": {}, 83 | "outputs": [], 84 | "prompt_number": 2 85 | }, 86 | { 87 | "cell_type": "heading", 88 | "level": 2, 89 | "metadata": {}, 90 | "source": [ 91 | "Sampling RDDs " 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "In Spark, there are two sampling operations, the transformation `sample` and the action `takeSample`. By using a transformation we can tell Spark to apply successive transformation on a sample of a given RDD. By using an action we retrieve a given sample and we can have it in local memory to be used by any other standard library (e.g. Scikit-learn). " 99 | ] 100 | }, 101 | { 102 | "cell_type": "heading", 103 | "level": 3, 104 | "metadata": {}, 105 | "source": [ 106 | "The `sample` transformation" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "The `sample` transformation takes up to three parameters. First is whether the sampling is done with replacement or not. Second is the sample size as a fraction. Finally we can optionally provide a *random seed*. " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "collapsed": false, 119 | "input": [ 120 | "raw_data_sample = raw_data.sample(False, 0.1, 1234)\n", 121 | "sample_size = raw_data_sample.count()\n", 122 | "total_size = raw_data.count()\n", 123 | "print \"Sample size is {} of {}\".format(sample_size, total_size)" 124 | ], 125 | "language": "python", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "output_type": "stream", 130 | "stream": "stdout", 131 | "text": [ 132 | "Sample size is 489957 of 4898431\n" 133 | ] 134 | } 135 | ], 136 | "prompt_number": 3 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "But the power of sampling as a transformation comes from doing it as part of a sequence of additional transformations. This will show more powerful once we start doing aggregations and key-value pairs operations, and will be specially useful when using Spark's machine learning library MLlib. " 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "In the meantime, imagine we want to have an approximation of the proportion of `normal.` interactions in our dataset. We could do this by counting the total number of tags as we did in previous notebooks. However we want a quicker response and we don't need the exact answer but just an approximation. We can do it as follows. " 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "collapsed": false, 155 | "input": [ 156 | "from time import time\n", 157 | "\n", 158 | "# transformations to be applied\n", 159 | "raw_data_sample_items = raw_data_sample.map(lambda x: x.split(\",\"))\n", 160 | "sample_normal_tags = raw_data_sample_items.filter(lambda x: \"normal.\" in x)\n", 161 | "\n", 162 | "# actions + time\n", 163 | "t0 = time()\n", 164 | "sample_normal_tags_count = sample_normal_tags.count()\n", 165 | "tt = time() - t0\n", 166 | "\n", 167 | "sample_normal_ratio = sample_normal_tags_count / float(sample_size)\n", 168 | "print \"The ratio of 'normal' interactions is {}\".format(round(sample_normal_ratio,3)) \n", 169 | "print \"Count done in {} seconds\".format(round(tt,3))" 170 | ], 171 | "language": "python", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "output_type": "stream", 176 | "stream": "stdout", 177 | "text": [ 178 | "The ratio of 'normal' interactions is 0.199\n", 179 | "Count done in 44.523 seconds\n" 180 | ] 181 | } 182 | ], 183 | "prompt_number": 4 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "Let's compare this with calculating the ratio without sampling. " 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "collapsed": false, 195 | "input": [ 196 | "# transformations to be applied\n", 197 | "raw_data_items = raw_data.map(lambda x: x.split(\",\"))\n", 198 | "normal_tags = raw_data_items.filter(lambda x: \"normal.\" in x)\n", 199 | "\n", 200 | "# actions + time\n", 201 | "t0 = time()\n", 202 | "normal_tags_count = normal_tags.count()\n", 203 | "tt = time() - t0\n", 204 | "\n", 205 | "normal_ratio = normal_tags_count / float(total_size)\n", 206 | "print \"The ratio of 'normal' interactions is {}\".format(round(normal_ratio,3)) \n", 207 | "print \"Count done in {} seconds\".format(round(tt,3))" 208 | ], 209 | "language": "python", 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "output_type": "stream", 214 | "stream": "stdout", 215 | "text": [ 216 | "The ratio of 'normal' interactions is 0.199\n", 217 | "Count done in 91.09 seconds\n" 218 | ] 219 | } 220 | ], 221 | "prompt_number": 5 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "We can see a gain in time. The more transformations we apply after the sampling the bigger this gain. This is because without sampling all the transformations are applied to the complete set of data. " 228 | ] 229 | }, 230 | { 231 | "cell_type": "heading", 232 | "level": 3, 233 | "metadata": {}, 234 | "source": [ 235 | "The `takeSample` action " 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "If what we need is to grab a sample of raw data from our RDD into local memory in order to be used by other non-Spark libraries, `takeSample` can be used. " 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "The syntax is very similar, but in this case we specify the number of items instead of the sample size as a fraction of the complete data size. " 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "collapsed": false, 255 | "input": [ 256 | "t0 = time()\n", 257 | "raw_data_sample = raw_data.takeSample(False, 400000, 1234)\n", 258 | "normal_data_sample = [x.split(\",\") for x in raw_data_sample if \"normal.\" in x]\n", 259 | "tt = time() - t0\n", 260 | "\n", 261 | "normal_sample_size = len(normal_data_sample)\n", 262 | "\n", 263 | "normal_ratio = normal_sample_size / 400000.0\n", 264 | "print \"The ratio of 'normal' interactions is {}\".format(normal_ratio)\n", 265 | "print \"Count done in {} seconds\".format(round(tt,3))" 266 | ], 267 | "language": "python", 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "output_type": "stream", 272 | "stream": "stdout", 273 | "text": [ 274 | "The ratio of 'normal' interactions is 0.1988025\n", 275 | "Count done in 76.166 seconds\n" 276 | ] 277 | } 278 | ], 279 | "prompt_number": 6 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "The process was very similar as before. We obtained a sample of about 10 percent of the data, and then filter and split. \n", 286 | "\n", 287 | "However, it took longer, even with a slightly smaller sample. The reason is that Spark just distributed the execution of the sampling process. The filtering and splitting of the results were done locally in a single node. " 288 | ] 289 | } 290 | ], 291 | "metadata": {} 292 | } 293 | ] 294 | } 295 | -------------------------------------------------------------------------------- /4. Spark/4. Spark 3. rdd-set.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:123c139134363a65ac4461e7d98848e74ede7989fa222d57f2ff95d79405e114" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Set operations on RDDs" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Spark supports many of the operations we have in mathematical sets, such as union and intersection, even when the RDDs themselves are not properly sets. It is important to note that these operations require that the RDDs being operated on are of the same type. " 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Set operations are quite straightforward to understand as it work as expected. The only consideration comes from the fact that RDDs are not real sets, and therefore operations such as the union of RDDs doesn't remove duplicates. In this notebook we will have a brief look at `subtract`, `distinct`, and `cartesian`. " 38 | ] 39 | }, 40 | { 41 | "cell_type": "heading", 42 | "level": 2, 43 | "metadata": {}, 44 | "source": [ 45 | "Getting the data and creating the RDD" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the KDD Cup 1999, containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "collapsed": false, 58 | "input": [ 59 | "import urllib\n", 60 | "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")" 61 | ], 62 | "language": "python", 63 | "metadata": {}, 64 | "outputs": [], 65 | "prompt_number": 1 66 | }, 67 | { 68 | "cell_type": "code", 69 | "collapsed": false, 70 | "input": [ 71 | "data_file = \"./kddcup.data_10_percent.gz\"\n", 72 | "raw_data = sc.textFile(data_file)" 73 | ], 74 | "language": "python", 75 | "metadata": {}, 76 | "outputs": [], 77 | "prompt_number": 2 78 | }, 79 | { 80 | "cell_type": "heading", 81 | "level": 2, 82 | "metadata": {}, 83 | "source": [ 84 | "Getting attack interactions using `subtract`" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "For illustrative purposes, imagine we already have our RDD with non attack (normal) interactions from some previous analysis. " 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "collapsed": false, 97 | "input": [ 98 | "normal_raw_data = raw_data.filter(lambda x: \"normal.\" in x)" 99 | ], 100 | "language": "python", 101 | "metadata": {}, 102 | "outputs": [], 103 | "prompt_number": 3 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "We can obtain attack interactions by subtracting normal ones from the original unfiltered RDD as follows. " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "collapsed": false, 115 | "input": [ 116 | "attack_raw_data = raw_data.subtract(normal_raw_data)" 117 | ], 118 | "language": "python", 119 | "metadata": {}, 120 | "outputs": [], 121 | "prompt_number": 4 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Let's do some counts to check our results. " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "collapsed": false, 133 | "input": [ 134 | "from time import time\n", 135 | "\n", 136 | "# count all\n", 137 | "t0 = time()\n", 138 | "raw_data_count = raw_data.count()\n", 139 | "tt = time() - t0\n", 140 | "print \"All count in {} secs\".format(round(tt,3))" 141 | ], 142 | "language": "python", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "output_type": "stream", 147 | "stream": "stdout", 148 | "text": [ 149 | "All count in 5.261 secs\n" 150 | ] 151 | } 152 | ], 153 | "prompt_number": 5 154 | }, 155 | { 156 | "cell_type": "code", 157 | "collapsed": false, 158 | "input": [ 159 | "# count normal\n", 160 | "t0 = time()\n", 161 | "normal_raw_data_count = normal_raw_data.count()\n", 162 | "tt = time() - t0\n", 163 | "print \"Normal count in {} secs\".format(round(tt,3))" 164 | ], 165 | "language": "python", 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "output_type": "stream", 170 | "stream": "stdout", 171 | "text": [ 172 | "Normal count in 5.571 secs\n" 173 | ] 174 | } 175 | ], 176 | "prompt_number": 6 177 | }, 178 | { 179 | "cell_type": "code", 180 | "collapsed": false, 181 | "input": [ 182 | "# count attacks\n", 183 | "t0 = time()\n", 184 | "attack_raw_data_count = attack_raw_data.count()\n", 185 | "tt = time() - t0\n", 186 | "print \"Attack count in {} secs\".format(round(tt,3))" 187 | ], 188 | "language": "python", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "output_type": "stream", 193 | "stream": "stdout", 194 | "text": [ 195 | "Attack count in 12.075 secs\n" 196 | ] 197 | } 198 | ], 199 | "prompt_number": 7 200 | }, 201 | { 202 | "cell_type": "code", 203 | "collapsed": false, 204 | "input": [ 205 | "print \"There are {} normal interactions and {} attacks, \\\n", 206 | "from a total of {} interactions\".format(normal_raw_data_count,attack_raw_data_count,raw_data_count)" 207 | ], 208 | "language": "python", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "output_type": "stream", 213 | "stream": "stdout", 214 | "text": [ 215 | "There are 97278 normal interactions and 396743 attacks, from a total of 494021 interactions\n" 216 | ] 217 | } 218 | ], 219 | "prompt_number": 8 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "So now we have two RDDs, one with normal interactions and another one with attacks. " 226 | ] 227 | }, 228 | { 229 | "cell_type": "heading", 230 | "level": 2, 231 | "metadata": {}, 232 | "source": [ 233 | "Protocol and service combinations using `cartesian`" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "We can compute the Cartesian product between two RDDs by using the `cartesian` transformation. It returns all possible pairs of elements between two RDDs. In our case we will use it to generate all the possible combinations between service and protocol in our network interactions. \n", 241 | "\n", 242 | "First of all we need to isolate each collection of values in two separate RDDs. For that we will use `distinct` on the CSV-parsed dataset. From the [dataset description](http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names) we know that protocol is the second column and service is the third (tag is the last one and not the first as appears in the page). " 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "So first, let's get the protocols. " 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "collapsed": false, 255 | "input": [ 256 | "csv_data = raw_data.map(lambda x: x.split(\",\"))\n", 257 | "protocols = csv_data.map(lambda x: x[1]).distinct()\n", 258 | "protocols.collect()" 259 | ], 260 | "language": "python", 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "metadata": {}, 265 | "output_type": "pyout", 266 | "prompt_number": 9, 267 | "text": [ 268 | "[u'udp', u'icmp', u'tcp']" 269 | ] 270 | } 271 | ], 272 | "prompt_number": 9 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Now we do the same for services. " 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "collapsed": false, 284 | "input": [ 285 | "services = csv_data.map(lambda x: x[2]).distinct()\n", 286 | "services.collect()" 287 | ], 288 | "language": "python", 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "metadata": {}, 293 | "output_type": "pyout", 294 | "prompt_number": 10, 295 | "text": [ 296 | "[u'domain',\n", 297 | " u'http_443',\n", 298 | " u'Z39_50',\n", 299 | " u'smtp',\n", 300 | " u'urp_i',\n", 301 | " u'private',\n", 302 | " u'echo',\n", 303 | " u'shell',\n", 304 | " u'red_i',\n", 305 | " u'eco_i',\n", 306 | " u'sunrpc',\n", 307 | " u'ftp_data',\n", 308 | " u'urh_i',\n", 309 | " u'pm_dump',\n", 310 | " u'pop_3',\n", 311 | " u'pop_2',\n", 312 | " u'systat',\n", 313 | " u'ftp',\n", 314 | " u'uucp',\n", 315 | " u'whois',\n", 316 | " u'netbios_dgm',\n", 317 | " u'efs',\n", 318 | " u'remote_job',\n", 319 | " u'daytime',\n", 320 | " u'ntp_u',\n", 321 | " u'finger',\n", 322 | " u'ldap',\n", 323 | " u'netbios_ns',\n", 324 | " u'kshell',\n", 325 | " u'iso_tsap',\n", 326 | " u'ecr_i',\n", 327 | " u'nntp',\n", 328 | " u'printer',\n", 329 | " u'domain_u',\n", 330 | " u'uucp_path',\n", 331 | " u'courier',\n", 332 | " u'exec',\n", 333 | " u'time',\n", 334 | " u'netstat',\n", 335 | " u'telnet',\n", 336 | " u'gopher',\n", 337 | " u'rje',\n", 338 | " u'sql_net',\n", 339 | " u'link',\n", 340 | " u'auth',\n", 341 | " u'netbios_ssn',\n", 342 | " u'csnet_ns',\n", 343 | " u'X11',\n", 344 | " u'IRC',\n", 345 | " u'tftp_u',\n", 346 | " u'login',\n", 347 | " u'supdup',\n", 348 | " u'name',\n", 349 | " u'nnsp',\n", 350 | " u'mtp',\n", 351 | " u'http',\n", 352 | " u'bgp',\n", 353 | " u'ctf',\n", 354 | " u'hostnames',\n", 355 | " u'klogin',\n", 356 | " u'vmnet',\n", 357 | " u'tim_i',\n", 358 | " u'discard',\n", 359 | " u'imap4',\n", 360 | " u'other',\n", 361 | " u'ssh']" 362 | ] 363 | } 364 | ], 365 | "prompt_number": 10 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "A longer list in this case." 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "Now we can do the cartesian product. " 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "collapsed": false, 384 | "input": [ 385 | "product = protocols.cartesian(services).collect()\n", 386 | "print \"There are {} combinations of protocol X service\".format(len(product))" 387 | ], 388 | "language": "python", 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "output_type": "stream", 393 | "stream": "stdout", 394 | "text": [ 395 | "There are 198 combinations of protocol X service\n" 396 | ] 397 | } 398 | ], 399 | "prompt_number": 11 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "Obviously, for such small RDDs doesn't really make sense to use Spark cartesian product. We could have perfectly collected the values after using `distinct` and do the cartesian product locally. Moreover, `distinct` and `cartesian` are expensive operations so they must be used with care when the operating datasets are large. " 406 | ] 407 | } 408 | ], 409 | "metadata": {} 410 | } 411 | ] 412 | } -------------------------------------------------------------------------------- /4. Spark/4. Spark 4. rdd-aggregations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:11079f4265aa0d15e0bf53fe2dd27e64eb926948e4a2f0f43e8e08a276da43f4" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Data aggregations on RDDs" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "We can aggregate RDD data in Spark by using three different actions: `reduce`, `fold`, and `aggregate`. The last one is the more general one and someway includes the first two. " 31 | ] 32 | }, 33 | { 34 | "cell_type": "heading", 35 | "level": 2, 36 | "metadata": {}, 37 | "source": [ 38 | "Getting the data and creating the RDD" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the [KDD Cup 1999](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html), containing nearly half million nework interactions. The file is provided as a Gzip file that we will download locally. " 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "collapsed": false, 51 | "input": [ 52 | "import urllib\n", 53 | "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")" 54 | ], 55 | "language": "python", 56 | "metadata": {}, 57 | "outputs": [], 58 | "prompt_number": 1 59 | }, 60 | { 61 | "cell_type": "code", 62 | "collapsed": false, 63 | "input": [ 64 | "data_file = \"./kddcup.data_10_percent.gz\"\n", 65 | "raw_data = sc.textFile(data_file)" 66 | ], 67 | "language": "python", 68 | "metadata": {}, 69 | "outputs": [], 70 | "prompt_number": 2 71 | }, 72 | { 73 | "cell_type": "heading", 74 | "level": 2, 75 | "metadata": {}, 76 | "source": [ 77 | "Inspecting interaction duration by tag" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Both `fold` and `reduce` take a function as an argument that is applied to two elements of the RDD. The `fold` action differs from `reduce` in that it gets and additional initial *zero value* to be used for the initial call. This value should be the identity element for the function provided. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "As an example, imagine we want to know the total duration of our interactions for normal and attack interactions. We can use `reduce` as follows. " 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "collapsed": false, 97 | "input": [ 98 | "# parse data\n", 99 | "csv_data = raw_data.map(lambda x: x.split(\",\"))\n", 100 | "\n", 101 | "# separate into different RDDs\n", 102 | "normal_csv_data = csv_data.filter(lambda x: x[41]==\"normal.\")\n", 103 | "attack_csv_data = csv_data.filter(lambda x: x[41]!=\"normal.\")" 104 | ], 105 | "language": "python", 106 | "metadata": {}, 107 | "outputs": [], 108 | "prompt_number": 3 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "The function that we pass to `reduce` gets and returns elements of the same type of the RDD. If we want to sum durations we need to extract that element into a new RDD. " 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "collapsed": false, 120 | "input": [ 121 | "normal_duration_data = normal_csv_data.map(lambda x: int(x[0]))\n", 122 | "attack_duration_data = attack_csv_data.map(lambda x: int(x[0]))" 123 | ], 124 | "language": "python", 125 | "metadata": {}, 126 | "outputs": [], 127 | "prompt_number": 4 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "Now we can reduce these new RDDs. " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "collapsed": false, 139 | "input": [ 140 | "total_normal_duration = normal_duration_data.reduce(lambda x, y: x + y)\n", 141 | "total_attack_duration = attack_duration_data.reduce(lambda x, y: x + y)\n", 142 | "\n", 143 | "print \"Total duration for 'normal' interactions is {}\".\\\n", 144 | " format(total_normal_duration)\n", 145 | "print \"Total duration for 'attack' interactions is {}\".\\\n", 146 | " format(total_attack_duration)" 147 | ], 148 | "language": "python", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "output_type": "stream", 153 | "stream": "stdout", 154 | "text": [ 155 | "Total duration for 'normal' interactions is 21075991\n", 156 | "Total duration for 'attack' interactions is 2626792\n" 157 | ] 158 | } 159 | ], 160 | "prompt_number": 5 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "We can go further and use counts to calculate duration means. " 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "collapsed": false, 172 | "input": [ 173 | "normal_count = normal_duration_data.count()\n", 174 | "attack_count = attack_duration_data.count()\n", 175 | "\n", 176 | "print \"Mean duration for 'normal' interactions is {}\".\\\n", 177 | " format(round(total_normal_duration/float(normal_count),3))\n", 178 | "print \"Mean duration for 'attack' interactions is {}\".\\\n", 179 | " format(round(total_attack_duration/float(attack_count),3))" 180 | ], 181 | "language": "python", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "output_type": "stream", 186 | "stream": "stdout", 187 | "text": [ 188 | "Mean duration for 'normal' interactions is 216.657\n", 189 | "Mean duration for 'attack' interactions is 6.621\n" 190 | ] 191 | } 192 | ], 193 | "prompt_number": 6 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "We have a first (and too simplistic) approach to identify attack interactions." 200 | ] 201 | }, 202 | { 203 | "cell_type": "heading", 204 | "level": 2, 205 | "metadata": {}, 206 | "source": [ 207 | "A better way, using `aggregate` " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "The `aggregate` action frees us from the constraint of having the return be the same type as the RDD we are working on. Like with `fold`, we supply an initial zero value of the type we want to return. Then we provide two functions. The first one is used to combine the elements from our RDD with the accumulator. The second function is needed to merge two accumulators. Let's see it in action calculating the mean we did before. " 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "collapsed": false, 220 | "input": [ 221 | "normal_sum_count = normal_duration_data.aggregate(\n", 222 | " (0,0), # the initial value\n", 223 | " (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc\n", 224 | " (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators\n", 225 | ")\n", 226 | "\n", 227 | "print \"Mean duration for 'normal' interactions is {}\".\\\n", 228 | " format(round(normal_sum_count[0]/float(normal_sum_count[1]),3))" 229 | ], 230 | "language": "python", 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "output_type": "stream", 235 | "stream": "stdout", 236 | "text": [ 237 | "Mean duration for 'normal' interactions is 216.657\n" 238 | ] 239 | } 240 | ], 241 | "prompt_number": 7 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "In the previous aggregation, the accumulator first element keeps the total sum, while the second element keeps the count. Combining an accumulator with an RDD element consists in summing up the value and incrementing the count. Combining two accumulators requires just a pairwise sum. " 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "We can do the same with attack type interactions. " 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "collapsed": false, 260 | "input": [ 261 | "attack_sum_count = attack_duration_data.aggregate(\n", 262 | " (0,0), # the initial value\n", 263 | " (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc\n", 264 | " (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators\n", 265 | ")\n", 266 | "\n", 267 | "print \"Mean duration for 'attack' interactions is {}\".\\\n", 268 | " format(round(attack_sum_count[0]/float(attack_sum_count[1]),3))" 269 | ], 270 | "language": "python", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "output_type": "stream", 275 | "stream": "stdout", 276 | "text": [ 277 | "Mean duration for 'attack' interactions is 6.621\n" 278 | ] 279 | } 280 | ], 281 | "prompt_number": 8 282 | } 283 | ], 284 | "metadata": {} 285 | } 286 | ] 287 | } -------------------------------------------------------------------------------- /4. Spark/4. Spark 5. rdd-key-value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:01c23f9757cbbcc111ac851dc4f48c2a8732f1271f4cd54e1f91839a9d62a8dc" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Working with key/value pair RDDs" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "[Introduction to Spark with Python, by Jose A. Dianes](https://github.com/jadianes/spark-py-notebooks)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Spark provides specific functions to deal with RDDs which elements are key/value pairs. They are usually used to perform aggregations and other processings by key. " 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In this notebook we will show how, by working with key/value pairs, we can process our network interactions dataset in a more practical and powerful way than that used in previous notebooks. Key/value pair aggregations will show to be particularly effective when trying to explore each type of tag in our network attacks, in an individual way. " 38 | ] 39 | }, 40 | { 41 | "cell_type": "heading", 42 | "level": 2, 43 | "metadata": {}, 44 | "source": [ 45 | "Getting the data and creating the RDD" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "As we did in our first notebook, we will use the reduced dataset (10 percent) provided for the [KDD Cup 1999](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html), containing nearly half million network interactions. The file is provided as a Gzip file that we will download locally. " 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "collapsed": false, 58 | "input": [ 59 | "import urllib\n", 60 | "f = urllib.urlretrieve (\"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\", \"kddcup.data_10_percent.gz\")" 61 | ], 62 | "language": "python", 63 | "metadata": {}, 64 | "outputs": [], 65 | "prompt_number": 1 66 | }, 67 | { 68 | "cell_type": "code", 69 | "collapsed": false, 70 | "input": [ 71 | "data_file = \"./kddcup.data_10_percent.gz\"\n", 72 | "raw_data = sc.textFile(data_file)" 73 | ], 74 | "language": "python", 75 | "metadata": {}, 76 | "outputs": [], 77 | "prompt_number": 1 78 | }, 79 | { 80 | "cell_type": "heading", 81 | "level": 2, 82 | "metadata": {}, 83 | "source": [ 84 | "Creating a pair RDD for interaction types" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "In this notebook we want to do some exploratory data analysis on our network interactions dataset. More concretely we want to profile each network interaction type in terms of some of its variables such as duration. In order to do so, we first need to create the RDD suitable for that, where each interaction is parsed as a CSV row representing the value, and is put together with its corresponding tag as a key. " 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Normally we create key/value pair RDDs by applying a function using `map` to the original data. This function returns the corresponding pair for a given RDD element. We can proceed as follows. " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "collapsed": false, 104 | "input": [ 105 | "csv_data = raw_data.map(lambda x: x.split(\",\"))\n", 106 | "key_value_data = csv_data.map(lambda x: (x[41], x)) # x[41] contains the network interaction tag" 107 | ], 108 | "language": "python", 109 | "metadata": {}, 110 | "outputs": [], 111 | "prompt_number": 2 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "We have now our key/value pair data ready to be used. Let's get the first element in order to see how it looks like. " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "collapsed": false, 123 | "input": [ 124 | "key_value_data.take(1)" 125 | ], 126 | "language": "python", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "metadata": {}, 131 | "output_type": "pyout", 132 | "prompt_number": 3, 133 | "text": [ 134 | "[(u'normal.',\n", 135 | " [u'0',\n", 136 | " u'tcp',\n", 137 | " u'http',\n", 138 | " u'SF',\n", 139 | " u'181',\n", 140 | " u'5450',\n", 141 | " u'0',\n", 142 | " u'0',\n", 143 | " u'0',\n", 144 | " u'0',\n", 145 | " u'0',\n", 146 | " u'1',\n", 147 | " u'0',\n", 148 | " u'0',\n", 149 | " u'0',\n", 150 | " u'0',\n", 151 | " u'0',\n", 152 | " u'0',\n", 153 | " u'0',\n", 154 | " u'0',\n", 155 | " u'0',\n", 156 | " u'0',\n", 157 | " u'8',\n", 158 | " u'8',\n", 159 | " u'0.00',\n", 160 | " u'0.00',\n", 161 | " u'0.00',\n", 162 | " u'0.00',\n", 163 | " u'1.00',\n", 164 | " u'0.00',\n", 165 | " u'0.00',\n", 166 | " u'9',\n", 167 | " u'9',\n", 168 | " u'1.00',\n", 169 | " u'0.00',\n", 170 | " u'0.11',\n", 171 | " u'0.00',\n", 172 | " u'0.00',\n", 173 | " u'0.00',\n", 174 | " u'0.00',\n", 175 | " u'0.00',\n", 176 | " u'normal.'])]" 177 | ] 178 | } 179 | ], 180 | "prompt_number": 3 181 | }, 182 | { 183 | "cell_type": "heading", 184 | "level": 2, 185 | "metadata": {}, 186 | "source": [ 187 | "Data aggregations with key/value pair RDDs" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "We can use all the transformations and actions available for normal RDDs with key/value pair RDDs. We just need to make the functions work with pair elements. Additionally, Spark provides specific functions to work with RDDs containing pair elements. They are very similar to those available for general RDDs. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "For example, we have a `reduceByKey` transformation that we can use as follows to calculate the total duration of each network interaction type. " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "collapsed": false, 207 | "input": [ 208 | "key_value_duration = csv_data.map(lambda x: (x[41], float(x[0]))) \n", 209 | "durations_by_key = key_value_duration.reduceByKey(lambda x, y: x + y)\n", 210 | "\n", 211 | "durations_by_key.collect()" 212 | ], 213 | "language": "python", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "metadata": {}, 218 | "output_type": "pyout", 219 | "prompt_number": 4, 220 | "text": [ 221 | "[(u'guess_passwd.', 144.0),\n", 222 | " (u'nmap.', 0.0),\n", 223 | " (u'warezmaster.', 301.0),\n", 224 | " (u'rootkit.', 1008.0),\n", 225 | " (u'warezclient.', 627563.0),\n", 226 | " (u'smurf.', 0.0),\n", 227 | " (u'pod.', 0.0),\n", 228 | " (u'neptune.', 0.0),\n", 229 | " (u'normal.', 21075991.0),\n", 230 | " (u'spy.', 636.0),\n", 231 | " (u'ftp_write.', 259.0),\n", 232 | " (u'phf.', 18.0),\n", 233 | " (u'portsweep.', 1991911.0),\n", 234 | " (u'teardrop.', 0.0),\n", 235 | " (u'buffer_overflow.', 2751.0),\n", 236 | " (u'land.', 0.0),\n", 237 | " (u'imap.', 72.0),\n", 238 | " (u'loadmodule.', 326.0),\n", 239 | " (u'perl.', 124.0),\n", 240 | " (u'multihop.', 1288.0),\n", 241 | " (u'back.', 284.0),\n", 242 | " (u'ipsweep.', 43.0),\n", 243 | " (u'satan.', 64.0)]" 244 | ] 245 | } 246 | ], 247 | "prompt_number": 4 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "We have a specific counting action for key/value pairs. " 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "collapsed": false, 259 | "input": [ 260 | "counts_by_key = key_value_data.countByKey()\n", 261 | "counts_by_key" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "metadata": {}, 268 | "output_type": "pyout", 269 | "prompt_number": 5, 270 | "text": [ 271 | "defaultdict(, {u'guess_passwd.': 53, u'nmap.': 231, u'warezmaster.': 20, u'rootkit.': 10, u'warezclient.': 1020, u'smurf.': 280790, u'pod.': 264, u'neptune.': 107201, u'normal.': 97278, u'spy.': 2, u'ftp_write.': 8, u'phf.': 4, u'portsweep.': 1040, u'teardrop.': 979, u'buffer_overflow.': 30, u'land.': 21, u'imap.': 12, u'loadmodule.': 9, u'perl.': 3, u'multihop.': 7, u'back.': 2203, u'ipsweep.': 1247, u'satan.': 1589})" 272 | ] 273 | } 274 | ], 275 | "prompt_number": 5 276 | }, 277 | { 278 | "cell_type": "heading", 279 | "level": 3, 280 | "metadata": {}, 281 | "source": [ 282 | "Using `combineByKey`" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "This is the most general of the per-key aggregation functions. Most of the other per-key combiners are implemented using it. We can think about it as the `aggregate` equivalent since it allows the user to return values that are not the same type as our input data." 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "For example, we can use it to calculate per-type average durations as follows. " 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "collapsed": false, 302 | "input": [ 303 | "sum_counts = key_value_duration.combineByKey(\n", 304 | " (lambda x: (x, 1)), # the initial value, with value x and count 1\n", 305 | " (lambda acc, value: (acc[0]+value, acc[1]+1)), # how to combine a pair value with the accumulator: sum value, and increment count\n", 306 | " (lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1])) # combine accumulators\n", 307 | ")\n", 308 | "\n", 309 | "sum_counts.collectAsMap()" 310 | ], 311 | "language": "python", 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "metadata": {}, 316 | "output_type": "pyout", 317 | "prompt_number": 6, 318 | "text": [ 319 | "{u'back.': (284.0, 2203),\n", 320 | " u'buffer_overflow.': (2751.0, 30),\n", 321 | " u'ftp_write.': (259.0, 8),\n", 322 | " u'guess_passwd.': (144.0, 53),\n", 323 | " u'imap.': (72.0, 12),\n", 324 | " u'ipsweep.': (43.0, 1247),\n", 325 | " u'land.': (0.0, 21),\n", 326 | " u'loadmodule.': (326.0, 9),\n", 327 | " u'multihop.': (1288.0, 7),\n", 328 | " u'neptune.': (0.0, 107201),\n", 329 | " u'nmap.': (0.0, 231),\n", 330 | " u'normal.': (21075991.0, 97278),\n", 331 | " u'perl.': (124.0, 3),\n", 332 | " u'phf.': (18.0, 4),\n", 333 | " u'pod.': (0.0, 264),\n", 334 | " u'portsweep.': (1991911.0, 1040),\n", 335 | " u'rootkit.': (1008.0, 10),\n", 336 | " u'satan.': (64.0, 1589),\n", 337 | " u'smurf.': (0.0, 280790),\n", 338 | " u'spy.': (636.0, 2),\n", 339 | " u'teardrop.': (0.0, 979),\n", 340 | " u'warezclient.': (627563.0, 1020),\n", 341 | " u'warezmaster.': (301.0, 20)}" 342 | ] 343 | } 344 | ], 345 | "prompt_number": 6 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "We can see that the arguments are pretty similar to those passed to `aggregate` in the previous notebook. The result associated to each type is in the form of a pair. If we want to actually get the averages, we need to do the division before collecting the results. " 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "collapsed": false, 357 | "input": [ 358 | "duration_means_by_type = sum_counts.map(lambda (key,value): (key, round(value[0]/value[1],3))).collectAsMap()\n", 359 | "\n", 360 | "# Print them sorted\n", 361 | "for tag in sorted(duration_means_by_type, key=duration_means_by_type.get, reverse=True):\n", 362 | " print tag, duration_means_by_type[tag]" 363 | ], 364 | "language": "python", 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "output_type": "stream", 369 | "stream": "stdout", 370 | "text": [ 371 | "portsweep. 1915.299\n", 372 | "warezclient. 615.258\n", 373 | "spy. 318.0\n", 374 | "normal. 216.657\n", 375 | "multihop. 184.0\n", 376 | "rootkit. 100.8\n", 377 | "buffer_overflow. 91.7\n", 378 | "perl. 41.333\n", 379 | "loadmodule. 36.222\n", 380 | "ftp_write. 32.375\n", 381 | "warezmaster. 15.05\n", 382 | "imap. 6.0\n", 383 | "phf. 4.5\n", 384 | "guess_passwd. 2.717\n", 385 | "back. 0.129\n", 386 | "satan. 0.04\n", 387 | "ipsweep. 0.034\n", 388 | "nmap. 0.0\n", 389 | "smurf. 0.0\n", 390 | "pod. 0.0\n", 391 | "neptune. 0.0\n", 392 | "teardrop. 0.0\n", 393 | "land. 0.0\n" 394 | ] 395 | } 396 | ], 397 | "prompt_number": 7 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "A small step into understanding what makes a network interaction be considered an attack." 404 | ] 405 | } 406 | ], 407 | "metadata": {} 408 | } 409 | ] 410 | } -------------------------------------------------------------------------------- /4. Spark/LICENSE: -------------------------------------------------------------------------------- 1 | This repository contains a variety of content; some developed by Jose A. Dianes, and some from third-parties. The third-party content is distributed under the license provided by those parties. 2 | 3 | The content developed by Jose A. Dianes is distributed under the following license: 4 | 5 | Copyright 2016 Jose A Dianes 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | -------------------------------------------------------------------------------- /4. Spark/README.md: -------------------------------------------------------------------------------- 1 | # Spark Python Notebooks 2 | 3 | [![Join the chat at https://gitter.im/jadianes/spark-py-notebooks](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/jadianes/spark-py-notebooks?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 4 | 5 | This is a collection of [IPython notebook](http://ipython.org/notebook.html)/[Jupyter](https://jupyter.org/) 6 | notebooks intended to train the reader on different [Apache Spark](http://spark.apache.org/) concepts, from 7 | basic to advanced, by using the **Python** language. 8 | 9 | If Python is not your language, and it is R, you may want to have a look at our [R on Apache Spark (SparkR) notebooks](https://github.com/jadianes/spark-r-notebooks) instead. Additionally, if your are interested in being introduced to some basic Data Science 10 | Engineering, you might find [these series of tutorials](https://github.com/jadianes/data-science-your-way) 11 | interesting. There we explain different concepts and applications 12 | using Python and R. 13 | 14 | ## Instructions 15 | 16 | A good way of using these notebooks is by first cloning the repo, and then 17 | starting your own [IPython notebook](http://ipython.org/notebook.html)/[Jupyter](https://jupyter.org/) in 18 | **pySpark mode**. For example, if we have a *standalone* Spark installation 19 | running in our `localhost` with a maximum of 6Gb per node assigned to IPython: 20 | 21 | MASTER="spark://127.0.0.1:7077" SPARK_EXECUTOR_MEMORY="6G" IPYTHON_OPTS="notebook --pylab inline" ~/spark-1.5.0-bin-hadoop2.6/bin/pyspark 22 | 23 | Notice that the path to the `pyspark` command will depend on your specific 24 | installation. So as requirement, you need to have 25 | [Spark installed](https://spark.apache.org/docs/latest/index.html) in 26 | the same machine you are going to start the `IPython notebook` server. 27 | 28 | For more Spark options see [here](https://spark.apache.org/docs/latest/spark-standalone.html). In general it works the rule of passing options 29 | described in the form `spark.executor.memory` as `SPARK_EXECUTOR_MEMORY` when 30 | calling IPython/pySpark. 31 | 32 | ## Datasets 33 | 34 | We will be using datasets from the [KDD Cup 1999](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html). The results 35 | of this competition can be found [here](http://cseweb.ucsd.edu/~elkan/clresults.html). 36 | 37 | ## References 38 | 39 | The reference book for these and other Spark related topics is: 40 | 41 | - *Learning Spark* by Holden Karau, Andy Konwinski, Patrick Wendell, and Matei Zaharia. 42 | 43 | ## Notebooks 44 | 45 | The following notebooks can be examined individually, although there is a more 46 | or less linear 'story' when followed in sequence. By using the same dataset 47 | they try to solve a related set of tasks with it. 48 | 49 | ### [RDD creation](https://github.com/jadianes/spark-py-notebooks/blob/master/nb1-rdd-creation/nb1-rdd-creation.ipynb) 50 | 51 | About reading files and parallelize. 52 | 53 | ### [RDDs basics](https://github.com/jadianes/spark-py-notebooks/blob/master/nb2-rdd-basics/nb2-rdd-basics.ipynb) 54 | 55 | A look at `map`, `filter`, and `collect`. 56 | 57 | ### [Sampling RDDs](https://github.com/jadianes/spark-py-notebooks/blob/master/nb3-rdd-sampling/nb3-rdd-sampling.ipynb) 58 | 59 | RDD sampling methods explained. 60 | 61 | ### [RDD set operations](https://github.com/jadianes/spark-py-notebooks/blob/master/nb4-rdd-set/nb4-rdd-set.ipynb) 62 | 63 | Brief introduction to some of the RDD pseudo-set operations. 64 | 65 | ### [Data aggregations on RDDs](https://github.com/jadianes/spark-py-notebooks/blob/master/nb5-rdd-aggregations/nb5-rdd-aggregations.ipynb) 66 | 67 | RDD actions `reduce`, `fold`, and `aggregate`. 68 | 69 | ### [Working with key/value pair RDDs](https://github.com/jadianes/spark-py-notebooks/blob/master/nb6-rdd-key-value/nb6-rdd-key-value.ipynb) 70 | 71 | How to deal with key/value pairs in order to aggregate and explore data. 72 | 73 | ### [MLlib: Basic Statistics and Exploratory Data Analysis](https://github.com/jadianes/spark-py-notebooks/blob/master/nb7-mllib-statistics/nb7-mllib-statistics.ipynb) 74 | 75 | A notebook introducing Local Vector types, basic statistics 76 | in MLlib for Exploratory Data Analysis and model selection. 77 | 78 | ### [MLlib: Logistic Regression](https://github.com/jadianes/spark-py-notebooks/blob/master/nb8-mllib-logit/nb8-mllib-logit.ipynb) 79 | 80 | Labeled points and Logistic Regression classification of network attacks in MLlib. 81 | Application of model selection techniques using correlation matrix and Hypothesis Testing. 82 | 83 | ### [MLlib: Decision Trees](https://github.com/jadianes/spark-py-notebooks/blob/master/nb9-mllib-trees/nb9-mllib-trees.ipynb) 84 | 85 | Use of tree-based methods and how they help explaining models and 86 | feature selection. 87 | 88 | ### [Spark SQL: structured processing for Data Analysis](https://github.com/jadianes/spark-py-notebooks/blob/master/nb10-sql-dataframes/nb10-sql-dataframes.ipynb) 89 | 90 | In this notebook a schema is inferred for our network interactions dataset. Based on that, we use 91 | Spark's SQL `DataFrame` abstraction to perform a more structured exploratory data analysis. 92 | 93 | 94 | ## Applications 95 | 96 | Beyond the basics. Close to real-world applications using Spark and other technologies. 97 | 98 | ### [Olssen: On-line Spectral Search ENgine for proteomics](https://github.com/jadianes/olssen) 99 | 100 | Same tech stack this time with an AngularJS client app. 101 | 102 | ### [An on-line movie recommendation web service](https://github.com/jadianes/spark-movie-lens) 103 | 104 | This tutorial can be used independently to build a movie recommender model based on the MovieLens dataset. Most of the code in the first part, about how to use ALS with the public MovieLens dataset, comes from my solution to one of the exercises proposed in the [CS100.1x Introduction to Big Data with Apache Spark by Anthony D. Joseph on edX](https://www.edx.org/course/introduction-big-data-apache-spark-uc-berkeleyx-cs100-1x), that is also [**publicly available since 2014 at Spark Summit**](https://databricks-training.s3.amazonaws.com/movie-recommendation-with-mllib.html). 105 | 106 | There I've added with minor modifications to use a larger dataset and also code about how to store and reload the model for later use. On top of that we build a Flask web service so the recommender can be use to provide movie recommendations on-line. 107 | 108 | ### [KDD Cup 1999](https://github.com/jadianes/kdd-cup-99-spark) 109 | 110 | My try using Spark with this classic dataset and Knowledge Discovery competition. 111 | 112 | ## Contributing 113 | 114 | Contributions are welcome! For bug reports or requests please [submit an issue](https://github.com/jadianes/spark-py-notebooks/issues). 115 | 116 | ## Contact 117 | 118 | Feel free to contact me to discuss any issues, questions, or comments. 119 | 120 | * Twitter: [@ja_dianes](https://twitter.com/ja_dianes) 121 | * GitHub: [jadianes](https://github.com/jadianes) 122 | * LinkedIn: [jadianes](https://www.linkedin.com/in/jadianes) 123 | * Website: [jadianes.me](http://jadianes.me) 124 | 125 | ## License 126 | 127 | This repository contains a variety of content; some developed by Jose A. Dianes, and some from third-parties. The third-party content is distributed under the license provided by those parties. 128 | 129 | The content developed by Jose A. Dianes is distributed under the following license: 130 | 131 | Copyright 2016 Jose A Dianes 132 | 133 | Licensed under the Apache License, Version 2.0 (the "License"); 134 | you may not use this file except in compliance with the License. 135 | You may obtain a copy of the License at 136 | 137 | http://www.apache.org/licenses/LICENSE-2.0 138 | 139 | Unless required by applicable law or agreed to in writing, software 140 | distributed under the License is distributed on an "AS IS" BASIS, 141 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 142 | See the License for the specific language governing permissions and 143 | limitations under the License. 144 | -------------------------------------------------------------------------------- /5. Machine Learning/5. ML 0. Install requirements.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "cells": [ 5 | { 6 | "source": "%%bash\nrm -r -f addutils\ngit clone https://github.com/analytics-bootcamp/addutils.git\ncd ./addutils\n\npython setup.py install --user", 7 | "execution_count": null, 8 | "cell_type": "code", 9 | "metadata": { 10 | "collapsed": true 11 | }, 12 | "outputs": [] 13 | }, 14 | { 15 | "source": "!pip install seaborn", 16 | "execution_count": null, 17 | "cell_type": "code", 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [] 22 | } 23 | ], 24 | "metadata": { 25 | "language_info": { 26 | "nbconvert_exporter": "python", 27 | "mimetype": "text/x-python", 28 | "pygments_lexer": "ipython2", 29 | "version": "2.7.11", 30 | "file_extension": ".py", 31 | "name": "python", 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 2 35 | } 36 | }, 37 | "kernelspec": { 38 | "name": "python2", 39 | "display_name": "Python 2 with Spark 1.6", 40 | "language": "python" 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /5. Machine Learning/5. ML 1. Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat_minor": 0, 3 | "metadata": { 4 | "language_info": { 5 | "pygments_lexer": "ipython3", 6 | "name": "python", 7 | "file_extension": ".py", 8 | "codemirror_mode": { 9 | "name": "ipython", 10 | "version": 3 11 | }, 12 | "mimetype": "text/x-python", 13 | "nbconvert_exporter": "python", 14 | "version": "3.5.2" 15 | }, 16 | "kernelspec": { 17 | "language": "python", 18 | "name": "python3", 19 | "display_name": "Python 3.5 (Experimental) with Spark 1.6" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "cells": [ 24 | { 25 | "source": "# Definitions and Advices\n\nAdopted from https://github.com/addfor/tutorials", 26 | "metadata": {}, 27 | "cell_type": "markdown" 28 | }, 29 | { 30 | "source": "## 1 What is Machine Learning ?", 31 | "metadata": {}, 32 | "cell_type": "markdown" 33 | }, 34 | { 35 | "source": "Today we see an explosion of applications that are wide and connected with an emphasis on storage and processing. *Most companies are storing a lot of data but not solving the problem of what to do with it*. Yet most of the information is stored in raw form: There a huge amound of information locked-up in databases: information that is potentially important but has not yet been discovered. The objective of these tutorials is to show the foundamental techniques to **Discover Meaningful Information in Data**.\n\n**Data Mining** is the extraction of implicit, previously unknown and potentially useful information from data.\n\n**Machine Learning (ML)** and **Deep Learning (DL)** provide the technical basis of Data Mining. **ML** is about building programs with **tunable parameters** (typically an array of floating point values) that are adjusted automatically so as to improve their behavior by **adapting to previously seen data.**\n\n**DL** is about modeling high-level abstractions in data by using model architectures composed of **multiple non-linear transformations.**\n\n**ML and DL** can be considered a subfield of **Artificial Intelligence (AI)** since those algorithms can be seen as building blocks to make computers learn to behave more intelligently by somehow **generalizing** rather that just storing and retrieving data items like a database system would do.\n\nMost of the examples of these tutorials are taken from the [scikit-learn documentation](http://scikit-learn.org/stable/index.html): check the original documentation for further information.", 36 | "metadata": {}, 37 | "cell_type": "markdown" 38 | }, 39 | { 40 | "source": "### 1.1 Documentation and reference:", 41 | "metadata": {}, 42 | "cell_type": "markdown" 43 | }, 44 | { 45 | "source": "* [Numpy Reference guide](http://docs.scipy.org/doc/numpy/reference/)\n* [SciPy Reference](http://docs.scipy.org/doc/scipy/reference/)\n* [scikit-learn User Guide](http://scikit-learn.org/stable/user_guide.html)\n* [scikit-learn External Resources](http://scikit-learn.org/stable/presentations.html)", 46 | "metadata": {}, 47 | "cell_type": "markdown" 48 | }, 49 | { 50 | "source": "## 2 Supervised and Unsupervised Learning", 51 | "metadata": {}, 52 | "cell_type": "markdown" 53 | }, 54 | { 55 | "source": "In general, a learning problem uses a set of n data samples to predict properties of unknown data. Usually data are organized in tables where rows (first axis) represent the **samples** (or **instances**) and colums represent **attributes** (or **features**), for Supervised Learning, another array of **classes** or **target variables** is provided.\n\nWe can separate learning problems in a few large categories:\n\nIn **SUPERVISED LEARNING**, we have a dataset consisting of both features and labels. The task is to construct an estimator which is able to predict the label of an object given the set of features.\n\n* We have a **CLASSIFICATION** task when the **target variable is nominal (discrete)** - examples:\n\n * predicting the species of iris given a set of measurements of its flower\n * given a multicolor image of an object through a telescope, determine whether that object is a star, a quasar, or a galaxy.\n\n* We have a **REGRESSION** task when the **target variable is continuous** - examples:\n\n * given a set of attributes, determine the selling price of an house\n\n\nIn **UNSUPERVISED LEARNING** the data has no labels, and we are interested in finding similarities between the samples.\n\nUnsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and *density estimation*.\nSome unsupervised learning problems are:\n\n* **CLUSTERING** is the task that **group similar items together** - examples:\n\n * given observations of distant galaxies, determine which features are most important in distinguishing between them.\n\n* **DENSITY ESTIMATION** is a task were we want to **find statistical values that describe the data**\n\n* **DIMENSIONALITY REDUCTION** is for **reduce the number of the features while keeping most of the information**\n\n**UNSUPERVISED / SUPERVISED LEARNING** in DL usally the two approach are combined, in fact the DL layers (Restricted Boltzmann Machines, Autoencoders, Convolutional Neural Networks) are used to learn the most significative features of the data. Those features are then used with standard ML regressors or classificators.", 56 | "metadata": {}, 57 | "cell_type": "markdown" 58 | }, 59 | { 60 | "source": "## 3 Cheat Sheet - scikit-learn Algorithm Selection", 61 | "metadata": {}, 62 | "cell_type": "markdown" 63 | }, 64 | { 65 | "source": "ML is a general concept involving a huge number of algorithms. This is a tentative Cheat Sheet to help finding the correct approach.\n\nBasically, the principle is to start simple first. If this doesn't work out, try something more complicated.\n\nRed Links point to algorithms NOT included in scikit-learn.\n\nTo make any of the algorithms actually work, you need to do the right preprocessing.\n\nGenerally every ML algorithm needs a minimum number of samples. All the methods listed below are applicable to datasets with at least 50 samples. For tasks involving less than 50 samples most of the following methods are not suitable:", 66 | "metadata": {}, 67 | "cell_type": "markdown" 68 | }, 69 | { 70 | "source": "* **To predict a QUANTITY:**\n\n * **Regression:** these methods give back a numerical outcome.\n\n * **LESS than 100k samples with all features important (dense data):**\n\n * TRY: [Ridge Regression](http://scikit-learn.org/stable/modules/linear_model.html#ridge-regression) *(see Generalized Linear Models)*\n\n * If Ridge Regression doesn't work, TRY: [Support Vector Regression (svm.SVR)](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) with *linear kernel* *(see Support Vector Machines)*\n\n * If SVR with *linear kernel* doesn't work, TRY: [Support Vector Regression (svm.SVR)](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) with *rbf kernel* *(see Support Vector Machines)*\n * If none of the above methods work, USE: [Ensemble Regressors](http://scikit-learn.org/stable/modules/ensemble.html) *(RF, Extremely Randomized Trees, GBRT)* *(see Support Ensemble Methods)*\n\n * **LESS than 100k samples with few features important (sparse data):**\n\n * USE: [Elastic Net Lasso](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) *(see Generalized Linear Models)*\n\n * **MORE than 100k samples:**\n * USE: [SGD Regressor](http://scikit-learn.org/stable/modules/sgd.html#regression) *(see Stochastic Gradient Descent)*\n\n * **Alternatively, for every problem size:**\n\n * CALL US

\n\n * **Dimensionality Reduction (NOT for predicting the structure of the data):** these methods are suitable for data visualization and human interpretation.\n\n * TRY: [RandomizedPCA](http://scikit-learn.org/stable/modules/decomposition.html#principal-component-analysis-pca)\n\n * If RandomizedPCA dont work, and you have *LESS than 10k samples*, USE: [t-distributed Stochastic Neighbor Embedding (t-SNE)](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n\n * If RandomizedPCA dont work, and you have *MORE than 10k samples*, CALL US: most probably you need a more efficient version of t-SNE\n\n * **For Prediction of multivariate or structured outputs:**\n\n * TRY: [SVM struct](http://www.cs.cornell.edu/people/tj/svm_light/svm_struct.html). This algorithm is free for non-commercial use\n\n * TRY: [pystruct](https://github.com/amueller/pystruct). (Under development)", 71 | "metadata": {}, 72 | "cell_type": "markdown" 73 | }, 74 | { 75 | "source": "* **Predict a CATEGORY for LABELED Data (Classification):**\n\n * **LESS than 100k samples**, TRY: [Linear SVC](http://scikit-learn.org/stable/modules/svm.html#svc)\n\n * If Linear SVC dont work, and you have *NUMERICAL DATA*, TRY: [KNeighborsClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)\n\n * If KNeighborsClassifier doesn't work, USE: [SVC](http://scikit-learn.org/stable/modules/svm.html#svc)\n\n * If SVC doesn't work, USE: [Ensemble Classifiers](http://scikit-learn.org/stable/modules/ensemble.html) *(RF, Extremely Randomized Trees, GBRT)*\n\n * If Linear SVC dont work, and you have *TEXTUAL DATA*, USE: [Naive Bayes](http://scikit-learn.org/stable/modules/naive_bayes.html)\n\n * **MORE than 100k samples**, TRY: [SGD Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)\n\n * If SGD Classifier dont work, , USE: [Kernel Approximation](http://scikit-learn.org/stable/modules/kernel_approximation.html)", 76 | "metadata": {}, 77 | "cell_type": "markdown" 78 | }, 79 | { 80 | "source": "* **Predict a CATEGORY for UNLABELED Data (Clustering):**\n\n * **LESS than 10k samples and KNOWN number of categories**\n * USE: [Mini Batch K-Means](http://scikit-learn.org/stable/modules/clustering.html#mini-batch-k-means)\n\n * **MORE than 10k samples and KNOWN number of categories**\n * TRY: [K-Means Clustering](http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#k-means-clustering)\n\n * If K-Means Clustering doesn't work, TRY: [Gaussian Mixture Models](http://scikit-learn.org/stable/modules/mixture.html#gmm-classifier)\n\n * If GMM doesn't work, USE: [Spectral Clustering](http://scikit-learn.org/stable/modules/clustering.html#spectral-clustering)\n\n * **LESS than 10k samples and UNKNOWN number of categories**\n * TRY: [Mean Shift](http://scikit-learn.org/stable/modules/clustering.html#mean-shift)\n\n * If Mean Shift doesn't work, USE: [Variational Gaussian Mixtures](http://scikit-learn.org/stable/modules/mixture.html#vbgmm-classifier-variational-gaussian-mixtures)", 81 | "metadata": {}, 82 | "cell_type": "markdown" 83 | }, 84 | { 85 | "source": "## 4 Machine Learning Wisdom", 86 | "metadata": {}, 87 | "cell_type": "markdown" 88 | }, 89 | { 90 | "source": "Here is a list of things to take in great consideration while developing ML systems:\n\n1. **No Free Lunch:** A wide variety of techniques exist for modeling. An important theorem in statistical machine learning essentially states that no one technique will outperform all other techniques on all problems *(Wolpert & MacReady, 1997)*. This theorem is sometimes referred to as *No Free Lunch*. Often, a modeling group will specialize in one particular technique, and will tout that technique as the being intrinsically superior to others. Such a claim should be regarded with extreme suspicion. Furthermore, the field of statistical machine learning is evolving rapidly, and new algorithms are developed at a regular pace, this determines a very fast aging for ML approaches. This is the reason why in Addfor we rely on Open Source, Lean and Data-Driven Development and **Combinatorial Innovation**.\n\n2. **Beware of False Predictors:** In selecting input variables for a model, one must be careful not to include false predictors. A false predictor is a variable that is strongly correlated with the output class, but that is not available in a realistic prediction scenario. This step is stricktly data-dependent and can be accomplished by paying attention to the choice of the validation dataset. **Correlation does not imply causation:** ice-cream sales is a strong predictor for drowning deaths.\n\n3. **Mind Data Balancing:** Always check if your algorithm is suitable to handle Data Asymmetricity.\n\n4. **Correctly Define Output Classes:** If the model's task is to predict a system failure, it seems natural for the output classes to be \"fail\" and \"not fail\". However, characterizing the exact conditions under which failure occurs is not straightforward. For example two failures for different reasons could represent very different classes.\n\n5. **Data Preparation:** this is maybe the most important task in any predictive algorithm, we dedicate a whole notebook to it! \n\n6. **Model Selection:** Any modeling technique can be used to construct of a continuum of models, from simple to complex. One of the key issues in modeling is model selection, which involves picking the appropriate level of complexity for a model given a data set. Although model selection methods can be automated to some degree, model selection cannot be avoided. If someone claims otherwise, or does not emphasize their expertise in model selection, one should be suspicious of his abilities.\n\n7. **Segmentation:** Often, a data set can be broken into several smaller, more homogenous data sets, which is referred to as segmentation. For example, a customer data base might be split into business and residential customers. Although domain experts can readily propose segmentations, enforcing a segmentation suggested by domain experts is generally not the most prudent approach to modeling, because the data itself provides clues to how the segmentation should be performed. Consequently, one should be concerned if a modeler claims to utilize a priori segmentation.\n\n8. **Model Evaluation:** Once a model has been built, the natural question to ask is how accurate it is. Here we describe common sorts of deception that can occur in assessing and evaluating a model:\n\n a) *Failing to use an independent test set:* To obtain a fair estimate of performance, the model must be evaluated on examples that were not contained in the training set. The available data must be split into nonoverlapping subsets, with the test set reserved only for evaluation.\n\n b) *Assuming stationarity of the test environment:* For many difficult problems, a model built based on historical data will become a poorer and poorer predictor as time goes on, because the environment is nonstationary--the rules and behaviors of individuals change over time. Consequently, the best measure of a model's true performance will be obtained if it is tested on data from a different point in time relative to the training data.\n\n c) *Incomplete reports of results:* An accurate model will correctly discriminate examples of one output class from examples of another output class. Discrimination performance is best reported with an ROC curve, a lift curve, or a precision-recall curve. Any report of accuracy using only a single number is suspect.\n\n d) *Filtering data to bias results:* In a large data set, one segment of the population may be easier to predict than another. If a model is trained and tested just on this segment of the population, it will be more accurate than a model that must handle the entire population. Selective filtering can turn a hard problem into an easier problem.\n\n e) *Selective sampling of test cases:* A fair evaluation of a model will utilize a test set that is drawn from the same population as the model will eventually encounter in actual usage.\n\n f) *Failing to assess statistical reliability:* When comparing the accuracy of two models, it is not sufficient to report that one model performed better than the other, because the difference might not be statistically reliable. \"Statistical reliability\" means, among other things, that if the comparison were repeated using a different sample of the population, the same result would be achieved.", 91 | "metadata": {}, 92 | "cell_type": "markdown" 93 | }, 94 | { 95 | "source": "\nThis work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.", 96 | "metadata": {}, 97 | "cell_type": "markdown" 98 | } 99 | ] 100 | } -------------------------------------------------------------------------------- /5. Machine Learning/Scikit_Learn_Cheat_Sheet_Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/5. Machine Learning/Scikit_Learn_Cheat_Sheet_Python.pdf -------------------------------------------------------------------------------- /6. Deep Learning/Keras.js Demos.url: -------------------------------------------------------------------------------- 1 | [InternetShortcut] 2 | URL=https://transcranial.github.io/keras-js/#/mnist-cnn 3 | -------------------------------------------------------------------------------- /6. Deep Learning/Keras_Cheat_Sheet_Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/6. Deep Learning/Keras_Cheat_Sheet_Python.pdf -------------------------------------------------------------------------------- /7. Misc/bias and variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/bias and variance.png -------------------------------------------------------------------------------- /7. Misc/biasvariance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import bokeh.plotting as bk 3 | 4 | def test_func(x, err=0.5): 5 | return np.random.normal(10 - 1. / (x + 0.1), err) 6 | 7 | def compute_error(x, y, p): 8 | yfit = np.polyval(p, x) 9 | return np.sqrt(np.mean((y - yfit) ** 2)) 10 | 11 | def plot_bias_variance(N=8, random_seed=42, err=0.5): 12 | np.random.seed(random_seed) 13 | x = 10 ** np.linspace(-2, 0, N) 14 | y = test_func(x) 15 | xfit = np.linspace(-0.2, 1.2, 1000) 16 | titles = ['d = 1 (under-fit; high bias)', 17 | 'd = 2', 18 | 'd = 6 (over-fit; high variance)'] 19 | degrees = [1, 2, 6] 20 | 21 | row = [] 22 | for i, d in enumerate(degrees): 23 | fig = bk.figure(plot_width=240, plot_height=240, 24 | title=titles[i], x_range=(-0.2, 1.2), y_range=(0, 12)) 25 | fig.title.text_font_size = '11pt' 26 | fig.xaxis.axis_label_text_font_size = '9pt' 27 | fig.yaxis.axis_label_text_font_size = '9pt' 28 | fig.x(x, y, color='black', size=12) 29 | 30 | p = np.polyfit(x, y, d) 31 | yfit = np.polyval(p, xfit) 32 | fig.line(xfit, yfit, line_color='blue') 33 | 34 | fig.xaxis.axis_label = 'house size' 35 | fig.xaxis.axis_label_text_font_size = '9pt' 36 | if i == 0: 37 | fig.yaxis.axis_label = 'price' 38 | row.append(fig) 39 | 40 | gp = bk.gridplot([row], border_space=0) 41 | bk.show(gp) 42 | 43 | -------------------------------------------------------------------------------- /7. Misc/ensemble_explore_hastie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/ensemble_explore_hastie.png -------------------------------------------------------------------------------- /7. Misc/international-airline-passengers.csv: -------------------------------------------------------------------------------- 1 | Month,Passengers 2 | 1949-01,112 3 | 1949-02,118 4 | 1949-03,132 5 | 1949-04,129 6 | 1949-05,121 7 | 1949-06,135 8 | 1949-07,148 9 | 1949-08,148 10 | 1949-09,136 11 | 1949-10,119 12 | 1949-11,104 13 | 1949-12,118 14 | 1950-01,115 15 | 1950-02,126 16 | 1950-03,141 17 | 1950-04,135 18 | 1950-05,125 19 | 1950-06,149 20 | 1950-07,170 21 | 1950-08,170 22 | 1950-09,158 23 | 1950-10,133 24 | 1950-11,114 25 | 1950-12,140 26 | 1951-01,145 27 | 1951-02,150 28 | 1951-03,178 29 | 1951-04,163 30 | 1951-05,172 31 | 1951-06,178 32 | 1951-07,199 33 | 1951-08,199 34 | 1951-09,184 35 | 1951-10,162 36 | 1951-11,146 37 | 1951-12,166 38 | 1952-01,171 39 | 1952-02,180 40 | 1952-03,193 41 | 1952-04,181 42 | 1952-05,183 43 | 1952-06,218 44 | 1952-07,230 45 | 1952-08,242 46 | 1952-09,209 47 | 1952-10,191 48 | 1952-11,172 49 | 1952-12,194 50 | 1953-01,196 51 | 1953-02,196 52 | 1953-03,236 53 | 1953-04,235 54 | 1953-05,229 55 | 1953-06,243 56 | 1953-07,264 57 | 1953-08,272 58 | 1953-09,237 59 | 1953-10,211 60 | 1953-11,180 61 | 1953-12,201 62 | 1954-01,204 63 | 1954-02,188 64 | 1954-03,235 65 | 1954-04,227 66 | 1954-05,234 67 | 1954-06,264 68 | 1954-07,302 69 | 1954-08,293 70 | 1954-09,259 71 | 1954-10,229 72 | 1954-11,203 73 | 1954-12,229 74 | 1955-01,242 75 | 1955-02,233 76 | 1955-03,267 77 | 1955-04,269 78 | 1955-05,270 79 | 1955-06,315 80 | 1955-07,364 81 | 1955-08,347 82 | 1955-09,312 83 | 1955-10,274 84 | 1955-11,237 85 | 1955-12,278 86 | 1956-01,284 87 | 1956-02,277 88 | 1956-03,317 89 | 1956-04,313 90 | 1956-05,318 91 | 1956-06,374 92 | 1956-07,413 93 | 1956-08,405 94 | 1956-09,355 95 | 1956-10,306 96 | 1956-11,271 97 | 1956-12,306 98 | 1957-01,315 99 | 1957-02,301 100 | 1957-03,356 101 | 1957-04,348 102 | 1957-05,355 103 | 1957-06,422 104 | 1957-07,465 105 | 1957-08,467 106 | 1957-09,404 107 | 1957-10,347 108 | 1957-11,305 109 | 1957-12,336 110 | 1958-01,340 111 | 1958-02,318 112 | 1958-03,362 113 | 1958-04,348 114 | 1958-05,363 115 | 1958-06,435 116 | 1958-07,491 117 | 1958-08,505 118 | 1958-09,404 119 | 1958-10,359 120 | 1958-11,310 121 | 1958-12,337 122 | 1959-01,360 123 | 1959-02,342 124 | 1959-03,406 125 | 1959-04,396 126 | 1959-05,420 127 | 1959-06,472 128 | 1959-07,548 129 | 1959-08,559 130 | 1959-09,463 131 | 1959-10,407 132 | 1959-11,362 133 | 1959-12,405 134 | 1960-01,417 135 | 1960-02,391 136 | 1960-03,419 137 | 1960-04,461 138 | 1960-05,472 139 | 1960-06,535 140 | 1960-07,622 141 | 1960-08,606 142 | 1960-09,508 143 | 1960-10,461 144 | 1960-11,390 145 | 1960-12,432 -------------------------------------------------------------------------------- /7. Misc/learning_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/learning_curves.png -------------------------------------------------------------------------------- /7. Misc/matlab_test_data_01.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/matlab_test_data_01.mat -------------------------------------------------------------------------------- /7. Misc/moon phases.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/moon phases.xlsx -------------------------------------------------------------------------------- /7. Misc/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/7. Misc/tree.png -------------------------------------------------------------------------------- /7. Misc/weather.txt: -------------------------------------------------------------------------------- 1 | How hot is it today?,temperature 2 | Is it hot outside?,temperature 3 | Will it be uncomfortably hot?,temperature 4 | Will it be sweltering?,temperature 5 | How cold is it today?,temperature 6 | Is it cold outside?,temperature 7 | Will it be uncomfortably cold?,temperature 8 | Will it be frigid?,temperature 9 | What is the expected high for today?,temperature 10 | What is the expected temperature?,temperature 11 | Will high temperatures be dangerous?,temperature 12 | Is it dangerously cold?,temperature 13 | When will the heat subside?,temperature 14 | Is it hot?,temperature 15 | Is it cold?,temperature 16 | How cold is it now?,temperature 17 | Will we have a cold day today?,temperature 18 | When will the cold subside?,temperature 19 | What highs are we expecting?,temperature 20 | What lows are we expecting?,temperature 21 | Is it warm?,temperature 22 | Is it chilly?,temperature 23 | What's the current temp in Celsius?,temperature 24 | What is the temperature in Fahrenheit?,temperature 25 | Is it windy?,conditions 26 | Will it rain today?,conditions 27 | What are the chances for rain?,conditions 28 | Will we get snow?,conditions 29 | Are we expecting sunny conditions?,conditions 30 | Is it overcast?,conditions 31 | Will it be cloudy?,conditions 32 | How much rain will fall today?,conditions 33 | How much snow are we expecting?,conditions 34 | Is it windy outside?,conditions 35 | How much snow do we expect?,conditions 36 | Is the forecast calling for snow today?,conditions 37 | Will we see some sun?,conditions 38 | When will the rain subside?,conditions 39 | Is it cloudy?,conditions 40 | Is it sunny now?,conditions 41 | Will it rain?,conditions 42 | Will we have much snow?,conditions 43 | Are the winds dangerous?,conditions 44 | What is the expected snowfall today?,conditions 45 | Will it be dry?,conditions 46 | Will it be breezy?,conditions 47 | Will it be humid?,conditions 48 | What is today's expected humidity?,conditions 49 | Will the blizzard hit us?,conditions 50 | Is it drizzling?,conditions -------------------------------------------------------------------------------- /Class intro.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/Class intro.pptx -------------------------------------------------------------------------------- /Putting data to work.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bootcampanalytics/Training-material/0d6573b07593fe3888a9efcb1280342c4bed4d07/Putting data to work.pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Training-material 2 | Bootcamp material for IBM DSX training April 3-5 in Singapore 3 | 4 | Start with Class intro.pptx 5 | -------------------------------------------------------------------------------- /agenda voting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "\n", 13 | "score=requests.get(\"https://codeshare.io/2pAwm4\").content\n", 14 | "res=score.split(\"xxx\")[1].split(\"\\\\n\")" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 71, 20 | "metadata": { 21 | "collapsed": false, 22 | "scrolled": true 23 | }, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/plain": [ 28 | "[('0. Basics', 1.0),\n", 29 | " ('1. In and Export', 1.1),\n", 30 | " ('2. Watson APIs', 2.0),\n", 31 | " ('3. Visualization', 2.0),\n", 32 | " ('4. Spark', 2.4),\n", 33 | " ('5. Machine Learning', 2.3),\n", 34 | " ('6. Training - Deep Learning', 2.2)]" 35 | ] 36 | }, 37 | "execution_count": 71, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "main=[i.split(\"#\") for i in res[3:10]]\n", 44 | "topics=[i[0].strip() for i in main]\n", 45 | "hours=[i[1] for i in main]\n", 46 | "votes = [len(i) for i in hours]\n", 47 | "tot_score = [sum(int(i) for i in j) for j in hours ]\n", 48 | "avg_scores=[round(float(a)/float(b),1) for a,b in zip(tot_score,votes)]\n", 49 | "\n", 50 | "zip(topics,avg_scores)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 121, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "\n", 65 | "('0', 'Basics\\\\\\\\0. Basics 1. Python', 1.0)\n", 66 | "('0', 'Basics\\\\\\\\0. Basics 0. Jupyter notebook', 1.2)\n", 67 | "('0', 'Basics\\\\\\\\0. Basics 4. Jupyter notebook magics, shell and R', 1.4)\n", 68 | "('0', 'Basics\\\\\\\\0. Basics 3. Pandas', 2.5)\n", 69 | "('0', 'Basics\\\\\\\\0. Basics 2. Numpy', 2.6)\n", 70 | "\n", 71 | "('1', 'In and Export\\\\\\\\1. import and export 0. Object Storage', 1.3)\n", 72 | "('1', 'In and Export\\\\\\\\1. import and export 1. Download and upload', 1.3)\n", 73 | "('1', 'In and Export\\\\\\\\1. import and export 2. DashDB', 1.4)\n", 74 | "('1', 'In and Export\\\\\\\\1. import and export 3. Cloudant', 2.2)\n", 75 | "('1', 'In and Export\\\\\\\\1. import and export 4. Twitter', 2.2)\n", 76 | "('1', 'In and Export\\\\\\\\1. import and export 5. BigInsights', 2.3)\n", 77 | "\n", 78 | "('2', 'Watson APIs\\\\\\\\2. Watson 0. Weather API', 1.0)\n", 79 | "('2', 'Watson APIs\\\\\\\\2. Watson 2. Alchemy News', 1.5)\n", 80 | "('2', 'Watson APIs\\\\\\\\2. Watson 1. Personality Insights', 1.8)\n", 81 | "('2', 'Watson APIs\\\\\\\\2. Watson 5. Natural language classifier', 2.1)\n", 82 | "('2', 'Watson APIs\\\\\\\\2. Watson 3. Alchemy language', 2.3)\n", 83 | "('2', 'Watson APIs\\\\\\\\2. Watson 4. Tone analyzer', 2.3)\n", 84 | "\n", 85 | "('3', 'Visualization\\\\\\\\3. Visualization 0. Matplotlib', 1.8)\n", 86 | "('3', 'Visualization\\\\\\\\3. Visualization 1. Machine learning techniques', 1.8)\n", 87 | "('3', 'Visualization\\\\\\\\3. Visualization 2. Pixiedust', 2.0)\n", 88 | "('3', 'Visualization\\\\\\\\3. Visualization 3. Bokeh', 2.7)\n", 89 | "\n", 90 | "('4', 'Spark\\\\\\\\4. Spark 0. rdd-creation', 1.4)\n", 91 | "('4', 'Spark\\\\\\\\4. Spark 1. rdd-basics', 1.4)\n", 92 | "('4', 'Spark\\\\\\\\4. Spark 2. rdd-sampling', 2.5)\n", 93 | "('4', 'Spark\\\\\\\\4. Spark 3. rdd-set', 2.5)\n", 94 | "('4', 'Spark\\\\\\\\4. Spark 4. rdd-aggregations', 2.5)\n", 95 | "('4', 'Spark\\\\\\\\4. Spark 5. rdd-key-value', 2.5)\n", 96 | "('4', 'Spark\\\\\\\\4. Spark 7. mllib-logit', 2.5)\n", 97 | "('4', 'Spark\\\\\\\\4. Spark 8. mllib-trees', 2.5)\n", 98 | "('4', 'Spark\\\\\\\\4. Spark 9. sql-dataframes', 2.7)\n", 99 | "('4', 'Spark\\\\\\\\4. Spark 6. mllib-statistics', 2.8)\n", 100 | "\n", 101 | "('5', 'Machine Learning\\\\\\\\5. ML 1. Introduction', 1.3)\n", 102 | "('5', 'Machine Learning\\\\\\\\5. ML 3. Scikit Learn interface', 2.0)\n", 103 | "('5', 'Machine Learning\\\\\\\\5. ML 2. Data preparation', 2.1)\n", 104 | "('5', 'Machine Learning\\\\\\\\5. ML 0. Install requirements', 2.3)\n", 105 | "('5', 'Machine Learning\\\\\\\\5. ML 5. Model evaluation', 2.3)\n", 106 | "('5', 'Machine Learning\\\\\\\\5. ML 4. Bias and variance', 2.4)\n", 107 | "('5', 'Machine Learning\\\\\\\\5. ML 7. Ensemble methods advanced', 2.5)\n", 108 | "('5', 'Machine Learning\\\\\\\\5. ML 9. Time series', 2.6)\n", 109 | "('5', 'Machine Learning\\\\\\\\5. ML 6. Ensemble methods', 2.7)\n", 110 | "('5', 'Machine Learning\\\\\\\\5. ML 8. Multi Model Ensembles', 2.8)\n", 111 | "\n", 112 | "('6', 'Deep Learning\\\\\\\\6. DL 0. Keras starter kit', 1.4)\n", 113 | "('6', 'Deep Learning\\\\\\\\6. DL 1. Fun with activation functions', 1.8)\n", 114 | "('6', 'Deep Learning\\\\\\\\6. DL 3. Embedding', 2.3)\n", 115 | "('6', 'Deep Learning\\\\\\\\6. DL 5. Auto encoder', 2.3)\n", 116 | "('6', 'Deep Learning\\\\\\\\6. DL 6. Recurrent networks', 2.5)\n", 117 | "('6', 'Deep Learning\\\\\\\\6. DL 2. Convolutional networks', 2.6)\n", 118 | "('6', 'Deep Learning\\\\\\\\6. DL 4. Multi-input models', 2.9)\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "main=[i.strip() for i in res[14:]]\n", 124 | "main=filter(None, main)\n", 125 | "main=[i.split(\"#\") for i in main]\n", 126 | "topics=[i[0].strip() for i in main]\n", 127 | "rank=[i[1] for i in main]\n", 128 | "votes = [len(i) for i in rank]\n", 129 | "tot_score = [sum(int(i) for i in j) for j in rank ]\n", 130 | "chapter = [i[0] for i in topics ]\n", 131 | "topics = [i[3:] for i in topics ]\n", 132 | "avg_scores=[round(float(a)/float(b),1) for a,b in zip(tot_score,votes)]\n", 133 | "\n", 134 | "score=zip(chapter,topics,avg_scores)\n", 135 | "score =sorted(score, key=lambda score: (score[0],score[2]))\n", 136 | "\n", 137 | "chapter=0\n", 138 | "for i in score:\n", 139 | " if i[0]!=chapter:\n", 140 | " print \"\"\n", 141 | " print i\n", 142 | " chapter=i[0]" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "anaconda-cloud": {}, 148 | "kernelspec": { 149 | "display_name": "Python [conda root]", 150 | "language": "python", 151 | "name": "conda-root-py" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 2 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython2", 163 | "version": "2.7.12" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 1 168 | } 169 | -------------------------------------------------------------------------------- /agenda.txt: -------------------------------------------------------------------------------- 1 | Spread 10 hours of training over the main topics: 2 | 3 | 0. Basics # 4 | 1. In and Export # 5 | 2. Watson APIs # 6 | 3. Visualization # 7 | 4. Spark # 8 | 5. Machine Learning # 9 | 6. Training - Deep Learning # 10 | 11 | 12 | Per topic, mark your top 3 interest as 1,2,3: 13 | 14 | 0. Basics\0. Basics 0. Jupyter notebook # 15 | 0. Basics\0. Basics 1. Python # 16 | 0. Basics\0. Basics 2. Numpy # 17 | 0. Basics\0. Basics 3. Pandas # 18 | 0. Basics\0. Basics 4. Jupyter notebook magics, shell and R # 19 | 20 | 1. In and Export\1. import and export 0. Object Storage # 21 | 1. In and Export\1. import and export 1. Download and upload # 22 | 1. In and Export\1. import and export 2. DashDB # 23 | 1. In and Export\1. import and export 3. Cloudant # 24 | 1. In and Export\1. import and export 4. Twitter # 25 | 1. In and Export\1. import and export 5. BigInsights # 26 | 27 | 2. Watson APIs\2. Watson 0. Weather API # 28 | 2. Watson APIs\2. Watson 1. Personality Insights # 29 | 2. Watson APIs\2. Watson 2. Alchemy News # 30 | 2. Watson APIs\2. Watson 3. Alchemy language # 31 | 2. Watson APIs\2. Watson 4. Tone analyzer # 32 | 2. Watson APIs\2. Watson 5. Natural language classifier # 33 | 34 | 3. Visualization\3. Visualization 0. Matplotlib # 35 | 3. Visualization\3. Visualization 1. Machine learning techniques # 36 | 3. Visualization\3. Visualization 2. Pixiedust # 37 | 3. Visualization\3. Visualization 3. Bokeh # 38 | 39 | 4. Spark\4. Spark 0. rdd-creation # 40 | 4. Spark\4. Spark 1. rdd-basics # 41 | 4. Spark\4. Spark 2. rdd-sampling # 42 | 4. Spark\4. Spark 3. rdd-set # 43 | 4. Spark\4. Spark 4. rdd-aggregations # 44 | 4. Spark\4. Spark 5. rdd-key-value # 45 | 4. Spark\4. Spark 6. mllib-statistics # 46 | 4. Spark\4. Spark 7. mllib-logit # 47 | 4. Spark\4. Spark 8. mllib-trees # 48 | 4. Spark\4. Spark 9. sql-dataframes # 49 | 50 | 5. Machine Learning\5. ML 0. Install requirements # 51 | 5. Machine Learning\5. ML 1. Introduction # 52 | 5. Machine Learning\5. ML 2. Data preparation # 53 | 5. Machine Learning\5. ML 3. Scikit Learn interface # 54 | 5. Machine Learning\5. ML 4. Bias and variance                       # 55 | 5. Machine Learning\5. ML 5. Model evaluation # 56 | 5. Machine Learning\5. ML 6. Ensemble methods # 57 | 5. Machine Learning\5. ML 7. Ensemble methods advanced # 58 | 5. Machine Learning\5. ML 8. Multi Model Ensembles # 59 | 5. Machine Learning\5. ML 9. Time series # 60 | 61 | 6. Deep Learning\6. DL 0. Keras starter kit                         # 62 | 6. Deep Learning\6. DL 1. Fun with activation functions               # 63 | 6. Deep Learning\6. DL 2. Convolutional networks                     # 64 | 6. Deep Learning\6. DL 3. Embedding                                 # 65 | 6. Deep Learning\6. DL 4. Multi-input models                         # 66 | 6. Deep Learning\6. DL 5. Auto encoder                               # 67 | 6. Deep Learning\6. DL 6. Recurrent networks                         # 68 | 69 | 70 | -------------------------------------------------------------------------------- /other/Data Generation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat_minor": 0, 3 | "metadata": { 4 | "language_info": { 5 | "pygments_lexer": "ipython2", 6 | "name": "python", 7 | "file_extension": ".py", 8 | "codemirror_mode": { 9 | "name": "ipython", 10 | "version": 2 11 | }, 12 | "mimetype": "text/x-python", 13 | "nbconvert_exporter": "python", 14 | "version": "2.7.11" 15 | }, 16 | "kernelspec": { 17 | "language": "python", 18 | "name": "python2", 19 | "display_name": "Python 2 with Spark 1.6" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "cells": [ 24 | { 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "source": "import numpy as np\nimport pandas as pd", 30 | "outputs": [], 31 | "cell_type": "code" 32 | }, 33 | { 34 | "execution_count": 28, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "source": "def gen_data(n=100):\n data = { \\\n 'v01': np.random.normal(0,1,n), \n 'v02': np.random.beta(3,1,n),\n 'v03': np.random.chisquare(3,n), \n 'v04': np.random.exponential(3,n),\n 'v05': np.random.f(10,3,n), \n 'v06': np.random.gamma(2,n), \n 'v07': np.random.logistic(10,1,n), \n 'v08': np.random.choice(2, n, p=[0.1, 0.9]),\n 'v09': np.random.choice(2, n, p=[0.2, 0.8]),\n 'v10': np.random.choice(2, n, p=[0.4, 0.6]), \n 'v11': np.random.choice(3, n, p=[0.1, 0.2, 0.7]),\n 'v12': np.random.choice(3, n, p=[0.3, 0.3, 0.4]),\n 'v13': np.random.choice(5, n, p=[0.1, 0.2, 0.3, 0.2, 0.2 ]),\n 'v14': np.random.choice(5, n, p=[0.2, 0.2, 0.2, 0.2, 0.2]),\n 'v15': np.random.choice(10, n, p=[0.1, 0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1, 0.1]) \\\n }\n\n df = pd.DataFrame(data) \n\n\n target = \\\n 0.1 * df.v01 + \\\n -0.1 * df.v03 + \\\n 0.015 * df.v04 + \\\n -0.00015* df.v06 + \\\n 0.12 * df.v07 + \\\n 0.11 * df.v08 + \\\n -0.13 * df.v10 + \\\n -0.05 * (df.v11==1) + \\\n 0.13 * (df.v13==1) + \\\n -0.12 * (df.v13==3) + \\\n 0.16 * (df.v14==1) + \\\n -0.02 * (df.v14==2) + \\\n 0.01 * (df.v15==2) + \\\n -0.02 * (df.v15==4) + \\\n 0.04 * (df.v15==6) + \\\n -0.04 * (df.v15==8) + \\\n 0.1 * df.v01 * df.v02 + \\\n -0.1 * df.v02 * df.v03 + \\\n 0.000005* df.v04 * df.v06 + \\\n -0.15 * df.v03 * df.v08 + \\\n 0.12 * df.v07 * df.v09 + \\\n 0.11 * df.v10 * df.v11 + \\\n -0.13 * df.v12 * df.v14 + \\\n -0.05 * (df.v11==2) * df.v01 + \\\n 0.13 * (df.v13==1) * df.v02 + \\\n -0.12 * (df.v13==2) * df.v03 + \\\n 0.16 * (df.v14==2) * df.v04 + \\\n -0.02 * (df.v14==3) * df.v05 + \\\n 0.00001 * (df.v15==1) * df.v06 + \\\n -0.02 * (df.v15==5) * df.v07 + \\\n 0.04 * (df.v15==7) * df.v08 + \\\n -0.04 * (df.v15==5) * df.v09 + \\\n 0.04 * (df.v15==4) * (df.v13==8) + \\\n -0.04 * (df.v15==3) * (df.v13==8) + \\\n 0.04 * (df.v15==2) * (df.v14==8) + \\\n -0.04 * (df.v15==1) * (df.v14==8) + \\\n 0.04 * (df.v15==2) * (df.v12==8) + \\\n -0.04 * (df.v15==3) * (df.v13==8)\n\n #df['target']=target\n df['target']=(np.exp(target)/(1+np.exp(target))>0.5).astype(int)\n \n return df\n", 39 | "outputs": [], 40 | "cell_type": "code" 41 | }, 42 | { 43 | "execution_count": 47, 44 | "metadata": { 45 | "collapsed": false, 46 | "scrolled": true 47 | }, 48 | "source": "data_train = gen_data(n=20000)\ndata_test = gen_data(n=100000)\n\n\ndata_train.head()\n", 49 | "outputs": [ 50 | { 51 | "execution_count": 47, 52 | "metadata": {}, 53 | "data": { 54 | "text/html": "
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
v01v02v03v04v05v06v07v08v09v10v11v12v13v14v15target
0-0.5812740.2995830.6725710.8796241.41778712273.41258411.577250101024130
10.1023600.9423901.3584321.2891673.51920712273.4125848.978558111211261
2-0.1310570.7963211.2180571.7044640.58204112273.41258410.950337111203301
3-1.6646990.9729882.6044711.1657244.58262012273.41258412.149130111102300
41.0185670.7968131.5492615.7302830.85452512273.4125848.379034110003091
\n
", 55 | "text/plain": " v01 v02 v03 v04 v05 v06 v07 \\\n0 -0.581274 0.299583 0.672571 0.879624 1.417787 12273.412584 11.577250 \n1 0.102360 0.942390 1.358432 1.289167 3.519207 12273.412584 8.978558 \n2 -0.131057 0.796321 1.218057 1.704464 0.582041 12273.412584 10.950337 \n3 -1.664699 0.972988 2.604471 1.165724 4.582620 12273.412584 12.149130 \n4 1.018567 0.796813 1.549261 5.730283 0.854525 12273.412584 8.379034 \n\n v08 v09 v10 v11 v12 v13 v14 v15 target \n0 1 0 1 0 2 4 1 3 0 \n1 1 1 1 2 1 1 2 6 1 \n2 1 1 1 2 0 3 3 0 1 \n3 1 1 1 1 0 2 3 0 0 \n4 1 1 0 0 0 3 0 9 1 " 56 | }, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "cell_type": "code" 61 | }, 62 | { 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "source": "@hidden_cell\n\ncredentials_1 = {\n 'host':'awh-yp-small03.services.dal.bluemix.net',\n 'port':'50000',\n 'user':'dash110459',\n 'password':\"\"\"cc7fcfe60374\"\"\",\n 'database':'BLUDB'\n}", 68 | "outputs": [], 69 | "cell_type": "code" 70 | }, 71 | { 72 | "execution_count": 48, 73 | "metadata": { 74 | "collapsed": false, 75 | "scrolled": true 76 | }, 77 | "source": "import ibmdbpy\nfrom ibmdbpy import IdaDataBase\n\n\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + credentials_1[\"user\"] + \";PWD=\" + credentials_1[\"password\"])\nidadf = idadb.as_idadataframe(data_train, \"DATA_TRAIN\", clear_existing=True)\nidadf = idadb.as_idadataframe(data_test, \"DATA_TEST\", clear_existing=True)", 78 | "outputs": [ 79 | { 80 | "name": "stderr", 81 | "output_type": "stream", 82 | "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in > ignored\n" 83 | }, 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": "DataFrame will be splitted into 40 chunks. (500 rows per chunk)\nUploaded: 40/40... [DONE]\n" 88 | }, 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in > ignored\n" 93 | }, 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": "DataFrame will be splitted into 200 chunks. (500 rows per chunk)\nUploaded: 200/200... [DONE]\n" 98 | } 99 | ], 100 | "cell_type": "code" 101 | } 102 | ] 103 | } -------------------------------------------------------------------------------- /other/SF Usage 1B records.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat_minor": 0, 3 | "metadata": { 4 | "language_info": { 5 | "pygments_lexer": "ipython2", 6 | "name": "python", 7 | "file_extension": ".py", 8 | "codemirror_mode": { 9 | "name": "ipython", 10 | "version": 2 11 | }, 12 | "mimetype": "text/x-python", 13 | "nbconvert_exporter": "python", 14 | "version": "2.7.11" 15 | }, 16 | "kernelspec": { 17 | "language": "python", 18 | "name": "python2", 19 | "display_name": "Python 2 with Spark 1.6" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "cells": [ 24 | { 25 | "execution_count": 7, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "source": "import pandas as pd\nimport numpy as np", 30 | "outputs": [], 31 | "cell_type": "code" 32 | }, 33 | { 34 | "execution_count": 8, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "source": "import ibmdbpy\nfrom ibmdbpy import IdaDataBase,IdaDataFrame\n\ncredentials_1 = {\n 'host':'dashdb-entry-yp-dal09-08.services.dal.bluemix.net',\n 'port':'50000',\n 'user':'dash8753',\n 'password':\"\"\"ddd6463d0ddc\"\"\",\n 'database':'BLUDB'\n}\n\nidadb = IdaDataBase(dsn=\"DASHDB;Database=BLUDB;Hostname=\" + credentials_1[\"host\"] + \";Port=50000;PROTOCOL=TCPIP;UID=\" + credentials_1[\"user\"] + \";PWD=\" + credentials_1[\"password\"])\n\n", 39 | "outputs": [], 40 | "cell_type": "code" 41 | }, 42 | { 43 | "execution_count": 9, 44 | "metadata": { 45 | "collapsed": false, 46 | "scrolled": true 47 | }, 48 | "source": "date_range=np.hstack((\n np.arange(20160701,20160731),\n np.arange(20160801,20160831),\n np.arange(20160901,20160930), \n np.arange(20161001,20161031),\n np.arange(20161101,20161130),\n np.arange(20161201,20161231)))\n\npivot=['VOICE,INCOMING','VOICE,OUTGOING','SMS,INCOMING','SMS,OUTGOING','DATA,INCOMING','DATA,OUTGOING']\n \nbase=np.transpose([np.tile(pivot, len(date_range)), np.repeat(date_range, len(pivot))])\nbase[:,[0, 1]] = base[:,[1, 0]]\nbase = np.hstack((base[:,:1], map(lambda x: x.split(','), base[:,1])))\n\nID = 11111111\nrecs = np.random.random_integers(500,base.shape[0])\none_ID=base[np.sort(np.random.choice(range(base.shape[0]), recs, replace=False)),:]\n\nnums=np.reshape(np.maximum(0,np.random.normal(1,10,recs*4)),(-1,4))\n\n\n#from timeit import default_timer as timer\n\n#start = timer()\n#res=pd.DataFrame(np.c_[ (np.repeat(ID,recs), one_ID, nums)],columns=['ID','date','cdr_type_name','cdr_type_direction','tot_num_times','tot_duration','total_up_down','tot_costs'])\n#res=res.apply(lambda x: pd.to_numeric(x, errors='ignore'))\n#end = timer()\n#print(end - start) \n\n#start = timer()\nres=pd.concat([pd.DataFrame(np.repeat(ID,recs),columns=['subs_id']),\n pd.DataFrame(one_ID,columns=['prd_id','cdr_type_name','cdr_direction']),\n pd.DataFrame(nums,columns=['tot_num_times','tot_duration','total_up_down','tot_costs'])], axis=1)\nres['prd_id']=pd.to_datetime(res['prd_id'], format='%Y%m%d', errors='ignore')\n#end = timer()\n#print(end - start) \n\n#idadb.as_idadataframe(res, \"SF_USAGE\", clear_existing=True)", 49 | "outputs": [ 50 | { 51 | "name": "stderr", 52 | "output_type": "stream", 53 | "text": "Exception AttributeError: \"Cursor instance has no attribute 'closed'\" in > ignored\n" 54 | }, 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": "Uploading 987 rows (maxnrow was set to 1000)\n" 59 | }, 60 | { 61 | "execution_count": 9, 62 | "metadata": {}, 63 | "data": { 64 | "text/plain": "" 65 | }, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "cell_type": "code" 70 | }, 71 | { 72 | "execution_count": 11, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "source": "%%capture\n\nID = 11235879\n\n\nSF_USAGE = IdaDataFrame(idadb, 'SF_USAGE')\nfor i in range(15232):\n ID+=1\n recs = np.random.random_integers(500,base.shape[0])\n one_ID=base[np.sort(np.random.choice(range(base.shape[0]), recs, replace=False)),:]\n\n nums=np.reshape(np.maximum(0,np.random.normal(1,10,recs*4)),(-1,4))\n res=pd.concat([pd.DataFrame(np.repeat(ID,recs),columns=['subs_id']),\n pd.DataFrame(one_ID,columns=['prd_id','cdr_type_name','cdr_direction']),\n pd.DataFrame(nums,columns=['tot_num_times','tot_duration','total_up_down','tot_costs'])], axis=1)\n res['prd_id']=pd.to_datetime(res['prd_id'], format='%Y%m%d', errors='ignore')\n \n idadb.append(SF_USAGE,res);\n SF_USAGE.commit()", 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": "Uploading 836 rows (maxnrow was set to 1000)\nUploading 903 rows (maxnrow was set to 1000)\nUploading 740 rows (maxnrow was set to 1000)\nUploading 937 rows (maxnrow was set to 1000)\nUploading 883 rows (maxnrow was set to 1000)\nUploading 732 rows (maxnrow was set to 1000)\nUploading 1046 rows (maxnrow was set to 1000)\nUploading 663 rows (maxnrow was set to 1000)\nUploading 812 rows (maxnrow was set to 1000)\nUploading 901 rows (maxnrow was set to 1000)\nUploading 596 rows (maxnrow was set to 1000)\nUploading 525 rows (maxnrow was set to 1000)\nUploading 553 rows (maxnrow was set to 1000)\nUploading 982 rows (maxnrow was set to 1000)\nUploading 808 rows (maxnrow was set to 1000)\nUploading 759 rows (maxnrow was set to 1000)\nUploading 769 rows (maxnrow was set to 1000)\nUploading 954 rows (maxnrow was set to 1000)\nUploading 667 rows (maxnrow was set to 1000)\nUploading 675 rows (maxnrow was set to 1000)\nUploading 912 rows (maxnrow was set to 1000)\n" 82 | }, 83 | { 84 | "ename": "KeyboardInterrupt", 85 | "evalue": "", 86 | "output_type": "error", 87 | "traceback": [ 88 | "\u001b[0;31m\u001b[0m", 89 | "\u001b[0;31mKeyboardInterrupt\u001b[0mTraceback (most recent call last)", 90 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'prd_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'prd_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'%Y%m%d'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0midadb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSF_USAGE\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mSF_USAGE\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 91 | "\u001b[0;32m/usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages/ibmdbpy/base.pyc\u001b[0m in \u001b[0;36mappend\u001b[0;34m(self, idadf, df, maxnrow)\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Uploading %s rows (maxnrow was set to %s)\"\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxnrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1383\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_insert_into_database\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midadf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtablename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msilent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1385\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 92 | "\u001b[0;32m/usr/local/src/bluemix_jupyter_bundle.v33/notebook/lib/python2.7/site-packages/ibmdbpy/base.pyc\u001b[0m in \u001b[0;36m_insert_into_database\u001b[0;34m(self, dataframe, tablename, silent)\u001b[0m\n\u001b[1;32m 1926\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1927\u001b[0m \u001b[0mvalue_string\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m'%s,'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1928\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mvalue_string\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m','\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1929\u001b[0m \u001b[0mvalue_string\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue_string\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1930\u001b[0m \u001b[0mrow_string\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"(%s),\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue_string\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 93 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 94 | ] 95 | } 96 | ], 97 | "cell_type": "code" 98 | } 99 | ] 100 | } --------------------------------------------------------------------------------