├── .gitignore ├── 01_simple_requests.py ├── 02_html_parsing.py ├── 03_advanced_parsing.py ├── 04_reading_csv.py ├── 05_analyzing_dataframe.py ├── 06_exploring_data.py ├── 07_text_tfidf.py ├── 08_dictvectorizer.py ├── 09_scaling.py ├── 10_pca.py ├── 11_svm.py ├── 12_regression.py ├── 13_clustering.py ├── 14_cross_validation.py ├── LICENSE ├── README.md ├── data-cloud.png ├── data-workshop-notebook.ipynb ├── pydata_slides.pdf ├── requirements.txt ├── sample.csv └── sample2.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /01_simple_requests.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | print requests.get("http://example.com").text[:65] + " ..." 4 | 5 | print requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"}).json()['items'][0] -------------------------------------------------------------------------------- /02_html_parsing.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | 3 | page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple") 4 | # ^ This is probably illegal. Blocket, please don't sue me! 5 | items_data = [] 6 | for el in page.getroot().find_class("item_row"): 7 | links = el.find_class("item_link") 8 | images = el.find_class("item_image") 9 | if links and images: 10 | items_data.append({"name": links[0].text, 11 | "image": images[0].attrib['src']}) 12 | 13 | print items_data -------------------------------------------------------------------------------- /03_advanced_parsing.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | 3 | page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple") 4 | 5 | print len(page.xpath('//a')) # number of links in the page 6 | print page.xpath('//img[@class = "item_image"]/@src') # products' images -------------------------------------------------------------------------------- /04_reading_csv.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | 3 | df = pandas.read_csv('sample.csv') 4 | 5 | # Display the DataFrame 6 | print df 7 | print 8 | 9 | # DataFrame's columns 10 | print df.columns 11 | print 12 | 13 | # Values of a given column 14 | print df['Model'] 15 | print -------------------------------------------------------------------------------- /05_analyzing_dataframe.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | 3 | df = pandas.read_csv('sample.csv') 4 | 5 | # Any missing values? 6 | print df['Price'] 7 | print df['Description'] 8 | 9 | # Fill missing prices by a linear interpolation 10 | df['Description'] = df['Description'].fillna("No description is available.") 11 | df['Price'] = df['Price'].interpolate() 12 | 13 | print df -------------------------------------------------------------------------------- /06_exploring_data.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | df = pandas.read_csv('sample2.csv') 6 | 7 | # This table has 3 columns: Office, Year, Sales 8 | print df.columns 9 | 10 | # It's really easy to query data with Pandas: 11 | print df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)] 12 | 13 | # It's also easy to do aggregations... 14 | aggregated_sales = df.groupby('Year').sum() 15 | print aggregated_sales 16 | 17 | # ... and generate plots 18 | aggregated_sales.plot(kind='bar') 19 | 20 | plt.show() -------------------------------------------------------------------------------- /07_text_tfidf.py: -------------------------------------------------------------------------------- 1 | from sklearn import feature_extraction 2 | 3 | corpus = ['Cats really are great.', 4 | 'I like cats but I still prefer dogs.', 5 | 'Dogs are the best.', 6 | 'I like trains', 7 | ] 8 | 9 | tfidf = feature_extraction.text.TfidfVectorizer() 10 | 11 | print tfidf.fit_transform(corpus) 12 | print tfidf.get_feature_names() -------------------------------------------------------------------------------- /08_dictvectorizer.py: -------------------------------------------------------------------------------- 1 | from sklearn import feature_extraction 2 | 3 | data = [{"weight": 60., "sex": 'female', "student": True}, 4 | {"weight": 80.1, "sex": 'male', "student": False}, 5 | {"weight": 65.3, "sex": 'male', "student": True}, 6 | {"weight": 58.5, "sex": 'female', "student": False}] 7 | 8 | vectorizer = feature_extraction.DictVectorizer(sparse=False) 9 | 10 | vectors = vectorizer.fit_transform(data) 11 | print vectors 12 | print vectorizer.get_feature_names() -------------------------------------------------------------------------------- /09_scaling.py: -------------------------------------------------------------------------------- 1 | from sklearn import preprocessing 2 | 3 | data = [[10., 2345., 0., 2.], 4 | [3., -3490., 0.1, 1.99], 5 | [13., 3903., -0.2, 2.11]] 6 | 7 | print preprocessing.normalize(data) -------------------------------------------------------------------------------- /10_pca.py: -------------------------------------------------------------------------------- 1 | from sklearn import decomposition 2 | 3 | data = [[0.3, 0.2, 0.4, 0.32], 4 | [0.3, 0.5, 1.0, 0.19], 5 | [0.3, -0.4, -0.8, 0.22]] 6 | 7 | pca = decomposition.PCA() 8 | print pca.fit_transform(data) 9 | print pca.explained_variance_ratio_ -------------------------------------------------------------------------------- /11_svm.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | from sklearn import svm 3 | 4 | iris = datasets.load_iris() 5 | 6 | X = iris.data[:, :2] 7 | y = iris.target 8 | 9 | # Training the model 10 | clf = svm.SVC(kernel='rbf') 11 | clf.fit(X, y) 12 | 13 | # Doing predictions 14 | new_data = [[4.85, 3.1], [5.61, 3.02]] 15 | print clf.predict(new_data) -------------------------------------------------------------------------------- /12_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import linear_model 3 | import matplotlib.pyplot as plt 4 | 5 | def f(x): 6 | return x + np.random.random() * 3. 7 | 8 | X = np.arange(0, 5, 0.5) 9 | X = X.reshape((len(X), 1)) 10 | y = map(f, X) 11 | 12 | clf = linear_model.LinearRegression() 13 | clf.fit(X, y) 14 | 15 | new_X = np.arange(0.2, 5.2, 0.3) 16 | new_X = new_X.reshape((len(new_X), 1)) 17 | new_y = clf.predict(new_X) 18 | 19 | plt.scatter(X, y, color='g', label='Training data') 20 | 21 | plt.plot(new_X, new_y, '.-', label='Predicted') 22 | 23 | 24 | 25 | plt.legend() 26 | plt.show() -------------------------------------------------------------------------------- /13_clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import DBSCAN 2 | from sklearn.datasets.samples_generator import make_blobs 3 | from sklearn.preprocessing import StandardScaler 4 | import matplotlib.pyplot as plt 5 | 6 | ############################################################################## 7 | # Generate sample data 8 | centers = [[1, 1], [-1, -1], [1, -1]] 9 | X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.4, 10 | random_state=0) 11 | X = StandardScaler().fit_transform(X) 12 | 13 | ############################################################################## 14 | # Compute DBSCAN 15 | db = DBSCAN(eps=0.3, min_samples=10).fit(X) 16 | labels = db.labels_ 17 | 18 | plt.scatter(X[:, 0], X[:, 1], c=labels) 19 | plt.show() 20 | print labels -------------------------------------------------------------------------------- /14_cross_validation.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm, cross_validation, datasets 2 | 3 | iris = datasets.load_iris() 4 | X, y = iris.data, iris.target 5 | 6 | model = svm.SVC() 7 | print cross_validation.cross_val_score(model, X, y, scoring='precision') 8 | print cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error') -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-processing and Machine Learning with Python 2 | 3 | ![Learn how to do data-processing and machine learning with Python](https://github.com/halflings/python-data-workshop/blob/master/data-cloud.png) 4 | 5 | Code and slides for a workshop I originally hosted a KTH (but also at the Swedish Bioinformatics Workshop), showcasing Python and its useful data-processing/machine learning capabilities. 6 | 7 | You can [view the slides on Speaker Deck](https://speakerdeck.com/halflings/data-processing-and-machine-learning-with-python), [download them](https://github.com/halflings/python-data-workshop/blob/master/pydata_slides.pdf) and, *highly recommended*, have a look at the accompanying [IPython notebook](http://nbviewer.ipython.org/github/halflings/python-data-workshop/blob/master/data-workshop-notebook.ipynb). 8 | -------------------------------------------------------------------------------- /data-cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/halflings/python-data-workshop/25f8cfe4f403bbeaacb9d52c25b0b53a46f999ed/data-cloud.png -------------------------------------------------------------------------------- /data-workshop-notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "plt.style.use('ggplot')\n", 15 | "plt.rcParams['figure.figsize'] = 16, 9" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "# Data analytics and machine learning with Python" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# I - Acquiring data" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### A simple HTTP request" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": false, 44 | "scrolled": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "import requests\n", 49 | "\n", 50 | "print(requests.get(\"http://example.com\").text)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### Communicating with APIs" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "response = requests.get(\"https://www.googleapis.com/books/v1/volumes\", params={\"q\":\"machine learning\"})\n", 69 | "raw_data = response.json()\n", 70 | "titles = [item['volumeInfo']['title'] for item in raw_data['items']]\n", 71 | "titles" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Parsing websites" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "import lxml.html\n", 90 | "\n", 91 | "page = lxml.html.parse(\"http://www.blocket.se/stockholm?q=apple\")\n", 92 | "# ^ This is probably illegal. Blocket, please don't sue me!\n", 93 | "items_data = []\n", 94 | "for el in page.getroot().find_class(\"item_row\"):\n", 95 | " links = el.find_class(\"item_link\")\n", 96 | " images = el.find_class(\"item_image\")\n", 97 | " prices = el.find_class(\"list_price\")\n", 98 | " if links and images and prices and prices[0].text:\n", 99 | " items_data.append({\"name\": links[0].text,\n", 100 | " \"image\": images[0].attrib['src'],\n", 101 | " \"price\": int(prices[0].text.split(\":\")[0].replace(\" \", \"\"))})\n", 102 | "items_data" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "### Reading local files (CSV/JSON)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "import pandas\n", 121 | "\n", 122 | "df = pandas.read_csv('sample.csv')" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# Display the DataFrame\n", 134 | "df" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "# DataFrame's columns\n", 146 | "df.columns" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "# Values of a given column\n", 158 | "df.Model" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Analyzing the dataframe" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "# Any missing values?\n", 177 | "df['Price']" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": false 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "df['Description']" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "# Fill missing prices by a linear interpolation\n", 200 | "df['Description'] = df['Description'].fillna(\"No description is available.\")\n", 201 | "df['Price'] = df['Price'].interpolate()\n", 202 | "\n", 203 | "df" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# II - Exploring data" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "import matplotlib.pyplot as plt\n", 222 | "\n", 223 | "df = pandas.read_csv('sample2.csv')\n", 224 | "\n", 225 | "df" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "# This table has 3 columns: Office, Year, Sales\n", 237 | "df.columns" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "# It's really easy to query data with Pandas:\n", 249 | "df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "# It's also easy to do aggregations...\n", 261 | "aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()\n", 262 | "aggregated_stockholm_sales" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()\n", 274 | "# ... and generate plots\n", 275 | "aggregated_stockholm_sales.plot(kind='bar')\n", 276 | "aggregated_ny_sales.plot(kind='bar', color='g')" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "# Machine learning" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "## Feature extraction" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "from sklearn import feature_extraction" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "### Extracting features from text" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "corpus = ['Cats? I love cats!',\n", 320 | " 'I love dogs.',\n", 321 | " 'I hate cats :(',\n", 322 | " 'I love trains',\n", 323 | " ]\n", 324 | "\n", 325 | "tfidf = feature_extraction.text.TfidfVectorizer()\n", 326 | "\n", 327 | "print(tfidf.fit_transform(corpus).toarray())\n", 328 | "print(tfidf.get_feature_names())" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Dict vectorizer" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "import json\n", 347 | "\n", 348 | "\n", 349 | "data = [json.loads(\"\"\"{\"weight\": 194.0, \"sex\": \"female\", \"student\": true}\"\"\"),\n", 350 | " {\"weight\": 60., \"sex\": 'female', \"student\": True},\n", 351 | " {\"weight\": 80.1, \"sex\": 'male', \"student\": False},\n", 352 | " {\"weight\": 65.3, \"sex\": 'male', \"student\": True},\n", 353 | " {\"weight\": 58.5, \"sex\": 'female', \"student\": False}]\n", 354 | "\n", 355 | "vectorizer = feature_extraction.DictVectorizer(sparse=False)\n", 356 | "\n", 357 | "vectors = vectorizer.fit_transform(data)\n", 358 | "print(vectors)\n", 359 | "print(vectorizer.get_feature_names())" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "### Pre-processing" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "##### Scaling" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": false 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "from sklearn import preprocessing\n", 385 | "\n", 386 | "data = [[10., 2345., 0., 2.],\n", 387 | " [3., -3490., 0.1, 1.99],\n", 388 | " [13., 3903., -0.2, 2.11]]\n", 389 | "\n", 390 | "preprocessing.normalize(data)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "##### Dimensionality reduction" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "collapsed": false 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "from sklearn import decomposition\n", 409 | "\n", 410 | "data = [[0.3, 0.2, 0.4, 0.32],\n", 411 | " [0.3, 0.5, 1.0, 0.19],\n", 412 | " [0.3, -0.4, -0.8, 0.22]]\n", 413 | "\n", 414 | "pca = decomposition.PCA()\n", 415 | "print(pca.fit_transform(data))\n", 416 | "print(pca.explained_variance_ratio_)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "# Machine learning models" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "## Classification (SVM)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "from sklearn import datasets\n", 442 | "from sklearn import svm" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "outputs": [], 452 | "source": [ 453 | "iris = datasets.load_iris()\n", 454 | "\n", 455 | "X = iris.data[:, :2]\n", 456 | "y = iris.target\n", 457 | "\n", 458 | "plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])\n", 459 | "\n", 460 | "to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])\n", 461 | "plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": false 469 | }, 470 | "outputs": [], 471 | "source": [ 472 | "# Training the model\n", 473 | "clf = svm.SVC(kernel='rbf')\n", 474 | "clf.fit(X, y)\n", 475 | "\n", 476 | "# Doing predictions\n", 477 | "print(clf.predict(to_predict))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "## Regression (linear regression)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "collapsed": false 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "import numpy as np\n", 496 | "from sklearn import linear_model\n", 497 | "import matplotlib.pyplot as plt\n", 498 | "\n", 499 | "def f(x):\n", 500 | " return x + np.random.random() * 3.\n", 501 | "\n", 502 | "X = np.arange(0, 5, 0.5)\n", 503 | "X = X.reshape((len(X), 1))\n", 504 | "y = list(map(f, X))\n", 505 | "\n", 506 | "clf = linear_model.LinearRegression()\n", 507 | "clf.fit(X, y)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": { 514 | "collapsed": false 515 | }, 516 | "outputs": [], 517 | "source": [ 518 | "new_X = np.arange(0.2, 5.2, 0.3)\n", 519 | "new_X = new_X.reshape((len(new_X), 1))\n", 520 | "new_y = clf.predict(new_X)\n", 521 | "\n", 522 | "plt.scatter(X, y, color='g', label='Training data')\n", 523 | "\n", 524 | "plt.plot(new_X, new_y, '.-', label='Predicted')\n", 525 | "plt.legend()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": {}, 531 | "source": [ 532 | "## Clustering (DBScan)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": { 539 | "collapsed": false 540 | }, 541 | "outputs": [], 542 | "source": [ 543 | "from sklearn.cluster import DBSCAN\n", 544 | "from sklearn.datasets.samples_generator import make_blobs\n", 545 | "from sklearn.preprocessing import StandardScaler\n", 546 | "\n", 547 | "# Generate sample data\n", 548 | "centers = [[1, 1], [-1, -1], [1, -1]]\n", 549 | "X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,\n", 550 | " random_state=0)\n", 551 | "plt.scatter(X[:, 0], X[:, 1])" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "# Compute DBSCAN\n", 563 | "db = DBSCAN(eps=0.3, min_samples=10).fit(X)\n", 564 | "db.labels_" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": { 571 | "collapsed": false 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "import matplotlib.pyplot as plt\n", 576 | "plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "## Cross-validation" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": { 590 | "collapsed": false 591 | }, 592 | "outputs": [], 593 | "source": [ 594 | "from sklearn import svm, cross_validation, datasets\n", 595 | "\n", 596 | "iris = datasets.load_iris()\n", 597 | "X, y = iris.data, iris.target\n", 598 | "\n", 599 | "model = svm.SVC()\n", 600 | "print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))\n", 601 | "print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "# A more complex Machine Learning pipeline: \"what's cooking?\"\n", 609 | "This is a basic solution I wrote for the Kaggle competition \"What's cooking?\" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.\n", 610 | "\n", 611 | "You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there." 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": { 618 | "collapsed": false 619 | }, 620 | "outputs": [], 621 | "source": [ 622 | "from collections import Counter\n", 623 | "import json\n", 624 | "\n", 625 | "import pandas as pd\n", 626 | "import scipy.sparse\n", 627 | "import sklearn.pipeline\n", 628 | "import sklearn.cross_validation\n", 629 | "import sklearn.feature_extraction\n", 630 | "import sklearn.naive_bayes\n", 631 | "\n", 632 | "def open_dataset(path):\n", 633 | " with open(path) as file:\n", 634 | " data = json.load(file)\n", 635 | " df = pd.DataFrame(data).set_index('id')\n", 636 | " return df\n", 637 | "\n", 638 | "df = open_dataset('train.json')\n", 639 | "\n", 640 | "pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n", 641 | "pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n", 642 | "\n", 643 | "def map_term_count(ingredients):\n", 644 | " return Counter(sum((i.split(' ') for i in ingredients), []))\n", 645 | "X = pipeline.fit_transform(df.ingredients.apply(Counter))\n", 646 | "X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])\n", 647 | "y = df.cuisine.values\n", 648 | "\n", 649 | "model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)\n", 650 | "\n", 651 | "# Cross-validation\n", 652 | "score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)\n", 653 | "print(score)\n", 654 | "\n", 655 | "# Running on the test dataset\n", 656 | "t_df = open_dataset('test.json')\n", 657 | "X_test = pipeline.transform(t_df.ingredients.apply(Counter))\n", 658 | "X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])\n", 659 | "\n", 660 | "model.fit(X, y)\n", 661 | "\n", 662 | "predictions = model.predict(X_test)\n", 663 | "result_df = pd.DataFrame(index=t_df.index)\n", 664 | "result_df['cuisine'] = pd.Series(predictions, index=result_df.index)\n", 665 | "\n", 666 | "result_df['ingredients'] = t_df['ingredients']\n", 667 | "result_df" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "## Thanks for following! I hope you learned a thing or two :-)\n", 675 | "\n", 676 | "Feel free to ask any question, or contact me on [kachkach.com](www.kachkach.com) / [@halflings](http://github.com/halflings)" 677 | ] 678 | } 679 | ], 680 | "metadata": { 681 | "kernelspec": { 682 | "display_name": "Python 3", 683 | "language": "python", 684 | "name": "python3" 685 | }, 686 | "language_info": { 687 | "codemirror_mode": { 688 | "name": "ipython", 689 | "version": 3 690 | }, 691 | "file_extension": ".py", 692 | "mimetype": "text/x-python", 693 | "name": "python", 694 | "nbconvert_exporter": "python", 695 | "pygments_lexer": "ipython3", 696 | "version": "3.4.3" 697 | } 698 | }, 699 | "nbformat": 4, 700 | "nbformat_minor": 0 701 | } 702 | -------------------------------------------------------------------------------- /pydata_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/halflings/python-data-workshop/25f8cfe4f403bbeaacb9d52c25b0b53a46f999ed/pydata_slides.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | pandas 4 | scikit-learn 5 | matplotlib 6 | numpy 7 | scipy 8 | -------------------------------------------------------------------------------- /sample.csv: -------------------------------------------------------------------------------- 1 | Year,Make,Model,Description,Price 2 | 1997,Ford,E350,"ac, abs, moon",3000.00 3 | 1999,Chevy,"Venture ""Extended Edition""","",4900.00 4 | 1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00 5 | 1996,Jeep,Grand Cherokee,"MUST SELL! 6 | air, moon roof, loaded" -------------------------------------------------------------------------------- /sample2.csv: -------------------------------------------------------------------------------- 1 | Office,Year,Sales 2 | Stockholm,2004,200 3 | Stockholm,2005,250 4 | Stockholm,2006,255 5 | Stockholm,2007,260 6 | Stockholm,2008,264 7 | Stockholm,2009,274 8 | Stockholm,2010,330 9 | Stockholm,2011,364 10 | New York,2004,432 11 | New York,2005,469 12 | New York,2006,480 13 | New York,2007,438 14 | New York,2008,330 15 | New York,2009,280 16 | New York,2010,299 17 | New York,2011,230 --------------------------------------------------------------------------------