├── .gitignore
├── 01_simple_requests.py
├── 02_html_parsing.py
├── 03_advanced_parsing.py
├── 04_reading_csv.py
├── 05_analyzing_dataframe.py
├── 06_exploring_data.py
├── 07_text_tfidf.py
├── 08_dictvectorizer.py
├── 09_scaling.py
├── 10_pca.py
├── 11_svm.py
├── 12_regression.py
├── 13_clustering.py
├── 14_cross_validation.py
├── LICENSE
├── README.md
├── data-cloud.png
├── data-workshop-notebook.ipynb
├── pydata_slides.pdf
├── requirements.txt
├── sample.csv
└── sample2.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/01_simple_requests.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | print requests.get("http://example.com").text[:65] + " ..."
4 | 
5 | print requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"}).json()['items'][0]


--------------------------------------------------------------------------------
/02_html_parsing.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | 
 3 | page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
 4 | # ^ This is probably illegal. Blocket, please don't sue me!
 5 | items_data = []
 6 | for el in page.getroot().find_class("item_row"):
 7 |     links = el.find_class("item_link")
 8 |     images = el.find_class("item_image")
 9 |     if links and images:
10 |         items_data.append({"name": links[0].text,
11 |                            "image": images[0].attrib['src']})
12 | 
13 | print items_data


--------------------------------------------------------------------------------
/03_advanced_parsing.py:
--------------------------------------------------------------------------------
1 | import lxml.html
2 | 
3 | page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
4 | 
5 | print len(page.xpath('//a')) # number of links in the page
6 | print page.xpath('//img[@class = "item_image"]/@src') # products' images


--------------------------------------------------------------------------------
/04_reading_csv.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | 
 3 | df = pandas.read_csv('sample.csv')
 4 | 
 5 | # Display the DataFrame
 6 | print df
 7 | print
 8 | 
 9 | # DataFrame's columns
10 | print df.columns
11 | print
12 | 
13 | # Values of a given column
14 | print df['Model']
15 | print


--------------------------------------------------------------------------------
/05_analyzing_dataframe.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | 
 3 | df = pandas.read_csv('sample.csv')
 4 | 
 5 | # Any missing values?
 6 | print df['Price']
 7 | print df['Description']
 8 | 
 9 | # Fill missing prices by a linear interpolation
10 | df['Description'] = df['Description'].fillna("No description is available.")
11 | df['Price'] = df['Price'].interpolate()
12 | 
13 | print df


--------------------------------------------------------------------------------
/06_exploring_data.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | df = pandas.read_csv('sample2.csv')
 6 | 
 7 | # This table has 3 columns: Office, Year, Sales
 8 | print df.columns
 9 | 
10 | # It's really easy to query data with Pandas:
11 | print df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]
12 | 
13 | # It's also easy to do aggregations...
14 | aggregated_sales = df.groupby('Year').sum()
15 | print aggregated_sales
16 | 
17 | # ... and generate plots
18 | aggregated_sales.plot(kind='bar')
19 | 
20 | plt.show()


--------------------------------------------------------------------------------
/07_text_tfidf.py:
--------------------------------------------------------------------------------
 1 | from sklearn import feature_extraction
 2 | 
 3 | corpus = ['Cats really are great.',
 4 |           'I like cats but I still prefer dogs.',
 5 |           'Dogs are the best.',
 6 |           'I like trains',
 7 |           ]
 8 | 
 9 | tfidf = feature_extraction.text.TfidfVectorizer()
10 | 
11 | print tfidf.fit_transform(corpus)
12 | print tfidf.get_feature_names()


--------------------------------------------------------------------------------
/08_dictvectorizer.py:
--------------------------------------------------------------------------------
 1 | from sklearn import feature_extraction
 2 | 
 3 | data = [{"weight": 60., "sex": 'female', "student": True},
 4 |         {"weight": 80.1, "sex": 'male', "student": False},
 5 |         {"weight": 65.3, "sex": 'male', "student": True},
 6 |         {"weight": 58.5, "sex": 'female', "student": False}]
 7 | 
 8 | vectorizer = feature_extraction.DictVectorizer(sparse=False)
 9 | 
10 | vectors = vectorizer.fit_transform(data)
11 | print vectors
12 | print vectorizer.get_feature_names()


--------------------------------------------------------------------------------
/09_scaling.py:
--------------------------------------------------------------------------------
1 | from sklearn import preprocessing
2 | 
3 | data = [[10., 2345., 0., 2.],
4 |         [3., -3490., 0.1, 1.99],
5 |         [13., 3903., -0.2, 2.11]]
6 | 
7 | print preprocessing.normalize(data)


--------------------------------------------------------------------------------
/10_pca.py:
--------------------------------------------------------------------------------
1 | from sklearn import decomposition
2 | 
3 | data = [[0.3, 0.2, 0.4,  0.32],
4 |         [0.3, 0.5, 1.0, 0.19],
5 |         [0.3, -0.4, -0.8, 0.22]]
6 | 
7 | pca = decomposition.PCA()
8 | print pca.fit_transform(data)
9 | print pca.explained_variance_ratio_


--------------------------------------------------------------------------------
/11_svm.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets
 2 | from sklearn import svm
 3 | 
 4 | iris = datasets.load_iris()
 5 | 
 6 | X = iris.data[:, :2]
 7 | y = iris.target
 8 | 
 9 | # Training the model
10 | clf = svm.SVC(kernel='rbf')
11 | clf.fit(X, y)
12 | 
13 | # Doing predictions
14 | new_data = [[4.85, 3.1], [5.61, 3.02]]
15 | print clf.predict(new_data)


--------------------------------------------------------------------------------
/12_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import linear_model
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def f(x):
 6 |     return x + np.random.random() * 3.
 7 | 
 8 | X = np.arange(0, 5, 0.5)
 9 | X = X.reshape((len(X), 1))
10 | y = map(f, X)
11 | 
12 | clf = linear_model.LinearRegression()
13 | clf.fit(X, y)
14 | 
15 | new_X = np.arange(0.2, 5.2, 0.3)
16 | new_X = new_X.reshape((len(new_X), 1))
17 | new_y = clf.predict(new_X)
18 | 
19 | plt.scatter(X, y, color='g', label='Training data')
20 | 
21 | plt.plot(new_X, new_y, '.-', label='Predicted')
22 | 
23 | 
24 | 
25 | plt.legend()
26 | plt.show()


--------------------------------------------------------------------------------
/13_clustering.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import DBSCAN
 2 | from sklearn.datasets.samples_generator import make_blobs
 3 | from sklearn.preprocessing import StandardScaler
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | ##############################################################################
 7 | # Generate sample data
 8 | centers = [[1, 1], [-1, -1], [1, -1]]
 9 | X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.4,
10 |                             random_state=0)
11 | X = StandardScaler().fit_transform(X)
12 | 
13 | ##############################################################################
14 | # Compute DBSCAN
15 | db = DBSCAN(eps=0.3, min_samples=10).fit(X)
16 | labels = db.labels_
17 | 
18 | plt.scatter(X[:, 0], X[:, 1], c=labels)
19 | plt.show()
20 | print labels


--------------------------------------------------------------------------------
/14_cross_validation.py:
--------------------------------------------------------------------------------
1 | from sklearn import svm, cross_validation, datasets
2 | 
3 | iris = datasets.load_iris()
4 | X, y = iris.data, iris.target
5 | 
6 | model = svm.SVC()
7 | print cross_validation.cross_val_score(model, X, y, scoring='precision')
8 | print cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error')


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-processing and Machine Learning with Python
2 | 
3 | ![Learn how to do data-processing and machine learning with Python](https://github.com/halflings/python-data-workshop/blob/master/data-cloud.png)
4 | 
5 | Code and slides for a workshop I originally hosted a KTH (but also at the Swedish Bioinformatics Workshop), showcasing Python and its useful data-processing/machine learning capabilities.
6 | 
7 | You can [view the slides on Speaker Deck](https://speakerdeck.com/halflings/data-processing-and-machine-learning-with-python), [download them](https://github.com/halflings/python-data-workshop/blob/master/pydata_slides.pdf) and, *highly recommended*, have a look at the accompanying [IPython notebook](http://nbviewer.ipython.org/github/halflings/python-data-workshop/blob/master/data-workshop-notebook.ipynb).
8 | 


--------------------------------------------------------------------------------
/data-cloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/halflings/python-data-workshop/25f8cfe4f403bbeaacb9d52c25b0b53a46f999ed/data-cloud.png


--------------------------------------------------------------------------------
/data-workshop-notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import numpy as np\n",
 14 |     "plt.style.use('ggplot')\n",
 15 |     "plt.rcParams['figure.figsize'] = 16, 9"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "# Data analytics and machine learning with Python"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "# I - Acquiring data"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "### A simple HTTP request"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": false,
 44 |     "scrolled": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import requests\n",
 49 |     "\n",
 50 |     "print(requests.get(\"http://example.com\").text)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "### Communicating with APIs"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "response = requests.get(\"https://www.googleapis.com/books/v1/volumes\", params={\"q\":\"machine learning\"})\n",
 69 |     "raw_data = response.json()\n",
 70 |     "titles = [item['volumeInfo']['title'] for item in raw_data['items']]\n",
 71 |     "titles"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Parsing websites"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import lxml.html\n",
 90 |     "\n",
 91 |     "page = lxml.html.parse(\"http://www.blocket.se/stockholm?q=apple\")\n",
 92 |     "# ^ This is probably illegal. Blocket, please don't sue me!\n",
 93 |     "items_data = []\n",
 94 |     "for el in page.getroot().find_class(\"item_row\"):\n",
 95 |     "    links = el.find_class(\"item_link\")\n",
 96 |     "    images = el.find_class(\"item_image\")\n",
 97 |     "    prices = el.find_class(\"list_price\")\n",
 98 |     "    if links and images and prices and prices[0].text:\n",
 99 |     "        items_data.append({\"name\": links[0].text,\n",
100 |     "                           \"image\": images[0].attrib['src'],\n",
101 |     "                           \"price\": int(prices[0].text.split(\":\")[0].replace(\" \", \"\"))})\n",
102 |     "items_data"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "### Reading local files (CSV/JSON)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "import pandas\n",
121 |     "\n",
122 |     "df = pandas.read_csv('sample.csv')"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "# Display the DataFrame\n",
134 |     "df"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "# DataFrame's columns\n",
146 |     "df.columns"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "# Values of a given column\n",
158 |     "df.Model"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "# Analyzing the dataframe"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "# Any missing values?\n",
177 |     "df['Price']"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": false
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "df['Description']"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": false
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "# Fill missing prices by a linear interpolation\n",
200 |     "df['Description'] = df['Description'].fillna(\"No description is available.\")\n",
201 |     "df['Price'] = df['Price'].interpolate()\n",
202 |     "\n",
203 |     "df"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "# II - Exploring data"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "import matplotlib.pyplot as plt\n",
222 |     "\n",
223 |     "df = pandas.read_csv('sample2.csv')\n",
224 |     "\n",
225 |     "df"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "outputs": [],
235 |    "source": [
236 |     "# This table has 3 columns: Office, Year, Sales\n",
237 |     "df.columns"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [],
247 |    "source": [
248 |     "# It's really easy to query data with Pandas:\n",
249 |     "df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {
256 |     "collapsed": false
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "# It's also easy to do aggregations...\n",
261 |     "aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()\n",
262 |     "aggregated_stockholm_sales"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "collapsed": false
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()\n",
274 |     "# ... and generate plots\n",
275 |     "aggregated_stockholm_sales.plot(kind='bar')\n",
276 |     "aggregated_ny_sales.plot(kind='bar', color='g')"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "# Machine learning"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "## Feature extraction"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {
297 |     "collapsed": false
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "from sklearn import feature_extraction"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "### Extracting features from text"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [],
318 |    "source": [
319 |     "corpus = ['Cats? I love cats!',\n",
320 |     "          'I love dogs.',\n",
321 |     "          'I hate cats :(',\n",
322 |     "          'I love trains',\n",
323 |     "          ]\n",
324 |     "\n",
325 |     "tfidf = feature_extraction.text.TfidfVectorizer()\n",
326 |     "\n",
327 |     "print(tfidf.fit_transform(corpus).toarray())\n",
328 |     "print(tfidf.get_feature_names())"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "### Dict vectorizer"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {
342 |     "collapsed": false
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "import json\n",
347 |     "\n",
348 |     "\n",
349 |     "data = [json.loads(\"\"\"{\"weight\": 194.0, \"sex\": \"female\", \"student\": true}\"\"\"),\n",
350 |     "        {\"weight\": 60., \"sex\": 'female', \"student\": True},\n",
351 |     "        {\"weight\": 80.1, \"sex\": 'male', \"student\": False},\n",
352 |     "        {\"weight\": 65.3, \"sex\": 'male', \"student\": True},\n",
353 |     "        {\"weight\": 58.5, \"sex\": 'female', \"student\": False}]\n",
354 |     "\n",
355 |     "vectorizer = feature_extraction.DictVectorizer(sparse=False)\n",
356 |     "\n",
357 |     "vectors = vectorizer.fit_transform(data)\n",
358 |     "print(vectors)\n",
359 |     "print(vectorizer.get_feature_names())"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "### Pre-processing"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "##### Scaling"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": false
381 |    },
382 |    "outputs": [],
383 |    "source": [
384 |     "from sklearn import preprocessing\n",
385 |     "\n",
386 |     "data = [[10., 2345., 0., 2.],\n",
387 |     "        [3., -3490., 0.1, 1.99],\n",
388 |     "        [13., 3903., -0.2, 2.11]]\n",
389 |     "\n",
390 |     "preprocessing.normalize(data)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {},
396 |    "source": [
397 |     "##### Dimensionality reduction"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": false
405 |    },
406 |    "outputs": [],
407 |    "source": [
408 |     "from sklearn import decomposition\n",
409 |     "\n",
410 |     "data = [[0.3, 0.2, 0.4,  0.32],\n",
411 |     "        [0.3, 0.5, 1.0, 0.19],\n",
412 |     "        [0.3, -0.4, -0.8, 0.22]]\n",
413 |     "\n",
414 |     "pca = decomposition.PCA()\n",
415 |     "print(pca.fit_transform(data))\n",
416 |     "print(pca.explained_variance_ratio_)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "# Machine learning models"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {},
429 |    "source": [
430 |     "## Classification (SVM)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "from sklearn import datasets\n",
442 |     "from sklearn import svm"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {
449 |     "collapsed": false
450 |    },
451 |    "outputs": [],
452 |    "source": [
453 |     "iris = datasets.load_iris()\n",
454 |     "\n",
455 |     "X = iris.data[:, :2]\n",
456 |     "y = iris.target\n",
457 |     "\n",
458 |     "plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])\n",
459 |     "\n",
460 |     "to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])\n",
461 |     "plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {
468 |     "collapsed": false
469 |    },
470 |    "outputs": [],
471 |    "source": [
472 |     "# Training the model\n",
473 |     "clf = svm.SVC(kernel='rbf')\n",
474 |     "clf.fit(X, y)\n",
475 |     "\n",
476 |     "# Doing predictions\n",
477 |     "print(clf.predict(to_predict))"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "## Regression (linear regression)"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {
491 |     "collapsed": false
492 |    },
493 |    "outputs": [],
494 |    "source": [
495 |     "import numpy as np\n",
496 |     "from sklearn import linear_model\n",
497 |     "import matplotlib.pyplot as plt\n",
498 |     "\n",
499 |     "def f(x):\n",
500 |     "    return x + np.random.random() * 3.\n",
501 |     "\n",
502 |     "X = np.arange(0, 5, 0.5)\n",
503 |     "X = X.reshape((len(X), 1))\n",
504 |     "y = list(map(f, X))\n",
505 |     "\n",
506 |     "clf = linear_model.LinearRegression()\n",
507 |     "clf.fit(X, y)"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {
514 |     "collapsed": false
515 |    },
516 |    "outputs": [],
517 |    "source": [
518 |     "new_X = np.arange(0.2, 5.2, 0.3)\n",
519 |     "new_X = new_X.reshape((len(new_X), 1))\n",
520 |     "new_y = clf.predict(new_X)\n",
521 |     "\n",
522 |     "plt.scatter(X, y, color='g', label='Training data')\n",
523 |     "\n",
524 |     "plt.plot(new_X, new_y, '.-', label='Predicted')\n",
525 |     "plt.legend()"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {},
531 |    "source": [
532 |     "## Clustering (DBScan)"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {
539 |     "collapsed": false
540 |    },
541 |    "outputs": [],
542 |    "source": [
543 |     "from sklearn.cluster import DBSCAN\n",
544 |     "from sklearn.datasets.samples_generator import make_blobs\n",
545 |     "from sklearn.preprocessing import StandardScaler\n",
546 |     "\n",
547 |     "# Generate sample data\n",
548 |     "centers = [[1, 1], [-1, -1], [1, -1]]\n",
549 |     "X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,\n",
550 |     "                            random_state=0)\n",
551 |     "plt.scatter(X[:, 0], X[:, 1])"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {
558 |     "collapsed": false
559 |    },
560 |    "outputs": [],
561 |    "source": [
562 |     "# Compute DBSCAN\n",
563 |     "db = DBSCAN(eps=0.3, min_samples=10).fit(X)\n",
564 |     "db.labels_"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {
571 |     "collapsed": false
572 |    },
573 |    "outputs": [],
574 |    "source": [
575 |     "import matplotlib.pyplot as plt\n",
576 |     "plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "markdown",
581 |    "metadata": {},
582 |    "source": [
583 |     "## Cross-validation"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": null,
589 |    "metadata": {
590 |     "collapsed": false
591 |    },
592 |    "outputs": [],
593 |    "source": [
594 |     "from sklearn import svm, cross_validation, datasets\n",
595 |     "\n",
596 |     "iris = datasets.load_iris()\n",
597 |     "X, y = iris.data, iris.target\n",
598 |     "\n",
599 |     "model = svm.SVC()\n",
600 |     "print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))\n",
601 |     "print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "markdown",
606 |    "metadata": {},
607 |    "source": [
608 |     "# A more complex Machine Learning pipeline: \"what's cooking?\"\n",
609 |     "This is a basic solution I wrote for the Kaggle competition \"What's cooking?\" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.\n",
610 |     "\n",
611 |     "You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there."
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {
618 |     "collapsed": false
619 |    },
620 |    "outputs": [],
621 |    "source": [
622 |     "from collections import Counter\n",
623 |     "import json\n",
624 |     "\n",
625 |     "import pandas as pd\n",
626 |     "import scipy.sparse\n",
627 |     "import sklearn.pipeline\n",
628 |     "import sklearn.cross_validation\n",
629 |     "import sklearn.feature_extraction\n",
630 |     "import sklearn.naive_bayes\n",
631 |     "\n",
632 |     "def open_dataset(path):\n",
633 |     "    with open(path) as file:\n",
634 |     "        data = json.load(file)\n",
635 |     "    df = pd.DataFrame(data).set_index('id')\n",
636 |     "    return df\n",
637 |     "\n",
638 |     "df = open_dataset('train.json')\n",
639 |     "\n",
640 |     "pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n",
641 |     "pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n",
642 |     "\n",
643 |     "def map_term_count(ingredients):\n",
644 |     "    return Counter(sum((i.split(' ') for i in ingredients), []))\n",
645 |     "X = pipeline.fit_transform(df.ingredients.apply(Counter))\n",
646 |     "X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])\n",
647 |     "y = df.cuisine.values\n",
648 |     "\n",
649 |     "model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)\n",
650 |     "\n",
651 |     "# Cross-validation\n",
652 |     "score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)\n",
653 |     "print(score)\n",
654 |     "\n",
655 |     "# Running on the test dataset\n",
656 |     "t_df = open_dataset('test.json')\n",
657 |     "X_test = pipeline.transform(t_df.ingredients.apply(Counter))\n",
658 |     "X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])\n",
659 |     "\n",
660 |     "model.fit(X, y)\n",
661 |     "\n",
662 |     "predictions = model.predict(X_test)\n",
663 |     "result_df = pd.DataFrame(index=t_df.index)\n",
664 |     "result_df['cuisine'] = pd.Series(predictions, index=result_df.index)\n",
665 |     "\n",
666 |     "result_df['ingredients'] = t_df['ingredients']\n",
667 |     "result_df"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "markdown",
672 |    "metadata": {},
673 |    "source": [
674 |     "## Thanks for following! I hope you learned a thing or two :-)\n",
675 |     "\n",
676 |     "Feel free to ask any question, or contact me on [kachkach.com](www.kachkach.com) / [@halflings](http://github.com/halflings)"
677 |    ]
678 |   }
679 |  ],
680 |  "metadata": {
681 |   "kernelspec": {
682 |    "display_name": "Python 3",
683 |    "language": "python",
684 |    "name": "python3"
685 |   },
686 |   "language_info": {
687 |    "codemirror_mode": {
688 |     "name": "ipython",
689 |     "version": 3
690 |    },
691 |    "file_extension": ".py",
692 |    "mimetype": "text/x-python",
693 |    "name": "python",
694 |    "nbconvert_exporter": "python",
695 |    "pygments_lexer": "ipython3",
696 |    "version": "3.4.3"
697 |   }
698 |  },
699 |  "nbformat": 4,
700 |  "nbformat_minor": 0
701 | }
702 | 


--------------------------------------------------------------------------------
/pydata_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/halflings/python-data-workshop/25f8cfe4f403bbeaacb9d52c25b0b53a46f999ed/pydata_slides.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | pandas
4 | scikit-learn
5 | matplotlib
6 | numpy
7 | scipy
8 | 


--------------------------------------------------------------------------------
/sample.csv:
--------------------------------------------------------------------------------
1 | Year,Make,Model,Description,Price
2 | 1997,Ford,E350,"ac, abs, moon",3000.00
3 | 1999,Chevy,"Venture ""Extended Edition""","",4900.00
4 | 1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00
5 | 1996,Jeep,Grand Cherokee,"MUST SELL!
6 | air, moon roof, loaded"


--------------------------------------------------------------------------------
/sample2.csv:
--------------------------------------------------------------------------------
 1 | Office,Year,Sales
 2 | Stockholm,2004,200
 3 | Stockholm,2005,250
 4 | Stockholm,2006,255
 5 | Stockholm,2007,260
 6 | Stockholm,2008,264
 7 | Stockholm,2009,274
 8 | Stockholm,2010,330
 9 | Stockholm,2011,364
10 | New York,2004,432
11 | New York,2005,469
12 | New York,2006,480
13 | New York,2007,438
14 | New York,2008,330
15 | New York,2009,280
16 | New York,2010,299
17 | New York,2011,230


--------------------------------------------------------------------------------