├── LICENSE ├── README.md ├── dss-2016 ├── README.md ├── churn_prediction │ ├── README.md │ ├── churn-tutorial-explore-explain.ipynb │ ├── churn-tutorial.ipynb │ └── user-activity-data.ipynb ├── deep_learning │ ├── Deep_Learning_for_Image_Analysis.ipynb │ └── README.md ├── lead_scoring │ ├── README.md │ └── lead_scoring_tutorial.ipynb └── recommendation_systems │ ├── README.md │ ├── Recommender DeepDive - Part 1.ipynb │ ├── Recommender DeepDive - Part 2.ipynb │ ├── book-recommender-exercises.ipynb │ └── book-recommender-solutions.ipynb ├── notebooks ├── AnyGivenSunday.ipynb ├── airline_on-time_performance.ipynb ├── autotagging_hacker_news_posts.ipynb ├── bank_lead_scoring_demo.ipynb ├── basic_recommender_functionalities.ipynb ├── build_imagenet_deeplearning.ipynb ├── customer-churn-prediction.ipynb ├── data_mining_web_session_logs.ipynb ├── datapipeline_recsys_intro.ipynb ├── datas_messy_clean_it.ipynb ├── deep_text_learning.ipynb ├── deploy-scikit-learn-in-ps.ipynb ├── feature-engineering.ipynb ├── feature_engineering_with_graphlab_create.ipynb ├── five_line_recommender.ipynb ├── follow_the_cryptocurrency.ipynb ├── food_retrieval-public.ipynb ├── fraud-detection.ipynb ├── getting_started_with_graphlab_create.ipynb ├── getting_started_with_python.ipynb ├── graph_analytics_movies.ipynb ├── image_similarity.ipynb ├── intro-regression.ipynb ├── introduction_to_sframes.ipynb ├── kaggle_bike_share_prediction.ipynb ├── linear_regression_benchmark.ipynb ├── link_prediction.ipynb ├── machine_learning_with_graphLab_create.ipynb ├── model_parameter_search.ipynb ├── predictive_services_intro.ipynb ├── product_matching.ipynb ├── reading_data_from_impala.ipynb ├── recsys_explicit_rating.ipynb ├── recsys_rank_10K_song.ipynb ├── reddit_analysis.ipynb ├── sentiment_classifier.ipynb ├── sherlock_text_analytics.ipynb └── spark_and_graphlab_create.ipynb ├── strata-nyc-2015 ├── README.md ├── deep-learning │ ├── Deep Learning for Image Classification and Finding Similar Images.ipynb │ ├── image_similarity.ipynb │ └── images │ │ ├── AA1.png │ │ ├── alexnet.png │ │ ├── cifar.png │ │ ├── evaluate.png │ │ ├── extract_features.png │ │ ├── improve.png │ │ ├── linear.png │ │ ├── load.png │ │ ├── quadratic.png │ │ ├── spiral.1-2.2-2-2-2-2-2.jpg │ │ ├── train.png │ │ ├── workflow1.png │ │ ├── workflow2.png │ │ ├── workflow3.png │ │ └── workflow4.png ├── deployment │ ├── images │ │ ├── left.png │ │ ├── middle.png │ │ ├── predictive_services_overview.png │ │ └── right.png │ ├── predictive_services.ipynb │ └── scikit_deployment.ipynb ├── feature_engineering │ ├── Feature Engineering for Text Data.ipynb │ └── PCA demo.ipynb └── recommendation-systems │ ├── README.md │ ├── Recommender DeepDive - Part 1.ipynb │ ├── Recommender DeepDive - Part 2.ipynb │ ├── book-recommender-exercises.ipynb │ ├── book-recommender-solutions.ipynb │ ├── strata-nyc-2015-recommendation-systems.key │ └── strata-nyc-2015-recommendation-systems.pptx ├── strata-sj-2016 ├── README.md ├── deep-learning │ ├── Deep Learning for Image Classification and Finding Similar Images.ipynb │ ├── Strata-SJ-2016-Deeplearning.pptx │ ├── image_similarity.ipynb │ └── images │ │ ├── AA1.png │ │ ├── alexnet.png │ │ ├── cifar.png │ │ ├── evaluate.png │ │ ├── extract_features.png │ │ ├── improve.png │ │ ├── linear.png │ │ ├── load.png │ │ ├── quadratic.png │ │ ├── spiral.1-2.2-2-2-2-2-2.jpg │ │ ├── train.png │ │ ├── workflow1.png │ │ ├── workflow2.png │ │ ├── workflow3.png │ │ └── workflow4.png ├── intro-ml │ ├── getting-started-with-sframes.ipynb │ └── sentiment_analysis.ipynb ├── ml-in-production │ ├── deploy-dress-recommender.ipynb │ ├── deploy-scikit-learn.ipynb │ ├── images │ │ ├── left.png │ │ ├── middle.png │ │ ├── predictive_services_overview.png │ │ └── right.png │ └── ml-production.key ├── recommendation-systems │ ├── README.md │ ├── Recommender DeepDive - Part 1.ipynb │ ├── Recommender DeepDive - Part 2.ipynb │ ├── book-recommender-exercises.ipynb │ ├── book-recommender-solutions.ipynb │ ├── strata-sj-2016-recommendation-systems.key │ └── strata-sj-2016-recommendation-systems.pptx └── time-series │ ├── anomaly_detection.ipynb │ ├── forecasting_basics.ipynb │ ├── interactive_plot.py │ ├── time_series_analysis_public.pptx │ └── time_series_data_object.ipynb └── webinars ├── README.md ├── pattern-mining ├── demo.ipynb ├── deployment.ipynb └── images │ ├── left.png │ ├── middle.png │ ├── predictive_services_overview.png │ └── right.png └── product-reviews ├── README.md ├── helper_util.py └── text_demo.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Turi Tutorials 2 | 3 | This repository contains materials for demos, tutorials, and talks by Turi. 4 | You can browse the notebooks using Github's own notebook viewer. Note that some 5 | images may not be rendered correctly. 6 | 7 | If you'd like to run it, you may register for GraphLab Create 8 | (https://turi.com/download/), then follow instructions to install. 9 | 10 | - GraphLab Create User Guide: https://turi.com/learn/userguide 11 | - GraphLab Forum: http://forum.turi.com/categories/graphlab-create 12 | 13 | # Events 14 | 15 | - Webinars[[Notebooks]](webinars/README.md) 16 | - Strata + Hadoop World, New York City 2015 [[Event Page]](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217) [[Tutorials]](strata-nyc-2015/README.md) 17 | - Strata + Hadoop World, San Jose, 2016 [[Event Page]](http://conferences.oreilly.com/strata/hadoop-big-data-ca/public/schedule/detail/47056). 18 | - Data Scince Summit, San Francisco, 2016 19 | -------------------------------------------------------------------------------- /dss-2016/README.md: -------------------------------------------------------------------------------- 1 | # Data Sciene Summit 2016, San Francisco 2 | 3 | This directory contains demo notebooks used for the collection of **machine 4 | learning tutorials** at the [Data Science Summit 5 | 2016](https://conf.turi.com/2016/us/). 6 | 7 | The tutorials introduce machine learning via real applications like: 8 | - recommender systems [Event 9 | Page](https://turi.com/events/training/2016-dss-personalization-tutorial.html) 10 | - object dection with deep learning [Event 11 | Page](https://turi.com/events/training/2016-dss-image-apps-tutorial.html) 12 | - predicting customer churn [Event 13 | Page](https://turi.com/events/training/2016-dss-customer-intelligence-tutorial.html) 14 | - lead scoring [Event 15 | Page](https://turi.com/events/training/2016-dss-customer-intelligence-tutorial.html) 16 | 17 | ## Setup Instructions 18 | 19 | You can browse the notebooks using Github's notebook viewer, but please note 20 | that some images may not be rendered correctly. 21 | 22 | Follow these steps to set up and run the notebooks on your own machine. 23 | 24 | - [Download](https://turi.com/download/) GraphLab Create v2.0.1 and then follow 25 | instructions to [install](https://turi.com/download/install.html). 26 | 27 | - Download and unzip the datasets 28 | [here](https://s3-us-west-2.amazonaws.com/turi-tutorials/TURI.zip). 29 | 30 | - [Install Jupyter 31 | notebook](http://jupyter.readthedocs.org/en/latest/install.html) (needed only 32 | if you install GraphLab Create via command line) 33 | 34 | ## Handy references 35 | 36 | - [GraphLab Create User Guide](https://turi.com/learn/userguide) 37 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create) 38 | -------------------------------------------------------------------------------- /dss-2016/churn_prediction/README.md: -------------------------------------------------------------------------------- 1 | Churn Prediction 2 | ================= 3 | 4 | User-activity data records typically contain user actions performed on a 5 | website, service, or product. These user activity transactions tell us a lot 6 | about the user's current interests & preferences. Knowing these interests and 7 | preferences can help make businesses make better decisions. 8 | 9 | Recommender systems, fraud detection, churn prediction, and lead scoring are 10 | examples of data products that can require user-activity data. In this 11 | two-part tutorial, you will first learn how to work with user activity data 12 | and then learn about two specific examples of applications that can leverage 13 | user activity data; churn prediction & lead scoring; 14 | 15 | Churn prediction is the task of identifying users that are likely to stop 16 | using a service, product or website. Lead scoring is the task of prioritizing 17 | users based on the probability that are likely to start using a service, 18 | product or website. In the first part of the tutorial, you will learn to: 19 | - Train a model to forecast user churn 20 | - Explore & Evaluate predictions made by the model 21 | - Consume predictions made by the model in an external application 22 | 23 | -------------------------------------------------------------------------------- /dss-2016/deep_learning/README.md: -------------------------------------------------------------------------------- 1 | Deep Learning 2 | ============= 3 | 4 | Deep Learning methods have been driving state-of-the-art results in computer 5 | vision, speech recognition, and natural language processing. This is important 6 | in areas such as face detection, photo organization, and machine language 7 | translation. As a result, user experiences for applications have improved 8 | dramatically. 9 | 10 | In this tutorial, I'll be deconstructing an image application using GraphLab 11 | Create, and introducing Deep Learning concepts in the process. The focus will 12 | be more on practical usage and less on theory. 13 | -------------------------------------------------------------------------------- /dss-2016/lead_scoring/README.md: -------------------------------------------------------------------------------- 1 | Lead Scoring 2 | ============ 3 | 4 | Prioritizing new leads is critical for sales and marketing teams. Modern 5 | machine learning methods use historical data and state of the art classifiers 6 | to learn a probabilistic relationship between sales account features and 7 | conversion outcome, enabling us to predict with very high accuracy which open 8 | accounts are likely to convert. This improvement in accuracy translates 9 | directly into more efficient use of sales and marketing resources. 10 | 11 | In this tutorial, I'll discuss the major themes in predictive lead scoring and 12 | walk through Python code for building a lead scoring application. The demo uses 13 | Turi's new Lead Scoring Toolkit, but the focus is on the design and 14 | implementation principles for the lead scoring task. 15 | -------------------------------------------------------------------------------- /dss-2016/lead_scoring/lead_scoring_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1. Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**The scenario**: suppose we run an online travel agency. We would like to convince our users to book overseas vacations, rather than domestic ones. Each of the users in this dataset will definitely book *something* at the end of a given trial period, i.e. we are only looking at engaged customers.\n", 15 | "\n", 16 | "**Goals**:\n", 17 | "1. predict which new users are most likely to book an overseas trip,\n", 18 | "2. generate segmention rules to group similar users based on features and propensity to convert.\n", 19 | "\n", 20 | "**Data**: mimics the [AirBnB challenge on Kaggle](https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings).\n", 21 | "- Users\n", 22 | "- Website or app sessions.\n", 23 | "\n", 24 | "I've simulated data that's very similar in terms of features and distributions, but I've added timestamps to the sessions, and changed the target from country to a binary domestic vs. international variable.\n", 25 | "\n", 26 | "**Sections**:\n", 27 | "1. Introduction\n", 28 | "2. The basic scenario - account data only\n", 29 | "3. What's happening under the hood?\n", 30 | "4. Incorporating activity data." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from __future__ import print_function\n", 42 | "import graphlab as gl" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# 2. The basic scenario" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Import the data: sales accounts" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "- **Sales accounts need not be synonymous with users**, although that is the case here. At Turi, our sales accounts consist of a mix of individual users, companies, and teams within large companies.\n", 64 | "\n", 65 | "- **The accounts dataset typically comes from a customer relationship management (CRM) tool**, like Salesforce, SAP, or Hubspot. In practice there is an extra step here of extracting the data from that system into an SFrame. " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "users = gl.SFrame('synthetic_airbnb_users.sfr')\n", 77 | "users.print_rows(3)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "users['status'].sketch_summary()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Encode the target variable" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Three types of accounts.\n", 103 | "- **Successful accounts**, i.e conversions, are coded as 1.\n", 104 | "- **Failed accounts** are coded as -1.\n", 105 | "- **Open accounts**, i.e. accounts that have not been decided, are coded as 0.\n", 106 | "\n", 107 | "Together, successful and failed accounts constitute the **training accounts**." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "status_code = {'international': 1,\n", 119 | " 'domestic': -1,\n", 120 | " 'new': 0}\n", 121 | "\n", 122 | "users['outcome'] = users['status'].apply(lambda x: status_code[x])\n", 123 | "users[['status', 'outcome']].print_rows(10)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Define the schema" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "In a complex problem like lead scoring, there are potentially many columns with \"meaning\". To help the lead scoring tool recognize these columns, we define a dictionary that maps standard lead scoring inputs to the columns in our particular dataset." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "user_schema = {'conversion_status': 'outcome',\n", 149 | " 'account_id': 'id',\n", 150 | " 'features': ['gender', 'age', 'signup_method', 'signup_app',\n", 151 | " 'first_device_type', 'first_browser']}" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Create the lead scoring tool" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "**All accounts are passed to the tool when it's created. There is no separate `predict` method.**\n", 166 | "- We typically want to score the same set of open accounts each day during the trial period.\n", 167 | "- Very rarely do we want to predict lead scores for different accounts.\n", 168 | "- It makes more sense to keep the open accounts in the model, so we can incrementally update the lead scores and market segments, as new data comes in.\n", 169 | "- The `update` method is not yet implemented :(" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": false, 177 | "scrolled": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "scorer = gl.lead_scoring.create(users, user_schema)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Retrieve the model output and export" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "There's a lot of stuff in the lead scoring model's summary. Let's focus on the accessible fields, three in particular:\n", 196 | "- **open_account_scores**: conversion probability and market segment for *open accounts*\n", 197 | "- **training_account_scores**: conversion probability and market segment for *existing successes and failures*\n", 198 | "- **segment_descriptions**: definitions and summary statistics for the market segments" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "print(scorer)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "scorer.open_account_scores.head(3)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "scorer.open_account_scores.topk('conversion_prob', k=3)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "scorer.training_account_scores.head(3)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "scorer.segment_descriptions.head(3)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "scorer.segment_descriptions[['segment_id', 'segment_features']].print_rows(max_column_width=65)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "To get the training or open accounts that belong to a particular market segment, use the respective SFrame's `filter_by` method." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "seg = scorer.training_account_scores.filter_by(8, 'segment_id').head(3)\n", 283 | "print(seg)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "# 3. What's happening under the hood?" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "## The scoring model: gradient boosted trees" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "print(scorer.scoring_model)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "Additional keyword arguments to the lead scoring `create` function are passed through to the gradient boosted trees model." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "scorer2 = gl.lead_scoring.create(users, user_schema, max_iterations=20, verbose=False)\n", 327 | "print(\"Original num trees:\", scorer.scoring_model.num_trees)\n", 328 | "print(\"New num trees:\", scorer2.scoring_model.num_trees)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "## Validating the scoring model " 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "By default, the gradient boosted trees model withholds ??? percent of the training accounts as a validation set. The validation accuracy can be accessed as a user." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "print(\"Validation accuracy:\", scorer.scoring_model.validation_accuracy)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## The segmentation model: decision tree" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "print(scorer.segmentation_model)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "Because training the lead scoring tool can take some time with large datasets, the number of segments can be changed *after* a lead scoring tool has been created. This function **creates a new model**, the original model is **immutable**." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": false 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "scorer2 = scorer.resize_segmentation_model(max_segments=20)\n", 390 | "\n", 391 | "print(\"original number of segments:\", scorer.segment_descriptions.num_rows())\n", 392 | "print(\"new number of segments:\", scorer2.segment_descriptions.num_rows())" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "# 4. Incorporating activity data" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "**Account activity data** describes interactions between accounts and aspects of your business, like web assets, email campaigns, or products. Conceptually, each interaction involves at a minimum:\n", 407 | "- an account\n", 408 | "- a timestamp\n", 409 | "\n", 410 | "Interactions may also have:\n", 411 | "- an \"item\"\n", 412 | "- a user\n", 413 | "- other features" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": false 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "sessions = gl.SFrame('synthetic_airbnb_sessions.sfr')\n", 425 | "sessions = gl.TimeSeries(sessions, index='timestamp')\n", 426 | "sessions.head(5)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "As with the accounts table, we need to indicate which columns in the activity table mean what. If we had a column indicating which user was involved, we could specify that as well here. In this scenario, we don't have users that are distinct from accounts." 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "collapsed": true 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "session_schema = {'account_id': 'user_id',\n", 445 | " 'item': 'action_detail'}" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "## Define relevant dates" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "To use account activity data, a lead scoring tool needs to know the time window for each account's relevant interactions. There are three key dates for each account.\n", 460 | "\n", 461 | "- **open date**: when a new sales account was created\n", 462 | "- **close date**: when the *trial period* ends for a new sales account\n", 463 | "- **decision date**: when a final decision was reached by a training account, either success (conversion) or failure. May be *before or after* the close date.\n", 464 | "\n", 465 | "The **trial duration** is the difference between the open date and the close date. The lead scoring tool in GLC assumes this is fixed for all accounts, but in general this need not be the case.\n", 466 | "\n", 467 | "Open accounts do not have a decision date yet, by definition. They may or may not be still within the trial period." 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "collapsed": true 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "user_schema.update({'open_date': 'date_account_created',\n", 479 | " 'decision_date': 'booking_date'})" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "The trial duration is represented by an instance of the `datetime` package's `timedelta` class." 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "## Create the lead scoring tool " 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "collapsed": false 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "import datetime as dt\n", 505 | "\n", 506 | "scorer3 = gl.lead_scoring.create(users, user_schema,\n", 507 | " sessions, session_schema,\n", 508 | " trial_duration=dt.timedelta(days=30))" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": { 515 | "collapsed": false 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "print(scorer3)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "## Under the hood: date-based data validation" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "**Invalid accounts** have a decision date earlier than their open date. This is impossible, and these accounts are simply dropped from the set of training accounts." 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": { 540 | "collapsed": false 541 | }, 542 | "outputs": [], 543 | "source": [ 544 | "invalid_ids = scorer3.invalid_accounts\n", 545 | "print(invalid_ids)\n", 546 | "\n", 547 | "invalid_accounts = users.filter_by(invalid_ids, 'id')\n", 548 | "invalid_accounts[['id', 'date_account_created', 'booking_date']].print_rows(3)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "**Implicit failure accounts** are accounts that are *open*, but have been open for so long they are extremely unlikely to convert.\n", 556 | "\n", 557 | "- The threshold for implicit failure is the 95th percentile of the time it took training accounts to reach a decision, or the trial period duration, whichever is longer.\n", 558 | "\n", 559 | "- Implicit failures are inluded in *both* the training and open account output, because they are used to train the scoring and segmentation models, but are technically still open.\n", 560 | "\n", 561 | "- The user **doesn't *have* to explicitly specify failure accounts** - the model can do that automatically." 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": { 568 | "collapsed": false 569 | }, 570 | "outputs": [], 571 | "source": [ 572 | "print(scorer3.num_implicit_failures)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "## Under the hood: activity-based feature engineering " 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "The lead scoring tool constructs account-level features based on the number of interactions, items, and users (not applicable in this scenario) per day that the accounts are open (up to the maximum of the trial duration). The names of these features are accessible as a model field." 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "scorer3.final_features" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "The values for these features are included in the primary model outputs (`training_account_scores` and `open_account_scores`)." 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "collapsed": false 612 | }, 613 | "outputs": [], 614 | "source": [ 615 | "scorer3.open_account_scores.print_rows(3)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "The activity-based features are also used to define market segments." 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": { 629 | "collapsed": false 630 | }, 631 | "outputs": [], 632 | "source": [ 633 | "cols = ['segment_features', 'median_conversion_prob', 'num_training_accounts']\n", 634 | "scorer3.segment_descriptions[cols].print_rows(max_row_width=80, max_column_width=60)" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "## Results: improved validation accuracy" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": { 648 | "collapsed": false 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "print(\"Account-only validation accuracy:\", scorer.scoring_model.validation_accuracy)\n", 653 | "print(\"Validation accuracy including activity features:\", scorer3.scoring_model.validation_accuracy)" 654 | ] 655 | } 656 | ], 657 | "metadata": { 658 | "kernelspec": { 659 | "display_name": "Python 2", 660 | "language": "python", 661 | "name": "python2" 662 | }, 663 | "language_info": { 664 | "codemirror_mode": { 665 | "name": "ipython", 666 | "version": 2 667 | }, 668 | "file_extension": ".py", 669 | "mimetype": "text/x-python", 670 | "name": "python", 671 | "nbconvert_exporter": "python", 672 | "pygments_lexer": "ipython2", 673 | "version": "2.7.12" 674 | } 675 | }, 676 | "nbformat": 4, 677 | "nbformat_minor": 0 678 | } 679 | -------------------------------------------------------------------------------- /dss-2016/recommendation_systems/README.md: -------------------------------------------------------------------------------- 1 | # Data Science Summit 2016 2 | 3 | Countless online services use recommender systems to provide personalization to 4 | their users. This is important for selling related items, increasing user 5 | engagement, and so on. 6 | 7 | In this tutorial, you will learn 8 | - the key machine learning concepts that underpin most modern recommender systems 9 | - how to build your own recommender system using off-the-shelf tools 10 | - the strengths and weaknesses of collaborative filtering and content-based 11 | approaches, as well as hybrid methods 12 | - how to explore, explain, and evaluate your recommender models 13 | 14 | -------------------------------------------------------------------------------- /dss-2016/recommendation_systems/book-recommender-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import graphlab as gl" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "The following code snippet will parse the books data provided at the training." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "[INFO] This commercial license of GraphLab Create is assigned to engr@turi.com.\n", 33 | "\n", 34 | "[INFO] Start server at: ipc:///tmp/graphlab_server-41686 - Server binary: /Users/chris/miniconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443482376.log\n", 35 | "[INFO] GraphLab Server Version: 1.6.1\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "import os\n", 41 | "if os.path.exists('books/ratings'):\n", 42 | " ratings = gl.SFrame('books/ratings')\n", 43 | " items = gl.SFrame('books/items')\n", 44 | " users = gl.SFrame('books/users')\n", 45 | "else:\n", 46 | " ratings = gl.SFrame.read_csv('books/book-ratings.csv')\n", 47 | " ratings.save('books/ratings')\n", 48 | " items = gl.SFrame.read_csv('books/book-data.csv')\n", 49 | " items.save('books/items')\n", 50 | " users = gl.SFrame.read_csv('books/user-data.csv')\n", 51 | " users.save('books/users')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Visually explore the above data using GraphLab Canvas." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Recommendation systems" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "In this section we will make a model that can be used to recommend new tags to users." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Creating a Model" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Use `gl.recommender.create()` to create a model that can be used to recommend tags to each user." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Print a summary of the model by simply entering the name of the object." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Get all unique users from the first 10000 observations and save them as a variable called `users`." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Get 20 recommendations for each user in your list of users. Save these as a new SFrame called `recs`." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Inspecting your model" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Get an SFrame of the 20 most similar items for each observed item." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "This dataset has multiple rows corresponding to the same book, e.g., in situations where reprintings were done by different publishers in different year.\n", 174 | "\n", 175 | "For each unique value of 'book' in the `items` SFrame, select one of the of the available values for `author`, `publisher`, and `year`. Hint: Try using [`SFrame.groupby`](https://turi.com/products/create/docs/graphlab.data_structures.html#module-graphlab.aggregate) and [`gl.aggregate.SELECT_ONE`](https://turi.com/products/create/docs/graphlab.data_structures.html#graphlab.aggregate.SELECT_ONE)." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "Computing the number of times each book was rated, and add a column containing these counts to the `items` SFrame using `SFrame.join`." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Print the first few books, sorted by the number of times they have been rated. Do these values make sense?" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Now print the most similar items per item, sorted by the most common books. Hint: Join the two SFrames you created above." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Experimenting with other models" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Create a dataset called `implicit` that contains only ratings data where `rating` was 4 or greater." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [], 256 | "source": [] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Create a train/test split of the `implicit` data created above. Hint: Use [random_split_by_user](https://turi.com/products/create/docs/generated/graphlab.recommender.random_split_by_user.html#graphlab.recommender.random_split_by_user)." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Print the first 5 rows of the training set." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Create a `ranking_factorization_recommender` model using just the training set and 20 factors." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "Evaluate how well this model recommends items that were seen in the test set you created above. Hint: Check out `m.evaluate_precision_recall()`." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": { 317 | "collapsed": false 318 | }, 319 | "outputs": [], 320 | "source": [] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Create an SFrame containing only one observation, where 'Billy Bob' has rated 'Animal Farm' with score 5.0." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "Use this data when querying for recommendations for the user 'Billy Bob'." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [], 352 | "source": [] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "Python 2", 358 | "language": "python", 359 | "name": "python2" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 2 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython2", 371 | "version": "2.7.11" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 0 376 | } 377 | -------------------------------------------------------------------------------- /notebooks/deploy-scikit-learn-in-ps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:0bd1413cc432876e4c643cf0cea23530ea8ff9ab987036dd795f5daf5df9291a" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Deploy scikit-learn model with Turi Predictive Services" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Deploying models created using scikit-learn in a Turi Predictive Service is very easy. This notebook walks you through the step-by-step process. The notebook has three sections: \n", 24 | "\n", 25 | "1. Create a Predictive Service\n", 26 | "2. Create a scikit-learn model and deploy it to a Predictive Service\n", 27 | "3. Query the model through CURL or a Predictive Service Client\n", 28 | "\n", 29 | "If you are deploying a model in an existing Predictive Service instance you can go to step two directly." 30 | ] 31 | }, 32 | { 33 | "cell_type": "heading", 34 | "level": 2, 35 | "metadata": {}, 36 | "source": [ 37 | "Prerequisites" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Apart from GraphLab Create you will, naturally, need scikit-learn installed in your current Python environment. The most straightforward way to do that is to use conda:\n", 45 | "```\n", 46 | "conda install scikit-learn\n", 47 | "```\n", 48 | "You will also need a valid AWS account in order to set up a predictive service." 49 | ] 50 | }, 51 | { 52 | "cell_type": "heading", 53 | "level": 2, 54 | "metadata": {}, 55 | "source": [ 56 | "Step one: Create a Predictive Service\n", 57 | "" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "This section shows you how to deploy a Predictive Service to EC2. The EC2 instances used by the Predictive Service will be launched in your own AWS account, so you will be responsible for the cost. \n", 65 | "\n", 66 | "To create a Predictive Service in Amazon AWS, we first configure the EC2 Config object, which contains the configuration parameters required for launching a Predictive Service cluster in EC2. These fields are optional and include the region, instance type, CIDR rules etc. Predictive Service uses this configuration for service creation.\n", 67 | "\n", 68 | "Having configured our EC2 Config object, we're ready to launch a Predictive Service Deployment, There are a few aspects of the Predictive Service that can be customized:\n", 69 | "* Number of nodes in the service - By default the number of hosts (`num_hosts`) is 1. To obtain good cache utility and high availability, we recommended setting num_hosts to at least 3.\n", 70 | "* State path to persist service state and service logs. This is a s3 location. \n", 71 | "* Port to be used by the server.\n", 72 | "* Other settings, such as SSL credentials etc.\n", 73 | "\n", 74 | "The following code snippet shows you how to create a Predictive Service. You will have to replace the ps_state_path and credentials for your Predictive Service." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "collapsed": false, 80 | "input": [ 81 | "import graphlab as gl\n", 82 | "\n", 83 | "# make sure to replace the following with your own information\n", 84 | "ps_state_path = 's3:///predictive_service/ps'\n", 85 | "\n", 86 | "# Create an EC2 config\n", 87 | "# You can either specify your AWS credentials using environment variables, or\n", 88 | "# set them as arguments to this object's constructor\n", 89 | "ec2_config = gl.deploy.Ec2Config(\n", 90 | " aws_access_key_id='',\n", 91 | " aws_secret_access_key='')\n", 92 | "\n", 93 | "# use the EC2 config to launch a new Predictive Service\n", 94 | "# num_hosts specifies how many hosts the Predictive Service cluster has. You can scale up and down later after initial creation.\n", 95 | "ps = gl.deploy.predictive_service.create(\n", 96 | " name='sklearn-predictive-service',\n", 97 | " ec2_config=ec2_config,\n", 98 | " state_path=ps_state_path,\n", 99 | " num_hosts=1)" 100 | ], 101 | "language": "python", 102 | "metadata": {}, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "collapsed": false, 108 | "input": [ 109 | "# once the Predictive Service is successfully created, you can query the service status\n", 110 | "ps.get_status()" 111 | ], 112 | "language": "python", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "metadata": {}, 117 | "output_type": "pyout", 118 | "prompt_number": 5, 119 | "text": [ 120 | "[{u'cache': {u'healthy': True, u'num_keys': 0, u'type': u'local'},\n", 121 | " u'dns_name': u'ec2-52-34-231-117.us-west-2.compute.amazonaws.com',\n", 122 | " u'id': u'i-992d1540',\n", 123 | " u'models': [],\n", 124 | " u'num_hosts': 1,\n", 125 | " u'reason': u'N/A',\n", 126 | " u'service_version': u'1.7.1',\n", 127 | " u'state': u'InService'}]" 128 | ] 129 | } 130 | ], 131 | "prompt_number": 5 132 | }, 133 | { 134 | "cell_type": "heading", 135 | "level": 2, 136 | "metadata": {}, 137 | "source": [ 138 | "Step two: Create a scikit-learn model and deploy to Predictive Service\n", 139 | "" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "Let's train a simple random forest model and deploy it in the Predictive Service" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "collapsed": false, 152 | "input": [ 153 | "from sklearn.ensemble import RandomForestClassifier\n", 154 | "X = [[0, 0], [1, 1]]\n", 155 | "Y = [0, 1]\n", 156 | "clf = RandomForestClassifier(n_estimators=10)\n", 157 | "clf = clf.fit(X, Y)" 158 | ], 159 | "language": "python", 160 | "metadata": {}, 161 | "outputs": [], 162 | "prompt_number": 6 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "We can expose the trained model as a REST endpoint in the Predictive Service. This will allow other applications to consume the predictions from the model. \n", 169 | "\n", 170 | "In order to do that, we wrap the model object in a Python function and add it to the Predictive Service. In the function you may add your own logic for transform input to the model, ensemble different models or manipulate output before returning. Checkout our [user guide](https://turi.com/learn/userguide/deployment/pred-working-with-objects.html) for more details.\n", 171 | "\n", 172 | "The result of the function needs to be a JSON serializable object." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "collapsed": false, 178 | "input": [ 179 | "def classify(x):\n", 180 | " prediction = clf.predict(x)\n", 181 | "\n", 182 | " # convert into a json serializable value\n", 183 | " return list(prediction)\n", 184 | "\n", 185 | "# add your predictive function that wraps scikit-learn model\n", 186 | "ps.add('classify', classify)" 187 | ], 188 | "language": "python", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "output_type": "stream", 193 | "stream": "stderr", 194 | "text": [ 195 | "[INFO] Endpoint 'classify' is added. Use apply_changes() to deploy all pending changes, or continue with other modification.\n" 196 | ] 197 | } 198 | ], 199 | "prompt_number": 7 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "You may do a test query before really deploying it to production. This will help detect errors in the function before deploying it the Predictive Service. \n", 206 | "\n", 207 | "The response to a query is a JSON object with the following keys:\n", 208 | " \n", 209 | " * response: is the actual response from the query;\n", 210 | " * uuid: is the unique identifier for your query. The 'uuid' is useful when you need to correlated the query with other data you potentially have for future model tuning.\n", 211 | " * version: is the model version. This is useful when you are updating model and you want to know exactly which version served your query" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "collapsed": false, 217 | "input": [ 218 | "ps.test_query('classify', x=[[0,0],[1,1]])" 219 | ], 220 | "language": "python", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "output_type": "stream", 225 | "stream": "stderr", 226 | "text": [ 227 | "[INFO] Input data serializable.\n" 228 | ] 229 | }, 230 | { 231 | "output_type": "stream", 232 | "stream": "stderr", 233 | "text": [ 234 | "[INFO] Trying to serve classify\n" 235 | ] 236 | }, 237 | { 238 | "output_type": "stream", 239 | "stream": "stderr", 240 | "text": [ 241 | "[INFO] Query results serializable.\n" 242 | ] 243 | }, 244 | { 245 | "metadata": {}, 246 | "output_type": "pyout", 247 | "prompt_number": 8, 248 | "text": [ 249 | "{u'response': [0, 1],\n", 250 | " u'uuid': u'9277467f-fc55-40ee-8125-403f95660840',\n", 251 | " u'version': 1}" 252 | ] 253 | } 254 | ], 255 | "prompt_number": 8 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "It is as expected, let us apply the changes and the predictive model is ready to go!" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "collapsed": false, 267 | "input": [ 268 | "# This will push the custom query to the Predictive Service. Since the update is asynchronous, you may need to wait \n", 269 | "# a little while before the model is fully deployed.\n", 270 | "ps.apply_changes()" 271 | ], 272 | "language": "python", 273 | "metadata": {}, 274 | "outputs": [] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "Check status and make sure the deployed custom predictive object is fully operational:" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "collapsed": false, 286 | "input": [ 287 | "# There are other variable way of query status, check API document for more details\n", 288 | "ps.get_status('model')" 289 | ], 290 | "language": "python", 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "html": [ 295 | "
\n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | "
nameexpected versiontypereasonnode.i-992d1540
classify1modelN/A1 (Loaded successfully)
\n", 311 | "[? rows x 5 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use len(sf) to force materialization.\n", 312 | "
" 313 | ], 314 | "metadata": {}, 315 | "output_type": "pyout", 316 | "prompt_number": 10, 317 | "text": [ 318 | "Columns:\n", 319 | "\tname\tstr\n", 320 | "\texpected version\tint\n", 321 | "\ttype\tstr\n", 322 | "\treason\tstr\n", 323 | "\tnode.i-992d1540\tstr\n", 324 | "\n", 325 | "Rows: Unknown\n", 326 | "\n", 327 | "Data:\n", 328 | "+----------+------------------+-------+--------+-------------------------+\n", 329 | "| name | expected version | type | reason | node.i-992d1540 |\n", 330 | "+----------+------------------+-------+--------+-------------------------+\n", 331 | "| classify | 1 | model | N/A | 1 (Loaded successfully) |\n", 332 | "+----------+------------------+-------+--------+-------------------------+\n", 333 | "[? rows x 5 columns]\n", 334 | "Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.\n", 335 | "You can use len(sf) to force materialization." 336 | ] 337 | } 338 | ], 339 | "prompt_number": 10 340 | }, 341 | { 342 | "cell_type": "code", 343 | "collapsed": false, 344 | "input": [ 345 | "# test query to make sure the model works fine\n", 346 | "ps.query('classify', x=[[0,0],[1,1]])" 347 | ], 348 | "language": "python", 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "metadata": {}, 353 | "output_type": "pyout", 354 | "prompt_number": 11, 355 | "text": [ 356 | "{u'from_cache': False,\n", 357 | " u'model': u'classify',\n", 358 | " u'response': [0, 1],\n", 359 | " u'uuid': u'05c773c5-a3e0-4783-bca6-a8925fb20a0c',\n", 360 | " u'version': 1}" 361 | ] 362 | } 363 | ], 364 | "prompt_number": 11 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "Now other applications can interact with our model! In the next section we will illustrate how to consume the model. We can also use other APIs like `ps.update()` to update a model and `ps.remove()` to remove a model." 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "Turi Predictive Services includes a stand-alone Python client for those who just want to query a running service. We will show you how to use the client in the following section. The client takes a configuration file containing the endpoint of the Predictive Service and API key used by client. You can generate the Python client configuration using the following call and hand off the configuration file to your consumer." 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "collapsed": false, 383 | "input": [ 384 | "# Generate a client configuration file for Predictive Service Client to consume\n", 385 | "# It is a good practice to config a CNAME entry in your DNS provider to have a well known endpoint\n", 386 | "# like https://models.companyname.com to point to the Predictive Service so that the consumer of\n", 387 | "# the Predictive Service do not need to change their code when you make modifications to your\n", 388 | "# Predictive Service\n", 389 | "# Here we use None only for demo purpose\n", 390 | "ps.save_client_config(file_path='/tmp/ps_client.conf', predictive_service_cname = None)" 391 | ], 392 | "language": "python", 393 | "metadata": {}, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "Once generated, the ps_client.conf file may be passed along to your client side developer. We will show you how to use the file in next section." 401 | ] 402 | }, 403 | { 404 | "cell_type": "heading", 405 | "level": 2, 406 | "metadata": {}, 407 | "source": [ 408 | "Step three: query the model through REST api and Python client\n", 409 | "" 410 | ] 411 | }, 412 | { 413 | "cell_type": "heading", 414 | "level": 3, 415 | "metadata": {}, 416 | "source": [ 417 | "Query through REST" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "The model query is exposed through REST API. The endpoints URL is:\n", 425 | "\n", 426 | " http(s):///query/\n", 427 | " \n", 428 | "You can find out the endpoint URL base by simply printing the `ps` object, and copying the *Load Balancer DNS Name*.\n", 429 | "\n", 430 | "The HTTP call for querying a model or method is a POST call, requiring a JSON-serialized string in the following format as payload:\n", 431 | "\n", 432 | " { \"data\": }\n", 433 | "\n", 434 | "You also need a valid API key, which you can retreive through `ps.api_key`.\n", 435 | "\n", 436 | "Here is a sample curl command to query the `classify` method that we deployed in this notebook:\n", 437 | "\n", 438 | " curl -u api_key: -d '{\"data\": {\"x\": [[0,0],[1,1]]}}'\n", 439 | " http:///query/classify" 440 | ] 441 | }, 442 | { 443 | "cell_type": "heading", 444 | "level": 3, 445 | "metadata": {}, 446 | "source": [ 447 | "Query through Python" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "We also ship a Python client package that you may easily consume the model. To install the package, do:\n", 455 | " \n", 456 | " pip install GraphLab-Service-Client \n", 457 | " \n", 458 | "After that you may consume the Predictive Model:" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "collapsed": false, 464 | "input": [ 465 | "from graphlab_service_client import PredictiveServiceClient\n", 466 | "\n", 467 | "# the configuration is saved through ps.save_client_config()\n", 468 | "client = PredictiveServiceClient(config_file='/tmp/ps_client.conf')\n", 469 | "\n", 470 | "client.query('classify', x=[[0,0], [1,1]])" 471 | ], 472 | "language": "python", 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "metadata": {}, 477 | "output_type": "pyout", 478 | "prompt_number": 14, 479 | "text": [ 480 | "{u'from_cache': True,\n", 481 | " u'model': u'classify',\n", 482 | " u'response': [0, 1],\n", 483 | " u'uuid': u'661f8381-e01c-414c-9fe6-4738bfaa28c2',\n", 484 | " u'version': 1}" 485 | ] 486 | } 487 | ], 488 | "prompt_number": 14 489 | }, 490 | { 491 | "cell_type": "heading", 492 | "level": 2, 493 | "metadata": {}, 494 | "source": [ 495 | "Shutting down the predictive service" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "If you don't need to keep you predictive service around for further tasks, make sure to terminate it to avoid incurring unnecessary costs:" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "collapsed": false, 508 | "input": [ 509 | "ps.terminate_service()" 510 | ], 511 | "language": "python", 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "output_type": "stream", 516 | "stream": "stderr", 517 | "text": [ 518 | "[INFO] Deleting load balancer: sklearn-predictive-service\n" 519 | ] 520 | }, 521 | { 522 | "output_type": "stream", 523 | "stream": "stderr", 524 | "text": [ 525 | "[INFO] Terminating EC2 host(s) [u'i-992d1540'] in us-west-2\n" 526 | ] 527 | }, 528 | { 529 | "output_type": "stream", 530 | "stream": "stderr", 531 | "text": [ 532 | "[INFO] Deleting state data.\n" 533 | ] 534 | }, 535 | { 536 | "output_type": "stream", 537 | "stream": "stderr", 538 | "text": [ 539 | "[INFO] Deleting s3 state data.\n" 540 | ] 541 | }, 542 | { 543 | "output_type": "stream", 544 | "stream": "stderr", 545 | "text": [ 546 | "[INFO] Deleting keys: []\n" 547 | ] 548 | }, 549 | { 550 | "output_type": "stream", 551 | "stream": "stderr", 552 | "text": [ 553 | "[INFO] Deleting keys: [u'user/scikit-ps/predictive_objects/classify/1/pickle_archive', u'user/scikit-ps-new/predictive_objects/classify/1/version']\n" 554 | ] 555 | }, 556 | { 557 | "output_type": "stream", 558 | "stream": "stderr", 559 | "text": [ 560 | "[INFO] Deleted reference to PredictiveService('sklearn-predictive-service') from current session.\n" 561 | ] 562 | } 563 | ], 564 | "prompt_number": 15 565 | }, 566 | { 567 | "cell_type": "heading", 568 | "level": 2, 569 | "metadata": {}, 570 | "source": [ 571 | "Where to go from here" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "This notebook gives you a peek at what Turi Predictive Service can offer. For a more detailed look at the functionalities in the Turi Predictive Service, checkout out [user guide](https://turi.com/learn/userguide/#Deployment) for more details. If you have any questions, post it in our [forum](http://forum.turi.com) and we are happy to assist you!" 579 | ] 580 | } 581 | ], 582 | "metadata": {} 583 | } 584 | ] 585 | } 586 | -------------------------------------------------------------------------------- /notebooks/reading_data_from_impala.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:8e45c6ddfe9c7874a75b074e334654d87c2e84aa0cea4a827d1c4118f578b081" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Reading data from Impala\n", 16 | "\n", 17 | "GraphLab create supports loading data from many standard data formats (CSV, Avro, JSON) and data stores such as S3 and HDFS. We also have an ODBC connector, which works seamlessly for reading data directly from [Cloudera's Impala](http://www.cloudera.com/content/cloudera/en/products-and-services/cdh/impala.html).\n", 18 | "\n", 19 | "Before trying this on your own computer, you'll need to make sure that you have the [Cloudera ODBC driver](http://www.cloudera.com/content/cloudera/en/downloads/connectors/impala/odbc/impala-odbc-v2-5-23.html) installed.\n", 20 | "\n", 21 | "Let's take a look at how simple it is to stream results from Impala queries directly into our scalable data structure, the SFrame." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "collapsed": false, 27 | "input": [ 28 | "import graphlab as gl" 29 | ], 30 | "language": "python", 31 | "metadata": {}, 32 | "outputs": [], 33 | "prompt_number": 4 34 | }, 35 | { 36 | "cell_type": "code", 37 | "collapsed": false, 38 | "input": [ 39 | "# configure your ODBC connection\n", 40 | "db = gl.connect_odbc(\"DRIVER=/opt/cloudera/impalaodbc/lib/universal/\" \\\n", 41 | " \"libclouderaimpalaodbc.dylib;HOST=10.10.2.15;PORT=21050\")" 42 | ], 43 | "language": "python", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "output_type": "stream", 48 | "stream": "stderr", 49 | "text": [ 50 | "[INFO] Start server at: ipc:///tmp/graphlab_server-29804 - Server binary: /Users/rlvoyer/Envs/glc_pypi_1.3/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1423871383.log\n" 51 | ] 52 | }, 53 | { 54 | "output_type": "stream", 55 | "stream": "stderr", 56 | "text": [ 57 | "[INFO] GraphLab Server Version: 1.3.0\n" 58 | ] 59 | } 60 | ], 61 | "prompt_number": 3 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Cloudera Impala uses SQL as its query language. We can run a standard SQL DESCRIBE query to get a sense for what the data looks like." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "collapsed": false, 73 | "input": [ 74 | "# run a DESCRIBE query against the Amazon product titles table\n", 75 | "gl.SFrame.from_odbc(db, \"DESCRIBE titles\")" 76 | ], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "html": [ 82 | "
\n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | "
nametypecomment
idxbigint
product_idstring
num_reviewsint
pricestring
simple_categorystring
titlestring
category_list_0string
category_list_1string
category_list_2string
category_list_3string
.........
\n", 144 | "[15 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", 145 | "
" 146 | ], 147 | "metadata": {}, 148 | "output_type": "pyout", 149 | "prompt_number": 5, 150 | "text": [ 151 | "Columns:\n", 152 | "\tname\tstr\n", 153 | "\ttype\tstr\n", 154 | "\tcomment\tstr\n", 155 | "\n", 156 | "Rows: 15\n", 157 | "\n", 158 | "Data:\n", 159 | "+-----------------+--------+---------+\n", 160 | "| name | type | comment |\n", 161 | "+-----------------+--------+---------+\n", 162 | "| idx | bigint | |\n", 163 | "| product_id | string | |\n", 164 | "| num_reviews | int | |\n", 165 | "| price | string | |\n", 166 | "| simple_category | string | |\n", 167 | "| title | string | |\n", 168 | "| category_list_0 | string | |\n", 169 | "| category_list_1 | string | |\n", 170 | "| category_list_2 | string | |\n", 171 | "| category_list_3 | string | |\n", 172 | "| ... | ... | ... |\n", 173 | "+-----------------+--------+---------+\n", 174 | "[15 rows x 3 columns]\n", 175 | "Note: Only the head of the SFrame is printed.\n", 176 | "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." 177 | ] 178 | } 179 | ], 180 | "prompt_number": 5 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Cool! Now let's stream some data into an SFrame." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "collapsed": false, 192 | "input": [ 193 | "# run a simple SELECT to get titles for all products with more than 100 reviews\n", 194 | "titles_sf = gl.SFrame.from_odbc(db, \"SELECT title, num_reviews, simple_category FROM titles WHERE num_reviews > 25\")\n", 195 | "titles_sf" 196 | ], 197 | "language": "python", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "html": [ 202 | "
\n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | "
titlenum_reviewssimple_category
reality166Music
keeping heart on pine
ridg ...
26Books
eric meyer on css:
mastering the languag ...
68Books
pierrot le fou (1969)52Movies & TV
the life of john wesley
hardin as written by ...
27Books
snakes on a train
(unrated director's ...
26Movies & TV
t2 : infiltra35Books
drop dead fred [region 2]
(1991) ...
161Movies & TV
loser goes first: my
thirty-something year ...
32Books
irresistible (banning
sisters trilogy) ...
29Books
.........
\n", 264 | "[71639 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", 265 | "
" 266 | ], 267 | "metadata": {}, 268 | "output_type": "pyout", 269 | "prompt_number": 14, 270 | "text": [ 271 | "Columns:\n", 272 | "\ttitle\tstr\n", 273 | "\tnum_reviews\tint\n", 274 | "\tsimple_category\tstr\n", 275 | "\n", 276 | "Rows: 71639\n", 277 | "\n", 278 | "Data:\n", 279 | "+-------------------------------+-------------+-----------------+\n", 280 | "| title | num_reviews | simple_category |\n", 281 | "+-------------------------------+-------------+-----------------+\n", 282 | "| reality | 166 | Music |\n", 283 | "| keeping heart on pine ridg | 26 | Books |\n", 284 | "| eric meyer on css: masteri... | 68 | Books |\n", 285 | "| pierrot le fou (1969) | 52 | Movies & TV |\n", 286 | "| the life of john wesley ha... | 27 | Books |\n", 287 | "| snakes on a train (unrated... | 26 | Movies & TV |\n", 288 | "| t2 : infiltra | 35 | Books |\n", 289 | "| drop dead fred [region 2] ... | 161 | Movies & TV |\n", 290 | "| loser goes first: my thirt... | 32 | Books |\n", 291 | "| irresistible (banning sist... | 29 | Books |\n", 292 | "| ... | ... | ... |\n", 293 | "+-------------------------------+-------------+-----------------+\n", 294 | "[71639 rows x 3 columns]\n", 295 | "Note: Only the head of the SFrame is printed.\n", 296 | "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." 297 | ] 298 | } 299 | ], 300 | "prompt_number": 14 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "We can use GraphLab Canvas to visualize the data." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "collapsed": false, 312 | "input": [ 313 | "titles_sf.show()" 314 | ], 315 | "language": "python", 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "output_type": "stream", 320 | "stream": "stdout", 321 | "text": [ 322 | "Canvas is accessible via web browser at the URL: http://localhost:63103/index.html\n" 323 | ] 324 | } 325 | ], 326 | "prompt_number": 15 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "And now that we have our data in an SFrame, we're ready to start training predictive models, and deploying them to production!" 333 | ] 334 | } 335 | ], 336 | "metadata": {} 337 | } 338 | ] 339 | } -------------------------------------------------------------------------------- /strata-nyc-2015/README.md: -------------------------------------------------------------------------------- 1 | # Strata + Hadoop World, New York City, 2015 2 | 3 | This directory contains demo notebooks used for **Machine Learning 101**, an all-day tutorial at [Strata + Hadoop World, New York City, 2015](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217). 4 | The course is designed to introduce machine learning via real applications like 5 | - building a recommender 6 | - image analysis using deep learning. 7 | 8 | Along the way, we also cover feature engineering and deploying machine learning models as a predictive service. . 9 | 10 | ## Setup Instructions 11 | 12 | You can browse the notebooks using Github iPython notebook viewer. Note that some images may not be rendered correctly If you'd like to run it, follow these steps to set up your machine. 13 | 14 | - [Download](https://turi.com/download/) GraphLab Create and then follow instructions to [install](https://turi.com/download/install.html). 15 | - Download and unzip the datasets [[831MB]](https://static.turi.com/datasets/ml101_datasets_stratanyc_2015.zip) 16 | 17 | ## Handy references 18 | 19 | - [GraphLab Create User Guide](https://turi.com/learn/userguide) 20 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create) 21 | -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/AA1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/AA1.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/alexnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/alexnet.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/cifar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/cifar.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/evaluate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/evaluate.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/extract_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/extract_features.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/improve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/improve.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/linear.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/load.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/load.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/quadratic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/quadratic.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/train.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/workflow1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow1.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/workflow2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow2.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/workflow3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow3.png -------------------------------------------------------------------------------- /strata-nyc-2015/deep-learning/images/workflow4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow4.png -------------------------------------------------------------------------------- /strata-nyc-2015/deployment/images/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/left.png -------------------------------------------------------------------------------- /strata-nyc-2015/deployment/images/middle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/middle.png -------------------------------------------------------------------------------- /strata-nyc-2015/deployment/images/predictive_services_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/predictive_services_overview.png -------------------------------------------------------------------------------- /strata-nyc-2015/deployment/images/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/right.png -------------------------------------------------------------------------------- /strata-nyc-2015/deployment/scikit_deployment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Introduction to ML Deployment\n", 8 | "==================\n", 9 | "\n", 10 | "Deploying models created using python in a Turi Predictive Service is very easy. This notebook walks you through the step-by-step process. \n", 11 | "\n", 12 | "\n", 13 | "\n", 14 | "-----------------------\n", 15 | "\n", 16 | "Deployment Steps\n", 17 | "=========\n", 18 | "The notebook has three sections: \n", 19 | "\n", 20 | "1. Create a model\n", 21 | "2. Create a predictive service\n", 22 | "3. Query the model\n", 23 | "\n", 24 | "If you are deploying a model in an existing Predictive Service instance you can go to step (2) directly.\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## 1. Create a model \n", 32 | "\n", 33 | "Let's train a simple random forest model and deploy it in the Predictive Service.\n", 34 | "\n", 35 | "" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 49 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 50 | " min_samples_leaf=1, min_samples_split=2,\n", 51 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", 52 | " oob_score=False, random_state=None, verbose=0,\n", 53 | " warm_start=False)" 54 | ] 55 | }, 56 | "execution_count": 5, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "from sklearn.ensemble import RandomForestClassifier\n", 63 | "from sklearn.datasets import load_iris\n", 64 | "iris = load_iris()\n", 65 | "\n", 66 | "model = RandomForestClassifier(n_estimators=10)\n", 67 | "model = model.fit(iris['data'], iris['target'])\n", 68 | "model" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "We can expose the trained model as a REST endpoint. This will allow other applications to consume the predictions from the model. \n", 76 | "\n", 77 | "In order to do that, we wrap the model object in a Python function and add it to the Predictive Service. In the function you may add your own logic for transform input to the model, ensemble different models or manipulate output before returning. Checkout out [user guide](https://turi.com/learn/userguide/#Deployment) for more details.\n", 78 | "\n", 79 | "The result of the function needs to be a **JSON serializable** object." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 46, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "def classify(x):\n", 91 | " prediction = model.predict(x)\n", 92 | "\n", 93 | " # convert into a json serializable value\n", 94 | " return list(prediction)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## 2. Create a Predictive Service (One time) \n", 102 | "\n", 103 | "This section shows you how to deploy a Predictive Service to EC2. The EC2 instances used by the Predictive Service will be launched in your own AWS account, so you will be responsible for the cost. \n", 104 | "\n", 105 | "" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "To create a Predictive Service in Amazon AWS, we first configure the EC2 Config object, which contains the configuration parameters required for launching a Predictive Service cluster in EC2. These fields are optional and include the region, instance type, CIDR rules etc. Predictive Service uses this configuration for service creation.\n", 113 | "\n", 114 | "Having configured our EC2 Config object, we're ready to launch a Predictive Service Deployment, There are a few aspects of the Predictive Service that can be customized:\n", 115 | "* Number of nodes in the service - By default the number of hosts (`num_hosts`) is 1. To obtain good cache utility and high availability, we recommended setting num_hosts to at least 3.\n", 116 | "* State path to persist service state and service logs. This is a s3 location. \n", 117 | "* Port to be used by the server.\n", 118 | "* Other settings, such as SSL credentials etc.\n", 119 | "\n", 120 | "The following code snippet shows you how to create a Predictive Service. You will have to replace the ps_state_path and credentials for your Predictive Service." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 1, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "import graphlab as gl\n", 132 | "\n", 133 | "# Replace with your path.\n", 134 | "ps_state_path = 's3:///predictive_service/ps'\n", 135 | "\n", 136 | "# Set your AWS credentials.\n", 137 | "gl.aws.set_credentials(, )\n", 138 | "\n", 139 | "# Create an EC2 config\n", 140 | "ec2_config = gl.deploy.Ec2Config()\n", 141 | "\n", 142 | "# Launch a predictive service\n", 143 | "ps = gl.deploy.predictive_service.create(name = 'sklearn-predictive-service', \n", 144 | " ec2_config = ec2_config, state_path = ps_state_path, num_hosts = 1)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Load an already created service" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 47, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [ 161 | { 162 | "name": "stderr", 163 | "output_type": "stream", 164 | "text": [ 165 | "[WARNING] Overwritting existing Predictive Service \"demolab-one-six\" in local session.\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "import graphlab as gl\n", 171 | "ps = gl.deploy.predictive_service.load('s3://gl-demo-usw2/predictive_service/demolab/ps-1.6')" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 62, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "Name : demolab-one-six\n", 185 | "State Path : s3://gl-demo-usw2/predictive_service/demolab/ps-1.6\n", 186 | "Description : None\n", 187 | "API Key : b437e588-0f2b-45e1-81c8-ce3acfa81ade\n", 188 | "CORS origin : *\n", 189 | "Global Cache State : enabled\n", 190 | "Load Balancer DNS Name: demolab-one-six-2015364754.us-west-2.elb.amazonaws.com\n", 191 | "\n", 192 | "Deployed endpoints:\n", 193 | "\tname: freshdress_kw_search, version: 3, type: alias, cache: disabled, description: Alias for freshdress_kw_search_model\n", 194 | "\tname: yelp_sentiment_most_extreme_for_place, version: 2, type: model, cache: enabled, description: \n", 195 | "\tname: classify-sklearn, version: 2, type: model, cache: enabled, description: \n", 196 | "\tname: freshdress_more_like_image_bw, version: 1, type: model, cache: enabled, description: \n", 197 | "\tname: freshdress_kw_search_model, version: 2, type: model, cache: enabled, description: \n", 198 | "\tname: composite_recommender_query, version: 1, type: model, cache: disabled, description: \n", 199 | "\tname: freshdress_describe, version: 2, type: alias, cache: disabled, description: Alias for freshdress_describe_image_basic\n", 200 | "\tname: freshdress_more_like_image_bow, version: 3, type: model, cache: enabled, description: \n", 201 | "\tname: yelp_sentiment_predict_text, version: 2, type: model, cache: enabled, description: \n", 202 | "\tname: freshdress_describe_image_basic, version: 1, type: model, cache: enabled, description: \n", 203 | "\tname: freshdress_more_like_image_color, version: 1, type: model, cache: enabled, description: \n", 204 | "\tname: freshdress_more_like_image, version: 5, type: alias, cache: disabled, description: Alias for freshdress_more_like_image_tfidf\n", 205 | "\tname: yelp_sentiment_most_extreme, version: 2, type: model, cache: enabled, description: \n", 206 | "\tname: freshdress_more_like_image_tfidf, version: 1, type: model, cache: enabled, description: \n", 207 | "\tname: composite_recommender_explanation, version: 1, type: model, cache: disabled, description: \n", 208 | "\tname: yelp_sentiment_summary, version: 2, type: model, cache: enabled, description: \n", 209 | "\n", 210 | "No Pending changes." 211 | ] 212 | }, 213 | "execution_count": 62, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "ps" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 53, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [ 229 | { 230 | "name": "stderr", 231 | "output_type": "stream", 232 | "text": [ 233 | "[INFO] Endpoint 'classify-sklearn' is updated. Use apply_changes to deploy all pending changes, or continue other modification.\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "# ps.add('classify-sklearn', classify) (If done for the first time)\n", 239 | "ps.update('classify-sklearn', classify)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 55, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stderr", 251 | "output_type": "stream", 252 | "text": [ 253 | "[INFO] There are no pending changes. No action is taken.\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "ps.apply_changes()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## Query the model \n", 266 | "\n", 267 | "You may do a test query before really deploying it to production. This will help detect errors in the function before deploying it the Predictive Service. \n", 268 | "\n", 269 | "" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 56, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "name": "stderr", 281 | "output_type": "stream", 282 | "text": [ 283 | "[INFO] Input data serializable.\n", 284 | "[INFO] Trying to serve classify-sklearn\n", 285 | "[INFO] Query results serializable.\n" 286 | ] 287 | }, 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "{u'response': [0],\n", 292 | " u'uuid': u'88947cb8-4646-489d-8360-81ce1d54004e',\n", 293 | " u'version': 1}" 294 | ] 295 | }, 296 | "execution_count": 56, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "ps.test_query('classify-sklearn', x=[5.1, 3.5, 1.4, 0.2])" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Now, let us query the real service." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 69, 315 | "metadata": { 316 | "collapsed": false 317 | }, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "{u'from_cache': True,\n", 323 | " u'model': u'classify-sklearn',\n", 324 | " u'response': [0],\n", 325 | " u'uuid': u'8afd2f01-6d37-4fd0-8788-5141f92459dd',\n", 326 | " u'version': 2}" 327 | ] 328 | }, 329 | "execution_count": 69, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "# test query to make sure the model works fine\n", 336 | "ps.query('classify-sklearn', x=[5.1, 3.5, 1.4, 0.2])" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "### Query from external applications via REST\n", 344 | "\n", 345 | "Now other applications can interact with our model! In the next section we will illustrate how to consume the model. We can also use other APIs like ps.update() to update a mode, ps.remove() to remove a model." 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "The model query is exposed through REST API. The path is:\n", 353 | "\n", 354 | " http(s):///data/\n", 355 | " \n", 356 | "And the payload is a JSON serialized string in the following format:\n", 357 | "\n", 358 | " {\"api_key\": ,\n", 359 | " \"data\": }\n", 360 | "\n", 361 | "Here the 'api key' may be obtained through ps.api_key, and data is the actual data passed to the custom predictive object in the Predictive Service. It will be passed to the query using **kwargs format\n", 362 | "\n", 363 | "Here is a sample curl command to query your model:\n", 364 | "\n", 365 | " curl -X POST -d '{\"api_key\":\"b437e588-0f2b-45e1-81c8-ce3acfa81ade\", \"data\":{\"x\":[5.1, 3.5, 1.4, 0.2]}}' http://demolab-one-six-2015364754.us-west-2.elb.amazonaws.com/query/classify-sklearn\n", 366 | " \n", 367 | " \n", 368 | "You can also query though Python using the **requests module**" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "### Query through Python" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 77, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "import json\n", 387 | "import requests\n", 388 | "\n", 389 | "def restful_query(x):\n", 390 | " headers = {'content-type': 'application/json'}\n", 391 | " payload = {'api_key':'b437e588-0f2b-45e1-81c8-ce3acfa81ade', \"data\":{\"x\": x}}\n", 392 | " end_point = 'http://demolab-one-six-2015364754.us-west-2.elb.amazonaws.com/query/classify-sklearn'\n", 393 | " return requests.post(end_point, json.dumps(payload), headers=headers).json()" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 78, 399 | "metadata": { 400 | "collapsed": false 401 | }, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "{u'from_cache': True,\n", 407 | " u'model': u'classify-sklearn',\n", 408 | " u'response': [0],\n", 409 | " u'uuid': u'ea1a4314-4795-4ca6-9822-70774e4fdafd',\n", 410 | " u'version': 2}" 411 | ] 412 | }, 413 | "execution_count": 78, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "restful_query([5.1, 3.5, 1.4, 0.2])" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 80, 425 | "metadata": { 426 | "collapsed": false 427 | }, 428 | "outputs": [ 429 | { 430 | "data": { 431 | "text/plain": [ 432 | "{u'from_cache': False,\n", 433 | " u'model': u'classify-sklearn',\n", 434 | " u'response': [0],\n", 435 | " u'uuid': u'a96dc4e6-b3de-4e72-9526-e12174ea58af',\n", 436 | " u'version': 2}" 437 | ] 438 | }, 439 | "execution_count": 80, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "restful_query([5.1, 3.5, 1.4, 0.3])" 446 | ] 447 | } 448 | ], 449 | "metadata": { 450 | "kernelspec": { 451 | "display_name": "Python 2", 452 | "language": "python", 453 | "name": "python2" 454 | }, 455 | "language_info": { 456 | "codemirror_mode": { 457 | "name": "ipython", 458 | "version": 2 459 | }, 460 | "file_extension": ".py", 461 | "mimetype": "text/x-python", 462 | "name": "python", 463 | "nbconvert_exporter": "python", 464 | "pygments_lexer": "ipython2", 465 | "version": "2.7.10" 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 0 470 | } 471 | -------------------------------------------------------------------------------- /strata-nyc-2015/recommendation-systems/README.md: -------------------------------------------------------------------------------- 1 | # Strata + Hadoop World, New York City, 2015 2 | 3 | This directory contains demo notebooks used for the "Introduction to Recommender Systems", the second session of **Machine Learning 101**, an all-day tutorial at [Strata + Hadoop World, New York City, 2015](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217). 4 | 5 | In this session we 6 | 7 | - give an introduction to recommendation systems, 8 | - show how easy it is to get started 9 | - provide examples and slides 10 | 11 | Along the way, we also cover feature engineering and deploying machine learning models as a predictive service. 12 | 13 | ## Setup Instructions 14 | 15 | You can browse the notebooks using Github IPython notebook viewer. Note that some images may not be rendered correctly. If you'd like to run it, follow these steps to set up your machine. 16 | 17 | - [Download](https://turi.com/download/) GraphLab Create and then follow instructions to [install](https://turi.com/download/install.html). 18 | - Download and unzip the datasets [[831MB]](https://static.turi.com/datasets/ml101_datasets_stratanyc_2015.zip) 19 | 20 | ## Handy references 21 | 22 | - [GraphLab Create User Guide](https://turi.com/learn/userguide) 23 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create) 24 | -------------------------------------------------------------------------------- /strata-nyc-2015/recommendation-systems/book-recommender-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import graphlab as gl" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "The following code snippet will parse the books data provided at the training." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "[INFO] This commercial license of GraphLab Create is assigned to engr@turi.com.\n", 33 | "\n", 34 | "[INFO] Start server at: ipc:///tmp/graphlab_server-41686 - Server binary: /Users/chris/miniconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443482376.log\n", 35 | "[INFO] GraphLab Server Version: 1.6.1\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "import os\n", 41 | "if os.path.exists('data/books/ratings'):\n", 42 | " ratings = gl.SFrame('data/books/ratings')\n", 43 | " items = gl.SFrame('data/books/items')\n", 44 | " users = gl.SFrame('data/books/users')\n", 45 | "else:\n", 46 | " ratings = gl.SFrame.read_csv('data/books/book-ratings.csv')\n", 47 | " ratings.save('data/books/ratings')\n", 48 | " items = gl.SFrame.read_csv('data/books/book-data.csv')\n", 49 | " items.save('data/books/items')\n", 50 | " users = gl.SFrame.read_csv('data/books/user-data.csv')\n", 51 | " users.save('data/books/users')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Visually explore the above data using GraphLab Canvas." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Recommendation systems" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "In this section we will make a model that can be used to recommend new tags to users." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Creating a Model" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Use `gl.recommender.create()` to create a model that can be used to recommend tags to each user." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Print a summary of the model by simply entering the name of the object." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Get all unique users from the first 10000 observations and save them as a variable called `users`." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Get 20 recommendations for each user in your list of users. Save these as a new SFrame called `recs`." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Inspecting your model" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Get an SFrame of the 20 most similar items for each observed item." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "This dataset has multiple rows corresponding to the same book, e.g., in situations where reprintings were done by different publishers in different year.\n", 174 | "\n", 175 | "For each unique value of 'book' in the `items` SFrame, select one of the of the available values for `author`, `publisher`, and `year`. Hint: Try using [`SFrame.groupby`](https://turi.com/products/create/docs/graphlab.data_structures.html#module-graphlab.aggregate) and [`gl.aggregate.SELECT_ONE`](https://turi.com/products/create/docs/graphlab.data_structures.html#graphlab.aggregate.SELECT_ONE)." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "Computing the number of times each book was rated, and add a column containing these counts to the `items` SFrame using `SFrame.join`." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Print the first few books, sorted by the number of times they have been rated. Do these values make sense?" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Now print the most similar items per item, sorted by the most common books. Hint: Join the two SFrames you created above." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Experimenting with other models" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Create a dataset called `implicit` that contains only ratings data where `rating` was 4 or greater." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [], 256 | "source": [] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Create a train/test split of the `implicit` data created above. Hint: Use [random_split_by_user](http://graphlab.com/products/create/docs/generated/graphlab.recommender.random_split_by_user.html#graphlab.recommender.random_split_by_user)." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Print the first 5 rows of the training set." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Create a `ranking_factorization_recommender` model using just the training set and 20 factors." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "Evaluate how well this model recommends items that were seen in the test set you created above. Hint: Check out `m.evaluate_precision_recall()`." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": { 317 | "collapsed": false 318 | }, 319 | "outputs": [], 320 | "source": [] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Create an SFrame containing only one observation, where 'Billy Bob' has rated 'Animal Farm' with score 5.0." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "Use this data when querying for recommendations for the user 'Billy Bob'." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [], 352 | "source": [] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "Python 2", 358 | "language": "python", 359 | "name": "python2" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 2 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython2", 371 | "version": "2.7.10" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 0 376 | } 377 | -------------------------------------------------------------------------------- /strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.key -------------------------------------------------------------------------------- /strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.pptx -------------------------------------------------------------------------------- /strata-sj-2016/README.md: -------------------------------------------------------------------------------- 1 | # Strata + Hadoop World, San Jose, 2016 2 | 3 | This directory contains demo notebooks used for **Practical Machine Learning**, an all-day tutorial 4 | at [Strata + Hadoop World, San Jose, 2016](http://conferences.oreilly.com/strata/hadoop-big-data-ca/public/schedule/detail/47056). 5 | The course is designed to introduce machine learning via real applications like 6 | - building a recommender 7 | - detecting anomalies 8 | - analyzing time series data 9 | - image analysis using deep learning 10 | - predicting customer churn 11 | - deploying machine learning 12 | 13 | ## Setup Instructions 14 | 15 | You can browse the notebooks using Github's notebook viewer, but please note that 16 | some images may not be rendered correctly. Follow these 17 | steps to set up and run the notebooks on your own machine. 18 | 19 | - [Download](https://turi.com/download/) GraphLab Create v1.8.5 and then follow instructions to [install](https://turi.com/download/install.html). 20 | - Download and unzip the datasets [[831MB]](https://static.turi.com/datasets/ml101_datasets_stratasj_2016.zip) 21 | - [Install Jupyter notebook](http://jupyter.readthedocs.org/en/latest/install.html) (needed only if you install GraphLab Create via command line) 22 | 23 | ## Handy references 24 | 25 | - [GraphLab Create User Guide](https://turi.com/learn/userguide) 26 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create) 27 | -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/Strata-SJ-2016-Deeplearning.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/Strata-SJ-2016-Deeplearning.pptx -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/AA1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/AA1.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/alexnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/alexnet.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/cifar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/cifar.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/evaluate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/evaluate.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/extract_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/extract_features.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/improve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/improve.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/linear.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/load.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/load.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/quadratic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/quadratic.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/train.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/workflow1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow1.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/workflow2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow2.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/workflow3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow3.png -------------------------------------------------------------------------------- /strata-sj-2016/deep-learning/images/workflow4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow4.png -------------------------------------------------------------------------------- /strata-sj-2016/ml-in-production/deploy-scikit-learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Introduction to ML Deployment\n", 8 | "==================\n", 9 | "\n", 10 | "Deploying models created using python in a Turi Predictive Service is very easy. This notebook walks you through the step-by-step process. \n", 11 | "\n", 12 | "\n", 13 | "\n", 14 | "-----------------------\n", 15 | "\n", 16 | "Deployment Steps\n", 17 | "=========\n", 18 | "The notebook has three sections: \n", 19 | "\n", 20 | "1. Create a model\n", 21 | "2. Create a predictive service\n", 22 | "3. Query the model\n", 23 | "\n", 24 | "If you are deploying a model in an existing Predictive Service instance you can go to step (2) directly.\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Preliminaries\n", 32 | "For the following code, you will need to install the following packages:\n", 33 | "\n", 34 | "```\n", 35 | "pip install graphlab-create\n", 36 | "pip install sklearn\n", 37 | "pip install numpy\n", 38 | "pip install scipy\n", 39 | "```" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## 1. Create a model \n", 47 | "\n", 48 | "Let's train a simple random forest model and deploy it in the Predictive Service.\n", 49 | "\n", 50 | "" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from sklearn.ensemble import RandomForestClassifier\n", 62 | "from sklearn.datasets import load_iris\n", 63 | "iris = load_iris()\n", 64 | "\n", 65 | "model = RandomForestClassifier(n_estimators=10)\n", 66 | "model = model.fit(iris['data'], iris['target'])\n", 67 | "model" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "We can expose the trained model as a REST endpoint. This will allow other applications to consume the predictions from the model. \n", 75 | "\n", 76 | "In order to do that, we wrap the model object in a Python function and add it to the Predictive Service. In the function you may add your own logic for transform input to the model, ensemble different models or manipulate output before returning. Checkout out [user guide](https://turi.com/learn/userguide/#Deployment) for more details.\n", 77 | "\n", 78 | "The result of the function needs to be a **JSON serializable** object." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "def classify(x):\n", 90 | " prediction = model.predict(x)\n", 91 | "\n", 92 | " # convert into a json serializable value\n", 93 | " return list(prediction)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## 2. Create a Predictive Service (One time) \n", 101 | "\n", 102 | "This section shows you how to deploy a Predictive Service to EC2. The EC2 instances used by the Predictive Service will be launched in your own AWS account, so you will be responsible for the cost. \n", 103 | "\n", 104 | "" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "To create a Predictive Service in Amazon AWS, we first configure the EC2 Config object, which contains the configuration parameters required for launching a Predictive Service cluster in EC2. These fields are optional and include the region, instance type, CIDR rules etc. Predictive Service uses this configuration for service creation.\n", 112 | "\n", 113 | "Having configured our EC2 Config object, we're ready to launch a Predictive Service Deployment, There are a few aspects of the Predictive Service that can be customized:\n", 114 | "* Number of nodes in the service - By default the number of hosts (`num_hosts`) is 1. To obtain good cache utility and high availability, we recommended setting num_hosts to at least 3.\n", 115 | "* State path to persist service state and service logs. This is a s3 location. \n", 116 | "* Port to be used by the server.\n", 117 | "* Other settings, such as SSL credentials etc.\n", 118 | "\n", 119 | "The following code snippet shows you how to create a Predictive Service. You will have to replace the ps_state_path and credentials for your Predictive Service." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "import graphlab as gl\n", 131 | "\n", 132 | "# Replace with your path.\n", 133 | "ps_state_path = 's3:///predictive_service/ps'\n", 134 | "\n", 135 | "# Set your AWS credentials.\n", 136 | "gl.aws.set_credentials(, )\n", 137 | "\n", 138 | "# Create an EC2 config\n", 139 | "ec2 = gl.deploy.Ec2Config()\n", 140 | "\n", 141 | "# Launch a predictive service\n", 142 | "ps = gl.deploy.predictive_service.create(\n", 143 | " name='sklearn-predictive-service', \n", 144 | " ec2_config=ec2, state_path=ps_state_path, num_hosts=1)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Load an already created service" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "import graphlab as gl\n", 163 | "ps = gl.deploy.predictive_service.load('s3://gl-demo-usw2/predictive_service/demolab/ps-1.8.4')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "ps" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "# ps.add('classify-sklearn', classify, description='Classify an iris based on petal and sepal dimensions')\n", 186 | "ps.update('classify-sklearn', classify, description='Classify an iris based on petal and sepal dimensions')" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "ps.apply_changes()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Query the model \n", 205 | "\n", 206 | "You may do a test query before really deploying it to production. This will help detect errors in the function before deploying it the Predictive Service. \n", 207 | "\n", 208 | "" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "ps.test_query('classify-sklearn', x=[5.1, 3.5, 1.4, 0.2])" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Now, let us query the real service." 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "# test query to make sure the model works fine\n", 238 | "ps.query('classify-sklearn', x=[5.1, 3.5, 1.4, 0.2])" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Query from external applications via REST\n", 246 | "\n", 247 | "Now other applications can interact with our model! In the next section we will illustrate how to consume the model. We can also use other APIs like ps.update() to update a mode, ps.remove() to remove a model." 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "The model query is exposed through REST API. The url to query is:\n", 255 | "\n", 256 | " http(s):///query/\n", 257 | "\n", 258 | "The predictive service uses basic access authentication to authorize the client to query. The client needs to provide the service's API key in the HTTP header as the password for user name `api_key`. The 'api key' may be obtained through ps.api_key\n", 259 | "\n", 260 | "The payload is a JSON serialized string in the following format:\n", 261 | "\n", 262 | " { \"data\": }\n", 263 | "\n", 264 | "The data is the actual data passed to the custom predictive object in the Predictive Service. It will be passed to the query using **kwargs format**.\n", 265 | "\n", 266 | "Here is a sample curl command to query your model:\n", 267 | "\n", 268 | " curl -u api_key:b0a1c056-30b9-4468-9b8d-c07289017228 -d '{\"data\":{\"x\":[5.1, 3.5, 1.4, 0.2]}}' http://demolab-one-six-2015364754.us-west-2.elb.amazonaws.com/query/classify-sklearn\n", 269 | "\n", 270 | " \n", 271 | "You can also query though Python using the **requests module**" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Query through Python" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "import json\n", 290 | "import requests\n", 291 | "from requests.auth import HTTPBasicAuth\n", 292 | "\n", 293 | "def restful_query(x):\n", 294 | " headers = {'content-type': 'application/json'}\n", 295 | " payload = {\"data\":{\"x\": x}}\n", 296 | " end_point = 'http://demolab-ps-one-eight-four-810335136.us-west-2.elb.amazonaws.com/query/classify-sklearn'\n", 297 | " return requests.post(\n", 298 | " end_point,\n", 299 | " json.dumps(payload),\n", 300 | " headers=headers,\n", 301 | " auth=HTTPBasicAuth('api_key', '9d97391e-8be7-47a9-8b72-34ecc9f0ad60')).json()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "restful_query([5.1, 3.5, 1.4, 0.2])" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "restful_query([5.1, 3.5, 1.4, 0.3])" 324 | ] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Python 2", 330 | "language": "python", 331 | "name": "python2" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 2 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython2", 343 | "version": "2.7.11" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 0 348 | } 349 | -------------------------------------------------------------------------------- /strata-sj-2016/ml-in-production/images/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/left.png -------------------------------------------------------------------------------- /strata-sj-2016/ml-in-production/images/middle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/middle.png -------------------------------------------------------------------------------- /strata-sj-2016/ml-in-production/images/predictive_services_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/predictive_services_overview.png -------------------------------------------------------------------------------- /strata-sj-2016/ml-in-production/images/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/right.png -------------------------------------------------------------------------------- /strata-sj-2016/ml-in-production/ml-production.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/ml-production.key -------------------------------------------------------------------------------- /strata-sj-2016/recommendation-systems/README.md: -------------------------------------------------------------------------------- 1 | # Strata + Hadoop World, San Jose 2016 2 | 3 | This directory contains demo notebooks used for the "Introduction to Recommender Systems", the second session of **Machine Learning 101**, an all-day tutorial at [Strata + Hadoop World, New York City, 2015](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217). 4 | 5 | In this session we 6 | 7 | - give an introduction to recommendation systems, 8 | - show how easy it is to get started 9 | - provide examples and slides 10 | 11 | Along the way, we also cover feature engineering and deploying machine learning models as a predictive service. 12 | 13 | ## Setup Instructions 14 | 15 | You can browse the notebooks using Github IPython notebook viewer. Note that some images may not be rendered correctly. If you'd like to run it, follow these steps to set up your machine. 16 | 17 | - [Download](https://turi.com/download/) GraphLab Create and then follow instructions to [install](https://turi.com/download/install.html). 18 | - Download and unzip the datasets 19 | 20 | ## Handy references 21 | 22 | - [GraphLab Create User Guide](https://turi.com/learn/userguide) 23 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create) 24 | -------------------------------------------------------------------------------- /strata-sj-2016/recommendation-systems/book-recommender-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import graphlab as gl" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "The following code snippet will parse the books data provided at the training." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "[INFO] This commercial license of GraphLab Create is assigned to engr@turi.com.\n", 33 | "\n", 34 | "[INFO] Start server at: ipc:///tmp/graphlab_server-41686 - Server binary: /Users/chris/miniconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443482376.log\n", 35 | "[INFO] GraphLab Server Version: 1.6.1\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "import os\n", 41 | "if os.path.exists('data/books/ratings'):\n", 42 | " ratings = gl.SFrame('data/books/ratings')\n", 43 | " items = gl.SFrame('data/books/items')\n", 44 | " users = gl.SFrame('data/books/users')\n", 45 | "else:\n", 46 | " ratings = gl.SFrame.read_csv('data/books/book-ratings.csv')\n", 47 | " ratings.save('data/books/ratings')\n", 48 | " items = gl.SFrame.read_csv('data/books/book-data.csv')\n", 49 | " items.save('data/books/items')\n", 50 | " users = gl.SFrame.read_csv('data/books/user-data.csv')\n", 51 | " users.save('data/books/users')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Visually explore the above data using GraphLab Canvas." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Recommendation systems" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "In this section we will make a model that can be used to recommend new tags to users." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Creating a Model" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Use `gl.recommender.create()` to create a model that can be used to recommend tags to each user." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Print a summary of the model by simply entering the name of the object." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Get all unique users from the first 10000 observations and save them as a variable called `users`." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Get 20 recommendations for each user in your list of users. Save these as a new SFrame called `recs`." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Inspecting your model" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Get an SFrame of the 20 most similar items for each observed item." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "This dataset has multiple rows corresponding to the same book, e.g., in situations where reprintings were done by different publishers in different year.\n", 174 | "\n", 175 | "For each unique value of 'book' in the `items` SFrame, select one of the of the available values for `author`, `publisher`, and `year`. Hint: Try using [`SFrame.groupby`](https://turi.com/products/create/docs/graphlab.data_structures.html#module-graphlab.aggregate) and [`gl.aggregate.SELECT_ONE`](https://turi.com/products/create/docs/graphlab.data_structures.html#graphlab.aggregate.SELECT_ONE)." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "Computing the number of times each book was rated, and add a column containing these counts to the `items` SFrame using `SFrame.join`." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Print the first few books, sorted by the number of times they have been rated. Do these values make sense?" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Now print the most similar items per item, sorted by the most common books. Hint: Join the two SFrames you created above." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Experimenting with other models" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Create a dataset called `implicit` that contains only ratings data where `rating` was 4 or greater." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [], 256 | "source": [] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Create a train/test split of the `implicit` data created above. Hint: Use [random_split_by_user](http://graphlab.com/products/create/docs/generated/graphlab.recommender.random_split_by_user.html#graphlab.recommender.random_split_by_user)." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Print the first 5 rows of the training set." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Create a `ranking_factorization_recommender` model using just the training set and 20 factors." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "Evaluate how well this model recommends items that were seen in the test set you created above. Hint: Check out `m.evaluate_precision_recall()`." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": { 317 | "collapsed": false 318 | }, 319 | "outputs": [], 320 | "source": [] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Create an SFrame containing only one observation, where 'Billy Bob' has rated 'Animal Farm' with score 5.0." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "Use this data when querying for recommendations for the user 'Billy Bob'." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [], 352 | "source": [] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "Python 2", 358 | "language": "python", 359 | "name": "python2" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 2 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython2", 371 | "version": "2.7.10" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 0 376 | } 377 | -------------------------------------------------------------------------------- /strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.key -------------------------------------------------------------------------------- /strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.pptx -------------------------------------------------------------------------------- /strata-sj-2016/time-series/anomaly_detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1. Load and inspect the data: Oklahoma earthquake stats" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import graphlab as gl\n", 19 | "\n", 20 | "okla_daily = gl.load_timeseries('working_data/ok_daily_stats.ts')\n", 21 | "\n", 22 | "print \"Number of rows:\", len(okla_daily)\n", 23 | "print \"Start:\", okla_daily.min_time\n", 24 | "print \"End:\", okla_daily.max_time\n", 25 | "okla_daily.print_rows(3)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import matplotlib.pyplot as plt\n", 37 | "%matplotlib notebook\n", 38 | "plt.style.use('ggplot')\n", 39 | "\n", 40 | "fig, ax = plt.subplots()\n", 41 | "ax.plot(okla_daily['time'], okla_daily['count'], color='dodgerblue')\n", 42 | "ax.set_ylabel('Number of quakes')\n", 43 | "ax.set_xlabel('Date')\n", 44 | "fig.autofmt_xdate()\n", 45 | "fig.show()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "---" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# 2. Let the toolkit choose the model " 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "from graphlab.toolkits import anomaly_detection\n", 71 | "\n", 72 | "model = anomaly_detection.create(okla_daily, features=['count'])\n", 73 | "\n", 74 | "print model" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "---" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# 3. The simple thresholding model" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "threshold = 5\n", 100 | "anomaly_mask = okla_daily['count'] >= threshold\n", 101 | "\n", 102 | "anomaly_scores = okla_daily[['count']]\n", 103 | "anomaly_scores['threshold_score'] = anomaly_mask" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "anomaly_scores.tail(8).print_rows()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "---" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "# 4. The moving Z-score model" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "from graphlab.toolkits.anomaly_detection import moving_zscore\n", 140 | "\n", 141 | "zscore_model = moving_zscore.create(okla_daily, feature='count',\n", 142 | " window_size=30,\n", 143 | " min_observations=15)\n", 144 | "\n", 145 | "print zscore_model" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "zscore_model.scores.tail(3)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "zscore_model.scores.head(3)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "anomaly_scores['outlier_score'] = zscore_model.scores['anomaly_score']\n", 179 | "anomaly_scores.tail(5).print_rows()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": false, 187 | "scrolled": false 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "fig, ax = plt.subplots(2, sharex=True)\n", 192 | "ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')\n", 193 | "ax[0].set_ylabel('# quakes')\n", 194 | "\n", 195 | "ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')\n", 196 | "ax[1].set_ylabel('outlier score')\n", 197 | "\n", 198 | "ax[1].set_xlabel('Date')\n", 199 | "fig.autofmt_xdate()\n", 200 | "fig.show()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "---" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "# 5. The Bayesian changepoint model" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "from graphlab.toolkits.anomaly_detection import bayesian_changepoints\n", 226 | "\n", 227 | "changept_model = bayesian_changepoints.create(okla_daily, feature='count',\n", 228 | " expected_runlength=2000, lag=7)\n", 229 | "\n", 230 | "print changept_model" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "anomaly_scores['changepoint_score'] = changept_model.scores['changepoint_score']\n", 242 | "anomaly_scores.head(5)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "fig, ax = plt.subplots(3, sharex=True)\n", 254 | "ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')\n", 255 | "ax[0].set_ylabel('# quakes')\n", 256 | "\n", 257 | "ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')\n", 258 | "ax[1].set_ylabel('outlier score')\n", 259 | "\n", 260 | "ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'], color='orchid')\n", 261 | "ax[2].set_ylabel('changepoint score')\n", 262 | "\n", 263 | "ax[2].set_xlabel('Date')\n", 264 | "fig.autofmt_xdate()\n", 265 | "fig.show()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "---" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "# 6. How to use the anomaly scores" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Option 1: choose an anomaly threshold *a priori*" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "- Slightly better than choosing a threshold in the original feature space.\n", 294 | "- For Bayesian changepoint detection, where the scores are probabilities, there is a natural threshold of 0.5." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "threshold = 0.5\n", 306 | "anom_mask = anomaly_scores['changepoint_score'] >= threshold\n", 307 | "\n", 308 | "anomalies = anomaly_scores[anom_mask]\n", 309 | "\n", 310 | "print \"Number of anomalies:\", len(anomalies)\n", 311 | "anomalies.head(5)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "## Option 2: choose the top-k anomalies" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "If you have a fixed budget for investigating and acting on anomalies, this is a good way to go." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "anomalies = anomaly_scores.to_sframe().topk('changepoint_score', k=5)\n", 337 | "\n", 338 | "print \"Number of anomalies:\", len(anomalies)\n", 339 | "anomalies.head(5)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "## Option 3: look at the anomaly distribution and choose a threshold" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "anomaly_scores['changepoint_score'].show()" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": false 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "threshold = 0.072\n", 369 | "anom_mask = anomaly_scores['changepoint_score'] >= threshold\n", 370 | "\n", 371 | "anomalies = anomaly_scores[anom_mask]\n", 372 | "\n", 373 | "print \"Number of anomalies:\", len(anomalies)\n", 374 | "anomalies.head(5)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "## Option 4: get fancy with plotting " 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": false, 389 | "scrolled": false 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "from interactive_plot import LineDrawer\n", 394 | "\n", 395 | "fig, ax = plt.subplots(3, sharex=True)\n", 396 | "guide_lines = []\n", 397 | "threshold_lines = []\n", 398 | "\n", 399 | "p = ax[0].plot(anomaly_scores['time'], anomaly_scores['count'],\n", 400 | " color='dodgerblue')\n", 401 | "ax[0].set_ylabel('# quakes')\n", 402 | "\n", 403 | "line, = ax[0].plot((anomaly_scores.min_time, anomaly_scores.min_time),\n", 404 | " ax[0].get_ylim(), lw=1, ls='--', color='black')\n", 405 | "guide_lines.append(line)\n", 406 | "\n", 407 | "ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'],\n", 408 | " color='orchid')\n", 409 | "ax[1].set_ylabel('outlier score')\n", 410 | "line, = ax[1].plot((anomaly_scores.min_time, anomaly_scores.min_time),\n", 411 | " ax[1].get_ylim(), lw=1, ls='--', color='black')\n", 412 | "guide_lines.append(line)\n", 413 | "\n", 414 | "ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'],\n", 415 | " color='orchid')\n", 416 | "ax[2].set_ylabel('changepoint score')\n", 417 | "ax[2].set_xlabel('Date')\n", 418 | "line, = ax[2].plot((anomaly_scores.min_time, anomaly_scores.min_time), (0., 1.),\n", 419 | " lw=1, ls='--', color='black')\n", 420 | "guide_lines.append(line)\n", 421 | "\n", 422 | "for a in ax:\n", 423 | " line, = a.plot(anomaly_scores.range, (0., 0.), lw=1, ls='--',\n", 424 | " color='black')\n", 425 | " threshold_lines.append(line)\n", 426 | "\n", 427 | "plot_scores = anomaly_scores[['count', 'outlier_score', 'changepoint_score']]\n", 428 | "interactive_thresholder = LineDrawer(plot_scores, guide_lines, threshold_lines)\n", 429 | "interactive_thresholder.connect()\n", 430 | "fig.autofmt_xdate()\n", 431 | "fig.show()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": { 438 | "collapsed": false 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "interactive_thresholder.anoms.print_rows(10)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "---" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "# 7. Updating the model with new data" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "collapsed": false 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "okla_new = gl.load_timeseries('working_data/ok_daily_update.ts')\n", 468 | "okla_new.print_rows(20)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "Why do we want to update the model, rather than training a new one?\n", 476 | "1. Because we've updated our parameters using the data we've seen already.\n", 477 | "2. Updating simplifies the drudgery of prepending the data to get a final score for the lags in the previous data set." 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "changept_model2 = changept_model.update(okla_new)\n", 489 | "\n", 490 | "print changept_model2" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": { 497 | "collapsed": false 498 | }, 499 | "outputs": [], 500 | "source": [ 501 | "changept_model2.scores.print_rows(20)" 502 | ] 503 | } 504 | ], 505 | "metadata": { 506 | "kernelspec": { 507 | "display_name": "Python 2", 508 | "language": "python", 509 | "name": "python2" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 2 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython2", 521 | "version": "2.7.11" 522 | } 523 | }, 524 | "nbformat": 4, 525 | "nbformat_minor": 0 526 | } 527 | -------------------------------------------------------------------------------- /strata-sj-2016/time-series/forecasting_basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1. Load and inspect the data: daily global earthquakes " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Load the main dataset: Feb. 2, 2013 - Mar. 15, 2016" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import graphlab as gl\n", 26 | "\n", 27 | "daily_stats = gl.load_timeseries('working_data/global_daily_stats.ts')\n", 28 | "\n", 29 | "print \"Number of rows:\", len(daily_stats)\n", 30 | "print \"Start:\", daily_stats.min_time\n", 31 | "print \"End:\", daily_stats.max_time\n", 32 | "daily_stats.print_rows(3)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Load the recent data: Mar. 16, 2016 - Mar. 22, 2016" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "The first point in this dataset is our forecasting goal. Pretend it's March 15, and we don't know the count of earthquakes for March 16th." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "daily_update = gl.load_timeseries('working_data/global_daily_update.ts')\n", 58 | "daily_update.print_rows()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Visualize the data with GraphLab Canvas" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "daily_stats.to_sframe().show()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Visualize the data with matplotlib" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false, 91 | "scrolled": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "import matplotlib.pyplot as plt\n", 96 | "%matplotlib notebook\n", 97 | "plt.style.use('ggplot')\n", 98 | "\n", 99 | "fig, ax = plt.subplots()\n", 100 | "ax.plot(daily_stats['time'], daily_stats['count'], color='dodgerblue')\n", 101 | "ax.set_xlabel('Date')\n", 102 | "ax.set_ylabel('Number of earthquakes')\n", 103 | "fig.autofmt_xdate()\n", 104 | "fig.show()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "---" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "# 2. A naive baseline: the grand mean" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "baseline_forecast = daily_stats['count'].mean()\n", 130 | "\n", 131 | "print baseline_forecast" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "---" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# 3. The autoregressive model " 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## Create lagged features" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "daily_stats['lag1_count'] = daily_stats.shift(1)['count']\n", 164 | "daily_stats['lag2_count'] = daily_stats.shift(2)['count']\n", 165 | "\n", 166 | "daily_stats.print_rows(3)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## Train the model" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": false, 181 | "scrolled": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "train_counts = daily_stats[2:].to_sframe()\n", 186 | "\n", 187 | "ar_model = gl.linear_regression.create(train_counts, target='count',\n", 188 | " features=['lag1_count', 'lag2_count'],\n", 189 | " l2_penalty=0., validation_set=None,\n", 190 | " verbose=False)\n", 191 | "\n", 192 | "print ar_model" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "train_counts.tail(5).print_rows()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## Get a forecast from the model " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "## Construct the input dataset first.\n", 222 | "sf_forecast = gl.SFrame({'lag1_count': [daily_stats['count'][-1]],\n", 223 | " 'lag2_count': [daily_stats['count'][-2]]})\n", 224 | "\n", 225 | "## Compute the model's forecast\n", 226 | "ar_forecast = ar_model.predict(sf_forecast)\n", 227 | "print ar_forecast[0]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "****" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "# 4. The gradient-boosted trees model" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## Split the timestamp into parts" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": { 255 | "collapsed": false 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "date_parts = daily_stats.index.split_datetime(column_name_prefix='date',\n", 260 | " limit=['year', 'month', 'day'])" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "## Create lags for *observed* features" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "To forecast tomorrow's earthqauke count:\n", 275 | "- we do know what the date will be, so no need to lag,\n", 276 | "- we don't know what the max and average magnitude will be, so we need to lag." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "daily_stats['lag1_avg_mag'] = daily_stats.shift(1)['avg_mag']\n", 288 | "daily_stats['lag1_max_mag'] = daily_stats.shift(1)['max_mag']" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "sf_train = daily_stats.to_sframe()\n", 300 | "sf_train = sf_train.add_columns(date_parts)\n", 301 | "\n", 302 | "sf_train.print_rows(3)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## Train the model " 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": false 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "feature_list = ['lag1_avg_mag', 'lag1_max_mag', 'lag1_count',\n", 321 | " 'date.year', 'date.month', 'date.day']\n", 322 | "\n", 323 | "# Remove the row with no lagged features.\n", 324 | "sf_train = sf_train[1:]\n", 325 | "\n", 326 | "gbt_model = gl.boosted_trees_regression.create(sf_train, target='count',\n", 327 | " features=feature_list,\n", 328 | " max_iterations=20,\n", 329 | " validation_set=None,\n", 330 | " verbose=False)\n", 331 | "\n", 332 | "print gbt_model" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Compute the model's forecast " 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "## Prepend the last couple rows of the training data.\n", 351 | "ts_forecast = daily_stats[daily_update.column_names()][-2:].union(daily_update)\n", 352 | "\n", 353 | "## Create the lagged features.\n", 354 | "ts_forecast['lag1_avg_mag'] = ts_forecast.shift(1)['avg_mag']\n", 355 | "ts_forecast['lag1_max_mag'] = ts_forecast.shift(1)['max_mag']\n", 356 | "ts_forecast['lag1_count'] = ts_forecast.shift(1)['count']\n", 357 | "\n", 358 | "## Split the timestamp into date parts.\n", 359 | "new_date_parts = ts_forecast.index.split_datetime(column_name_prefix='date',\n", 360 | " limit=['year', 'month', 'day'])\n", 361 | "\n", 362 | "## Add the date parts to the dataset.\n", 363 | "sf_forecast = ts_forecast.to_sframe().add_columns(new_date_parts)\n", 364 | "\n", 365 | "sf_forecast.print_rows(3)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "gbt_forecast = gbt_model.predict(sf_forecast)\n", 377 | "gbt_forecast[2]" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "--- " 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "# 5. And the winner is... " 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "print \"Actual value for March 16:\", daily_update['count'][0]\n", 403 | "print \"\\nBaseline forecast:\", baseline_forecast\n", 404 | "print \"AR model forecast:\", ar_forecast[0]\n", 405 | "print \"GBT forecast:\", gbt_forecast[2], \"\\t(*** winner ***)\"" 406 | ] 407 | } 408 | ], 409 | "metadata": { 410 | "kernelspec": { 411 | "display_name": "Python 2", 412 | "language": "python", 413 | "name": "python2" 414 | }, 415 | "language_info": { 416 | "codemirror_mode": { 417 | "name": "ipython", 418 | "version": 2 419 | }, 420 | "file_extension": ".py", 421 | "mimetype": "text/x-python", 422 | "name": "python", 423 | "nbconvert_exporter": "python", 424 | "pygments_lexer": "ipython2", 425 | "version": "2.7.11" 426 | } 427 | }, 428 | "nbformat": 4, 429 | "nbformat_minor": 0 430 | } 431 | -------------------------------------------------------------------------------- /strata-sj-2016/time-series/interactive_plot.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as _plt 3 | from matplotlib.widgets import Button 4 | _plt.style.use('ggplot') 5 | 6 | 7 | ## Plot an interactive version. 8 | class LineDrawer(object): 9 | def __init__(self, scores, guide_lines, threshold_lines): 10 | self.guide_lines = guide_lines 11 | self.threshold_lines = threshold_lines 12 | self.figure = self.guide_lines[0].figure 13 | self.scores = scores 14 | 15 | self.anoms = self.scores[:0] 16 | self.anom_plot = self.figure.axes[0].plot(self.anoms['time'], 17 | self.anoms['count'], 18 | color='red', lw=0, marker='o', 19 | markersize=10, 20 | alpha=0.7) 21 | 22 | def connect(self): 23 | """Connect to the event.""" 24 | self.cid_press = self.figure.canvas.mpl_connect('button_press_event', 25 | self.on_press) 26 | 27 | def disconnect(self): 28 | """Disconnect the event bindings.""" 29 | self.figure.canvas.mpl_disconnect(self.cid_press) 30 | 31 | def on_press(self, event): 32 | """Store the location data when the mouse button is pressed.""" 33 | 34 | if event.inaxes == self.figure.axes[0]: 35 | self.threshold_lines[0].set_ydata((event.ydata, event.ydata)) 36 | self.threshold_lines[1].set_ydata((0., 0.)) 37 | self.threshold_lines[2].set_ydata((0., 0.)) 38 | 39 | col = self.scores.value_col_names[0] 40 | 41 | elif event.inaxes == self.figure.axes[1]: 42 | self.threshold_lines[1].set_ydata((event.ydata, event.ydata)) 43 | self.threshold_lines[0].set_ydata((0., 0.)) 44 | self.threshold_lines[2].set_ydata((0., 0.)) 45 | 46 | col = self.scores.value_col_names[1] 47 | 48 | elif event.inaxes == self.figure.axes[2]: 49 | self.threshold_lines[2].set_ydata((event.ydata, event.ydata)) 50 | self.threshold_lines[0].set_ydata((0., 0.)) 51 | self.threshold_lines[1].set_ydata((0., 0.)) 52 | 53 | col = self.scores.value_col_names[2] 54 | 55 | else: 56 | return 57 | 58 | ## Print the anomalies from the selected horizontal threshold. 59 | mask = self.scores[col] >= event.ydata 60 | self.anoms = self.scores[mask] 61 | 62 | ## Replot the anomalies on the first axes. 63 | self.anom_plot[0].set_data((list(self.anoms['time']), 64 | list(self.anoms['count']))) 65 | 66 | ## Re-position the vertical guide lines. 67 | for line in self.guide_lines: 68 | line.set_xdata((event.xdata, event.xdata)) 69 | 70 | ## Re-draw the whole figure. 71 | self.figure.canvas.draw() 72 | 73 | -------------------------------------------------------------------------------- /strata-sj-2016/time-series/time_series_analysis_public.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/time-series/time_series_analysis_public.pptx -------------------------------------------------------------------------------- /strata-sj-2016/time-series/time_series_data_object.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1. The data: global earthquake events" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "I pulled the data from the **USGS Advanced National Seismic System Comprehensive Earthquake Catalog**.\n", 15 | "- http://earthquake.usgs.gov/data/\n", 16 | "- http://earthquake.usgs.gov/earthquakes/search/\n", 17 | "- Global seismic events\n", 18 | "- Magnitude 2.5+\n", 19 | "- Real-time data starts February 2, 2013.\n", 20 | "- Main pull goes through March 15, 2016.\n", 21 | "- Update pull goes from March 16 - March 22, 2016." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Load the data" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false, 36 | "scrolled": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import datetime as dt\n", 41 | "import graphlab as gl\n", 42 | "\n", 43 | "sf = gl.SFrame.read_csv('raw_data/global_earthquakes.csv', verbose=False)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Inspect the data visually " 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "sf.show()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "useful_columns = ['time', 'latitude', 'longitude', 'mag', 'type', 'location']\n", 73 | "sf = sf[useful_columns]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "mask = sf['type'] == 'nuclear explosion'\n", 85 | "sf[mask]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## A small bit of data cleaning" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "mask = sf['type'] == 'earthquake'\n", 104 | "sf = sf[mask]\n", 105 | "sf = sf.remove_column('type')\n", 106 | "print \"Number of earthquake events:\", sf.num_rows()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "---" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "# 2. Convert to a `TimeSeries` object " 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Format the timestamp " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "sf['time'] = sf['time'].str_to_datetime(str_format='%Y-%m-%dT%H:%M:%s%ZP')\n", 139 | "sf['time'] = sf['time'].apply(lambda x: x.replace(tzinfo=None))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Convert from `SFrame` to `TimeSeries`" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "quakes = gl.TimeSeries(sf, index='time')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "---" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "# 3. Basic `TimeSeries` operations" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Many operations are just like `SFrame` " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "quakes.print_rows(3)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "quakes[4:7].print_rows()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## Some operations are little different " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "### Column subsets retain the time index" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "quakes[['latitude', 'longitude']].print_rows(3)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Some operations are unique to `TimeSeries`" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Row slicing with a `datetime` interval" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "start = dt.datetime(2014, 5, 1)\n", 251 | "end = dt.datetime(2014, 5, 2)\n", 252 | "\n", 253 | "quakes.slice(start, end).print_rows(3)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "### Working with the time index " 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "print \"Earliest timestamp:\", quakes.min_time\n", 272 | "print \"Latest timestamp:\", quakes.max_time\n", 273 | "print \"Timestamp range:\", quakes.range" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "print \"Index column:\", quakes.index_col_name\n", 285 | "print \"Value columns:\", quakes.value_col_names" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "print quakes.index[:3]" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "big_one = quakes.argmax('mag')\n", 308 | "quakes[big_one]" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "## We can always go back to an `SFrame`" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "sf2 = quakes.to_sframe()\n", 327 | "print type(sf2)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "---" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "# 4. Appending more data" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "## Read in new data and preprocess" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "sf_recent = gl.SFrame.read_csv('raw_data/global_earthquakes_recent.csv', verbose=False)\n", 360 | "\n", 361 | "# Trim away the columns we're not interested in.\n", 362 | "sf_recent = sf_recent[useful_columns]\n", 363 | "\n", 364 | "# Remove any non-earthquake events.\n", 365 | "mask = sf_recent['type'] == 'earthquake'\n", 366 | "sf_recent = sf_recent[mask]\n", 367 | "sf_recent = sf_recent.remove_column('type')\n", 368 | "\n", 369 | "# Convert the timestamp to a `datetime` type.\n", 370 | "sf_recent['time'] = sf_recent['time'].str_to_datetime(str_format='%Y-%m-%dT%H:%M:%s%ZP')\n", 371 | "sf_recent['time'] = sf_recent['time'].apply(lambda x: x.replace(tzinfo=None))\n", 372 | "\n", 373 | "# Convert to a `TimeSeries` object.\n", 374 | "recent_quakes = gl.TimeSeries(sf_recent, index='time')\n", 375 | "recent_quakes.print_rows(3)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "## Get the union of the two datasets" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "- If the indexes don't overlap, this is equivalent to `SFrame.append`.\n", 390 | "- If there is an overlap, `TimeSeries.union` enforces time order." 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": { 397 | "collapsed": false 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "all_quakes = quakes.union(recent_quakes)\n", 402 | "\n", 403 | "print all_quakes.min_time\n", 404 | "print all_quakes.max_time" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "---" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "# 5. Grouping observations by value" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "collapsed": false 426 | }, 427 | "outputs": [], 428 | "source": [ 429 | "grp = quakes.group('location')\n", 430 | "print grp" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "The **`group_info`** `SFrame` tells us what the group names are and how many observations are in each group." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "grp.group_info().topk('group_size', k=8)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "The **`get_group`** method lets us isolate just the observations for any group." 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": { 462 | "collapsed": false 463 | }, 464 | "outputs": [], 465 | "source": [ 466 | "oklahoma_quakes = grp.get_group('Oklahoma')\n", 467 | "oklahoma_quakes.print_rows(3)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "---" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "# 6. Grouping observations by time component " 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "The **`date_part`** of a `TimeSeries` object let's us specify components of a datetime value." 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": false, 496 | "scrolled": true 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "grp = quakes.group(quakes.date_part.HOUR)\n", 501 | "hour_counts = grp.group_info()\n", 502 | "hour_counts.print_rows(5)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "import matplotlib.pyplot as plt\n", 514 | "%matplotlib notebook\n", 515 | "plt.style.use('ggplot')\n", 516 | "\n", 517 | "fig, ax = plt.subplots()\n", 518 | "ax.bar(hour_counts['time.hour'], hour_counts['group_size'], color='dodgerblue')\n", 519 | "ax.set_xlabel('Hour of the day')\n", 520 | "ax.set_ylabel('Number of earthquakes')\n", 521 | "fig.show()" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "---" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "# 7. Resampling " 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "Four things happen with the **`resample`** method:\n", 543 | "1. A new time index is created with uniformly spaced intervals.\n", 544 | "2. Each observation is mapped to an interval.\n", 545 | "3. **Downsampling**: *aggregate* statistics are computed within each interval.\n", 546 | "4. **Upsampling**: values are *interpolated* for empty intervals." 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": { 553 | "collapsed": false 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "import graphlab.aggregate as agg\n", 558 | "\n", 559 | "daily_stats = quakes.resample(period=dt.timedelta(days=1),\n", 560 | " upsample_method='none',\n", 561 | " downsample_method={'count': agg.COUNT('latitude'),\n", 562 | " 'avg_mag': agg.MEAN('mag'),\n", 563 | " 'max_mag': agg.MAX('mag')})\n", 564 | "\n", 565 | "daily_stats['count'] = daily_stats['count'].fillna(0)\n", 566 | "daily_stats.print_rows(5)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "---" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "# 8. Setting up the next notebooks " 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "- For the modeling notebook, we'll use the global earthquake data, downsampled to daily statistics.\n", 588 | "- For the anomaly detection notebook, we'll use just the Oklahoma data, downsampled to daily statistics. " 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "metadata": { 595 | "collapsed": true 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "def compute_daily_stats(data):\n", 600 | " daily = data.resample(period=dt.timedelta(days=1),\n", 601 | " upsample_method='none',\n", 602 | " downsample_method={'count': agg.COUNT('latitude'),\n", 603 | " 'avg_mag': agg.MEAN('mag'),\n", 604 | " 'max_mag': agg.MAX('mag')})\n", 605 | "\n", 606 | " daily['count'] = daily['count'].fillna(0)\n", 607 | " return daily" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": { 614 | "collapsed": true 615 | }, 616 | "outputs": [], 617 | "source": [ 618 | "# Save the daily counts and recent daily counts.\n", 619 | "daily_stats.save('working_data/global_daily_stats.ts')\n", 620 | "compute_daily_stats(recent_quakes).save('working_data/global_daily_update.ts')" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": { 627 | "collapsed": true 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "# Filter just the Oklahoma data from the recent events.\n", 632 | "grp = recent_quakes.group('location')\n", 633 | "recent_oklahoma_quakes = grp.get_group('Oklahoma')" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": { 640 | "collapsed": true 641 | }, 642 | "outputs": [], 643 | "source": [ 644 | "# Compute daily stats for the Oklahoma quake events.\n", 645 | "compute_daily_stats(oklahoma_quakes).save('working_data/ok_daily_stats.ts')\n", 646 | "compute_daily_stats(recent_oklahoma_quakes).save('working_data/ok_daily_update.ts')" 647 | ] 648 | } 649 | ], 650 | "metadata": { 651 | "kernelspec": { 652 | "display_name": "Python 2", 653 | "language": "python", 654 | "name": "python2" 655 | }, 656 | "language_info": { 657 | "codemirror_mode": { 658 | "name": "ipython", 659 | "version": 2 660 | }, 661 | "file_extension": ".py", 662 | "mimetype": "text/x-python", 663 | "name": "python", 664 | "nbconvert_exporter": "python", 665 | "pygments_lexer": "ipython2", 666 | "version": "2.7.11" 667 | } 668 | }, 669 | "nbformat": 4, 670 | "nbformat_minor": 0 671 | } 672 | -------------------------------------------------------------------------------- /webinars/README.md: -------------------------------------------------------------------------------- 1 | # Turi Webinars 2 | 3 | This repository contains materials for webinars by Turi. You can browse 4 | the notebooks using Github's own notebook viewer. Note that some images may not 5 | be rendered correctly. 6 | 7 | If you'd like to run it, you may register for GraphLab Create 8 | (https://turi.com/download/), then follow instructions to install. 9 | 10 | - GraphLab Create User Guide: https://turi.com/learn/userguide 11 | - GraphLab Forum: http://forum.turi.com/categories/graphlab-create 12 | -------------------------------------------------------------------------------- /webinars/pattern-mining/images/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/left.png -------------------------------------------------------------------------------- /webinars/pattern-mining/images/middle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/middle.png -------------------------------------------------------------------------------- /webinars/pattern-mining/images/predictive_services_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/predictive_services_overview.png -------------------------------------------------------------------------------- /webinars/pattern-mining/images/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/right.png -------------------------------------------------------------------------------- /webinars/product-reviews/README.md: -------------------------------------------------------------------------------- 1 | ## Text analysis with machine learning 2 | 3 | This repository contains a demo of using GraphLab Create for understanding product sentiment using review data. The notebook requires GLC v1.9 which will be released at the end of April, 2016. 4 | 5 | See the [live demo](http://demo-baby-product-reviews.turi.com) for a example of how one might use this analysis to compare products. 6 | 7 | This demo requires the Amazon Baby Products data set hosted here: 8 | [https://github.com/learnml/machine-learning-specialization](https://github.com/learnml/machine-learning-specialization) 9 | -------------------------------------------------------------------------------- /webinars/product-reviews/helper_util.py: -------------------------------------------------------------------------------- 1 | import graphlab as gl 2 | from graphlab.toolkits.text_analytics import trim_rare_words, split_by_sentence, extract_part_of_speech, stopwords, PartOfSpeech 3 | from ipywidgets import widgets 4 | from IPython.display import display, HTML, clear_output 5 | 6 | def search(reviews, query='monitor'): 7 | m = gl._internal.search.create(reviews[['name']].unique().dropna()) 8 | monitors = m.query(query)['name'] 9 | reviews = reviews.filter_by(monitors, 'name') 10 | return reviews 11 | 12 | def get_comparisons(a, b, item_a, item_b, aspects): 13 | 14 | # Compute the number of sentences 15 | a2 = a.groupby('tag', {item_a: gl.aggregate.COUNT}) 16 | b2 = b.groupby('tag', {item_b: gl.aggregate.COUNT}) 17 | counts = a2.join(b2) 18 | 19 | # Compute the mean sentiment 20 | a2 = a.groupby('tag', {item_a: gl.aggregate.AVG('sentiment')}) 21 | b2 = b.groupby('tag', {item_b: gl.aggregate.AVG('sentiment')}) 22 | sentiment = a2.join(b2) 23 | 24 | # Get a list of adjectives 25 | a2 = a.select_columns(['tag', 'adjectives'])\ 26 | .stack('adjectives', 'adjective')\ 27 | .filter_by(aspects, 'adjective', exclude=True)\ 28 | .groupby(['tag'], {item_a: gl.aggregate.CONCAT('adjective')}) 29 | b2 = b.select_columns(['tag', 'adjectives'])\ 30 | .stack('adjectives', 'adjective')\ 31 | .filter_by(aspects, 'adjective', exclude=True)\ 32 | .groupby(['tag'], {item_b: gl.aggregate.CONCAT('adjective')}) 33 | adjectives = a2.join(b2) 34 | 35 | return counts, sentiment, adjectives 36 | 37 | def get_dropdown(reviews): 38 | counts = reviews.groupby('name', gl.aggregate.COUNT).sort('Count', ascending=False) 39 | counts['display_name'] = counts.apply(lambda x: '{} ({})'.format(x['name'], x['Count'])) 40 | counts = counts.head(500) 41 | 42 | from collections import OrderedDict 43 | items = OrderedDict(zip(counts['display_name'], counts['name'])) 44 | item_dropdown = widgets.Dropdown() 45 | item_dropdown.options = items 46 | item_dropdown.value = items.values()[1] 47 | return item_dropdown 48 | 49 | def get_extreme_sentences(tagged, k=100): 50 | 51 | def highlight(sentence, tags, color): 52 | for tag in tags: 53 | html_tag = '{1}'.format(color, tag) 54 | sentence = sentence.replace(tag, html_tag) 55 | return sentence 56 | 57 | good = tagged.topk('sentiment', k=k, reverse=False) 58 | good['highlighted'] = good.apply(lambda x: highlight(x['sentence'], x['adjectives'], 'red')) 59 | good['highlighted'] = good.apply(lambda x: highlight(x['highlighted'], [x['tag']], 'green')) 60 | 61 | bad = tagged.topk('sentiment', k=k, reverse=True) 62 | bad['highlighted'] = bad.apply(lambda x: highlight(x['sentence'], x['adjectives'], 'red')) 63 | bad['highlighted'] = bad.apply(lambda x: highlight(x['highlighted'], [x['tag']], 'green')) 64 | 65 | return good, bad 66 | 67 | def print_sentences(sentences): 68 | display(HTML('

'.join(sentences))) 69 | 70 | 71 | --------------------------------------------------------------------------------