├── LICENSE
├── README.md
├── dss-2016
    ├── README.md
    ├── churn_prediction
    │   ├── README.md
    │   ├── churn-tutorial-explore-explain.ipynb
    │   ├── churn-tutorial.ipynb
    │   └── user-activity-data.ipynb
    ├── deep_learning
    │   ├── Deep_Learning_for_Image_Analysis.ipynb
    │   └── README.md
    ├── lead_scoring
    │   ├── README.md
    │   └── lead_scoring_tutorial.ipynb
    └── recommendation_systems
    │   ├── README.md
    │   ├── Recommender DeepDive - Part 1.ipynb
    │   ├── Recommender DeepDive - Part 2.ipynb
    │   ├── book-recommender-exercises.ipynb
    │   └── book-recommender-solutions.ipynb
├── notebooks
    ├── AnyGivenSunday.ipynb
    ├── airline_on-time_performance.ipynb
    ├── autotagging_hacker_news_posts.ipynb
    ├── bank_lead_scoring_demo.ipynb
    ├── basic_recommender_functionalities.ipynb
    ├── build_imagenet_deeplearning.ipynb
    ├── customer-churn-prediction.ipynb
    ├── data_mining_web_session_logs.ipynb
    ├── datapipeline_recsys_intro.ipynb
    ├── datas_messy_clean_it.ipynb
    ├── deep_text_learning.ipynb
    ├── deploy-scikit-learn-in-ps.ipynb
    ├── feature-engineering.ipynb
    ├── feature_engineering_with_graphlab_create.ipynb
    ├── five_line_recommender.ipynb
    ├── follow_the_cryptocurrency.ipynb
    ├── food_retrieval-public.ipynb
    ├── fraud-detection.ipynb
    ├── getting_started_with_graphlab_create.ipynb
    ├── getting_started_with_python.ipynb
    ├── graph_analytics_movies.ipynb
    ├── image_similarity.ipynb
    ├── intro-regression.ipynb
    ├── introduction_to_sframes.ipynb
    ├── kaggle_bike_share_prediction.ipynb
    ├── linear_regression_benchmark.ipynb
    ├── link_prediction.ipynb
    ├── machine_learning_with_graphLab_create.ipynb
    ├── model_parameter_search.ipynb
    ├── predictive_services_intro.ipynb
    ├── product_matching.ipynb
    ├── reading_data_from_impala.ipynb
    ├── recsys_explicit_rating.ipynb
    ├── recsys_rank_10K_song.ipynb
    ├── reddit_analysis.ipynb
    ├── sentiment_classifier.ipynb
    ├── sherlock_text_analytics.ipynb
    └── spark_and_graphlab_create.ipynb
├── strata-nyc-2015
    ├── README.md
    ├── deep-learning
    │   ├── Deep Learning for Image Classification and Finding Similar Images.ipynb
    │   ├── image_similarity.ipynb
    │   └── images
    │   │   ├── AA1.png
    │   │   ├── alexnet.png
    │   │   ├── cifar.png
    │   │   ├── evaluate.png
    │   │   ├── extract_features.png
    │   │   ├── improve.png
    │   │   ├── linear.png
    │   │   ├── load.png
    │   │   ├── quadratic.png
    │   │   ├── spiral.1-2.2-2-2-2-2-2.jpg
    │   │   ├── train.png
    │   │   ├── workflow1.png
    │   │   ├── workflow2.png
    │   │   ├── workflow3.png
    │   │   └── workflow4.png
    ├── deployment
    │   ├── images
    │   │   ├── left.png
    │   │   ├── middle.png
    │   │   ├── predictive_services_overview.png
    │   │   └── right.png
    │   ├── predictive_services.ipynb
    │   └── scikit_deployment.ipynb
    ├── feature_engineering
    │   ├── Feature Engineering for Text Data.ipynb
    │   └── PCA demo.ipynb
    └── recommendation-systems
    │   ├── README.md
    │   ├── Recommender DeepDive - Part 1.ipynb
    │   ├── Recommender DeepDive - Part 2.ipynb
    │   ├── book-recommender-exercises.ipynb
    │   ├── book-recommender-solutions.ipynb
    │   ├── strata-nyc-2015-recommendation-systems.key
    │   └── strata-nyc-2015-recommendation-systems.pptx
├── strata-sj-2016
    ├── README.md
    ├── deep-learning
    │   ├── Deep Learning for Image Classification and Finding Similar Images.ipynb
    │   ├── Strata-SJ-2016-Deeplearning.pptx
    │   ├── image_similarity.ipynb
    │   └── images
    │   │   ├── AA1.png
    │   │   ├── alexnet.png
    │   │   ├── cifar.png
    │   │   ├── evaluate.png
    │   │   ├── extract_features.png
    │   │   ├── improve.png
    │   │   ├── linear.png
    │   │   ├── load.png
    │   │   ├── quadratic.png
    │   │   ├── spiral.1-2.2-2-2-2-2-2.jpg
    │   │   ├── train.png
    │   │   ├── workflow1.png
    │   │   ├── workflow2.png
    │   │   ├── workflow3.png
    │   │   └── workflow4.png
    ├── intro-ml
    │   ├── getting-started-with-sframes.ipynb
    │   └── sentiment_analysis.ipynb
    ├── ml-in-production
    │   ├── deploy-dress-recommender.ipynb
    │   ├── deploy-scikit-learn.ipynb
    │   ├── images
    │   │   ├── left.png
    │   │   ├── middle.png
    │   │   ├── predictive_services_overview.png
    │   │   └── right.png
    │   └── ml-production.key
    ├── recommendation-systems
    │   ├── README.md
    │   ├── Recommender DeepDive - Part 1.ipynb
    │   ├── Recommender DeepDive - Part 2.ipynb
    │   ├── book-recommender-exercises.ipynb
    │   ├── book-recommender-solutions.ipynb
    │   ├── strata-sj-2016-recommendation-systems.key
    │   └── strata-sj-2016-recommendation-systems.pptx
    └── time-series
    │   ├── anomaly_detection.ipynb
    │   ├── forecasting_basics.ipynb
    │   ├── interactive_plot.py
    │   ├── time_series_analysis_public.pptx
    │   └── time_series_data_object.ipynb
└── webinars
    ├── README.md
    ├── pattern-mining
        ├── demo.ipynb
        ├── deployment.ipynb
        └── images
        │   ├── left.png
        │   ├── middle.png
        │   ├── predictive_services_overview.png
        │   └── right.png
    └── product-reviews
        ├── README.md
        ├── helper_util.py
        └── text_demo.ipynb


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Turi Tutorials
 2 | 
 3 | This repository contains materials for demos, tutorials, and talks by Turi.
 4 | You can browse the notebooks using Github's own notebook viewer. Note that some
 5 | images may not be rendered correctly. 
 6 | 
 7 | If you'd like to run it, you may register for GraphLab Create
 8 | (https://turi.com/download/), then follow instructions to install.
 9 | 
10 | - GraphLab Create User Guide: https://turi.com/learn/userguide
11 | - GraphLab Forum: http://forum.turi.com/categories/graphlab-create
12 | 
13 | # Events
14 | 
15 | - Webinars[[Notebooks]](webinars/README.md)
16 | - Strata + Hadoop World, New York City 2015 [[Event Page]](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217) [[Tutorials]](strata-nyc-2015/README.md)
17 | - Strata + Hadoop World, San Jose, 2016 [[Event Page]](http://conferences.oreilly.com/strata/hadoop-big-data-ca/public/schedule/detail/47056).
18 | - Data Scince Summit, San Francisco, 2016 
19 | 


--------------------------------------------------------------------------------
/dss-2016/README.md:
--------------------------------------------------------------------------------
 1 | # Data Sciene Summit 2016, San Francisco 
 2 | 
 3 | This directory contains demo notebooks used for the collection of **machine
 4 | learning tutorials** at the [Data Science Summit
 5 | 2016](https://conf.turi.com/2016/us/). 
 6 | 
 7 | The tutorials introduce machine learning via real applications like:
 8 | - recommender systems [Event
 9 |   Page](https://turi.com/events/training/2016-dss-personalization-tutorial.html)
10 | - object dection with deep learning [Event
11 |   Page](https://turi.com/events/training/2016-dss-image-apps-tutorial.html)
12 | - predicting customer churn [Event
13 |   Page](https://turi.com/events/training/2016-dss-customer-intelligence-tutorial.html)
14 | - lead scoring [Event
15 |   Page](https://turi.com/events/training/2016-dss-customer-intelligence-tutorial.html)
16 | 
17 | ## Setup Instructions
18 | 
19 | You  can browse the notebooks using Github's notebook viewer, but please note
20 | that some images may not be rendered correctly. 
21 | 
22 | Follow these steps to set up and run the notebooks on your own machine.
23 | 
24 | - [Download](https://turi.com/download/) GraphLab Create v2.0.1 and then follow
25 |   instructions to [install](https://turi.com/download/install.html).
26 | 
27 | - Download and unzip the datasets
28 |   [here](https://s3-us-west-2.amazonaws.com/turi-tutorials/TURI.zip).
29 | 
30 | - [Install Jupyter
31 |   notebook](http://jupyter.readthedocs.org/en/latest/install.html) (needed only
32 |   if you install GraphLab Create via command line)
33 | 
34 | ## Handy references
35 | 
36 | - [GraphLab Create User Guide](https://turi.com/learn/userguide)
37 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create)
38 | 


--------------------------------------------------------------------------------
/dss-2016/churn_prediction/README.md:
--------------------------------------------------------------------------------
 1 | Churn Prediction 
 2 | =================
 3 | 
 4 | User-activity data records typically contain user actions performed on a
 5 |  website, service, or product. These user activity transactions tell us a lot
 6 |  about the user's current interests & preferences. Knowing these interests and
 7 |  preferences can help make businesses make better decisions. 
 8 | 
 9 |  Recommender systems, fraud detection, churn prediction, and lead scoring are
10 |  examples of data products that can require user-activity data. In this
11 |  two-part tutorial, you will first learn how to work with user activity data
12 |  and then learn about two specific examples of applications that can leverage
13 |  user activity data; churn prediction & lead scoring;
14 | 
15 |  Churn prediction is the task of identifying users that are likely to stop
16 |  using a service, product or website. Lead scoring is the task of prioritizing
17 |  users based on the probability that are likely to start using a service,
18 |  product or website. In the first part of the tutorial, you will learn to:
19 |  - Train a model to forecast user churn
20 |  - Explore & Evaluate predictions made by the model
21 |  - Consume predictions made by the model in an external application
22 | 
23 | 


--------------------------------------------------------------------------------
/dss-2016/deep_learning/README.md:
--------------------------------------------------------------------------------
 1 | Deep Learning 
 2 | =============
 3 | 
 4 | Deep Learning methods have been driving state-of-the-art results in computer
 5 | vision, speech recognition, and natural language processing. This is important
 6 | in areas such as face detection, photo organization, and machine language
 7 | translation. As a result, user experiences for applications have improved
 8 | dramatically.  
 9 | 
10 | In this tutorial, I'll be deconstructing an image application using GraphLab
11 | Create, and introducing Deep Learning concepts in the process. The focus will
12 | be more on practical usage and less on theory.
13 | 


--------------------------------------------------------------------------------
/dss-2016/lead_scoring/README.md:
--------------------------------------------------------------------------------
 1 | Lead Scoring
 2 | ============
 3 | 
 4 | Prioritizing new leads is critical for sales and marketing teams. Modern
 5 | machine learning methods use historical data and state of the art classifiers
 6 | to learn a probabilistic relationship between sales account features and
 7 | conversion outcome, enabling us to predict with very high accuracy which open
 8 | accounts are likely to convert. This improvement in accuracy translates
 9 | directly into more efficient use of sales and marketing resources.
10 | 
11 | In this tutorial, I'll discuss the major themes in predictive lead scoring and
12 | walk through Python code for building a lead scoring application. The demo uses
13 | Turi's new Lead Scoring Toolkit, but the focus is on the design and
14 | implementation principles for the lead scoring task.
15 | 


--------------------------------------------------------------------------------
/dss-2016/lead_scoring/lead_scoring_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 1. Introduction"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "**The scenario**: suppose we run an online travel agency. We would like to convince our users to book overseas vacations, rather than domestic ones. Each of the users in this dataset will definitely book *something* at the end of a given trial period, i.e. we are only looking at engaged customers.\n",
 15 |     "\n",
 16 |     "**Goals**:\n",
 17 |     "1. predict which new users are most likely to book an overseas trip,\n",
 18 |     "2. generate segmention rules to group similar users based on features and propensity to convert.\n",
 19 |     "\n",
 20 |     "**Data**: mimics the [AirBnB challenge on Kaggle](https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings).\n",
 21 |     "- Users\n",
 22 |     "- Website or app sessions.\n",
 23 |     "\n",
 24 |     "I've simulated data that's very similar in terms of features and distributions, but I've added timestamps to the sessions, and changed the target from country to a binary domestic vs. international variable.\n",
 25 |     "\n",
 26 |     "**Sections**:\n",
 27 |     "1. Introduction\n",
 28 |     "2. The basic scenario - account data only\n",
 29 |     "3. What's happening under the hood?\n",
 30 |     "4. Incorporating activity data."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from __future__ import print_function\n",
 42 |     "import graphlab as gl"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "# 2. The basic scenario"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Import the data: sales accounts"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "- **Sales accounts need not be synonymous with users**, although that is the case here. At Turi, our sales accounts consist of a mix of individual users, companies, and teams within large companies.\n",
 64 |     "\n",
 65 |     "- **The accounts dataset typically comes from a customer relationship management (CRM) tool**, like Salesforce, SAP, or Hubspot. In practice there is an extra step here of extracting the data from that system into an SFrame. "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "collapsed": false
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "users = gl.SFrame('synthetic_airbnb_users.sfr')\n",
 77 |     "users.print_rows(3)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "users['status'].sketch_summary()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "## Encode the target variable"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Three types of accounts.\n",
103 |     "- **Successful accounts**, i.e conversions, are coded as 1.\n",
104 |     "- **Failed accounts** are coded as -1.\n",
105 |     "- **Open accounts**, i.e. accounts that have not been decided, are coded as 0.\n",
106 |     "\n",
107 |     "Together, successful and failed accounts constitute the **training accounts**."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "status_code = {'international': 1,\n",
119 |     "               'domestic': -1,\n",
120 |     "               'new': 0}\n",
121 |     "\n",
122 |     "users['outcome'] = users['status'].apply(lambda x: status_code[x])\n",
123 |     "users[['status', 'outcome']].print_rows(10)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Define the schema"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "In a complex problem like lead scoring, there are potentially many columns with \"meaning\". To help the lead scoring tool recognize these columns, we define a dictionary that maps standard lead scoring inputs to the columns in our particular dataset."
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "user_schema = {'conversion_status': 'outcome',\n",
149 |     "               'account_id': 'id',\n",
150 |     "               'features': ['gender', 'age', 'signup_method', 'signup_app',\n",
151 |     "                            'first_device_type', 'first_browser']}"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "## Create the lead scoring tool"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "**All accounts are passed to the tool when it's created. There is no separate `predict` method.**\n",
166 |     "- We typically want to score the same set of open accounts each day during the trial period.\n",
167 |     "- Very rarely do we want to predict lead scores for different accounts.\n",
168 |     "- It makes more sense to keep the open accounts in the model, so we can incrementally update the lead scores and market segments, as new data comes in.\n",
169 |     "- The `update` method is not yet implemented :("
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "collapsed": false,
177 |     "scrolled": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "scorer = gl.lead_scoring.create(users, user_schema)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Retrieve the model output and export"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "There's a lot of stuff in the lead scoring model's summary. Let's focus on the accessible fields, three in particular:\n",
196 |     "- **open_account_scores**: conversion probability and market segment for *open accounts*\n",
197 |     "- **training_account_scores**: conversion probability and market segment for *existing successes and failures*\n",
198 |     "- **segment_descriptions**: definitions and summary statistics for the market segments"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "print(scorer)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "scorer.open_account_scores.head(3)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "scorer.open_account_scores.topk('conversion_prob', k=3)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "scorer.training_account_scores.head(3)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "scorer.segment_descriptions.head(3)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {
260 |     "collapsed": false
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "scorer.segment_descriptions[['segment_id', 'segment_features']].print_rows(max_column_width=65)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "To get the training or open accounts that belong to a particular market segment, use the respective SFrame's `filter_by` method."
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "collapsed": false
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "seg = scorer.training_account_scores.filter_by(8, 'segment_id').head(3)\n",
283 |     "print(seg)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "# 3. What's happening under the hood?"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "## The scoring model: gradient boosted trees"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [],
307 |    "source": [
308 |     "print(scorer.scoring_model)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "Additional keyword arguments to the lead scoring `create` function are passed through to the gradient boosted trees model."
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": false
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "scorer2 = gl.lead_scoring.create(users, user_schema, max_iterations=20, verbose=False)\n",
327 |     "print(\"Original num trees:\", scorer.scoring_model.num_trees)\n",
328 |     "print(\"New num trees:\", scorer2.scoring_model.num_trees)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "## Validating the scoring model "
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "By default, the gradient boosted trees model withholds ??? percent of the training accounts as a validation set. The validation accuracy can be accessed as a user."
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {
349 |     "collapsed": false
350 |    },
351 |    "outputs": [],
352 |    "source": [
353 |     "print(\"Validation accuracy:\", scorer.scoring_model.validation_accuracy)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "## The segmentation model: decision tree"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {
367 |     "collapsed": false
368 |    },
369 |    "outputs": [],
370 |    "source": [
371 |     "print(scorer.segmentation_model)"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "Because training the lead scoring tool can take some time with large datasets, the number of segments can be changed *after* a lead scoring tool has been created. This function **creates a new model**, the original model is **immutable**."
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {
385 |     "collapsed": false
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "scorer2 = scorer.resize_segmentation_model(max_segments=20)\n",
390 |     "\n",
391 |     "print(\"original number of segments:\", scorer.segment_descriptions.num_rows())\n",
392 |     "print(\"new number of segments:\", scorer2.segment_descriptions.num_rows())"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "# 4. Incorporating activity data"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "markdown",
404 |    "metadata": {},
405 |    "source": [
406 |     "**Account activity data** describes interactions between accounts and aspects of your business, like web assets, email campaigns, or products. Conceptually, each interaction involves at a minimum:\n",
407 |     "- an account\n",
408 |     "- a timestamp\n",
409 |     "\n",
410 |     "Interactions may also have:\n",
411 |     "- an \"item\"\n",
412 |     "- a user\n",
413 |     "- other features"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {
420 |     "collapsed": false
421 |    },
422 |    "outputs": [],
423 |    "source": [
424 |     "sessions = gl.SFrame('synthetic_airbnb_sessions.sfr')\n",
425 |     "sessions = gl.TimeSeries(sessions, index='timestamp')\n",
426 |     "sessions.head(5)"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "As with the accounts table, we need to indicate which columns in the activity table mean what. If we had a column indicating which user was involved, we could specify that as well here. In this scenario, we don't have users that are distinct from accounts."
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {
440 |     "collapsed": true
441 |    },
442 |    "outputs": [],
443 |    "source": [
444 |     "session_schema = {'account_id': 'user_id',\n",
445 |     "                  'item': 'action_detail'}"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "metadata": {},
451 |    "source": [
452 |     "## Define relevant dates"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "To use account activity data, a lead scoring tool needs to know the time window for each account's relevant interactions. There are three key dates for each account.\n",
460 |     "\n",
461 |     "- **open date**: when a new sales account was created\n",
462 |     "- **close date**: when the *trial period* ends for a new sales account\n",
463 |     "- **decision date**: when a final decision was reached by a training account, either success (conversion) or failure. May be *before or after* the close date.\n",
464 |     "\n",
465 |     "The **trial duration** is the difference between the open date and the close date. The lead scoring tool in GLC assumes this is fixed for all accounts, but in general this need not be the case.\n",
466 |     "\n",
467 |     "Open accounts do not have a decision date yet, by definition. They may or may not be still within the trial period."
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {
474 |     "collapsed": true
475 |    },
476 |    "outputs": [],
477 |    "source": [
478 |     "user_schema.update({'open_date': 'date_account_created',\n",
479 |     "                    'decision_date': 'booking_date'})"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {},
485 |    "source": [
486 |     "The trial duration is represented by an instance of the `datetime` package's `timedelta` class."
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "## Create the lead scoring tool "
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {
500 |     "collapsed": false
501 |    },
502 |    "outputs": [],
503 |    "source": [
504 |     "import datetime as dt\n",
505 |     "\n",
506 |     "scorer3 = gl.lead_scoring.create(users, user_schema,\n",
507 |     "                                 sessions, session_schema,\n",
508 |     "                                 trial_duration=dt.timedelta(days=30))"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {
515 |     "collapsed": false
516 |    },
517 |    "outputs": [],
518 |    "source": [
519 |     "print(scorer3)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {},
525 |    "source": [
526 |     "## Under the hood: date-based data validation"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {},
532 |    "source": [
533 |     "**Invalid accounts** have a decision date earlier than their open date. This is impossible, and these accounts are simply dropped from the set of training accounts."
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {
540 |     "collapsed": false
541 |    },
542 |    "outputs": [],
543 |    "source": [
544 |     "invalid_ids = scorer3.invalid_accounts\n",
545 |     "print(invalid_ids)\n",
546 |     "\n",
547 |     "invalid_accounts = users.filter_by(invalid_ids, 'id')\n",
548 |     "invalid_accounts[['id', 'date_account_created', 'booking_date']].print_rows(3)"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "**Implicit failure accounts** are accounts that are *open*, but have been open for so long they are extremely unlikely to convert.\n",
556 |     "\n",
557 |     "- The threshold for implicit failure is the 95th percentile of the time it took training accounts to reach a decision, or the trial period duration, whichever is longer.\n",
558 |     "\n",
559 |     "- Implicit failures are inluded in *both* the training and open account output, because they are used to train the scoring and segmentation models, but are technically still open.\n",
560 |     "\n",
561 |     "- The user **doesn't *have* to explicitly specify failure accounts** - the model can do that automatically."
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": null,
567 |    "metadata": {
568 |     "collapsed": false
569 |    },
570 |    "outputs": [],
571 |    "source": [
572 |     "print(scorer3.num_implicit_failures)"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "## Under the hood: activity-based feature engineering "
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {},
585 |    "source": [
586 |     "The lead scoring tool constructs account-level features based on the number of interactions, items, and users (not applicable in this scenario) per day that the accounts are open (up to the maximum of the trial duration). The names of these features are accessible as a model field."
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {
593 |     "collapsed": false
594 |    },
595 |    "outputs": [],
596 |    "source": [
597 |     "scorer3.final_features"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "markdown",
602 |    "metadata": {},
603 |    "source": [
604 |     "The values for these features are included in the primary model outputs (`training_account_scores` and `open_account_scores`)."
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {
611 |     "collapsed": false
612 |    },
613 |    "outputs": [],
614 |    "source": [
615 |     "scorer3.open_account_scores.print_rows(3)"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "markdown",
620 |    "metadata": {},
621 |    "source": [
622 |     "The activity-based features are also used to define market segments."
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {
629 |     "collapsed": false
630 |    },
631 |    "outputs": [],
632 |    "source": [
633 |     "cols = ['segment_features', 'median_conversion_prob', 'num_training_accounts']\n",
634 |     "scorer3.segment_descriptions[cols].print_rows(max_row_width=80, max_column_width=60)"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "markdown",
639 |    "metadata": {},
640 |    "source": [
641 |     "## Results: improved validation accuracy"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": null,
647 |    "metadata": {
648 |     "collapsed": false
649 |    },
650 |    "outputs": [],
651 |    "source": [
652 |     "print(\"Account-only validation accuracy:\", scorer.scoring_model.validation_accuracy)\n",
653 |     "print(\"Validation accuracy including activity features:\", scorer3.scoring_model.validation_accuracy)"
654 |    ]
655 |   }
656 |  ],
657 |  "metadata": {
658 |   "kernelspec": {
659 |    "display_name": "Python 2",
660 |    "language": "python",
661 |    "name": "python2"
662 |   },
663 |   "language_info": {
664 |    "codemirror_mode": {
665 |     "name": "ipython",
666 |     "version": 2
667 |    },
668 |    "file_extension": ".py",
669 |    "mimetype": "text/x-python",
670 |    "name": "python",
671 |    "nbconvert_exporter": "python",
672 |    "pygments_lexer": "ipython2",
673 |    "version": "2.7.12"
674 |   }
675 |  },
676 |  "nbformat": 4,
677 |  "nbformat_minor": 0
678 | }
679 | 


--------------------------------------------------------------------------------
/dss-2016/recommendation_systems/README.md:
--------------------------------------------------------------------------------
 1 | # Data Science Summit 2016 
 2 | 
 3 | Countless online services use recommender systems to provide personalization to
 4 | their users. This is important for selling related items, increasing user
 5 | engagement, and so on. 
 6 | 
 7 | In this tutorial, you will learn
 8 | - the key machine learning concepts that underpin most modern recommender systems
 9 | - how to build your own recommender system using off-the-shelf tools
10 | - the strengths and weaknesses of collaborative filtering and content-based
11 |   approaches, as well as hybrid methods
12 | - how to explore, explain, and evaluate your recommender models
13 | 
14 | 


--------------------------------------------------------------------------------
/dss-2016/recommendation_systems/book-recommender-exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import graphlab as gl"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "The following code snippet will parse the books data provided at the training."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": false
 26 |    },
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "[INFO] This commercial license of GraphLab Create is assigned to engr@turi.com.\n",
 33 |       "\n",
 34 |       "[INFO] Start server at: ipc:///tmp/graphlab_server-41686 - Server binary: /Users/chris/miniconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443482376.log\n",
 35 |       "[INFO] GraphLab Server Version: 1.6.1\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "if os.path.exists('books/ratings'):\n",
 42 |     "    ratings = gl.SFrame('books/ratings')\n",
 43 |     "    items = gl.SFrame('books/items')\n",
 44 |     "    users = gl.SFrame('books/users')\n",
 45 |     "else:\n",
 46 |     "    ratings = gl.SFrame.read_csv('books/book-ratings.csv')\n",
 47 |     "    ratings.save('books/ratings')\n",
 48 |     "    items = gl.SFrame.read_csv('books/book-data.csv')\n",
 49 |     "    items.save('books/items')\n",
 50 |     "    users = gl.SFrame.read_csv('books/user-data.csv')\n",
 51 |     "    users.save('books/users')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Visually explore the above data using GraphLab Canvas."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Recommendation systems"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "In this section we will make a model that can be used to recommend new tags to users."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "### Creating a Model"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Use `gl.recommender.create()` to create a model that can be used to recommend tags to each user."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": []
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Print a summary of the model by simply entering the name of the object."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": []
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Get all unique users from the first 10000 observations and save them as a variable called `users`."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": []
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Get 20 recommendations for each user in your list of users. Save these as a new SFrame called `recs`."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": []
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Inspecting your model"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Get an SFrame of the 20 most similar items for each observed item."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": []
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "This dataset has multiple rows corresponding to the same book, e.g., in situations where reprintings were done by different publishers in different year.\n",
174 |     "\n",
175 |     "For each unique value of 'book' in the `items` SFrame, select one of the of the available values for `author`, `publisher`, and `year`. Hint: Try using [`SFrame.groupby`](https://turi.com/products/create/docs/graphlab.data_structures.html#module-graphlab.aggregate) and [`gl.aggregate.SELECT_ONE`](https://turi.com/products/create/docs/graphlab.data_structures.html#graphlab.aggregate.SELECT_ONE)."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [],
185 |    "source": []
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Computing the number of times each book was rated, and add a column containing these counts to the `items` SFrame using `SFrame.join`."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Print the first few books, sorted by the number of times they have been rated. Do these values make sense?"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [],
217 |    "source": []
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Now print the most similar items per item, sorted by the most common books. Hint: Join the two SFrames you created above."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": []
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Experimenting with other models"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "Create a dataset called `implicit` that contains only ratings data where `rating` was 4 or greater."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {
253 |     "collapsed": false
254 |    },
255 |    "outputs": [],
256 |    "source": []
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "Create a train/test split of the `implicit` data created above. Hint: Use [random_split_by_user](https://turi.com/products/create/docs/generated/graphlab.recommender.random_split_by_user.html#graphlab.recommender.random_split_by_user)."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "collapsed": false
270 |    },
271 |    "outputs": [],
272 |    "source": []
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "Print the first 5 rows of the training set."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {
285 |     "collapsed": false
286 |    },
287 |    "outputs": [],
288 |    "source": []
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "Create a `ranking_factorization_recommender` model using just the training set and 20 factors."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "collapsed": false
302 |    },
303 |    "outputs": [],
304 |    "source": []
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "Evaluate how well this model recommends items that were seen in the test set you created above. Hint: Check out `m.evaluate_precision_recall()`."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {
317 |     "collapsed": false
318 |    },
319 |    "outputs": [],
320 |    "source": []
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "Create an SFrame containing only one observation, where 'Billy Bob' has rated 'Animal Farm' with score 5.0."
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [],
336 |    "source": []
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "Use this data when querying for recommendations for the user 'Billy Bob'."
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {
349 |     "collapsed": false
350 |    },
351 |    "outputs": [],
352 |    "source": []
353 |   }
354 |  ],
355 |  "metadata": {
356 |   "kernelspec": {
357 |    "display_name": "Python 2",
358 |    "language": "python",
359 |    "name": "python2"
360 |   },
361 |   "language_info": {
362 |    "codemirror_mode": {
363 |     "name": "ipython",
364 |     "version": 2
365 |    },
366 |    "file_extension": ".py",
367 |    "mimetype": "text/x-python",
368 |    "name": "python",
369 |    "nbconvert_exporter": "python",
370 |    "pygments_lexer": "ipython2",
371 |    "version": "2.7.11"
372 |   }
373 |  },
374 |  "nbformat": 4,
375 |  "nbformat_minor": 0
376 | }
377 | 


--------------------------------------------------------------------------------
/notebooks/deploy-scikit-learn-in-ps.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:0bd1413cc432876e4c643cf0cea23530ea8ff9ab987036dd795f5daf5df9291a"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Deploy scikit-learn model with Turi Predictive Services"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "Deploying models created using scikit-learn in a Turi Predictive Service is very easy. This notebook walks you through the step-by-step process. The notebook has three sections: \n",
 24 |       "\n",
 25 |       "1. <a href='#createps'>Create a Predictive Service</a>\n",
 26 |       "2. <a href='#deploymodel'>Create a scikit-learn model and deploy it to a Predictive Service</a>\n",
 27 |       "3. <a href='#queryps'>Query the model through CURL or a Predictive Service Client</a>\n",
 28 |       "\n",
 29 |       "If you are deploying a model in an existing Predictive Service instance you can go to <a href='#deploymodel'>step two</a> directly."
 30 |      ]
 31 |     },
 32 |     {
 33 |      "cell_type": "heading",
 34 |      "level": 2,
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "Prerequisites"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "markdown",
 42 |      "metadata": {},
 43 |      "source": [
 44 |       "Apart from GraphLab Create you will, naturally, need scikit-learn installed in your current Python environment. The most straightforward way to do that is to use conda:\n",
 45 |       "```\n",
 46 |       "conda install scikit-learn\n",
 47 |       "```\n",
 48 |       "You will also need a valid AWS account in order to set up a predictive service."
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "heading",
 53 |      "level": 2,
 54 |      "metadata": {},
 55 |      "source": [
 56 |       "Step one: Create a Predictive Service\n",
 57 |       "<a id='createps'></a>"
 58 |      ]
 59 |     },
 60 |     {
 61 |      "cell_type": "markdown",
 62 |      "metadata": {},
 63 |      "source": [
 64 |       "This section shows you how to deploy a Predictive Service to EC2. The EC2 instances used by the Predictive Service will be launched in your own AWS account, so you will be responsible for the cost. \n",
 65 |       "\n",
 66 |       "To create a Predictive Service in Amazon AWS, we first configure the EC2 Config object, which contains the configuration parameters required for launching a Predictive Service cluster in EC2. These fields are optional and include the region, instance type, CIDR rules etc. Predictive Service uses this configuration for service creation.\n",
 67 |       "\n",
 68 |       "Having configured our EC2 Config object, we're ready to launch a Predictive Service Deployment, There are a few aspects of the Predictive Service that can be customized:\n",
 69 |       "* Number of nodes in the service - By default the number of hosts (`num_hosts`) is 1. To obtain good cache utility and high availability, we recommended setting num_hosts to at least 3.\n",
 70 |       "* State path to persist service state and service logs. This is a s3 location. \n",
 71 |       "* Port to be used by the server.\n",
 72 |       "* Other settings, such as SSL credentials etc.\n",
 73 |       "\n",
 74 |       "The following code snippet shows you how to create a Predictive Service. You will have to replace the ps_state_path and credentials for your Predictive Service."
 75 |      ]
 76 |     },
 77 |     {
 78 |      "cell_type": "code",
 79 |      "collapsed": false,
 80 |      "input": [
 81 |       "import graphlab as gl\n",
 82 |       "\n",
 83 |       "# make sure to replace the following with your own information\n",
 84 |       "ps_state_path = 's3://<your-bucket-name>/predictive_service/ps'\n",
 85 |       "\n",
 86 |       "# Create an EC2 config\n",
 87 |       "# You can either specify your AWS credentials using environment variables, or\n",
 88 |       "# set them as arguments to this object's constructor\n",
 89 |       "ec2_config = gl.deploy.Ec2Config(\n",
 90 |       "    aws_access_key_id='<your access key>',\n",
 91 |       "    aws_secret_access_key='<your secret key>')\n",
 92 |       "\n",
 93 |       "# use the EC2 config to launch a new Predictive Service\n",
 94 |       "# num_hosts specifies how many hosts the Predictive Service cluster has. You can scale up and down later after initial creation.\n",
 95 |       "ps = gl.deploy.predictive_service.create(\n",
 96 |       "    name='sklearn-predictive-service',\n",
 97 |       "    ec2_config=ec2_config,\n",
 98 |       "    state_path=ps_state_path,\n",
 99 |       "    num_hosts=1)"
100 |      ],
101 |      "language": "python",
102 |      "metadata": {},
103 |      "outputs": []
104 |     },
105 |     {
106 |      "cell_type": "code",
107 |      "collapsed": false,
108 |      "input": [
109 |       "# once the Predictive Service is successfully created, you can query the service status\n",
110 |       "ps.get_status()"
111 |      ],
112 |      "language": "python",
113 |      "metadata": {},
114 |      "outputs": [
115 |       {
116 |        "metadata": {},
117 |        "output_type": "pyout",
118 |        "prompt_number": 5,
119 |        "text": [
120 |         "[{u'cache': {u'healthy': True, u'num_keys': 0, u'type': u'local'},\n",
121 |         "  u'dns_name': u'ec2-52-34-231-117.us-west-2.compute.amazonaws.com',\n",
122 |         "  u'id': u'i-992d1540',\n",
123 |         "  u'models': [],\n",
124 |         "  u'num_hosts': 1,\n",
125 |         "  u'reason': u'N/A',\n",
126 |         "  u'service_version': u'1.7.1',\n",
127 |         "  u'state': u'InService'}]"
128 |        ]
129 |       }
130 |      ],
131 |      "prompt_number": 5
132 |     },
133 |     {
134 |      "cell_type": "heading",
135 |      "level": 2,
136 |      "metadata": {},
137 |      "source": [
138 |       "Step two: Create a scikit-learn model and deploy to Predictive Service\n",
139 |       "<a id='deploymodel'></a>"
140 |      ]
141 |     },
142 |     {
143 |      "cell_type": "markdown",
144 |      "metadata": {},
145 |      "source": [
146 |       "Let's train a simple random forest model and deploy it in the Predictive Service"
147 |      ]
148 |     },
149 |     {
150 |      "cell_type": "code",
151 |      "collapsed": false,
152 |      "input": [
153 |       "from sklearn.ensemble import RandomForestClassifier\n",
154 |       "X = [[0, 0], [1, 1]]\n",
155 |       "Y = [0, 1]\n",
156 |       "clf = RandomForestClassifier(n_estimators=10)\n",
157 |       "clf = clf.fit(X, Y)"
158 |      ],
159 |      "language": "python",
160 |      "metadata": {},
161 |      "outputs": [],
162 |      "prompt_number": 6
163 |     },
164 |     {
165 |      "cell_type": "markdown",
166 |      "metadata": {},
167 |      "source": [
168 |       "We can expose the trained model as a REST endpoint in the Predictive Service. This will allow other applications to consume the predictions from the model. \n",
169 |       "\n",
170 |       "In order to do that, we wrap the model object in a Python function and add it to the Predictive Service. In the function you may add your own logic for transform input to the model, ensemble different models or manipulate output before returning. Checkout our [user guide](https://turi.com/learn/userguide/deployment/pred-working-with-objects.html) for more details.\n",
171 |       "\n",
172 |       "The result of the function needs to be  a JSON serializable object."
173 |      ]
174 |     },
175 |     {
176 |      "cell_type": "code",
177 |      "collapsed": false,
178 |      "input": [
179 |       "def classify(x):\n",
180 |       "    prediction = clf.predict(x)\n",
181 |       "\n",
182 |       "    # convert into a json serializable value\n",
183 |       "    return list(prediction)\n",
184 |       "\n",
185 |       "# add your predictive function that wraps scikit-learn model\n",
186 |       "ps.add('classify', classify)"
187 |      ],
188 |      "language": "python",
189 |      "metadata": {},
190 |      "outputs": [
191 |       {
192 |        "output_type": "stream",
193 |        "stream": "stderr",
194 |        "text": [
195 |         "[INFO] Endpoint 'classify' is added. Use apply_changes() to deploy all pending changes, or continue with other modification.\n"
196 |        ]
197 |       }
198 |      ],
199 |      "prompt_number": 7
200 |     },
201 |     {
202 |      "cell_type": "markdown",
203 |      "metadata": {},
204 |      "source": [
205 |       "You may do a test query before really deploying it to production. This will help detect errors in the function before deploying it the Predictive Service. \n",
206 |       "\n",
207 |       "The response to a query is a JSON object with the following keys:\n",
208 |       "    \n",
209 |       "    * response: is the actual response from the query;\n",
210 |       "    * uuid: is the unique identifier for your query. The 'uuid' is useful when you need to correlated the query with other data you potentially have for future model tuning.\n",
211 |       "    * version: is the model version. This is useful when you are updating model and you want to know exactly which version served your query"
212 |      ]
213 |     },
214 |     {
215 |      "cell_type": "code",
216 |      "collapsed": false,
217 |      "input": [
218 |       "ps.test_query('classify', x=[[0,0],[1,1]])"
219 |      ],
220 |      "language": "python",
221 |      "metadata": {},
222 |      "outputs": [
223 |       {
224 |        "output_type": "stream",
225 |        "stream": "stderr",
226 |        "text": [
227 |         "[INFO] Input data serializable.\n"
228 |        ]
229 |       },
230 |       {
231 |        "output_type": "stream",
232 |        "stream": "stderr",
233 |        "text": [
234 |         "[INFO] Trying to serve classify\n"
235 |        ]
236 |       },
237 |       {
238 |        "output_type": "stream",
239 |        "stream": "stderr",
240 |        "text": [
241 |         "[INFO] Query results serializable.\n"
242 |        ]
243 |       },
244 |       {
245 |        "metadata": {},
246 |        "output_type": "pyout",
247 |        "prompt_number": 8,
248 |        "text": [
249 |         "{u'response': [0, 1],\n",
250 |         " u'uuid': u'9277467f-fc55-40ee-8125-403f95660840',\n",
251 |         " u'version': 1}"
252 |        ]
253 |       }
254 |      ],
255 |      "prompt_number": 8
256 |     },
257 |     {
258 |      "cell_type": "markdown",
259 |      "metadata": {},
260 |      "source": [
261 |       "It is as expected, let us apply the changes and the predictive model is ready to go!"
262 |      ]
263 |     },
264 |     {
265 |      "cell_type": "code",
266 |      "collapsed": false,
267 |      "input": [
268 |       "# This will push the custom query to the Predictive Service. Since the update is asynchronous, you may need to wait \n",
269 |       "# a little while before the model is fully deployed.\n",
270 |       "ps.apply_changes()"
271 |      ],
272 |      "language": "python",
273 |      "metadata": {},
274 |      "outputs": []
275 |     },
276 |     {
277 |      "cell_type": "markdown",
278 |      "metadata": {},
279 |      "source": [
280 |       "Check status and make sure the deployed custom predictive object is fully operational:"
281 |      ]
282 |     },
283 |     {
284 |      "cell_type": "code",
285 |      "collapsed": false,
286 |      "input": [
287 |       "# There are other variable way of query status, check API document for more details\n",
288 |       "ps.get_status('model')"
289 |      ],
290 |      "language": "python",
291 |      "metadata": {},
292 |      "outputs": [
293 |       {
294 |        "html": [
295 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
296 |         "    <tr>\n",
297 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">name</th>\n",
298 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">expected version</th>\n",
299 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">type</th>\n",
300 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">reason</th>\n",
301 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">node.i-992d1540</th>\n",
302 |         "    </tr>\n",
303 |         "    <tr>\n",
304 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">classify</td>\n",
305 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1</td>\n",
306 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">model</td>\n",
307 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">N/A</td>\n",
308 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1 (Loaded successfully)</td>\n",
309 |         "    </tr>\n",
310 |         "</table>\n",
311 |         "[? rows x 5 columns]<br/>Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.<br/>You can use len(sf) to force materialization.\n",
312 |         "</div>"
313 |        ],
314 |        "metadata": {},
315 |        "output_type": "pyout",
316 |        "prompt_number": 10,
317 |        "text": [
318 |         "Columns:\n",
319 |         "\tname\tstr\n",
320 |         "\texpected version\tint\n",
321 |         "\ttype\tstr\n",
322 |         "\treason\tstr\n",
323 |         "\tnode.i-992d1540\tstr\n",
324 |         "\n",
325 |         "Rows: Unknown\n",
326 |         "\n",
327 |         "Data:\n",
328 |         "+----------+------------------+-------+--------+-------------------------+\n",
329 |         "|   name   | expected version |  type | reason |     node.i-992d1540     |\n",
330 |         "+----------+------------------+-------+--------+-------------------------+\n",
331 |         "| classify |        1         | model |  N/A   | 1 (Loaded successfully) |\n",
332 |         "+----------+------------------+-------+--------+-------------------------+\n",
333 |         "[? rows x 5 columns]\n",
334 |         "Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.\n",
335 |         "You can use len(sf) to force materialization."
336 |        ]
337 |       }
338 |      ],
339 |      "prompt_number": 10
340 |     },
341 |     {
342 |      "cell_type": "code",
343 |      "collapsed": false,
344 |      "input": [
345 |       "# test query to make sure the model works fine\n",
346 |       "ps.query('classify', x=[[0,0],[1,1]])"
347 |      ],
348 |      "language": "python",
349 |      "metadata": {},
350 |      "outputs": [
351 |       {
352 |        "metadata": {},
353 |        "output_type": "pyout",
354 |        "prompt_number": 11,
355 |        "text": [
356 |         "{u'from_cache': False,\n",
357 |         " u'model': u'classify',\n",
358 |         " u'response': [0, 1],\n",
359 |         " u'uuid': u'05c773c5-a3e0-4783-bca6-a8925fb20a0c',\n",
360 |         " u'version': 1}"
361 |        ]
362 |       }
363 |      ],
364 |      "prompt_number": 11
365 |     },
366 |     {
367 |      "cell_type": "markdown",
368 |      "metadata": {},
369 |      "source": [
370 |       "Now other applications can interact with our model! In the next section we will illustrate how to consume the model. We can also use other APIs like `ps.update()` to update a model and `ps.remove()` to remove a model."
371 |      ]
372 |     },
373 |     {
374 |      "cell_type": "markdown",
375 |      "metadata": {},
376 |      "source": [
377 |       "Turi Predictive Services includes a stand-alone Python client for those who just want to query a running service. We will show you how to use the client in the following section. The client takes a configuration file containing the endpoint of the Predictive Service and API key used by client. You can generate the Python client configuration using the following call and hand off the configuration file to your consumer."
378 |      ]
379 |     },
380 |     {
381 |      "cell_type": "code",
382 |      "collapsed": false,
383 |      "input": [
384 |       "# Generate a client configuration file for Predictive Service Client to consume\n",
385 |       "# It is a good practice to config a CNAME entry in your DNS provider to have a well known endpoint\n",
386 |       "# like https://models.companyname.com to point to the Predictive Service so that the consumer of\n",
387 |       "# the Predictive Service do not need to change their code when you make modifications to your\n",
388 |       "# Predictive Service\n",
389 |       "# Here we use None only for demo purpose\n",
390 |       "ps.save_client_config(file_path='/tmp/ps_client.conf', predictive_service_cname = None)"
391 |      ],
392 |      "language": "python",
393 |      "metadata": {},
394 |      "outputs": []
395 |     },
396 |     {
397 |      "cell_type": "markdown",
398 |      "metadata": {},
399 |      "source": [
400 |       "Once generated, the ps_client.conf file may be passed along to your client side developer. We will show you how to use the file in next section."
401 |      ]
402 |     },
403 |     {
404 |      "cell_type": "heading",
405 |      "level": 2,
406 |      "metadata": {},
407 |      "source": [
408 |       "Step three: query the model through REST api and Python client\n",
409 |       "<a id='queryps'></a>"
410 |      ]
411 |     },
412 |     {
413 |      "cell_type": "heading",
414 |      "level": 3,
415 |      "metadata": {},
416 |      "source": [
417 |       "Query through REST"
418 |      ]
419 |     },
420 |     {
421 |      "cell_type": "markdown",
422 |      "metadata": {},
423 |      "source": [
424 |       "The model query is exposed through REST API. The endpoints URL is:\n",
425 |       "\n",
426 |       "    http(s)://<your-ps-endpoint-base>/query/<model-name>\n",
427 |       "    \n",
428 |       "You can find out the endpoint URL base by simply printing the `ps` object, and copying the *Load Balancer DNS Name*.\n",
429 |       "\n",
430 |       "The HTTP call for querying a model or method is  a POST call, requiring a JSON-serialized string in the following format as payload:\n",
431 |       "\n",
432 |       "    { \"data\": <parameters to model or custom method> }\n",
433 |       "\n",
434 |       "You also need a valid API key, which you can retreive through `ps.api_key`.\n",
435 |       "\n",
436 |       "Here is a sample curl command to query the `classify` method that we deployed in this notebook:\n",
437 |       "\n",
438 |       "    curl -u api_key:<your-api-key> -d '{\"data\": {\"x\": [[0,0],[1,1]]}}'\n",
439 |       "        http://<your-ps-endpoint-base>/query/classify"
440 |      ]
441 |     },
442 |     {
443 |      "cell_type": "heading",
444 |      "level": 3,
445 |      "metadata": {},
446 |      "source": [
447 |       "Query through Python"
448 |      ]
449 |     },
450 |     {
451 |      "cell_type": "markdown",
452 |      "metadata": {},
453 |      "source": [
454 |       "We also ship a Python client package that you may easily consume the model. To install the package, do:\n",
455 |       "    \n",
456 |       "    pip install GraphLab-Service-Client  \n",
457 |       "    \n",
458 |       "After that you may consume the Predictive Model:"
459 |      ]
460 |     },
461 |     {
462 |      "cell_type": "code",
463 |      "collapsed": false,
464 |      "input": [
465 |       "from graphlab_service_client import PredictiveServiceClient\n",
466 |       "\n",
467 |       "# the configuration is saved through ps.save_client_config()\n",
468 |       "client = PredictiveServiceClient(config_file='/tmp/ps_client.conf')\n",
469 |       "\n",
470 |       "client.query('classify', x=[[0,0], [1,1]])"
471 |      ],
472 |      "language": "python",
473 |      "metadata": {},
474 |      "outputs": [
475 |       {
476 |        "metadata": {},
477 |        "output_type": "pyout",
478 |        "prompt_number": 14,
479 |        "text": [
480 |         "{u'from_cache': True,\n",
481 |         " u'model': u'classify',\n",
482 |         " u'response': [0, 1],\n",
483 |         " u'uuid': u'661f8381-e01c-414c-9fe6-4738bfaa28c2',\n",
484 |         " u'version': 1}"
485 |        ]
486 |       }
487 |      ],
488 |      "prompt_number": 14
489 |     },
490 |     {
491 |      "cell_type": "heading",
492 |      "level": 2,
493 |      "metadata": {},
494 |      "source": [
495 |       "Shutting down the predictive service"
496 |      ]
497 |     },
498 |     {
499 |      "cell_type": "markdown",
500 |      "metadata": {},
501 |      "source": [
502 |       "If you don't need to keep you predictive service around for further tasks, make sure to terminate it to avoid incurring unnecessary costs:"
503 |      ]
504 |     },
505 |     {
506 |      "cell_type": "code",
507 |      "collapsed": false,
508 |      "input": [
509 |       "ps.terminate_service()"
510 |      ],
511 |      "language": "python",
512 |      "metadata": {},
513 |      "outputs": [
514 |       {
515 |        "output_type": "stream",
516 |        "stream": "stderr",
517 |        "text": [
518 |         "[INFO] Deleting load balancer: sklearn-predictive-service\n"
519 |        ]
520 |       },
521 |       {
522 |        "output_type": "stream",
523 |        "stream": "stderr",
524 |        "text": [
525 |         "[INFO] Terminating EC2 host(s) [u'i-992d1540'] in us-west-2\n"
526 |        ]
527 |       },
528 |       {
529 |        "output_type": "stream",
530 |        "stream": "stderr",
531 |        "text": [
532 |         "[INFO] Deleting state data.\n"
533 |        ]
534 |       },
535 |       {
536 |        "output_type": "stream",
537 |        "stream": "stderr",
538 |        "text": [
539 |         "[INFO] Deleting s3 state data.\n"
540 |        ]
541 |       },
542 |       {
543 |        "output_type": "stream",
544 |        "stream": "stderr",
545 |        "text": [
546 |         "[INFO] Deleting keys: []\n"
547 |        ]
548 |       },
549 |       {
550 |        "output_type": "stream",
551 |        "stream": "stderr",
552 |        "text": [
553 |         "[INFO] Deleting keys: [u'user/scikit-ps/predictive_objects/classify/1/pickle_archive', u'user/scikit-ps-new/predictive_objects/classify/1/version']\n"
554 |        ]
555 |       },
556 |       {
557 |        "output_type": "stream",
558 |        "stream": "stderr",
559 |        "text": [
560 |         "[INFO] Deleted reference to PredictiveService('sklearn-predictive-service') from current session.\n"
561 |        ]
562 |       }
563 |      ],
564 |      "prompt_number": 15
565 |     },
566 |     {
567 |      "cell_type": "heading",
568 |      "level": 2,
569 |      "metadata": {},
570 |      "source": [
571 |       "Where to go from here"
572 |      ]
573 |     },
574 |     {
575 |      "cell_type": "markdown",
576 |      "metadata": {},
577 |      "source": [
578 |       "This notebook gives you a peek at what Turi Predictive Service can offer. For a more detailed look at the functionalities in the Turi Predictive Service, checkout out [user guide](https://turi.com/learn/userguide/#Deployment) for more details. If you have any questions, post it in our [forum](http://forum.turi.com) and we are happy to assist you!"
579 |      ]
580 |     }
581 |    ],
582 |    "metadata": {}
583 |   }
584 |  ]
585 | }
586 | 


--------------------------------------------------------------------------------
/notebooks/reading_data_from_impala.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:8e45c6ddfe9c7874a75b074e334654d87c2e84aa0cea4a827d1c4118f578b081"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "markdown",
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "# Reading data from Impala\n",
 16 |       "\n",
 17 |       "GraphLab create supports loading data from many standard data formats (CSV, Avro, JSON) and data stores such as S3 and HDFS. We also have an ODBC connector, which works seamlessly for reading data directly from [Cloudera's Impala](http://www.cloudera.com/content/cloudera/en/products-and-services/cdh/impala.html).\n",
 18 |       "\n",
 19 |       "Before trying this on your own computer, you'll need to make sure that you have the [Cloudera ODBC driver](http://www.cloudera.com/content/cloudera/en/downloads/connectors/impala/odbc/impala-odbc-v2-5-23.html) installed.\n",
 20 |       "\n",
 21 |       "Let's take a look at how simple it is to stream results from Impala queries directly into our scalable data structure, the SFrame."
 22 |      ]
 23 |     },
 24 |     {
 25 |      "cell_type": "code",
 26 |      "collapsed": false,
 27 |      "input": [
 28 |       "import graphlab as gl"
 29 |      ],
 30 |      "language": "python",
 31 |      "metadata": {},
 32 |      "outputs": [],
 33 |      "prompt_number": 4
 34 |     },
 35 |     {
 36 |      "cell_type": "code",
 37 |      "collapsed": false,
 38 |      "input": [
 39 |       "# configure your ODBC connection\n",
 40 |       "db = gl.connect_odbc(\"DRIVER=/opt/cloudera/impalaodbc/lib/universal/\" \\\n",
 41 |       "                     \"libclouderaimpalaodbc.dylib;HOST=10.10.2.15;PORT=21050\")"
 42 |      ],
 43 |      "language": "python",
 44 |      "metadata": {},
 45 |      "outputs": [
 46 |       {
 47 |        "output_type": "stream",
 48 |        "stream": "stderr",
 49 |        "text": [
 50 |         "[INFO] Start server at: ipc:///tmp/graphlab_server-29804 - Server binary: /Users/rlvoyer/Envs/glc_pypi_1.3/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1423871383.log\n"
 51 |        ]
 52 |       },
 53 |       {
 54 |        "output_type": "stream",
 55 |        "stream": "stderr",
 56 |        "text": [
 57 |         "[INFO] GraphLab Server Version: 1.3.0\n"
 58 |        ]
 59 |       }
 60 |      ],
 61 |      "prompt_number": 3
 62 |     },
 63 |     {
 64 |      "cell_type": "markdown",
 65 |      "metadata": {},
 66 |      "source": [
 67 |       "Cloudera Impala uses SQL as its query language. We can run a standard SQL DESCRIBE query to get a sense for what the data looks like."
 68 |      ]
 69 |     },
 70 |     {
 71 |      "cell_type": "code",
 72 |      "collapsed": false,
 73 |      "input": [
 74 |       "# run a DESCRIBE query against the Amazon product titles table\n",
 75 |       "gl.SFrame.from_odbc(db, \"DESCRIBE titles\")"
 76 |      ],
 77 |      "language": "python",
 78 |      "metadata": {},
 79 |      "outputs": [
 80 |       {
 81 |        "html": [
 82 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
 83 |         "    <tr>\n",
 84 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">name</th>\n",
 85 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">type</th>\n",
 86 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">comment</th>\n",
 87 |         "    </tr>\n",
 88 |         "    <tr>\n",
 89 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">idx</td>\n",
 90 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">bigint</td>\n",
 91 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
 92 |         "    </tr>\n",
 93 |         "    <tr>\n",
 94 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">product_id</td>\n",
 95 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
 96 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
 97 |         "    </tr>\n",
 98 |         "    <tr>\n",
 99 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">num_reviews</td>\n",
100 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">int</td>\n",
101 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
102 |         "    </tr>\n",
103 |         "    <tr>\n",
104 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">price</td>\n",
105 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
106 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
107 |         "    </tr>\n",
108 |         "    <tr>\n",
109 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">simple_category</td>\n",
110 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
111 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
112 |         "    </tr>\n",
113 |         "    <tr>\n",
114 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">title</td>\n",
115 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
116 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
117 |         "    </tr>\n",
118 |         "    <tr>\n",
119 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">category_list_0</td>\n",
120 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
121 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
122 |         "    </tr>\n",
123 |         "    <tr>\n",
124 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">category_list_1</td>\n",
125 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
126 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
127 |         "    </tr>\n",
128 |         "    <tr>\n",
129 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">category_list_2</td>\n",
130 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
131 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
132 |         "    </tr>\n",
133 |         "    <tr>\n",
134 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">category_list_3</td>\n",
135 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">string</td>\n",
136 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\"></td>\n",
137 |         "    </tr>\n",
138 |         "    <tr>\n",
139 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">...</td>\n",
140 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">...</td>\n",
141 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">...</td>\n",
142 |         "    </tr>\n",
143 |         "</table>\n",
144 |         "[15 rows x 3 columns]<br/>Note: Only the head of the SFrame is printed.<br/>You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n",
145 |         "</div>"
146 |        ],
147 |        "metadata": {},
148 |        "output_type": "pyout",
149 |        "prompt_number": 5,
150 |        "text": [
151 |         "Columns:\n",
152 |         "\tname\tstr\n",
153 |         "\ttype\tstr\n",
154 |         "\tcomment\tstr\n",
155 |         "\n",
156 |         "Rows: 15\n",
157 |         "\n",
158 |         "Data:\n",
159 |         "+-----------------+--------+---------+\n",
160 |         "|       name      |  type  | comment |\n",
161 |         "+-----------------+--------+---------+\n",
162 |         "|       idx       | bigint |         |\n",
163 |         "|    product_id   | string |         |\n",
164 |         "|   num_reviews   |  int   |         |\n",
165 |         "|      price      | string |         |\n",
166 |         "| simple_category | string |         |\n",
167 |         "|      title      | string |         |\n",
168 |         "| category_list_0 | string |         |\n",
169 |         "| category_list_1 | string |         |\n",
170 |         "| category_list_2 | string |         |\n",
171 |         "| category_list_3 | string |         |\n",
172 |         "|       ...       |  ...   |   ...   |\n",
173 |         "+-----------------+--------+---------+\n",
174 |         "[15 rows x 3 columns]\n",
175 |         "Note: Only the head of the SFrame is printed.\n",
176 |         "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns."
177 |        ]
178 |       }
179 |      ],
180 |      "prompt_number": 5
181 |     },
182 |     {
183 |      "cell_type": "markdown",
184 |      "metadata": {},
185 |      "source": [
186 |       "Cool! Now let's stream some data into an SFrame."
187 |      ]
188 |     },
189 |     {
190 |      "cell_type": "code",
191 |      "collapsed": false,
192 |      "input": [
193 |       "# run a simple SELECT to get titles for all products with more than 100 reviews\n",
194 |       "titles_sf = gl.SFrame.from_odbc(db, \"SELECT title, num_reviews, simple_category FROM titles WHERE num_reviews > 25\")\n",
195 |       "titles_sf"
196 |      ],
197 |      "language": "python",
198 |      "metadata": {},
199 |      "outputs": [
200 |       {
201 |        "html": [
202 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
203 |         "    <tr>\n",
204 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">title</th>\n",
205 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">num_reviews</th>\n",
206 |         "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">simple_category</th>\n",
207 |         "    </tr>\n",
208 |         "    <tr>\n",
209 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">reality</td>\n",
210 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">166</td>\n",
211 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Music</td>\n",
212 |         "    </tr>\n",
213 |         "    <tr>\n",
214 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">keeping heart on pine<br>ridg ...</td>\n",
215 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">26</td>\n",
216 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Books</td>\n",
217 |         "    </tr>\n",
218 |         "    <tr>\n",
219 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">eric meyer on css:<br>mastering the languag ...</td>\n",
220 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">68</td>\n",
221 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Books</td>\n",
222 |         "    </tr>\n",
223 |         "    <tr>\n",
224 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">pierrot le fou (1969)</td>\n",
225 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">52</td>\n",
226 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Movies &amp; TV</td>\n",
227 |         "    </tr>\n",
228 |         "    <tr>\n",
229 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">the life of john wesley<br>hardin as written by ...</td>\n",
230 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">27</td>\n",
231 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Books</td>\n",
232 |         "    </tr>\n",
233 |         "    <tr>\n",
234 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">snakes on a train<br>(unrated director's ...</td>\n",
235 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">26</td>\n",
236 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Movies &amp; TV</td>\n",
237 |         "    </tr>\n",
238 |         "    <tr>\n",
239 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">t2 : infiltra</td>\n",
240 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">35</td>\n",
241 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Books</td>\n",
242 |         "    </tr>\n",
243 |         "    <tr>\n",
244 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">drop dead fred [region 2]<br>(1991) ...</td>\n",
245 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">161</td>\n",
246 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Movies &amp; TV</td>\n",
247 |         "    </tr>\n",
248 |         "    <tr>\n",
249 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">loser goes first: my<br>thirty-something year ...</td>\n",
250 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">32</td>\n",
251 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Books</td>\n",
252 |         "    </tr>\n",
253 |         "    <tr>\n",
254 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">irresistible (banning<br>sisters trilogy) ...</td>\n",
255 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">29</td>\n",
256 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">Books</td>\n",
257 |         "    </tr>\n",
258 |         "    <tr>\n",
259 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">...</td>\n",
260 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">...</td>\n",
261 |         "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">...</td>\n",
262 |         "    </tr>\n",
263 |         "</table>\n",
264 |         "[71639 rows x 3 columns]<br/>Note: Only the head of the SFrame is printed.<br/>You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n",
265 |         "</div>"
266 |        ],
267 |        "metadata": {},
268 |        "output_type": "pyout",
269 |        "prompt_number": 14,
270 |        "text": [
271 |         "Columns:\n",
272 |         "\ttitle\tstr\n",
273 |         "\tnum_reviews\tint\n",
274 |         "\tsimple_category\tstr\n",
275 |         "\n",
276 |         "Rows: 71639\n",
277 |         "\n",
278 |         "Data:\n",
279 |         "+-------------------------------+-------------+-----------------+\n",
280 |         "|             title             | num_reviews | simple_category |\n",
281 |         "+-------------------------------+-------------+-----------------+\n",
282 |         "|            reality            |     166     |      Music      |\n",
283 |         "|   keeping heart on pine ridg  |      26     |      Books      |\n",
284 |         "| eric meyer on css: masteri... |      68     |      Books      |\n",
285 |         "|     pierrot le fou (1969)     |      52     |   Movies & TV   |\n",
286 |         "| the life of john wesley ha... |      27     |      Books      |\n",
287 |         "| snakes on a train (unrated... |      26     |   Movies & TV   |\n",
288 |         "|         t2 : infiltra         |      35     |      Books      |\n",
289 |         "| drop dead fred [region 2] ... |     161     |   Movies & TV   |\n",
290 |         "| loser goes first: my thirt... |      32     |      Books      |\n",
291 |         "| irresistible (banning sist... |      29     |      Books      |\n",
292 |         "|              ...              |     ...     |       ...       |\n",
293 |         "+-------------------------------+-------------+-----------------+\n",
294 |         "[71639 rows x 3 columns]\n",
295 |         "Note: Only the head of the SFrame is printed.\n",
296 |         "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns."
297 |        ]
298 |       }
299 |      ],
300 |      "prompt_number": 14
301 |     },
302 |     {
303 |      "cell_type": "markdown",
304 |      "metadata": {},
305 |      "source": [
306 |       "We can use GraphLab Canvas to visualize the data."
307 |      ]
308 |     },
309 |     {
310 |      "cell_type": "code",
311 |      "collapsed": false,
312 |      "input": [
313 |       "titles_sf.show()"
314 |      ],
315 |      "language": "python",
316 |      "metadata": {},
317 |      "outputs": [
318 |       {
319 |        "output_type": "stream",
320 |        "stream": "stdout",
321 |        "text": [
322 |         "Canvas is accessible via web browser at the URL: http://localhost:63103/index.html\n"
323 |        ]
324 |       }
325 |      ],
326 |      "prompt_number": 15
327 |     },
328 |     {
329 |      "cell_type": "markdown",
330 |      "metadata": {},
331 |      "source": [
332 |       "And now that we have our data in an SFrame, we're ready to start training predictive models, and deploying them to production!"
333 |      ]
334 |     }
335 |    ],
336 |    "metadata": {}
337 |   }
338 |  ]
339 | }


--------------------------------------------------------------------------------
/strata-nyc-2015/README.md:
--------------------------------------------------------------------------------
 1 | # Strata + Hadoop World, New York City, 2015
 2 | 
 3 | This directory contains demo notebooks used for **Machine Learning 101**, an all-day tutorial at [Strata + Hadoop World, New York City, 2015](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217).
 4 | The course is designed to introduce machine learning via real applications like 
 5 | - building a recommender
 6 | - image analysis using deep learning. 
 7 | 
 8 | Along the way, we also cover feature engineering and deploying machine learning models as a predictive service. .
 9 | 
10 | ## Setup Instructions
11 | 
12 | You can browse the notebooks using Github iPython notebook viewer. Note that some images may not be rendered correctly If you'd like to run it, follow these steps to set up your machine.
13 | 
14 | - [Download](https://turi.com/download/) GraphLab Create and then follow instructions to [install](https://turi.com/download/install.html).
15 | - Download and unzip the datasets [[831MB]](https://static.turi.com/datasets/ml101_datasets_stratanyc_2015.zip)
16 | 
17 | ## Handy references
18 | 
19 | - [GraphLab Create User Guide](https://turi.com/learn/userguide)
20 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create)
21 | 


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/AA1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/AA1.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/alexnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/alexnet.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/cifar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/cifar.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/evaluate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/evaluate.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/extract_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/extract_features.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/improve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/improve.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/linear.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/load.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/load.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/quadratic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/quadratic.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/train.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/workflow1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow1.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/workflow2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow2.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/workflow3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow3.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deep-learning/images/workflow4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deep-learning/images/workflow4.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deployment/images/left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/left.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deployment/images/middle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/middle.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deployment/images/predictive_services_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/predictive_services_overview.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deployment/images/right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/deployment/images/right.png


--------------------------------------------------------------------------------
/strata-nyc-2015/deployment/scikit_deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Introduction to ML Deployment\n",
  8 |     "==================\n",
  9 |     "\n",
 10 |     "Deploying models created using python in a Turi Predictive Service is very easy. This notebook walks you through the step-by-step process. \n",
 11 |     "\n",
 12 |     "<img src='images/predictive_services_overview.png'></img>\n",
 13 |     "\n",
 14 |     "-----------------------\n",
 15 |     "\n",
 16 |     "Deployment Steps\n",
 17 |     "=========\n",
 18 |     "The notebook has three sections: \n",
 19 |     "\n",
 20 |     "1. <a href='#cpo'>Create a model</a>\n",
 21 |     "2. <a href='#create'>Create a predictive service</a>\n",
 22 |     "3. <a href='#query'>Query the model</a>\n",
 23 |     "\n",
 24 |     "If you are deploying a model in an existing Predictive Service instance you can go to step (2) directly.\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## 1. Create a model <a id='cpo'></a>\n",
 32 |     "\n",
 33 |     "Let's train a simple random forest model and deploy it in the Predictive Service.\n",
 34 |     "\n",
 35 |     "<img src=\"images/left.png\"></img>"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 5,
 41 |    "metadata": {
 42 |     "collapsed": false
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
 49 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
 50 |        "            min_samples_leaf=1, min_samples_split=2,\n",
 51 |        "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
 52 |        "            oob_score=False, random_state=None, verbose=0,\n",
 53 |        "            warm_start=False)"
 54 |       ]
 55 |      },
 56 |      "execution_count": 5,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "from sklearn.ensemble import RandomForestClassifier\n",
 63 |     "from sklearn.datasets import load_iris\n",
 64 |     "iris = load_iris()\n",
 65 |     "\n",
 66 |     "model = RandomForestClassifier(n_estimators=10)\n",
 67 |     "model = model.fit(iris['data'], iris['target'])\n",
 68 |     "model"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "We can expose the trained model as a REST endpoint. This will allow other applications to consume the predictions from the model.  \n",
 76 |     "\n",
 77 |     "In order to do that, we wrap the model object in a Python function and add it to the Predictive Service. In the function you may add your own logic for transform input to the model, ensemble different models or manipulate output before returning. Checkout out [user guide](https://turi.com/learn/userguide/#Deployment) for more details.\n",
 78 |     "\n",
 79 |     "The result of the function needs to be  a **JSON serializable** object."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 46,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "def classify(x):\n",
 91 |     "    prediction = model.predict(x)\n",
 92 |     "\n",
 93 |     "    # convert into a json serializable value\n",
 94 |     "    return list(prediction)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## 2. Create a Predictive Service (One time) <a id='create'></a>\n",
102 |     "\n",
103 |     "This section shows you how to deploy a Predictive Service to EC2. The EC2 instances used by the Predictive Service will be launched in your own AWS account, so you will be responsible for the cost. \n",
104 |     "\n",
105 |     "<img src=\"images/middle.png\"></img>"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "To create a Predictive Service in Amazon AWS, we first configure the EC2 Config object, which contains the configuration parameters required for launching a Predictive Service cluster in EC2. These fields are optional and include the region, instance type, CIDR rules etc. Predictive Service uses this configuration for service creation.\n",
113 |     "\n",
114 |     "Having configured our EC2 Config object, we're ready to launch a Predictive Service Deployment, There are a few aspects of the Predictive Service that can be customized:\n",
115 |     "* Number of nodes in the service - By default the number of hosts (`num_hosts`) is 1. To obtain good cache utility and high availability, we recommended setting num_hosts to at least 3.\n",
116 |     "* State path to persist service state and service logs. This is a s3 location. \n",
117 |     "* Port to be used by the server.\n",
118 |     "* Other settings, such as SSL credentials etc.\n",
119 |     "\n",
120 |     "The following code snippet shows you how to create a Predictive Service. You will have to replace the ps_state_path and credentials for your Predictive Service."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 1,
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "import graphlab as gl\n",
132 |     "\n",
133 |     "# Replace with your path.\n",
134 |     "ps_state_path = 's3://<your-bucket-name>/predictive_service/ps'\n",
135 |     "\n",
136 |     "# Set your AWS credentials.\n",
137 |     "gl.aws.set_credentials(<key>, <secret>)\n",
138 |     "\n",
139 |     "# Create an EC2 config\n",
140 |     "ec2_config = gl.deploy.Ec2Config()\n",
141 |     "\n",
142 |     "# Launch a predictive service\n",
143 |     "ps = gl.deploy.predictive_service.create(name = 'sklearn-predictive-service', \n",
144 |     "              ec2_config = ec2_config, state_path = ps_state_path, num_hosts = 1)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "### Load an already created service"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 47,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [
161 |     {
162 |      "name": "stderr",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "[WARNING] Overwritting existing Predictive Service \"demolab-one-six\" in local session.\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "import graphlab as gl\n",
171 |     "ps = gl.deploy.predictive_service.load('s3://gl-demo-usw2/predictive_service/demolab/ps-1.6')"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 62,
177 |    "metadata": {
178 |     "collapsed": false
179 |    },
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "Name                  : demolab-one-six\n",
185 |        "State Path            : s3://gl-demo-usw2/predictive_service/demolab/ps-1.6\n",
186 |        "Description           : None\n",
187 |        "API Key               : b437e588-0f2b-45e1-81c8-ce3acfa81ade\n",
188 |        "CORS origin           : *\n",
189 |        "Global Cache State    : enabled\n",
190 |        "Load Balancer DNS Name: demolab-one-six-2015364754.us-west-2.elb.amazonaws.com\n",
191 |        "\n",
192 |        "Deployed endpoints:\n",
193 |        "\tname: freshdress_kw_search, version: 3, type: alias, cache: disabled, description: Alias for freshdress_kw_search_model\n",
194 |        "\tname: yelp_sentiment_most_extreme_for_place, version: 2, type: model, cache: enabled, description: \n",
195 |        "\tname: classify-sklearn, version: 2, type: model, cache: enabled, description: \n",
196 |        "\tname: freshdress_more_like_image_bw, version: 1, type: model, cache: enabled, description: \n",
197 |        "\tname: freshdress_kw_search_model, version: 2, type: model, cache: enabled, description: \n",
198 |        "\tname: composite_recommender_query, version: 1, type: model, cache: disabled, description: \n",
199 |        "\tname: freshdress_describe, version: 2, type: alias, cache: disabled, description: Alias for freshdress_describe_image_basic\n",
200 |        "\tname: freshdress_more_like_image_bow, version: 3, type: model, cache: enabled, description: \n",
201 |        "\tname: yelp_sentiment_predict_text, version: 2, type: model, cache: enabled, description: \n",
202 |        "\tname: freshdress_describe_image_basic, version: 1, type: model, cache: enabled, description: \n",
203 |        "\tname: freshdress_more_like_image_color, version: 1, type: model, cache: enabled, description: \n",
204 |        "\tname: freshdress_more_like_image, version: 5, type: alias, cache: disabled, description: Alias for freshdress_more_like_image_tfidf\n",
205 |        "\tname: yelp_sentiment_most_extreme, version: 2, type: model, cache: enabled, description: \n",
206 |        "\tname: freshdress_more_like_image_tfidf, version: 1, type: model, cache: enabled, description: \n",
207 |        "\tname: composite_recommender_explanation, version: 1, type: model, cache: disabled, description: \n",
208 |        "\tname: yelp_sentiment_summary, version: 2, type: model, cache: enabled, description: \n",
209 |        "\n",
210 |        "No Pending changes."
211 |       ]
212 |      },
213 |      "execution_count": 62,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "ps"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 53,
225 |    "metadata": {
226 |     "collapsed": false
227 |    },
228 |    "outputs": [
229 |     {
230 |      "name": "stderr",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "[INFO] Endpoint 'classify-sklearn' is updated. Use apply_changes to deploy all pending changes, or continue other modification.\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "# ps.add('classify-sklearn', classify) (If done for the first time)\n",
239 |     "ps.update('classify-sklearn', classify)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 55,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stderr",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "[INFO] There are no pending changes. No action is taken.\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "ps.apply_changes()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "## Query the model <a id='query'></a>\n",
266 |     "\n",
267 |     "You may do a test query before really deploying it to production. This will help detect errors in the function before deploying it the Predictive Service. \n",
268 |     "\n",
269 |     "<img src=\"images/right.png\"></img>"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 56,
275 |    "metadata": {
276 |     "collapsed": false
277 |    },
278 |    "outputs": [
279 |     {
280 |      "name": "stderr",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "[INFO] Input data serializable.\n",
284 |       "[INFO] Trying to serve classify-sklearn\n",
285 |       "[INFO] Query results serializable.\n"
286 |      ]
287 |     },
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "{u'response': [0],\n",
292 |        " u'uuid': u'88947cb8-4646-489d-8360-81ce1d54004e',\n",
293 |        " u'version': 1}"
294 |       ]
295 |      },
296 |      "execution_count": 56,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "ps.test_query('classify-sklearn', x=[5.1,  3.5,  1.4,  0.2])"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "Now, let us query the real service."
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 69,
315 |    "metadata": {
316 |     "collapsed": false
317 |    },
318 |    "outputs": [
319 |     {
320 |      "data": {
321 |       "text/plain": [
322 |        "{u'from_cache': True,\n",
323 |        " u'model': u'classify-sklearn',\n",
324 |        " u'response': [0],\n",
325 |        " u'uuid': u'8afd2f01-6d37-4fd0-8788-5141f92459dd',\n",
326 |        " u'version': 2}"
327 |       ]
328 |      },
329 |      "execution_count": 69,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "# test query to make sure the model works fine\n",
336 |     "ps.query('classify-sklearn', x=[5.1,  3.5,  1.4,  0.2])"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "### Query from external applications via REST\n",
344 |     "\n",
345 |     "Now other applications can interact with our model! In the next section we will illustrate how to consume the model. We can also  use other APIs like ps.update() to update a mode, ps.remove() to remove a model."
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "The model query is exposed through REST API. The path is:\n",
353 |     "\n",
354 |     "    http(s)://<your-ps-endpoint>/data/<model-name>\n",
355 |     "    \n",
356 |     "And the payload is a JSON serialized string in the following format:\n",
357 |     "\n",
358 |     "    {\"api_key\": <api key>,\n",
359 |     "     \"data\": <data-passed-to-custom-query>}\n",
360 |     "\n",
361 |     "Here the 'api key' may be obtained through ps.api_key, and data is the actual data passed to the custom predictive object in the Predictive Service. It will be passed to the query using **kwargs format\n",
362 |     "\n",
363 |     "Here is a sample curl command to query your model:\n",
364 |     "\n",
365 |     "    curl -X POST -d '{\"api_key\":\"b437e588-0f2b-45e1-81c8-ce3acfa81ade\", \"data\":{\"x\":[5.1,  3.5,  1.4,  0.2]}}' http://demolab-one-six-2015364754.us-west-2.elb.amazonaws.com/query/classify-sklearn\n",
366 |     "   \n",
367 |     "    \n",
368 |     "You can also query though Python using the **requests module**"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "metadata": {},
374 |    "source": [
375 |     "### Query through Python"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 77,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "import json\n",
387 |     "import requests\n",
388 |     "\n",
389 |     "def restful_query(x):\n",
390 |     "    headers = {'content-type': 'application/json'}\n",
391 |     "    payload = {'api_key':'b437e588-0f2b-45e1-81c8-ce3acfa81ade', \"data\":{\"x\": x}}\n",
392 |     "    end_point = 'http://demolab-one-six-2015364754.us-west-2.elb.amazonaws.com/query/classify-sklearn'\n",
393 |     "    return requests.post(end_point, json.dumps(payload), headers=headers).json()"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 78,
399 |    "metadata": {
400 |     "collapsed": false
401 |    },
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/plain": [
406 |        "{u'from_cache': True,\n",
407 |        " u'model': u'classify-sklearn',\n",
408 |        " u'response': [0],\n",
409 |        " u'uuid': u'ea1a4314-4795-4ca6-9822-70774e4fdafd',\n",
410 |        " u'version': 2}"
411 |       ]
412 |      },
413 |      "execution_count": 78,
414 |      "metadata": {},
415 |      "output_type": "execute_result"
416 |     }
417 |    ],
418 |    "source": [
419 |     "restful_query([5.1,  3.5,  1.4,  0.2])"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 80,
425 |    "metadata": {
426 |     "collapsed": false
427 |    },
428 |    "outputs": [
429 |     {
430 |      "data": {
431 |       "text/plain": [
432 |        "{u'from_cache': False,\n",
433 |        " u'model': u'classify-sklearn',\n",
434 |        " u'response': [0],\n",
435 |        " u'uuid': u'a96dc4e6-b3de-4e72-9526-e12174ea58af',\n",
436 |        " u'version': 2}"
437 |       ]
438 |      },
439 |      "execution_count": 80,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "restful_query([5.1,  3.5,  1.4,  0.3])"
446 |    ]
447 |   }
448 |  ],
449 |  "metadata": {
450 |   "kernelspec": {
451 |    "display_name": "Python 2",
452 |    "language": "python",
453 |    "name": "python2"
454 |   },
455 |   "language_info": {
456 |    "codemirror_mode": {
457 |     "name": "ipython",
458 |     "version": 2
459 |    },
460 |    "file_extension": ".py",
461 |    "mimetype": "text/x-python",
462 |    "name": "python",
463 |    "nbconvert_exporter": "python",
464 |    "pygments_lexer": "ipython2",
465 |    "version": "2.7.10"
466 |   }
467 |  },
468 |  "nbformat": 4,
469 |  "nbformat_minor": 0
470 | }
471 | 


--------------------------------------------------------------------------------
/strata-nyc-2015/recommendation-systems/README.md:
--------------------------------------------------------------------------------
 1 | # Strata + Hadoop World, New York City, 2015
 2 | 
 3 | This directory contains demo notebooks used for the "Introduction to Recommender Systems", the second session of **Machine Learning 101**, an all-day tutorial at [Strata + Hadoop World, New York City, 2015](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217).
 4 | 
 5 | In this session we 
 6 | 
 7 | - give an introduction to recommendation systems, 
 8 | - show how easy it is to get started
 9 | - provide examples and slides
10 | 
11 | Along the way, we also cover feature engineering and deploying machine learning models as a predictive service.
12 | 
13 | ## Setup Instructions
14 | 
15 | You can browse the notebooks using Github IPython notebook viewer. Note that some images may not be rendered correctly. If you'd like to run it, follow these steps to set up your machine.
16 | 
17 | - [Download](https://turi.com/download/) GraphLab Create and then follow instructions to [install](https://turi.com/download/install.html).
18 | - Download and unzip the datasets [[831MB]](https://static.turi.com/datasets/ml101_datasets_stratanyc_2015.zip)
19 | 
20 | ## Handy references
21 | 
22 | - [GraphLab Create User Guide](https://turi.com/learn/userguide)
23 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create)
24 | 


--------------------------------------------------------------------------------
/strata-nyc-2015/recommendation-systems/book-recommender-exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import graphlab as gl"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "The following code snippet will parse the books data provided at the training."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": false
 26 |    },
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "[INFO] This commercial license of GraphLab Create is assigned to engr@turi.com.\n",
 33 |       "\n",
 34 |       "[INFO] Start server at: ipc:///tmp/graphlab_server-41686 - Server binary: /Users/chris/miniconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443482376.log\n",
 35 |       "[INFO] GraphLab Server Version: 1.6.1\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "if os.path.exists('data/books/ratings'):\n",
 42 |     "    ratings = gl.SFrame('data/books/ratings')\n",
 43 |     "    items = gl.SFrame('data/books/items')\n",
 44 |     "    users = gl.SFrame('data/books/users')\n",
 45 |     "else:\n",
 46 |     "    ratings = gl.SFrame.read_csv('data/books/book-ratings.csv')\n",
 47 |     "    ratings.save('data/books/ratings')\n",
 48 |     "    items = gl.SFrame.read_csv('data/books/book-data.csv')\n",
 49 |     "    items.save('data/books/items')\n",
 50 |     "    users = gl.SFrame.read_csv('data/books/user-data.csv')\n",
 51 |     "    users.save('data/books/users')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Visually explore the above data using GraphLab Canvas."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Recommendation systems"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "In this section we will make a model that can be used to recommend new tags to users."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "### Creating a Model"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Use `gl.recommender.create()` to create a model that can be used to recommend tags to each user."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": []
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Print a summary of the model by simply entering the name of the object."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": []
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Get all unique users from the first 10000 observations and save them as a variable called `users`."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": []
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Get 20 recommendations for each user in your list of users. Save these as a new SFrame called `recs`."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": []
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Inspecting your model"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Get an SFrame of the 20 most similar items for each observed item."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": []
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "This dataset has multiple rows corresponding to the same book, e.g., in situations where reprintings were done by different publishers in different year.\n",
174 |     "\n",
175 |     "For each unique value of 'book' in the `items` SFrame, select one of the of the available values for `author`, `publisher`, and `year`. Hint: Try using [`SFrame.groupby`](https://turi.com/products/create/docs/graphlab.data_structures.html#module-graphlab.aggregate) and [`gl.aggregate.SELECT_ONE`](https://turi.com/products/create/docs/graphlab.data_structures.html#graphlab.aggregate.SELECT_ONE)."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [],
185 |    "source": []
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Computing the number of times each book was rated, and add a column containing these counts to the `items` SFrame using `SFrame.join`."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Print the first few books, sorted by the number of times they have been rated. Do these values make sense?"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [],
217 |    "source": []
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Now print the most similar items per item, sorted by the most common books. Hint: Join the two SFrames you created above."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": []
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Experimenting with other models"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "Create a dataset called `implicit` that contains only ratings data where `rating` was 4 or greater."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {
253 |     "collapsed": false
254 |    },
255 |    "outputs": [],
256 |    "source": []
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "Create a train/test split of the `implicit` data created above. Hint: Use [random_split_by_user](http://graphlab.com/products/create/docs/generated/graphlab.recommender.random_split_by_user.html#graphlab.recommender.random_split_by_user)."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "collapsed": false
270 |    },
271 |    "outputs": [],
272 |    "source": []
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "Print the first 5 rows of the training set."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {
285 |     "collapsed": false
286 |    },
287 |    "outputs": [],
288 |    "source": []
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "Create a `ranking_factorization_recommender` model using just the training set and 20 factors."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "collapsed": false
302 |    },
303 |    "outputs": [],
304 |    "source": []
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "Evaluate how well this model recommends items that were seen in the test set you created above. Hint: Check out `m.evaluate_precision_recall()`."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {
317 |     "collapsed": false
318 |    },
319 |    "outputs": [],
320 |    "source": []
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "Create an SFrame containing only one observation, where 'Billy Bob' has rated 'Animal Farm' with score 5.0."
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [],
336 |    "source": []
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "Use this data when querying for recommendations for the user 'Billy Bob'."
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {
349 |     "collapsed": false
350 |    },
351 |    "outputs": [],
352 |    "source": []
353 |   }
354 |  ],
355 |  "metadata": {
356 |   "kernelspec": {
357 |    "display_name": "Python 2",
358 |    "language": "python",
359 |    "name": "python2"
360 |   },
361 |   "language_info": {
362 |    "codemirror_mode": {
363 |     "name": "ipython",
364 |     "version": 2
365 |    },
366 |    "file_extension": ".py",
367 |    "mimetype": "text/x-python",
368 |    "name": "python",
369 |    "nbconvert_exporter": "python",
370 |    "pygments_lexer": "ipython2",
371 |    "version": "2.7.10"
372 |   }
373 |  },
374 |  "nbformat": 4,
375 |  "nbformat_minor": 0
376 | }
377 | 


--------------------------------------------------------------------------------
/strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.key


--------------------------------------------------------------------------------
/strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-nyc-2015/recommendation-systems/strata-nyc-2015-recommendation-systems.pptx


--------------------------------------------------------------------------------
/strata-sj-2016/README.md:
--------------------------------------------------------------------------------
 1 | # Strata + Hadoop World, San Jose, 2016
 2 | 
 3 | This directory contains demo notebooks used for **Practical Machine Learning**, an all-day tutorial 
 4 | at [Strata + Hadoop World, San Jose, 2016](http://conferences.oreilly.com/strata/hadoop-big-data-ca/public/schedule/detail/47056).
 5 | The course is designed to introduce machine learning via real applications like 
 6 | - building a recommender
 7 | - detecting anomalies
 8 | - analyzing time series data
 9 | - image analysis using deep learning
10 | - predicting customer churn
11 | - deploying machine learning 
12 | 
13 | ## Setup Instructions
14 | 
15 | You can browse the notebooks using Github's notebook viewer, but please note that
16 | some images may not be rendered correctly. Follow these
17 | steps to set up and run the notebooks on your own machine.
18 | 
19 | - [Download](https://turi.com/download/) GraphLab Create v1.8.5 and then follow instructions to [install](https://turi.com/download/install.html).
20 | - Download and unzip the datasets [[831MB]](https://static.turi.com/datasets/ml101_datasets_stratasj_2016.zip)
21 | - [Install Jupyter notebook](http://jupyter.readthedocs.org/en/latest/install.html) (needed only if you install GraphLab Create via command line)
22 | 
23 | ## Handy references
24 | 
25 | - [GraphLab Create User Guide](https://turi.com/learn/userguide)
26 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create)
27 | 


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/Strata-SJ-2016-Deeplearning.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/Strata-SJ-2016-Deeplearning.pptx


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/AA1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/AA1.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/alexnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/alexnet.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/cifar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/cifar.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/evaluate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/evaluate.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/extract_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/extract_features.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/improve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/improve.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/linear.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/load.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/load.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/quadratic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/quadratic.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/spiral.1-2.2-2-2-2-2-2.jpg


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/train.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/workflow1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow1.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/workflow2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow2.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/workflow3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow3.png


--------------------------------------------------------------------------------
/strata-sj-2016/deep-learning/images/workflow4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/deep-learning/images/workflow4.png


--------------------------------------------------------------------------------
/strata-sj-2016/ml-in-production/deploy-scikit-learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Introduction to ML Deployment\n",
  8 |     "==================\n",
  9 |     "\n",
 10 |     "Deploying models created using python in a Turi Predictive Service is very easy. This notebook walks you through the step-by-step process. \n",
 11 |     "\n",
 12 |     "<img src='images/predictive_services_overview.png'></img>\n",
 13 |     "\n",
 14 |     "-----------------------\n",
 15 |     "\n",
 16 |     "Deployment Steps\n",
 17 |     "=========\n",
 18 |     "The notebook has three sections: \n",
 19 |     "\n",
 20 |     "1. <a href='#cpo'>Create a model</a>\n",
 21 |     "2. <a href='#create'>Create a predictive service</a>\n",
 22 |     "3. <a href='#query'>Query the model</a>\n",
 23 |     "\n",
 24 |     "If you are deploying a model in an existing Predictive Service instance you can go to step (2) directly.\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Preliminaries\n",
 32 |     "For the following code, you will need to install the following packages:\n",
 33 |     "\n",
 34 |     "```\n",
 35 |     "pip install graphlab-create\n",
 36 |     "pip install sklearn\n",
 37 |     "pip install numpy\n",
 38 |     "pip install scipy\n",
 39 |     "```"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## 1. Create a model <a id='cpo'></a>\n",
 47 |     "\n",
 48 |     "Let's train a simple random forest model and deploy it in the Predictive Service.\n",
 49 |     "\n",
 50 |     "<img src=\"images/left.png\"></img>"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "from sklearn.ensemble import RandomForestClassifier\n",
 62 |     "from sklearn.datasets import load_iris\n",
 63 |     "iris = load_iris()\n",
 64 |     "\n",
 65 |     "model = RandomForestClassifier(n_estimators=10)\n",
 66 |     "model = model.fit(iris['data'], iris['target'])\n",
 67 |     "model"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "We can expose the trained model as a REST endpoint. This will allow other applications to consume the predictions from the model.  \n",
 75 |     "\n",
 76 |     "In order to do that, we wrap the model object in a Python function and add it to the Predictive Service. In the function you may add your own logic for transform input to the model, ensemble different models or manipulate output before returning. Checkout out [user guide](https://turi.com/learn/userguide/#Deployment) for more details.\n",
 77 |     "\n",
 78 |     "The result of the function needs to be  a **JSON serializable** object."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "def classify(x):\n",
 90 |     "    prediction = model.predict(x)\n",
 91 |     "\n",
 92 |     "    # convert into a json serializable value\n",
 93 |     "    return list(prediction)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## 2. Create a Predictive Service (One time) <a id='create'></a>\n",
101 |     "\n",
102 |     "This section shows you how to deploy a Predictive Service to EC2. The EC2 instances used by the Predictive Service will be launched in your own AWS account, so you will be responsible for the cost. \n",
103 |     "\n",
104 |     "<img src=\"images/middle.png\"></img>"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "To create a Predictive Service in Amazon AWS, we first configure the EC2 Config object, which contains the configuration parameters required for launching a Predictive Service cluster in EC2. These fields are optional and include the region, instance type, CIDR rules etc. Predictive Service uses this configuration for service creation.\n",
112 |     "\n",
113 |     "Having configured our EC2 Config object, we're ready to launch a Predictive Service Deployment, There are a few aspects of the Predictive Service that can be customized:\n",
114 |     "* Number of nodes in the service - By default the number of hosts (`num_hosts`) is 1. To obtain good cache utility and high availability, we recommended setting num_hosts to at least 3.\n",
115 |     "* State path to persist service state and service logs. This is a s3 location. \n",
116 |     "* Port to be used by the server.\n",
117 |     "* Other settings, such as SSL credentials etc.\n",
118 |     "\n",
119 |     "The following code snippet shows you how to create a Predictive Service. You will have to replace the ps_state_path and credentials for your Predictive Service."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "import graphlab as gl\n",
131 |     "\n",
132 |     "# Replace with your path.\n",
133 |     "ps_state_path = 's3://<your-bucket-name>/predictive_service/ps'\n",
134 |     "\n",
135 |     "# Set your AWS credentials.\n",
136 |     "gl.aws.set_credentials(<key>, <secret>)\n",
137 |     "\n",
138 |     "# Create an EC2 config\n",
139 |     "ec2 = gl.deploy.Ec2Config()\n",
140 |     "\n",
141 |     "# Launch a predictive service\n",
142 |     "ps = gl.deploy.predictive_service.create(\n",
143 |     "    name='sklearn-predictive-service', \n",
144 |     "    ec2_config=ec2, state_path=ps_state_path, num_hosts=1)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "### Load an already created service"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "import graphlab as gl\n",
163 |     "ps = gl.deploy.predictive_service.load('s3://gl-demo-usw2/predictive_service/demolab/ps-1.8.4')"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "ps"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "# ps.add('classify-sklearn', classify, description='Classify an iris based on petal and sepal dimensions')\n",
186 |     "ps.update('classify-sklearn', classify, description='Classify an iris based on petal and sepal dimensions')"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {
193 |     "collapsed": false
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "ps.apply_changes()"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "## Query the model <a id='query'></a>\n",
205 |     "\n",
206 |     "You may do a test query before really deploying it to production. This will help detect errors in the function before deploying it the Predictive Service. \n",
207 |     "\n",
208 |     "<img src=\"images/right.png\"></img>"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "collapsed": false
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "ps.test_query('classify-sklearn', x=[5.1,  3.5,  1.4,  0.2])"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "Now, let us query the real service."
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "# test query to make sure the model works fine\n",
238 |     "ps.query('classify-sklearn', x=[5.1,  3.5,  1.4,  0.2])"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "### Query from external applications via REST\n",
246 |     "\n",
247 |     "Now other applications can interact with our model! In the next section we will illustrate how to consume the model. We can also  use other APIs like ps.update() to update a mode, ps.remove() to remove a model."
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "The model query is exposed through REST API. The url to query is:\n",
255 |     "\n",
256 |     "    http(s)://<your-ps-endpoint>/query/<model-name>\n",
257 |     "\n",
258 |     "The predictive service uses basic access authentication to authorize the client to query. The client needs to provide the service's API key in the HTTP header as the password for user name `api_key`. The 'api key' may be obtained through ps.api_key\n",
259 |     "\n",
260 |     "The payload is a JSON serialized string in the following format:\n",
261 |     "\n",
262 |     "    { \"data\": <data-passed-to-custom-query> }\n",
263 |     "\n",
264 |     "The data is the actual data passed to the custom predictive object in the Predictive Service. It will be passed to the query using **kwargs format**.\n",
265 |     "\n",
266 |     "Here is a sample curl command to query your model:\n",
267 |     "\n",
268 |     "    curl -u api_key:b0a1c056-30b9-4468-9b8d-c07289017228 -d '{\"data\":{\"x\":[5.1,  3.5,  1.4,  0.2]}}' http://demolab-one-six-2015364754.us-west-2.elb.amazonaws.com/query/classify-sklearn\n",
269 |     "\n",
270 |     "    \n",
271 |     "You can also query though Python using the **requests module**"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "### Query through Python"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {
285 |     "collapsed": true
286 |    },
287 |    "outputs": [],
288 |    "source": [
289 |     "import json\n",
290 |     "import requests\n",
291 |     "from requests.auth import HTTPBasicAuth\n",
292 |     "\n",
293 |     "def restful_query(x):\n",
294 |     "    headers = {'content-type': 'application/json'}\n",
295 |     "    payload = {\"data\":{\"x\": x}}\n",
296 |     "    end_point = 'http://demolab-ps-one-eight-four-810335136.us-west-2.elb.amazonaws.com/query/classify-sklearn'\n",
297 |     "    return requests.post(\n",
298 |     "        end_point,\n",
299 |     "        json.dumps(payload),\n",
300 |     "        headers=headers,\n",
301 |     "        auth=HTTPBasicAuth('api_key', '9d97391e-8be7-47a9-8b72-34ecc9f0ad60')).json()"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {
308 |     "collapsed": false
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "restful_query([5.1,  3.5,  1.4,  0.2])"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": false
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "restful_query([5.1,  3.5,  1.4,  0.3])"
324 |    ]
325 |   }
326 |  ],
327 |  "metadata": {
328 |   "kernelspec": {
329 |    "display_name": "Python 2",
330 |    "language": "python",
331 |    "name": "python2"
332 |   },
333 |   "language_info": {
334 |    "codemirror_mode": {
335 |     "name": "ipython",
336 |     "version": 2
337 |    },
338 |    "file_extension": ".py",
339 |    "mimetype": "text/x-python",
340 |    "name": "python",
341 |    "nbconvert_exporter": "python",
342 |    "pygments_lexer": "ipython2",
343 |    "version": "2.7.11"
344 |   }
345 |  },
346 |  "nbformat": 4,
347 |  "nbformat_minor": 0
348 | }
349 | 


--------------------------------------------------------------------------------
/strata-sj-2016/ml-in-production/images/left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/left.png


--------------------------------------------------------------------------------
/strata-sj-2016/ml-in-production/images/middle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/middle.png


--------------------------------------------------------------------------------
/strata-sj-2016/ml-in-production/images/predictive_services_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/predictive_services_overview.png


--------------------------------------------------------------------------------
/strata-sj-2016/ml-in-production/images/right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/images/right.png


--------------------------------------------------------------------------------
/strata-sj-2016/ml-in-production/ml-production.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/ml-in-production/ml-production.key


--------------------------------------------------------------------------------
/strata-sj-2016/recommendation-systems/README.md:
--------------------------------------------------------------------------------
 1 | # Strata + Hadoop World, San Jose 2016
 2 | 
 3 | This directory contains demo notebooks used for the "Introduction to Recommender Systems", the second session of **Machine Learning 101**, an all-day tutorial at [Strata + Hadoop World, New York City, 2015](http://strataconf.com/big-data-conference-ny-2015/public/schedule/detail/43217).
 4 | 
 5 | In this session we 
 6 | 
 7 | - give an introduction to recommendation systems, 
 8 | - show how easy it is to get started
 9 | - provide examples and slides
10 | 
11 | Along the way, we also cover feature engineering and deploying machine learning models as a predictive service.
12 | 
13 | ## Setup Instructions
14 | 
15 | You can browse the notebooks using Github IPython notebook viewer. Note that some images may not be rendered correctly. If you'd like to run it, follow these steps to set up your machine.
16 | 
17 | - [Download](https://turi.com/download/) GraphLab Create and then follow instructions to [install](https://turi.com/download/install.html).
18 | - Download and unzip the datasets 
19 | 
20 | ## Handy references
21 | 
22 | - [GraphLab Create User Guide](https://turi.com/learn/userguide)
23 | - [GraphLab Forum](http://forum.turi.com/categories/graphlab-create)
24 | 


--------------------------------------------------------------------------------
/strata-sj-2016/recommendation-systems/book-recommender-exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import graphlab as gl"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "The following code snippet will parse the books data provided at the training."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": false
 26 |    },
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "[INFO] This commercial license of GraphLab Create is assigned to engr@turi.com.\n",
 33 |       "\n",
 34 |       "[INFO] Start server at: ipc:///tmp/graphlab_server-41686 - Server binary: /Users/chris/miniconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443482376.log\n",
 35 |       "[INFO] GraphLab Server Version: 1.6.1\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "if os.path.exists('data/books/ratings'):\n",
 42 |     "    ratings = gl.SFrame('data/books/ratings')\n",
 43 |     "    items = gl.SFrame('data/books/items')\n",
 44 |     "    users = gl.SFrame('data/books/users')\n",
 45 |     "else:\n",
 46 |     "    ratings = gl.SFrame.read_csv('data/books/book-ratings.csv')\n",
 47 |     "    ratings.save('data/books/ratings')\n",
 48 |     "    items = gl.SFrame.read_csv('data/books/book-data.csv')\n",
 49 |     "    items.save('data/books/items')\n",
 50 |     "    users = gl.SFrame.read_csv('data/books/user-data.csv')\n",
 51 |     "    users.save('data/books/users')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Visually explore the above data using GraphLab Canvas."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Recommendation systems"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "In this section we will make a model that can be used to recommend new tags to users."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "### Creating a Model"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Use `gl.recommender.create()` to create a model that can be used to recommend tags to each user."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": []
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Print a summary of the model by simply entering the name of the object."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": []
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Get all unique users from the first 10000 observations and save them as a variable called `users`."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": []
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Get 20 recommendations for each user in your list of users. Save these as a new SFrame called `recs`."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": []
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Inspecting your model"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Get an SFrame of the 20 most similar items for each observed item."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": []
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "This dataset has multiple rows corresponding to the same book, e.g., in situations where reprintings were done by different publishers in different year.\n",
174 |     "\n",
175 |     "For each unique value of 'book' in the `items` SFrame, select one of the of the available values for `author`, `publisher`, and `year`. Hint: Try using [`SFrame.groupby`](https://turi.com/products/create/docs/graphlab.data_structures.html#module-graphlab.aggregate) and [`gl.aggregate.SELECT_ONE`](https://turi.com/products/create/docs/graphlab.data_structures.html#graphlab.aggregate.SELECT_ONE)."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [],
185 |    "source": []
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Computing the number of times each book was rated, and add a column containing these counts to the `items` SFrame using `SFrame.join`."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Print the first few books, sorted by the number of times they have been rated. Do these values make sense?"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [],
217 |    "source": []
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Now print the most similar items per item, sorted by the most common books. Hint: Join the two SFrames you created above."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": []
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Experimenting with other models"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "Create a dataset called `implicit` that contains only ratings data where `rating` was 4 or greater."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {
253 |     "collapsed": false
254 |    },
255 |    "outputs": [],
256 |    "source": []
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "Create a train/test split of the `implicit` data created above. Hint: Use [random_split_by_user](http://graphlab.com/products/create/docs/generated/graphlab.recommender.random_split_by_user.html#graphlab.recommender.random_split_by_user)."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "collapsed": false
270 |    },
271 |    "outputs": [],
272 |    "source": []
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "Print the first 5 rows of the training set."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {
285 |     "collapsed": false
286 |    },
287 |    "outputs": [],
288 |    "source": []
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "Create a `ranking_factorization_recommender` model using just the training set and 20 factors."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "collapsed": false
302 |    },
303 |    "outputs": [],
304 |    "source": []
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "Evaluate how well this model recommends items that were seen in the test set you created above. Hint: Check out `m.evaluate_precision_recall()`."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {
317 |     "collapsed": false
318 |    },
319 |    "outputs": [],
320 |    "source": []
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "Create an SFrame containing only one observation, where 'Billy Bob' has rated 'Animal Farm' with score 5.0."
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [],
336 |    "source": []
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "Use this data when querying for recommendations for the user 'Billy Bob'."
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {
349 |     "collapsed": false
350 |    },
351 |    "outputs": [],
352 |    "source": []
353 |   }
354 |  ],
355 |  "metadata": {
356 |   "kernelspec": {
357 |    "display_name": "Python 2",
358 |    "language": "python",
359 |    "name": "python2"
360 |   },
361 |   "language_info": {
362 |    "codemirror_mode": {
363 |     "name": "ipython",
364 |     "version": 2
365 |    },
366 |    "file_extension": ".py",
367 |    "mimetype": "text/x-python",
368 |    "name": "python",
369 |    "nbconvert_exporter": "python",
370 |    "pygments_lexer": "ipython2",
371 |    "version": "2.7.10"
372 |   }
373 |  },
374 |  "nbformat": 4,
375 |  "nbformat_minor": 0
376 | }
377 | 


--------------------------------------------------------------------------------
/strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.key


--------------------------------------------------------------------------------
/strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/recommendation-systems/strata-sj-2016-recommendation-systems.pptx


--------------------------------------------------------------------------------
/strata-sj-2016/time-series/anomaly_detection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 1. Load and inspect the data: Oklahoma earthquake stats"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import graphlab as gl\n",
 19 |     "\n",
 20 |     "okla_daily = gl.load_timeseries('working_data/ok_daily_stats.ts')\n",
 21 |     "\n",
 22 |     "print \"Number of rows:\", len(okla_daily)\n",
 23 |     "print \"Start:\", okla_daily.min_time\n",
 24 |     "print \"End:\", okla_daily.max_time\n",
 25 |     "okla_daily.print_rows(3)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import matplotlib.pyplot as plt\n",
 37 |     "%matplotlib notebook\n",
 38 |     "plt.style.use('ggplot')\n",
 39 |     "\n",
 40 |     "fig, ax = plt.subplots()\n",
 41 |     "ax.plot(okla_daily['time'], okla_daily['count'], color='dodgerblue')\n",
 42 |     "ax.set_ylabel('Number of quakes')\n",
 43 |     "ax.set_xlabel('Date')\n",
 44 |     "fig.autofmt_xdate()\n",
 45 |     "fig.show()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "---"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "# 2. Let the toolkit choose the model "
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "from graphlab.toolkits import anomaly_detection\n",
 71 |     "\n",
 72 |     "model = anomaly_detection.create(okla_daily, features=['count'])\n",
 73 |     "\n",
 74 |     "print model"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "---"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "# 3. The simple thresholding model"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "threshold = 5\n",
100 |     "anomaly_mask = okla_daily['count'] >= threshold\n",
101 |     "\n",
102 |     "anomaly_scores = okla_daily[['count']]\n",
103 |     "anomaly_scores['threshold_score'] = anomaly_mask"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "anomaly_scores.tail(8).print_rows()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "---"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "# 4. The moving Z-score model"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "from graphlab.toolkits.anomaly_detection import moving_zscore\n",
140 |     "\n",
141 |     "zscore_model = moving_zscore.create(okla_daily, feature='count',\n",
142 |     "                                    window_size=30,\n",
143 |     "                                    min_observations=15)\n",
144 |     "\n",
145 |     "print zscore_model"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "zscore_model.scores.tail(3)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "zscore_model.scores.head(3)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {
174 |     "collapsed": false
175 |    },
176 |    "outputs": [],
177 |    "source": [
178 |     "anomaly_scores['outlier_score'] = zscore_model.scores['anomaly_score']\n",
179 |     "anomaly_scores.tail(5).print_rows()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {
186 |     "collapsed": false,
187 |     "scrolled": false
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "fig, ax = plt.subplots(2, sharex=True)\n",
192 |     "ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')\n",
193 |     "ax[0].set_ylabel('# quakes')\n",
194 |     "\n",
195 |     "ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')\n",
196 |     "ax[1].set_ylabel('outlier score')\n",
197 |     "\n",
198 |     "ax[1].set_xlabel('Date')\n",
199 |     "fig.autofmt_xdate()\n",
200 |     "fig.show()"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "---"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "# 5. The Bayesian changepoint model"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {
221 |     "collapsed": false
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "from graphlab.toolkits.anomaly_detection import bayesian_changepoints\n",
226 |     "\n",
227 |     "changept_model = bayesian_changepoints.create(okla_daily, feature='count',\n",
228 |     "                                              expected_runlength=2000, lag=7)\n",
229 |     "\n",
230 |     "print changept_model"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "collapsed": false
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "anomaly_scores['changepoint_score'] = changept_model.scores['changepoint_score']\n",
242 |     "anomaly_scores.head(5)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "fig, ax = plt.subplots(3, sharex=True)\n",
254 |     "ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')\n",
255 |     "ax[0].set_ylabel('# quakes')\n",
256 |     "\n",
257 |     "ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')\n",
258 |     "ax[1].set_ylabel('outlier score')\n",
259 |     "\n",
260 |     "ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'], color='orchid')\n",
261 |     "ax[2].set_ylabel('changepoint score')\n",
262 |     "\n",
263 |     "ax[2].set_xlabel('Date')\n",
264 |     "fig.autofmt_xdate()\n",
265 |     "fig.show()"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "---"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "# 6. How to use the anomaly scores"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "## Option 1: choose an anomaly threshold *a priori*"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "- Slightly better than choosing a threshold in the original feature space.\n",
294 |     "- For Bayesian changepoint detection, where the scores are probabilities, there is a natural threshold of 0.5."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "collapsed": false
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "threshold = 0.5\n",
306 |     "anom_mask = anomaly_scores['changepoint_score'] >= threshold\n",
307 |     "\n",
308 |     "anomalies = anomaly_scores[anom_mask]\n",
309 |     "\n",
310 |     "print \"Number of anomalies:\", len(anomalies)\n",
311 |     "anomalies.head(5)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "## Option 2: choose the top-k anomalies"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "If you have a fixed budget for investigating and acting on anomalies, this is a good way to go."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "anomalies = anomaly_scores.to_sframe().topk('changepoint_score', k=5)\n",
337 |     "\n",
338 |     "print \"Number of anomalies:\", len(anomalies)\n",
339 |     "anomalies.head(5)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "## Option 3: look at the anomaly distribution and choose a threshold"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {
353 |     "collapsed": false
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "anomaly_scores['changepoint_score'].show()"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {
364 |     "collapsed": false
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "threshold = 0.072\n",
369 |     "anom_mask = anomaly_scores['changepoint_score'] >= threshold\n",
370 |     "\n",
371 |     "anomalies = anomaly_scores[anom_mask]\n",
372 |     "\n",
373 |     "print \"Number of anomalies:\", len(anomalies)\n",
374 |     "anomalies.head(5)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "## Option 4: get fancy with plotting "
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {
388 |     "collapsed": false,
389 |     "scrolled": false
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "from interactive_plot import LineDrawer\n",
394 |     "\n",
395 |     "fig, ax = plt.subplots(3, sharex=True)\n",
396 |     "guide_lines = []\n",
397 |     "threshold_lines = []\n",
398 |     "\n",
399 |     "p = ax[0].plot(anomaly_scores['time'], anomaly_scores['count'],\n",
400 |     "               color='dodgerblue')\n",
401 |     "ax[0].set_ylabel('# quakes')\n",
402 |     "\n",
403 |     "line, = ax[0].plot((anomaly_scores.min_time, anomaly_scores.min_time),\n",
404 |     "                   ax[0].get_ylim(), lw=1, ls='--', color='black')\n",
405 |     "guide_lines.append(line)\n",
406 |     "\n",
407 |     "ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'],\n",
408 |     "           color='orchid')\n",
409 |     "ax[1].set_ylabel('outlier score')\n",
410 |     "line, = ax[1].plot((anomaly_scores.min_time, anomaly_scores.min_time),\n",
411 |     "                   ax[1].get_ylim(), lw=1, ls='--', color='black')\n",
412 |     "guide_lines.append(line)\n",
413 |     "\n",
414 |     "ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'],\n",
415 |     "           color='orchid')\n",
416 |     "ax[2].set_ylabel('changepoint score')\n",
417 |     "ax[2].set_xlabel('Date')\n",
418 |     "line, = ax[2].plot((anomaly_scores.min_time, anomaly_scores.min_time), (0., 1.),\n",
419 |     "                   lw=1, ls='--', color='black')\n",
420 |     "guide_lines.append(line)\n",
421 |     "\n",
422 |     "for a in ax:\n",
423 |     "    line, = a.plot(anomaly_scores.range, (0., 0.), lw=1, ls='--',\n",
424 |     "                   color='black')\n",
425 |     "    threshold_lines.append(line)\n",
426 |     "\n",
427 |     "plot_scores = anomaly_scores[['count', 'outlier_score', 'changepoint_score']]\n",
428 |     "interactive_thresholder = LineDrawer(plot_scores, guide_lines, threshold_lines)\n",
429 |     "interactive_thresholder.connect()\n",
430 |     "fig.autofmt_xdate()\n",
431 |     "fig.show()"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {
438 |     "collapsed": false
439 |    },
440 |    "outputs": [],
441 |    "source": [
442 |     "interactive_thresholder.anoms.print_rows(10)"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "---"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "# 7. Updating the model with new data"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {
463 |     "collapsed": false
464 |    },
465 |    "outputs": [],
466 |    "source": [
467 |     "okla_new = gl.load_timeseries('working_data/ok_daily_update.ts')\n",
468 |     "okla_new.print_rows(20)"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {},
474 |    "source": [
475 |     "Why do we want to update the model, rather than training a new one?\n",
476 |     "1. Because we've updated our parameters using the data we've seen already.\n",
477 |     "2. Updating simplifies the drudgery of prepending the data to get a final score for the lags in the previous data set."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "collapsed": false
485 |    },
486 |    "outputs": [],
487 |    "source": [
488 |     "changept_model2 = changept_model.update(okla_new)\n",
489 |     "\n",
490 |     "print changept_model2"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {
497 |     "collapsed": false
498 |    },
499 |    "outputs": [],
500 |    "source": [
501 |     "changept_model2.scores.print_rows(20)"
502 |    ]
503 |   }
504 |  ],
505 |  "metadata": {
506 |   "kernelspec": {
507 |    "display_name": "Python 2",
508 |    "language": "python",
509 |    "name": "python2"
510 |   },
511 |   "language_info": {
512 |    "codemirror_mode": {
513 |     "name": "ipython",
514 |     "version": 2
515 |    },
516 |    "file_extension": ".py",
517 |    "mimetype": "text/x-python",
518 |    "name": "python",
519 |    "nbconvert_exporter": "python",
520 |    "pygments_lexer": "ipython2",
521 |    "version": "2.7.11"
522 |   }
523 |  },
524 |  "nbformat": 4,
525 |  "nbformat_minor": 0
526 | }
527 | 


--------------------------------------------------------------------------------
/strata-sj-2016/time-series/forecasting_basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 1. Load and inspect the data: daily global earthquakes "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Load the main dataset: Feb. 2, 2013 - Mar. 15, 2016"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import graphlab as gl\n",
 26 |     "\n",
 27 |     "daily_stats = gl.load_timeseries('working_data/global_daily_stats.ts')\n",
 28 |     "\n",
 29 |     "print \"Number of rows:\", len(daily_stats)\n",
 30 |     "print \"Start:\", daily_stats.min_time\n",
 31 |     "print \"End:\", daily_stats.max_time\n",
 32 |     "daily_stats.print_rows(3)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Load the recent data: Mar. 16, 2016 - Mar. 22, 2016"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "The first point in this dataset is our forecasting goal. Pretend it's March 15, and we don't know the count of earthquakes for March 16th."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "daily_update = gl.load_timeseries('working_data/global_daily_update.ts')\n",
 58 |     "daily_update.print_rows()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Visualize the data with GraphLab Canvas"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "collapsed": false
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "daily_stats.to_sframe().show()"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Visualize the data with matplotlib"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": false,
 91 |     "scrolled": false
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "import matplotlib.pyplot as plt\n",
 96 |     "%matplotlib notebook\n",
 97 |     "plt.style.use('ggplot')\n",
 98 |     "\n",
 99 |     "fig, ax = plt.subplots()\n",
100 |     "ax.plot(daily_stats['time'], daily_stats['count'], color='dodgerblue')\n",
101 |     "ax.set_xlabel('Date')\n",
102 |     "ax.set_ylabel('Number of earthquakes')\n",
103 |     "fig.autofmt_xdate()\n",
104 |     "fig.show()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "---"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "# 2. A naive baseline: the grand mean"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "baseline_forecast = daily_stats['count'].mean()\n",
130 |     "\n",
131 |     "print baseline_forecast"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "---"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "# 3. The autoregressive model "
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Create lagged features"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": false
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "daily_stats['lag1_count'] = daily_stats.shift(1)['count']\n",
164 |     "daily_stats['lag2_count'] = daily_stats.shift(2)['count']\n",
165 |     "\n",
166 |     "daily_stats.print_rows(3)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## Train the model"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {
180 |     "collapsed": false,
181 |     "scrolled": false
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "train_counts = daily_stats[2:].to_sframe()\n",
186 |     "\n",
187 |     "ar_model = gl.linear_regression.create(train_counts, target='count',\n",
188 |     "                                        features=['lag1_count', 'lag2_count'],\n",
189 |     "                                        l2_penalty=0., validation_set=None,\n",
190 |     "                                        verbose=False)\n",
191 |     "\n",
192 |     "print ar_model"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "collapsed": false
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "train_counts.tail(5).print_rows()"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "## Get a forecast from the model "
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "## Construct the input dataset first.\n",
222 |     "sf_forecast = gl.SFrame({'lag1_count': [daily_stats['count'][-1]],\n",
223 |     "                         'lag2_count': [daily_stats['count'][-2]]})\n",
224 |     "\n",
225 |     "## Compute the model's forecast\n",
226 |     "ar_forecast = ar_model.predict(sf_forecast)\n",
227 |     "print ar_forecast[0]"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "****"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "# 4. The gradient-boosted trees model"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## Split the timestamp into parts"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {
255 |     "collapsed": false
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "date_parts = daily_stats.index.split_datetime(column_name_prefix='date',\n",
260 |     "                                        limit=['year', 'month', 'day'])"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "## Create lags for *observed* features"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "To forecast tomorrow's earthqauke count:\n",
275 |     "- we do know what the date will be, so no need to lag,\n",
276 |     "- we don't know what the max and average magnitude will be, so we need to lag."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [],
286 |    "source": [
287 |     "daily_stats['lag1_avg_mag'] = daily_stats.shift(1)['avg_mag']\n",
288 |     "daily_stats['lag1_max_mag'] = daily_stats.shift(1)['max_mag']"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "collapsed": false
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "sf_train = daily_stats.to_sframe()\n",
300 |     "sf_train = sf_train.add_columns(date_parts)\n",
301 |     "\n",
302 |     "sf_train.print_rows(3)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "## Train the model "
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": false
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "feature_list = ['lag1_avg_mag', 'lag1_max_mag', 'lag1_count',\n",
321 |     "                'date.year', 'date.month', 'date.day']\n",
322 |     "\n",
323 |     "# Remove the row with no lagged features.\n",
324 |     "sf_train = sf_train[1:]\n",
325 |     "\n",
326 |     "gbt_model = gl.boosted_trees_regression.create(sf_train, target='count',\n",
327 |     "                                               features=feature_list,\n",
328 |     "                                               max_iterations=20,\n",
329 |     "                                               validation_set=None,\n",
330 |     "                                               verbose=False)\n",
331 |     "\n",
332 |     "print gbt_model"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "## Compute the model's forecast "
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "collapsed": false
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "## Prepend the last couple rows of the training data.\n",
351 |     "ts_forecast = daily_stats[daily_update.column_names()][-2:].union(daily_update)\n",
352 |     "\n",
353 |     "## Create the lagged features.\n",
354 |     "ts_forecast['lag1_avg_mag'] = ts_forecast.shift(1)['avg_mag']\n",
355 |     "ts_forecast['lag1_max_mag'] = ts_forecast.shift(1)['max_mag']\n",
356 |     "ts_forecast['lag1_count'] = ts_forecast.shift(1)['count']\n",
357 |     "\n",
358 |     "## Split the timestamp into date parts.\n",
359 |     "new_date_parts = ts_forecast.index.split_datetime(column_name_prefix='date',\n",
360 |     "                                        limit=['year', 'month', 'day'])\n",
361 |     "\n",
362 |     "## Add the date parts to the dataset.\n",
363 |     "sf_forecast = ts_forecast.to_sframe().add_columns(new_date_parts)\n",
364 |     "\n",
365 |     "sf_forecast.print_rows(3)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {
372 |     "collapsed": false
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "gbt_forecast = gbt_model.predict(sf_forecast)\n",
377 |     "gbt_forecast[2]"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "--- "
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "# 5. And the winner is... "
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "collapsed": false
399 |    },
400 |    "outputs": [],
401 |    "source": [
402 |     "print \"Actual value for March 16:\", daily_update['count'][0]\n",
403 |     "print \"\\nBaseline forecast:\", baseline_forecast\n",
404 |     "print \"AR model forecast:\", ar_forecast[0]\n",
405 |     "print \"GBT forecast:\", gbt_forecast[2], \"\\t(*** winner ***)\""
406 |    ]
407 |   }
408 |  ],
409 |  "metadata": {
410 |   "kernelspec": {
411 |    "display_name": "Python 2",
412 |    "language": "python",
413 |    "name": "python2"
414 |   },
415 |   "language_info": {
416 |    "codemirror_mode": {
417 |     "name": "ipython",
418 |     "version": 2
419 |    },
420 |    "file_extension": ".py",
421 |    "mimetype": "text/x-python",
422 |    "name": "python",
423 |    "nbconvert_exporter": "python",
424 |    "pygments_lexer": "ipython2",
425 |    "version": "2.7.11"
426 |   }
427 |  },
428 |  "nbformat": 4,
429 |  "nbformat_minor": 0
430 | }
431 | 


--------------------------------------------------------------------------------
/strata-sj-2016/time-series/interactive_plot.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import matplotlib.pyplot as _plt
 3 | from matplotlib.widgets import Button
 4 | _plt.style.use('ggplot')
 5 | 
 6 | 
 7 | ## Plot an interactive version.
 8 | class LineDrawer(object):
 9 |     def __init__(self, scores, guide_lines, threshold_lines):
10 |         self.guide_lines = guide_lines
11 |         self.threshold_lines = threshold_lines
12 |         self.figure = self.guide_lines[0].figure
13 |         self.scores = scores
14 | 
15 |         self.anoms = self.scores[:0]
16 |         self.anom_plot = self.figure.axes[0].plot(self.anoms['time'],
17 |                                                   self.anoms['count'],
18 |                                                   color='red', lw=0, marker='o',
19 |                                                   markersize=10,
20 |                                                   alpha=0.7)
21 | 
22 |     def connect(self):
23 |         """Connect to the event."""
24 |         self.cid_press = self.figure.canvas.mpl_connect('button_press_event',
25 |                                                         self.on_press)
26 | 
27 |     def disconnect(self):
28 |         """Disconnect the event bindings."""
29 |         self.figure.canvas.mpl_disconnect(self.cid_press)
30 | 
31 |     def on_press(self, event):
32 |         """Store the location data when the mouse button is pressed."""
33 | 
34 |         if event.inaxes == self.figure.axes[0]:
35 |             self.threshold_lines[0].set_ydata((event.ydata, event.ydata))
36 |             self.threshold_lines[1].set_ydata((0., 0.))
37 |             self.threshold_lines[2].set_ydata((0., 0.))
38 | 
39 |             col = self.scores.value_col_names[0]
40 | 
41 |         elif event.inaxes == self.figure.axes[1]:
42 |             self.threshold_lines[1].set_ydata((event.ydata, event.ydata))
43 |             self.threshold_lines[0].set_ydata((0., 0.))
44 |             self.threshold_lines[2].set_ydata((0., 0.))
45 | 
46 |             col = self.scores.value_col_names[1]
47 | 
48 |         elif event.inaxes == self.figure.axes[2]:
49 |             self.threshold_lines[2].set_ydata((event.ydata, event.ydata))
50 |             self.threshold_lines[0].set_ydata((0., 0.))
51 |             self.threshold_lines[1].set_ydata((0., 0.))
52 | 
53 |             col = self.scores.value_col_names[2]
54 | 
55 |         else:
56 |             return
57 | 
58 |         ## Print the anomalies from the selected horizontal threshold.
59 |         mask = self.scores[col] >= event.ydata
60 |         self.anoms = self.scores[mask]
61 | 
62 |         ## Replot the anomalies on the first axes.
63 |         self.anom_plot[0].set_data((list(self.anoms['time']),
64 |                                     list(self.anoms['count'])))
65 | 
66 |         ## Re-position the vertical guide lines.
67 |         for line in self.guide_lines:
68 |             line.set_xdata((event.xdata, event.xdata))
69 | 
70 |         ## Re-draw the whole figure.
71 |         self.figure.canvas.draw()
72 | 
73 | 


--------------------------------------------------------------------------------
/strata-sj-2016/time-series/time_series_analysis_public.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/strata-sj-2016/time-series/time_series_analysis_public.pptx


--------------------------------------------------------------------------------
/strata-sj-2016/time-series/time_series_data_object.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 1. The data: global earthquake events"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "I pulled the data from the **USGS Advanced National Seismic System Comprehensive Earthquake Catalog**.\n",
 15 |     "- http://earthquake.usgs.gov/data/\n",
 16 |     "- http://earthquake.usgs.gov/earthquakes/search/\n",
 17 |     "- Global seismic events\n",
 18 |     "- Magnitude 2.5+\n",
 19 |     "- Real-time data starts February 2, 2013.\n",
 20 |     "- Main pull goes through March 15, 2016.\n",
 21 |     "- Update pull goes from March 16 - March 22, 2016."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Load the data"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "collapsed": false,
 36 |     "scrolled": false
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import datetime as dt\n",
 41 |     "import graphlab as gl\n",
 42 |     "\n",
 43 |     "sf = gl.SFrame.read_csv('raw_data/global_earthquakes.csv', verbose=False)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Inspect the data visually "
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "sf.show()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "useful_columns = ['time', 'latitude', 'longitude', 'mag', 'type', 'location']\n",
 73 |     "sf = sf[useful_columns]"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "mask = sf['type'] == 'nuclear explosion'\n",
 85 |     "sf[mask]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## A small bit of data cleaning"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {
 99 |     "collapsed": false
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "mask = sf['type'] == 'earthquake'\n",
104 |     "sf = sf[mask]\n",
105 |     "sf = sf.remove_column('type')\n",
106 |     "print \"Number of earthquake events:\", sf.num_rows()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "---"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "# 2. Convert to a `TimeSeries` object "
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Format the timestamp "
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "sf['time'] = sf['time'].str_to_datetime(str_format='%Y-%m-%dT%H:%M:%s%ZP')\n",
139 |     "sf['time'] = sf['time'].apply(lambda x: x.replace(tzinfo=None))"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "## Convert from `SFrame` to `TimeSeries`"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "quakes = gl.TimeSeries(sf, index='time')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "---"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "# 3. Basic `TimeSeries` operations"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "## Many operations are just like `SFrame` "
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {
185 |     "collapsed": false
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "quakes.print_rows(3)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {
196 |     "collapsed": false
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "quakes[4:7].print_rows()"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "## Some operations are little different "
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "### Column subsets retain the time index"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {
221 |     "collapsed": false
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "quakes[['latitude', 'longitude']].print_rows(3)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "## Some operations are unique to `TimeSeries`"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Row slicing with a `datetime` interval"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "start = dt.datetime(2014, 5, 1)\n",
251 |     "end = dt.datetime(2014, 5, 2)\n",
252 |     "\n",
253 |     "quakes.slice(start, end).print_rows(3)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "### Working with the time index "
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "print \"Earliest timestamp:\", quakes.min_time\n",
272 |     "print \"Latest timestamp:\", quakes.max_time\n",
273 |     "print \"Timestamp range:\", quakes.range"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {
280 |     "collapsed": false
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "print \"Index column:\", quakes.index_col_name\n",
285 |     "print \"Value columns:\", quakes.value_col_names"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {
292 |     "collapsed": false
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "print quakes.index[:3]"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {
303 |     "collapsed": false
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "big_one = quakes.argmax('mag')\n",
308 |     "quakes[big_one]"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "## We can always go back to an `SFrame`"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": false
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "sf2 = quakes.to_sframe()\n",
327 |     "print type(sf2)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "---"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "# 4. Appending more data"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "## Read in new data and preprocess"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {
355 |     "collapsed": false
356 |    },
357 |    "outputs": [],
358 |    "source": [
359 |     "sf_recent = gl.SFrame.read_csv('raw_data/global_earthquakes_recent.csv', verbose=False)\n",
360 |     "\n",
361 |     "# Trim away the columns we're not interested in.\n",
362 |     "sf_recent = sf_recent[useful_columns]\n",
363 |     "\n",
364 |     "# Remove any non-earthquake events.\n",
365 |     "mask = sf_recent['type'] == 'earthquake'\n",
366 |     "sf_recent = sf_recent[mask]\n",
367 |     "sf_recent = sf_recent.remove_column('type')\n",
368 |     "\n",
369 |     "# Convert the timestamp to a `datetime` type.\n",
370 |     "sf_recent['time'] = sf_recent['time'].str_to_datetime(str_format='%Y-%m-%dT%H:%M:%s%ZP')\n",
371 |     "sf_recent['time'] = sf_recent['time'].apply(lambda x: x.replace(tzinfo=None))\n",
372 |     "\n",
373 |     "# Convert to a `TimeSeries` object.\n",
374 |     "recent_quakes = gl.TimeSeries(sf_recent, index='time')\n",
375 |     "recent_quakes.print_rows(3)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "## Get the union of the two datasets"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "- If the indexes don't overlap, this is equivalent to `SFrame.append`.\n",
390 |     "- If there is an overlap, `TimeSeries.union` enforces time order."
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {
397 |     "collapsed": false
398 |    },
399 |    "outputs": [],
400 |    "source": [
401 |     "all_quakes = quakes.union(recent_quakes)\n",
402 |     "\n",
403 |     "print all_quakes.min_time\n",
404 |     "print all_quakes.max_time"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "---"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "markdown",
416 |    "metadata": {},
417 |    "source": [
418 |     "# 5. Grouping observations by value"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {
425 |     "collapsed": false
426 |    },
427 |    "outputs": [],
428 |    "source": [
429 |     "grp = quakes.group('location')\n",
430 |     "print grp"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "The **`group_info`** `SFrame` tells us what the group names are and how many observations are in each group."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {
444 |     "collapsed": false
445 |    },
446 |    "outputs": [],
447 |    "source": [
448 |     "grp.group_info().topk('group_size', k=8)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "The **`get_group`** method lets us isolate just the observations for any group."
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {
462 |     "collapsed": false
463 |    },
464 |    "outputs": [],
465 |    "source": [
466 |     "oklahoma_quakes = grp.get_group('Oklahoma')\n",
467 |     "oklahoma_quakes.print_rows(3)"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "---"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "# 6. Grouping observations by time component "
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "markdown",
486 |    "metadata": {},
487 |    "source": [
488 |     "The **`date_part`** of a `TimeSeries` object let's us specify components of a datetime value."
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {
495 |     "collapsed": false,
496 |     "scrolled": true
497 |    },
498 |    "outputs": [],
499 |    "source": [
500 |     "grp = quakes.group(quakes.date_part.HOUR)\n",
501 |     "hour_counts = grp.group_info()\n",
502 |     "hour_counts.print_rows(5)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": null,
508 |    "metadata": {
509 |     "collapsed": false
510 |    },
511 |    "outputs": [],
512 |    "source": [
513 |     "import matplotlib.pyplot as plt\n",
514 |     "%matplotlib notebook\n",
515 |     "plt.style.use('ggplot')\n",
516 |     "\n",
517 |     "fig, ax = plt.subplots()\n",
518 |     "ax.bar(hour_counts['time.hour'], hour_counts['group_size'], color='dodgerblue')\n",
519 |     "ax.set_xlabel('Hour of the day')\n",
520 |     "ax.set_ylabel('Number of earthquakes')\n",
521 |     "fig.show()"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "markdown",
526 |    "metadata": {},
527 |    "source": [
528 |     "---"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "# 7. Resampling "
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "Four things happen with the **`resample`** method:\n",
543 |     "1. A new time index is created with uniformly spaced intervals.\n",
544 |     "2. Each observation is mapped to an interval.\n",
545 |     "3. **Downsampling**: *aggregate* statistics are computed within each interval.\n",
546 |     "4. **Upsampling**: values are *interpolated* for empty intervals."
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {
553 |     "collapsed": false
554 |    },
555 |    "outputs": [],
556 |    "source": [
557 |     "import graphlab.aggregate as agg\n",
558 |     "\n",
559 |     "daily_stats = quakes.resample(period=dt.timedelta(days=1),\n",
560 |     "                        upsample_method='none',\n",
561 |     "                        downsample_method={'count': agg.COUNT('latitude'),\n",
562 |     "                                           'avg_mag': agg.MEAN('mag'),\n",
563 |     "                                           'max_mag': agg.MAX('mag')})\n",
564 |     "\n",
565 |     "daily_stats['count'] = daily_stats['count'].fillna(0)\n",
566 |     "daily_stats.print_rows(5)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {},
572 |    "source": [
573 |     "---"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "markdown",
578 |    "metadata": {},
579 |    "source": [
580 |     "# 8. Setting up the next notebooks "
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "markdown",
585 |    "metadata": {},
586 |    "source": [
587 |     "- For the modeling notebook, we'll use the global earthquake data, downsampled to daily statistics.\n",
588 |     "- For the anomaly detection notebook, we'll use just the Oklahoma data, downsampled to daily statistics. "
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": null,
594 |    "metadata": {
595 |     "collapsed": true
596 |    },
597 |    "outputs": [],
598 |    "source": [
599 |     "def compute_daily_stats(data):\n",
600 |     "    daily = data.resample(period=dt.timedelta(days=1),\n",
601 |     "                                upsample_method='none',\n",
602 |     "                                downsample_method={'count': agg.COUNT('latitude'),\n",
603 |     "                                                   'avg_mag': agg.MEAN('mag'),\n",
604 |     "                                                   'max_mag': agg.MAX('mag')})\n",
605 |     "\n",
606 |     "    daily['count'] = daily['count'].fillna(0)\n",
607 |     "    return daily"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "code",
612 |    "execution_count": null,
613 |    "metadata": {
614 |     "collapsed": true
615 |    },
616 |    "outputs": [],
617 |    "source": [
618 |     "# Save the daily counts and recent daily counts.\n",
619 |     "daily_stats.save('working_data/global_daily_stats.ts')\n",
620 |     "compute_daily_stats(recent_quakes).save('working_data/global_daily_update.ts')"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {
627 |     "collapsed": true
628 |    },
629 |    "outputs": [],
630 |    "source": [
631 |     "# Filter just the Oklahoma data from the recent events.\n",
632 |     "grp = recent_quakes.group('location')\n",
633 |     "recent_oklahoma_quakes = grp.get_group('Oklahoma')"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": null,
639 |    "metadata": {
640 |     "collapsed": true
641 |    },
642 |    "outputs": [],
643 |    "source": [
644 |     "# Compute daily stats for the Oklahoma quake events.\n",
645 |     "compute_daily_stats(oklahoma_quakes).save('working_data/ok_daily_stats.ts')\n",
646 |     "compute_daily_stats(recent_oklahoma_quakes).save('working_data/ok_daily_update.ts')"
647 |    ]
648 |   }
649 |  ],
650 |  "metadata": {
651 |   "kernelspec": {
652 |    "display_name": "Python 2",
653 |    "language": "python",
654 |    "name": "python2"
655 |   },
656 |   "language_info": {
657 |    "codemirror_mode": {
658 |     "name": "ipython",
659 |     "version": 2
660 |    },
661 |    "file_extension": ".py",
662 |    "mimetype": "text/x-python",
663 |    "name": "python",
664 |    "nbconvert_exporter": "python",
665 |    "pygments_lexer": "ipython2",
666 |    "version": "2.7.11"
667 |   }
668 |  },
669 |  "nbformat": 4,
670 |  "nbformat_minor": 0
671 | }
672 | 


--------------------------------------------------------------------------------
/webinars/README.md:
--------------------------------------------------------------------------------
 1 | # Turi Webinars
 2 | 
 3 | This repository contains materials for webinars by Turi. You can browse
 4 | the notebooks using Github's own notebook viewer. Note that some images may not
 5 | be rendered correctly. 
 6 | 
 7 | If you'd like to run it, you may register for GraphLab Create
 8 | (https://turi.com/download/), then follow instructions to install.
 9 | 
10 | - GraphLab Create User Guide: https://turi.com/learn/userguide
11 | - GraphLab Forum: http://forum.turi.com/categories/graphlab-create
12 | 


--------------------------------------------------------------------------------
/webinars/pattern-mining/images/left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/left.png


--------------------------------------------------------------------------------
/webinars/pattern-mining/images/middle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/middle.png


--------------------------------------------------------------------------------
/webinars/pattern-mining/images/predictive_services_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/predictive_services_overview.png


--------------------------------------------------------------------------------
/webinars/pattern-mining/images/right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turi-code/tutorials/74273496f96fa11f67136210a736dce62e635e12/webinars/pattern-mining/images/right.png


--------------------------------------------------------------------------------
/webinars/product-reviews/README.md:
--------------------------------------------------------------------------------
1 | ## Text analysis with machine learning
2 | 
3 | This repository contains a demo of using GraphLab Create for understanding product sentiment using review data. The notebook requires GLC v1.9 which will be released at the end of April, 2016.
4 | 
5 | See the [live demo](http://demo-baby-product-reviews.turi.com) for a example of how one might use this analysis to compare products.
6 | 
7 | This demo requires the Amazon Baby Products data set hosted here: 
8 | [https://github.com/learnml/machine-learning-specialization](https://github.com/learnml/machine-learning-specialization)
9 | 


--------------------------------------------------------------------------------
/webinars/product-reviews/helper_util.py:
--------------------------------------------------------------------------------
 1 | import graphlab as gl
 2 | from graphlab.toolkits.text_analytics import trim_rare_words, split_by_sentence, extract_part_of_speech, stopwords, PartOfSpeech
 3 | from ipywidgets import widgets
 4 | from IPython.display import display, HTML, clear_output
 5 | 
 6 | def search(reviews, query='monitor'):
 7 |     m = gl._internal.search.create(reviews[['name']].unique().dropna())
 8 |     monitors = m.query(query)['name']
 9 |     reviews = reviews.filter_by(monitors, 'name')
10 |     return reviews
11 | 
12 | def get_comparisons(a, b, item_a, item_b, aspects):
13 | 
14 |     # Compute the number of sentences
15 |     a2 = a.groupby('tag', {item_a: gl.aggregate.COUNT})
16 |     b2 = b.groupby('tag', {item_b: gl.aggregate.COUNT})
17 |     counts = a2.join(b2)
18 | 
19 |     # Compute the mean sentiment
20 |     a2 = a.groupby('tag', {item_a: gl.aggregate.AVG('sentiment')})
21 |     b2 = b.groupby('tag', {item_b: gl.aggregate.AVG('sentiment')})
22 |     sentiment = a2.join(b2)
23 | 
24 |     # Get a list of adjectives
25 |     a2 = a.select_columns(['tag', 'adjectives'])\
26 |           .stack('adjectives', 'adjective')\
27 |           .filter_by(aspects, 'adjective', exclude=True)\
28 |           .groupby(['tag'], {item_a: gl.aggregate.CONCAT('adjective')})
29 |     b2 = b.select_columns(['tag', 'adjectives'])\
30 |           .stack('adjectives', 'adjective')\
31 |           .filter_by(aspects, 'adjective', exclude=True)\
32 |           .groupby(['tag'], {item_b: gl.aggregate.CONCAT('adjective')})
33 |     adjectives = a2.join(b2)
34 | 
35 |     return counts, sentiment, adjectives
36 | 
37 | def get_dropdown(reviews):
38 |     counts = reviews.groupby('name', gl.aggregate.COUNT).sort('Count', ascending=False)
39 |     counts['display_name'] = counts.apply(lambda x: '{} ({})'.format(x['name'], x['Count']))
40 |     counts = counts.head(500)
41 | 
42 |     from collections import OrderedDict
43 |     items = OrderedDict(zip(counts['display_name'], counts['name']))
44 |     item_dropdown = widgets.Dropdown()
45 |     item_dropdown.options = items
46 |     item_dropdown.value = items.values()[1]
47 |     return item_dropdown
48 | 
49 | def get_extreme_sentences(tagged, k=100):
50 | 
51 |     def highlight(sentence, tags, color):
52 |         for tag in tags:
53 |             html_tag = '<span style="color:{0}">{1}</span>'.format(color, tag)
54 |             sentence = sentence.replace(tag, html_tag)
55 |         return sentence
56 | 
57 |     good = tagged.topk('sentiment', k=k, reverse=False)
58 |     good['highlighted']  = good.apply(lambda x: highlight(x['sentence'], x['adjectives'], 'red'))
59 |     good['highlighted']  = good.apply(lambda x: highlight(x['highlighted'], [x['tag']], 'green'))
60 | 
61 |     bad = tagged.topk('sentiment', k=k, reverse=True)
62 |     bad['highlighted']  = bad.apply(lambda x: highlight(x['sentence'], x['adjectives'], 'red'))
63 |     bad['highlighted']  = bad.apply(lambda x: highlight(x['highlighted'], [x['tag']], 'green'))
64 | 
65 |     return good, bad
66 | 
67 | def print_sentences(sentences):
68 |     display(HTML('<p/>'.join(sentences)))
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------