├── images ├── cpv1.mp4 └── dask.svg ├── data ├── california │ ├── part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet │ ├── part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet │ ├── part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet │ ├── part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet │ ├── part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet │ ├── part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet │ ├── part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet │ └── part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet └── breast_cancer.csv ├── binder └── environment.yml ├── LICENSE ├── README.md ├── .gitignore ├── 00-Intro.ipynb ├── 01-QuickTour.ipynb ├── 03-E2E.ipynb └── 02-LessQuickTour.ipynb /images/cpv1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/images/cpv1.mp4 -------------------------------------------------------------------------------- /data/california/part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00000-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-105-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00001-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-106-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00002-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-107-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00003-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-108-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00004-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-109-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00005-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-110-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00006-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-111-1-c000.snappy.parquet -------------------------------------------------------------------------------- /data/california/part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbreind/distributed-tour-2022/main/data/california/part-00007-tid-2595799468439767928-8acbc669-35b8-49ef-ae03-df77016f96f8-112-1-c000.snappy.parquet -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: tour 2 | channels: 3 | - conda-forge 4 | - cyclus 5 | - defaults 6 | dependencies: 7 | - python=3.9 8 | - jupyterlab 9 | - dask 10 | - distributed 11 | - dask-labextension 12 | - pyspark=3.3.1 13 | - java-jdk 14 | - tqdm 15 | - python-graphviz 16 | - xgboost=1.6.2 17 | - pip: 18 | - ray[air]==2.0.1 19 | - xgboost_ray 20 | 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 adbreind 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## ODSC West 2022 Session 2 | 3 | ### Democratizing Distributed Compute and Machine Learning: A Tour of Three Frameworks 4 | 5 | __Please note: this is a "point in time" discussion of these frameworks from October 2022; due to the rapid evolution of these tools, this content may well be outdated by the time you look at it.__ 6 | 7 | Run this in binder: 8 | * https://tinyurl.com/odscwest22 (GESIS link, gives you 8GB RAM) 9 | * https://tinyurl.com/odscwest22g (GKE link, a little snappier startup, but only 2GB RAM) 10 | 11 | 12 | "Democratizing" has become a buzzword, and why not? Institutions of all types are discovering that almost every job role touches a bit of large-scale data analysis or data science, and sometimes more than just a bit! In this talk we'll look at the patterns, strengths, and weaknesses of three different open-source tools, which all claim to make large-scale computation simpler, easier, and more accessible to more people. 13 | 14 | Our exploration will reveal not only major differences at the technical level, but also differences in culture, documentation, usability, open-source governance, and other areas. How easy are they to use, for real people in real organizations? 15 | 16 | We'll look at... 17 | 18 | ... Apache Spark, a well established cluster computing tool suited to many kinds of work. Among other languages, Apache Spark boasts SparkSQL, which allows a huge number of SQL-capable folks to work on big data. 19 | 20 | ... Ray, a newer, multi language framework from UC Berkeley's RISE lab. Ray focuses on simplifying the scaffolding beneath distributed task graphs and actor sets so that users can focus on simple distributed training, tuning, reinforcement learning, and more. 21 | 22 | ... Dask, a Python-native library and part of the SciPy ecosystem dedicated to scaling popular tools like Pandas and NumPy. Dask lets users apply their existing Python knowledge by supporting elements of the Pandas, NumPy, and scikit-learn APIs ... and also extends to scheduling custom task graphs. 23 | 24 | All of these projects focus in some way on ease of use, and all have expanded the abilities of normal humans to work with data at scale. But they are also each quite different. This workshop will feature hands-on coding (with supplied notebooks), to help you think about what's easy, what's hard, what life is like with these tools, and which ones may be right for your organization. 25 | 26 | Focus: Mostly Technical / Some Business 27 | 28 | Experience level: Intermediate 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /00-Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "dXnXTrORzdnk" 8 | }, 9 | "source": [ 10 | "# ODSC West 2022:
Democratizing Distributed Compute and Machine Learning: A Tour of Three Frameworks\n", 11 | "\n", 12 | "> \"Democratizing\" has become a buzzword, and why not? Institutions of all types are discovering that almost every job role touches a bit of large-scale data analysis or data science, and sometimes more than just a bit! In this talk we'll look at the patterns, strengths, and weaknesses of three different open-source tools, which all claim to make large-scale computation simpler, easier, and more accessible to more people.\n", 13 | ">\n", 14 | ">Our exploration will reveal not only major differences at the technical level, but also differences in culture, documentation, usability, open-source governance, and other areas. How easy are they to use, for real people in real organizations?\n", 15 | ">\n", 16 | ">We'll look at...\n", 17 | ">\n", 18 | "> * __Apache Spark__, a well established cluster computing tool suited to many kinds of work. Among other languages, Apache Spark boasts SparkSQL, which allows a huge number of SQL-capable folks to work on big data.\n", 19 | "> * __Ray__, a newer, multi language framework from UC Berkeley's RISE lab. Ray focuses on simplifying the scaffolding beneath distributed task graphs and actor sets so that users can focus on simple distributed training, tuning, reinforcement learning, and more.\n", 20 | "> * __Dask__, a Python-native library and part of the SciPy ecosystem dedicated to scaling popular tools like Pandas and NumPy. Dask lets users apply their existing Python knowledge by supporting elements of the Pandas, NumPy, and scikit-learn APIs ... and also extends to scheduling custom task graphs.\n", 21 | ">\n", 22 | "> All of these projects focus in some way on ease of use, and all have expanded the abilities of normal humans to work with data at scale. But they are also each quite different. This workshop will feature hands-on coding (with supplied notebooks), to help you think about what's easy, what's hard, what life is like with these tools, and which ones may be right for your organization.\n", 23 | ">\n", 24 | "> *Focus: Mostly Technical / Some Business ... Experience level: Intermediate*\n", 25 | "\n", 26 | "## Tour Itinerary\n", 27 | "1. Intro\n", 28 | "1. Overview of platforms: 3 frameworks in 14 minutes\n", 29 | "1. Strengths, weaknesses, differences: 3 frameworks in 59 minutes\n", 30 | "1. End-to-end example using multiple frameworks\n", 31 | "1. Recommendations, speculations, inside baseball, Q & A\n", 32 | "\n", 33 | "\n", 34 | "### Instructor: Adam Breindel\n", 35 | "\n", 36 | "#### Contact: https://www.linkedin.com/in/adbreind - adbreind@gmail.com\n", 37 | "\n", 38 | "\n", 39 | "\n", 40 | "* 25 years building systems for startups and large enterprises including enterprise, mobile, web, data, machine learning\n", 41 | "* 15 years teaching front- and back-end technology\n", 42 | "\n", 43 | "#### Interesting projects...\n", 44 | "* My first full-time job in tech involved streaming neural net fraud scoring (debit cards)\n", 45 | "* Realtime & offline analytics for banking\n", 46 | "* Music synchronization and licensing for networked jukeboxes\n", 47 | "\n", 48 | "#### Industries\n", 49 | "* Finance / Insurance, Travel, Media / Entertainment, Government, Energy, Tech" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [] 58 | } 59 | ], 60 | "metadata": { 61 | "accelerator": "GPU", 62 | "colab": { 63 | "collapsed_sections": [], 64 | "name": "01QI-Intro.ipynb", 65 | "provenance": [ 66 | { 67 | "file_id": "1tKG1FZPbCHSqp4kJx52wgnG9f7RW0Cvf", 68 | "timestamp": 1553029369447 69 | } 70 | ], 71 | "version": "0.3.2" 72 | }, 73 | "kernelspec": { 74 | "display_name": "Python 3 (ipykernel)", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.9.13" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 4 93 | } 94 | -------------------------------------------------------------------------------- /01-QuickTour.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a2e622c6-1731-472e-9fd2-006b40fb387f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Three frameworks in 14 minutes (more or less)\n", 9 | "\n", 10 | "Background: these comparisons are not a \"horserace\" -- i.e., these tools are not directly comparable in functionality, so it's not a ranking, and none of them is inherently much faster than the others.\n", 11 | "\n", 12 | "The goal is to get a sense for which tools are best for which tasks so we can use one or all of them to build the systems we need to build.\n", 13 | "\n", 14 | "(chronologically...)\n", 15 | "\n", 16 | "## Apache Spark\n", 17 | "\n", 18 | "Probably the most famous of these tools, Spark...\n", 19 | "* was created around 2009 at UC Berkeley as a successor/companion to Hadoop big data tools\n", 20 | " * motivation was leveraging memory and network (vs. storage)\n", 21 | "* was fast but not easily usable in the earliest iterations (-2015)\n", 22 | "* embraced SQL and dataframe patterns for users (plus better internals for performance) beginning in 2015 \n", 23 | " * enjoys extensive success as a \"unified platform\" (data, ML, streaming, SQL)\n", 24 | "* leverages mainly Scala, with some other language wrappers (most notably Python and SQL)\n", 25 | "* is Apache Licensed and a top-level Apache Foundation project, with most core contributors at Databricks\n", 26 | "* works well if you have a good mental model of how it works and how to tune/troubleshoot; difficult and/or underperforming otherwise\n", 27 | "* hard-coded data-parallel scheduler pattern\n", 28 | "\n", 29 | "__* Strongest for: SQL, next-gen table formats [Delta Lake, Hudi, Iceberg], ETL, featurization from data lake[house], streaming, docs__\n", 30 | "\n", 31 | "__* Weakest for: integration with custom code, (in)flexible machine learning, \"grokking\" for tuning/troubleshooting__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "fcf2ea31-8512-4f3b-9df7-309d97c8a7b6", 37 | "metadata": {}, 38 | "source": [ 39 | "## Dask\n", 40 | "\n", 41 | "Popular among scientists and gaining general interest, Dask...\n", 42 | "* was created in 2014 as a pure-Python scheduler for multiprocessing\n", 43 | " * motivation was to extend SciPy/PyData to arbitrarily large datasets\n", 44 | " * and \"invent nothing\" (meaning leverage existing community code/libraries)\n", 45 | "* expanded to include Array, Dataframe, Bag (functional) collections, some ML as well as core scheduling primitives for Python functions\n", 46 | "* supports external integrations, e.g., it is used implicitly by multi-dimensional array library XArray\n", 47 | "* is pure Python (although many related Python libraries delegate to native code or accelerated code, e.g., NumPy, CuPy)\n", 48 | "* is BSD-3-Clause licensed (permissive)\n", 49 | "* has a core group of contributors with natural sciences research backgrounds\n", 50 | " * functions largely on the \"classic volunteer OSS\" model with no corporate control\n", 51 | "* is fairly easy to get started with, especially if you know some Python and PyData tools\n", 52 | "\n", 53 | "__* Strongest for: array computation, scaling custom Python code, realtime visibility into processing (dashboards), \"grokking\" execution__\n", 54 | "\n", 55 | "__* Weakest for: tabular data access, \"off-the-shelf\" scalable machine learning, large-scale (data parallel) shuffle, docs__\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "294237c6-ed95-432b-b79d-0b618cfaf647", 61 | "metadata": {}, 62 | "source": [ 63 | "## Ray\n", 64 | "\n", 65 | "The newest of these frameworks and most rapidly evolving, Ray...\n", 66 | "\n", 67 | "* began in 2016-2017 at UC Berkeley featuring\n", 68 | " * flexible, arbitrarily scalable abstraction over general function graphs and actors (think roughly \"distributed OO\")\n", 69 | " * distributed scheduler\n", 70 | "* refactored in 2020-present as a layered platform with\n", 71 | " * \"product-style/off-the-shelf\" solutions to common problems (data movement, scalable ML training including RL, tuning, deployment, and more)\n", 72 | " * integrations to popular external components (e.g., Huggingface)\n", 73 | " * a core layer for scaling custom code designs or building high-level components\n", 74 | "* is mostly Python with a Python API; some code components in C++ and \"plug points\" for alternative language bindings (e.g., Java)\n", 75 | "* is Apache licensed, but part of the Linux Foundation; leadership and core contributions from Anyscale\n", 76 | "* presents simple/effective high-level interfaces for common problems\n", 77 | " * at the lower levels it's a but simpler than Spark but a bit more complex than Dask (YMMV)\n", 78 | " \n", 79 | "__* Strongest for: scaling with minimal work: ML, DL, RL, tuning, deployment, etc.; API design focused on users; easy integration, docs__\n", 80 | "\n", 81 | "__* Weakest for: stable patterns/APIs due to fast evolution, updated design; early days for new APIs (e.g., AIR)__\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "3560fbbc-e6fe-4cfb-aaac-47019198b466", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3 (ipykernel)", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.9.13" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 5 114 | } 115 | -------------------------------------------------------------------------------- /03-E2E.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "006dbbc6-0b93-4006-bf95-d577ac76f2fa", 6 | "metadata": {}, 7 | "source": [ 8 | "# Narrow end-to-end story (but where are the 'ends' these days?)\n", 9 | "\n", 10 | "Here we'll take a look at\n", 11 | "* loading and reshaping data\n", 12 | "* training a model\n", 13 | "* serving\n", 14 | "\n", 15 | "... using a combination of tools.\n", 16 | "\n", 17 | "__The focus/goal is to share the *flavor* of the APIs and systems, not to go focus on solving specific problems__\n", 18 | "\n", 19 | ">We won't cover (but are certainly not neglecting the importance of) upstream activities like data acquisistion, discovery, and catalog integration...\n", 20 | "parallel work like experiment tracking, recording dataset provenance and features, archiving artifacts... or key downstream activities like monitoring models in production, drift or bias detection, rollout/rollback of new model versions" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "84e3ed37-cef0-4b1a-9b1a-24c78025d9b6", 26 | "metadata": {}, 27 | "source": [ 28 | "## Loading data\n", 29 | "\n", 30 | "For many use cases, the initial access of the data might be via Spark (or, e.g., Trino) in order to locate tables in __Nessie__ (https://projectnessie.org/) or a __Hive Metastore__ and to assemble/extract with (potentially complex) SQL.\n", 31 | "\n", 32 | "In this example, we'll assume we already know the locations of our data and we'll use Dask to access it.\n", 33 | "\n", 34 | "*By design, we are not going to create a Dask distributed cluster -- we'll use Dask to define some tasks but Ray to run them. If this is confusing, we'll have you covered in a couple of minutes.*" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "7220f692-a40f-48fb-9842-1c09a2aa3701", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import dask.dataframe as ddf\n", 45 | "\n", 46 | "df = ddf.read_csv('data/diamonds.csv', dtype={'table':'float64'})\n", 47 | "\n", 48 | "df" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "8e66677b-2784-4dbc-a766-2e04e5cd9ace", 54 | "metadata": {}, 55 | "source": [ 56 | "We can extend the Dask dataframe graph with some common data prep operations" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "00217add-66ab-476e-bf80-c9dbd85a2b70", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "df2 = df.categorize()\n", 67 | "\n", 68 | "df2" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "68987073-7793-43d8-9664-dbd1f7f7c5fb", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df3 = ddf.get_dummies(df2)\n", 79 | "\n", 80 | "df3" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "82163ba0-9a83-460a-9db4-d1f72b197e24", 86 | "metadata": {}, 87 | "source": [ 88 | "Ray can schedule (compute) the operations from a Dask task graph. In fact, Ray Data can integrate with lots of other data sources: https://docs.ray.io/en/latest/data/dataset.html#supported-input-formats" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "45e4c4d8-f3bf-4214-b5bc-ae1d6588c81a", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "import ray\n", 99 | "\n", 100 | "ray.init(num_cpus=4)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "6d6dbbe4-dc8b-413a-b934-be6b130ebc6a", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "ds = ray.data.from_dask(df3)\n", 111 | "\n", 112 | "ds" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "4075e769-23a0-4211-8cd5-ed61ba7528ab", 118 | "metadata": {}, 119 | "source": [ 120 | "We can do *some* data manipulation with Ray Data datasets.\n", 121 | "\n", 122 | "Today, Ray Data is envisioned as \"last-mile preprocessing\" along with assisting tasks that are specific to paralellism (e.g., repartition) or which require special handling in the parallel case (e.g., train/test split)." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "4fac2212-eb95-45c9-9f3d-c2a1c064e23a", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "ds1 = ds.drop_columns('Unnamed: 0').repartition(2)\n", 133 | "\n", 134 | "ds1" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "5bbd3b67-7308-42ba-9145-2ea905a5248a", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "ds1.take(1)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "80865925-f2e4-4e05-ac67-3f660621a56e", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "train_dataset, valid_dataset = ds1.train_test_split(test_size=0.2)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "28c40554-ee63-47ef-8751-3bb7da882079", 160 | "metadata": {}, 161 | "source": [ 162 | "We can use the `Trainer` pattern (https://docs.ray.io/en/latest/train/train.html#intro-to-ray-train) -- here with XGBoost, but similarly for deep learning." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "e612a9fa-937a-4122-b6f5-a3ba20bc43c6", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "from ray.train.xgboost import XGBoostTrainer\n", 173 | "from ray.air.config import ScalingConfig\n", 174 | "\n", 175 | "scale = ScalingConfig(num_workers=2, use_gpu=False)\n", 176 | "\n", 177 | "trainer = XGBoostTrainer(scaling_config=scale,\n", 178 | " label_column=\"price\",\n", 179 | " num_boost_round=20,\n", 180 | " params={ \"objective\": \"reg:squarederror\", \"eval_metric\": [\"rmse\", \"error\"], },\n", 181 | " datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n", 182 | ")\n", 183 | "\n", 184 | "result = trainer.fit()\n", 185 | "print(result.metrics)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "id": "ede8e918-0c02-4786-a91f-dc6910bd47b7", 191 | "metadata": {}, 192 | "source": [ 193 | "If we had more time and wanted more accuracy, this would be a great point to try out __Ray Tune__ and get the best hyperparams we can: https://docs.ray.io/en/latest/tune/index.html\n", 194 | "\n", 195 | "Instead, we'll move toward serving this model via a low-latency request-response prediction service with __Ray Serve__.\n", 196 | "\n", 197 | "Before creating our service, let's make sure everything's working" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "9693b380-25ef-422e-b63a-d1baea692b70", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "from ray.train.xgboost import XGBoostPredictor\n", 208 | "\n", 209 | "predictor = XGBoostPredictor.from_checkpoint(result.checkpoint)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "id": "ccaceeb5-ba8a-4b33-9d86-fbd0df810b01", 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "smoke_test = valid_dataset.drop_columns('price')\n", 220 | "\n", 221 | "smoke_test.to_pandas()[:1]" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "1ac9eb26-4c26-47ac-bc79-84c0103a5d2a", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "predictor.predict(smoke_test.to_pandas()[:1])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "2ea6fb84-5c9c-4e7c-b428-5343d80aaae0", 237 | "metadata": {}, 238 | "source": [ 239 | "Ok, now we'll create a service with Ray Serve to deploy our model.\n", 240 | "\n", 241 | "We'll serialize our last model checkpoint -- in production we could do something like this or use a model db or other mechanism to find the version we want to deploy." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "6f0073e8-fa19-4526-88af-4f1ab690df1b", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "import cloudpickle\n", 252 | "\n", 253 | "checkpoint_serialized = cloudpickle.dumps(result.checkpoint)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "666deca2-0bd9-4b99-9ca2-78d760b6e676", 259 | "metadata": {}, 260 | "source": [ 261 | "At first, it might not be obvious why (or even whether) we want a system as complex as Ray for serving models.\n", 262 | "\n", 263 | "In this demo case, we could probably solve the problem other ways. But when we have multiple services, ensembling of models, conditional flow, autoscaling and heterogeneous hardware ... we'll be glad to have a tool designed for just such challenges.\n", 264 | "\n", 265 | "https://docs.ray.io/en/latest/serve/scaling-and-resource-allocation.html#autoscaling" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "id": "d82340e3-eb69-48fc-b7a3-72d10deab217", 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "import pandas as pd\n", 276 | "from starlette.requests import Request\n", 277 | "from typing import Dict\n", 278 | "from ray import serve\n", 279 | "\n", 280 | "@serve.deployment(route_prefix=\"/\", num_replicas=2)\n", 281 | "class DiamondPricerDeployment:\n", 282 | " def __init__(self, checkpoint:bytes):\n", 283 | " self._model = XGBoostPredictor.from_checkpoint(cloudpickle.loads(checkpoint))\n", 284 | "\n", 285 | " async def __call__(self, request: Request) -> Dict:\n", 286 | " data = await request.json()\n", 287 | " return { \"result\" : self._model.predict(pd.read_json(data)).predictions[0] }\n", 288 | "\n", 289 | "serve.run(DiamondPricerDeployment.bind(checkpoint=checkpoint_serialized))" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "id": "40b1d3ab-04fa-468f-8ec6-c0d5ce8d2e07", 295 | "metadata": {}, 296 | "source": [ 297 | "Ok... let's make some predictions!" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "6a690bc6-2c79-4d89-a170-d049f35278a6", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "sample_row = smoke_test.to_pandas()[:1].copy(True)\n", 308 | "sample_row.carat = 0.8\n", 309 | "sample_row" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "id": "e0590ed7-278d-4930-ad8b-d4fb7727596e", 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "import requests\n", 320 | "\n", 321 | "print(requests.post(\"http://localhost:8000/\", json = sample_row.to_json()).json())" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "id": "851d680d-6358-41c1-aff5-a5365d70d9e1", 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 3 (ipykernel)", 336 | "language": "python", 337 | "name": "python3" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.9.13" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 5 354 | } 355 | -------------------------------------------------------------------------------- /02-LessQuickTour.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4201fa05-ec56-4cba-9ef4-e300ffd65beb", 6 | "metadata": {}, 7 | "source": [ 8 | "# Three frameworks in 59 minutes (more or less)\n", 9 | "\n", 10 | "Let's take a deeper dive into each of these tools and get into some code and architecture.\n", 11 | "\n", 12 | "For time reasons, and because we are interested in demoing where these tools work well, we'll just look at a few bits of key use cases." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "b9b3b36a-b93b-4b1a-b3b1-2dd99276b434", 18 | "metadata": {}, 19 | "source": [ 20 | "## Apache Spark\n", 21 | "\n", 22 | "__Data access__" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "b6bcc467-67d8-4032-9d14-1d379d411b92", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import pyspark\n", 33 | "from pyspark.sql import SparkSession\n", 34 | "\n", 35 | "spark = SparkSession.builder.getOrCreate()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "287f1416-6e6d-42df-b26d-33c5cd523d66", 41 | "metadata": {}, 42 | "source": [ 43 | "We can create a Spark dataframe from SQL. A Spark dataframe is really a query, not a dataset ... so it's closer to a VIEW in the RDBMS world." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "628d1307-42dd-4ef2-aea9-7ade14361def", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "spark.sql('SELECT * FROM parquet.`data/california`')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "17b92a91-6eaf-40e5-940e-f5b8a1361af2", 59 | "metadata": {}, 60 | "source": [ 61 | "We need to explicitly tell Spark if we want to read or process data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "a0d80200-d89e-4bea-9e72-afe59ecef3e8", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "spark.sql('SELECT * FROM parquet.`data/california`').show()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "916fb1c1-9352-4cae-98ab-e9dd35be9b40", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "spark.sql('SELECT origin, AVG(delay) as delay FROM parquet.`data/california` GROUP BY origin HAVING count(1) > 500 ORDER BY delay DESC').show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "0ce6d45d-d141-4a57-8668-277045afb7dd", 87 | "metadata": {}, 88 | "source": [ 89 | "__Data manipulation__" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "cec3596d-277e-4aaf-8bd2-283df5b43719", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "df = spark.read.csv('data/diamonds.csv', inferSchema=True, header=True)\n", 100 | "\n", 101 | "df" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "682da7c6-2a0a-4a0c-aacb-79a1de197016", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "df.show()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "28314f33-1313-42a4-9fce-1acf39091771", 117 | "metadata": {}, 118 | "source": [ 119 | "We can manipulate Spark dataframes with the classic PySpark API" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "76524e9e-9630-45c1-8564-773a0ecab6ea", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "df.drop('_c0').withColumnRenamed('price', 'label')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "02b64ded-f935-4782-b655-213255ab712f", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "import pyspark.sql.functions as fn\n", 140 | "\n", 141 | "df.groupby(fn.ceil('carat')).mean('price').orderBy('ceil(carat)').show()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "af7c4ece-c968-47eb-b0ac-b29d8fb6c2c5", 147 | "metadata": {}, 148 | "source": [ 149 | "In recent versions of Spark, we can also use the Pandas API (although there are a number of caveats that come with this approach)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "435fdeda-227c-4696-822b-a690b3ae4876", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "import pyspark.pandas as ps\n", 160 | "\n", 161 | "df.pandas_api()[:5]" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "id": "b8bfccaa-b047-42af-8f8f-a0033ebcff86", 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "psdf = df.pandas_api().drop(columns='_c0').rename(columns={'price':'label'})\n", 172 | "\n", 173 | "psdf[:5]" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "63c85ed3-abfa-4fec-830a-736a3860cbb8", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "ps.get_dummies(psdf)[:5]" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "c697d799-eee6-44cf-a529-cc0f1a9c63ef", 189 | "metadata": {}, 190 | "source": [ 191 | "__Architecture__" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "15c23b12-0fec-4436-a7aa-87c9f90700d7", 197 | "metadata": {}, 198 | "source": [ 199 | "" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "id": "b0b18b81-36b0-4ba1-bd6a-10849c0105d8", 205 | "metadata": {}, 206 | "source": [ 207 | "---\n", 208 | "\n", 209 | "## Dask\n", 210 | "\n", 211 | "\n", 212 | "__Cluster creation and dashboards__" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "6394dde4-4aa8-43f6-8166-398cffef2e4a", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "from dask.distributed import Client, LocalCluster\n", 223 | "\n", 224 | "cluster = LocalCluster(n_workers=2, threads_per_worker=1, memory_limit='1GB')\n", 225 | "\n", 226 | "client = Client(cluster)\n", 227 | "\n", 228 | "client" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "56621f2c-014e-4431-9a9a-342ea9fe2dc2", 234 | "metadata": {}, 235 | "source": [ 236 | "* Dashboard\n", 237 | "* Jupyterlab plugin" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "id": "866e4923-6c4d-4a94-8d61-e1c052bbc665", 243 | "metadata": {}, 244 | "source": [ 245 | "__Arrays__" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "3c8acbe5-ad41-49ff-9bfc-888ef781f11b", 251 | "metadata": {}, 252 | "source": [ 253 | "Dask Array is a virtual, lazy large array composed of chunks, each of which will be a NumPy array (in the default configuration) when loaded" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "ea35d1de-a158-4095-985c-136d57d1bf77", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "import dask.array as da\n", 264 | "\n", 265 | "arr = da.random.random((200, 200), chunks=(50, 40))\n", 266 | "\n", 267 | "arr" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "id": "7403b360-21e4-46f9-9f07-89612eaa6f8a", 273 | "metadata": {}, 274 | "source": [ 275 | "Dask Array aims to implement most of the NumPy API, so we use that API for most operations" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "dae4d0cf-dda2-488e-9e4f-0a738ea59660", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "arr @ arr.T" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "id": "a242f889-2fdb-4fd4-ac26-bc594dcc54ed", 291 | "metadata": {}, 292 | "source": [ 293 | "Because the data structure is virtual, we need to tell Dask explicitly what we want to `.compute()` or write out (e.g., via `.to_zarr()`)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "6b8a058d-17a5-48ad-badf-7c8b622bc9e9", 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "(arr @ arr.T).compute()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "3fda2220-635e-4e74-84dc-d221a82fc128", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "da.linalg.svd((arr @ arr.T).rechunk(200, 20)) # returns (u,s,v)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "47f97777-c9a7-43a4-b630-50276798797e", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "da.linalg.svd((arr @ arr.T).rechunk(200, 20))[1].compute() # singular vals" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "id": "bf2bf4e8-d7a8-48b2-88a4-fcbed887e7a9", 329 | "metadata": {}, 330 | "source": [ 331 | "__Architecture__\n", 332 | "\n", 333 | "" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "2a028d2b-0ec9-4502-a673-44d65a14d858", 339 | "metadata": {}, 340 | "source": [ 341 | "__Parallelizing Python__\n", 342 | "\n", 343 | "Dask has two different APIs for parallelizing Python code. Here's we'll look at `delayed`." 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "1b45cd56-24ff-4c94-97e1-e38f68237f65", 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "from dask import delayed\n", 354 | "import numpy as np\n", 355 | "\n", 356 | "@delayed\n", 357 | "def get_data(i):\n", 358 | " return np.array([i, i+1, i+2])\n", 359 | "\n", 360 | "get_data(7)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "670a6693-f72c-4af5-ad85-e1c27975b80d", 366 | "metadata": {}, 367 | "source": [ 368 | "A Delayed is a proxy object (it can \"handle\" most normal operations/messages and internally records them into a compute graph)." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "6da8bf2e-4eb8-4fe4-9ff4-6eb697f9ee42", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "get_data(7).compute()" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "id": "8916c674-69a1-4845-8090-4a3b3874c100", 384 | "metadata": {}, 385 | "source": [ 386 | "In its role as root of a compute graph, we can also tell it to `.compute`, cache (`.persist()`), explain (`.dask` or `.visualize()`)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "id": "658b81c6-97cd-440f-b64f-c6fd8e0cd505", 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "some_numbers = get_data(7)\n", 397 | "\n", 398 | "some_more = get_data(100)\n", 399 | "\n", 400 | "total = np.sum(some_numbers) + np.sum(some_more)\n", 401 | "\n", 402 | "total" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "id": "2d29c227-fc85-4796-b046-44a6d1ef0916", 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "total.visualize()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "id": "2795924b-7c27-4074-9d9b-b725166cc956", 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "total.compute()" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "ee304e50-997e-4376-aa37-4492ee9d32bd", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "client.close()\n", 433 | "cluster.close()" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "id": "cbdfbb78-6753-44ff-a2eb-cc5c11d86e0e", 439 | "metadata": {}, 440 | "source": [ 441 | "---\n", 442 | "\n", 443 | "## Ray" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "id": "296f625e-db6f-4939-ad4c-b63dee25c57b", 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "import ray\n", 454 | "\n", 455 | "ray.init(num_cpus=4)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "id": "24f84810-37c8-48df-8729-6b82a7734e40", 461 | "metadata": {}, 462 | "source": [ 463 | "__Data access__\n", 464 | "\n", 465 | "Ray accesses data from storage or from other systems via Ray Data" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "id": "bd1f41b3-4adc-4443-a465-8407144f171a", 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "dataset = ray.data.read_csv('data/breast_cancer.csv')\n", 476 | "\n", 477 | "dataset.take(1)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "id": "9874aef9-6fb9-4753-9d5f-d42723e64aa4", 483 | "metadata": {}, 484 | "source": [ 485 | "__Prep and model training__\n", 486 | "\n", 487 | "Ray Data is also capable of some data manipulation (\"last-mile data prep\")" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "id": "706960de-bf96-44fa-b008-7383b1f87ae4", 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "id": "8e76d4b9-b057-4c41-8f57-ba31f2b01e1e", 503 | "metadata": {}, 504 | "source": [ 505 | "Training is done through a standardized `Trainer` interface that allows for tree-based, deep-learning, or other distributed training use cases" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "id": "ea071cac-21c4-4b41-b9ee-2692e20f571d", 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "from ray.train.xgboost import XGBoostTrainer\n", 516 | "from ray.air.config import ScalingConfig\n", 517 | "\n", 518 | "scale = ScalingConfig(num_workers=2, use_gpu=False)\n", 519 | "\n", 520 | "trainer = XGBoostTrainer(\n", 521 | " scaling_config = scale, label_column=\"target\", num_boost_round=20,\n", 522 | " \n", 523 | " params={ \"objective\": \"binary:logistic\", \"eval_metric\": [\"logloss\", \"error\"] }, # XGBoost params\n", 524 | " \n", 525 | " datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n", 526 | ")" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "id": "74785d88-9daf-4ba8-a92f-e76ee648f6dc", 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "result = trainer.fit()\n", 537 | "print(result.metrics)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "id": "d546b17b-0c92-45d9-b021-304afa29463a", 543 | "metadata": {}, 544 | "source": [ 545 | "__Architecture__\n", 546 | "\n", 547 | "https://docs.ray.io/en/latest/cluster/key-concepts.html#key-concepts\n", 548 | "\n", 549 | "" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "id": "973f6c58-2c01-4123-bf14-d787c26ffbe4", 555 | "metadata": {}, 556 | "source": [ 557 | "__Prediction__\n", 558 | "\n", 559 | "Batch prediction has a dedicated API (a separate API is used for fast/small prediction, which we'll see when we demo Ray Serve)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "id": "f0dd74af-0afb-4020-b768-9595eae76c40", 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "from ray.train.batch_predictor import BatchPredictor\n", 570 | "from ray.train.xgboost import XGBoostPredictor\n", 571 | "\n", 572 | "batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor)\n", 573 | "\n", 574 | "demo_records = valid_dataset.drop_columns(['target'])\n", 575 | "\n", 576 | "batch_predictor.predict(demo_records).to_pandas()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "id": "06e6cffa-8d0b-4067-b271-93f2d4863cc6", 582 | "metadata": {}, 583 | "source": [ 584 | "__Reinforcement Learning__\n", 585 | "\n", 586 | "RL is a key use case with extensive design and support in Ray. It's outside our scope for today but definitely check it out.\n", 587 | "\n", 588 | "