├── .gitignore ├── Demo.ipynb ├── README.md └── Slides-Demo.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | metastore_db 2 | derby.log 3 | .ipynb_checkpoints 4 | *.csv 5 | tmp* 6 | *.slides.html 7 | -------------------------------------------------------------------------------- /Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Apache Toree Demo" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Simple Spark Scala\n", 15 | "\n", 16 | "Test notebook with simple Spark Scala code.\n", 17 | "\n", 18 | "Take numbers 1 to 100, keep the numbers that are even, square them, and keep the first 10." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "attributes": { 26 | "classes": [ 27 | "scala" 28 | ], 29 | "id": "" 30 | }, 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "Array(4, 16, 36, 64, 100, 144, 196, 256, 324, 400)" 38 | ] 39 | }, 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "sc.parallelize(1 to 100).\n", 47 | " filter(x => x % 2 == 0).\n", 48 | " map(x => x * x).\n", 49 | " take(10)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "Use tab for auto-complete." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Test CSV Library" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Useful functions\n", 71 | "\n", 72 | "Define some functions." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 2, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "// Grab URL contents\n", 84 | "def getUrl(url:String):String = \n", 85 | " scala.io.Source.fromURL(url).mkString\n", 86 | "\n", 87 | "// Write file\n", 88 | "def fileWrite(path:String,contents:String) = {\n", 89 | " import java.io.{PrintWriter,File}\n", 90 | " val writer = new PrintWriter(new File(path))\n", 91 | " writer.write(contents)\n", 92 | " writer.close\n", 93 | "}" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### Download Prices\n", 101 | "\n", 102 | "Get the historical stock price of AAPL and save it in AAPL.csv" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "AAPL.csv\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "val symbol = \"AAPL\"\n", 122 | "val baseUrl = \"http://real-chart.finance.yahoo.com\"\n", 123 | "val url = s\"${baseUrl}/table.csv?s=${symbol}&g=d&ignore=.csv\"\n", 124 | "val csv = getUrl(url)\n", 125 | "val csvFile = s\"${symbol}.csv\"\n", 126 | "fileWrite(csvFile, csv)\n", 127 | "println(csvFile)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Highest Prices\n", 135 | "\n", 136 | "Find the days with the highest adjusted close prices." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "(130.67132,2015-05-22)\n", 151 | "(130.579411,2015-02-23)\n", 152 | "(130.235775,2015-04-27)\n", 153 | "(130.20796,2015-07-20)\n", 154 | "(130.178369,2015-05-27)\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "val stockRdd = sc.textFile(csvFile).\n", 160 | " filter(line => line matches \".*\\\\d.*\").\n", 161 | " map(line => line.split(\",\")).\n", 162 | " map(fields => (fields(6).toDouble,fields(0))).\n", 163 | " sortBy({case (close,date) => close},false)\n", 164 | "\n", 165 | "stockRdd.take(5).foreach(println)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Load CSV\n", 173 | "\n", 174 | "Now lets use SQL to analyze the stock instead of directly manipulating records." 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Load CSV file as data frame." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 5, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "val df = sqlContext.read.\n", 193 | " format(\"com.databricks.spark.csv\").\n", 194 | " option(\"header\", \"true\").\n", 195 | " option(\"inferSchema\", \"true\").\n", 196 | " load(\"AAPL.csv\")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### View Data Frame\n", 204 | "\n", 205 | "What does `df` look like?" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 6, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "+----------+----------+\n", 220 | "| Date| Adj Close|\n", 221 | "+----------+----------+\n", 222 | "|2016-03-18|105.919998|\n", 223 | "|2016-03-17|105.800003|\n", 224 | "|2016-03-16|105.970001|\n", 225 | "|2016-03-15|104.580002|\n", 226 | "|2016-03-14|102.519997|\n", 227 | "|2016-03-11|102.260002|\n", 228 | "|2016-03-10|101.169998|\n", 229 | "|2016-03-09|101.120003|\n", 230 | "|2016-03-08|101.029999|\n", 231 | "|2016-03-07|101.870003|\n", 232 | "|2016-03-04|103.010002|\n", 233 | "|2016-03-03| 101.5|\n", 234 | "|2016-03-02| 100.75|\n", 235 | "|2016-03-01|100.529999|\n", 236 | "|2016-02-29| 96.690002|\n", 237 | "|2016-02-26| 96.910004|\n", 238 | "|2016-02-25| 96.760002|\n", 239 | "|2016-02-24| 96.099998|\n", 240 | "|2016-02-23| 94.690002|\n", 241 | "|2016-02-22| 96.879997|\n", 242 | "+----------+----------+\n", 243 | "only showing top 20 rows\n", 244 | "\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "df.select(\"Date\",\"Adj Close\").show" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "### SQL Queries\n", 257 | "\n", 258 | "Register it as a SQL table." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 7, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "df.registerTempTable(\"aapl\")" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Find out how many rows it has." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 8, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "+---------+\n", 291 | "|row_count|\n", 292 | "+---------+\n", 293 | "| 8893|\n", 294 | "+---------+\n", 295 | "\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "sqlContext.sql(\"SELECT COUNT(1) AS row_count FROM aapl\").show" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Highest Prices\n", 308 | "\n", 309 | "Find out what the highest adjusted close was." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": { 316 | "collapsed": false 317 | }, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "+---------+\n", 324 | "|max_close|\n", 325 | "+---------+\n", 326 | "|130.67132|\n", 327 | "+---------+\n", 328 | "\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "sqlContext.sql(\"SELECT MAX(`Adj Close`) AS max_close FROM aapl\").show" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "Find the dates of the 5 highest adjusted close prices." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 10, 346 | "metadata": { 347 | "attributes": { 348 | "classes": [ 349 | "scala" 350 | ], 351 | "id": "" 352 | }, 353 | "collapsed": false 354 | }, 355 | "outputs": [ 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "+----------+----------+\n", 361 | "| Date| Adj Close|\n", 362 | "+----------+----------+\n", 363 | "|2015-05-22| 130.67132|\n", 364 | "|2015-02-23|130.579411|\n", 365 | "|2015-04-27|130.235775|\n", 366 | "|2015-07-20| 130.20796|\n", 367 | "|2015-05-27|130.178369|\n", 368 | "+----------+----------+\n", 369 | "\n" 370 | ] 371 | } 372 | ], 373 | "source": [ 374 | "sqlContext.sql(\"\"\"SELECT Date,`Adj Close` FROM aapl \n", 375 | " ORDER BY `Adj Close` DESC LIMIT 5\"\"\").show" 376 | ] 377 | } 378 | ], 379 | "metadata": { 380 | "kernelspec": { 381 | "display_name": "Toree", 382 | "language": "", 383 | "name": "toree" 384 | }, 385 | "language_info": { 386 | "name": "scala" 387 | } 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 0 391 | } 392 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Toree Quickstart 2 | 3 | ## Presentation 4 | 5 | Presentation accompanying this tutorial: 6 | 7 | 8 | 9 | ## Preamble 10 | 11 | All of the following commands should be executed in a new terminal 12 | window. You do not need to modify your `.profile` or `.bashrc` or any 13 | other configuration file. 14 | 15 | ## Install Spark 16 | 17 | Install Spark from . 18 | 19 | Make sure you download the pre-built *binaries* for Hadoop 2.6 and 20 | later, and not the sources. (By default the download link will point 21 | to the sources and not the binaries.) 22 | 23 | ## Install Pip, Jupyter, Toree 24 | 25 | Install pip if you don't already have it installed. 26 | 27 | sudo easy_install pip 28 | 29 | If you already have pip then make sure it is updated. 30 | 31 | sudo pip install --upgrade pip 32 | 33 | Install Jupyter and Toree. 34 | 35 | sudo pip install jupyter 36 | sudo pip install toree 37 | 38 | ## Configure 39 | 40 | Set `SPARK_HOME` to point to the directory where you downloaded and 41 | expanded the Spark binaries. Instead of 1.6.1 you might have a 42 | different version number. 43 | 44 | SPARK_HOME=$HOME/Downloads/spark-1.6.1-bin-hadoop2.6 45 | 46 | Configure Toree. 47 | 48 | jupyter toree install \ 49 | --spark_home=$SPARK_HOME 50 | 51 | ## Start 52 | 53 | Start notebook. 54 | 55 | jupyter notebook 56 | 57 | Point browser to . 58 | 59 | Then open a new notebook using *New > Toree*. 60 | 61 | ## Test 62 | 63 | Test notebook with simple Spark Scala code. 64 | 65 | ```scala 66 | sc.parallelize(1 to 100). 67 | filter(x => x % 2 == 0). 68 | map(x => x * x). 69 | take(10) 70 | ``` 71 | 72 | Use tab for auto-complete. 73 | 74 | # 3rd Party Libraries 75 | 76 | ## Configure 77 | 78 | List all packages you will use. 79 | 80 | SPARK_PKGS=$(cat << END | xargs echo | sed 's/ /,/g' 81 | com.databricks:spark-csv_2.10:1.4.0 82 | com.databricks:spark-avro_2.10:2.0.1 83 | END) 84 | 85 | Define `SPARK_OPTS` and `SPARK_HOME`. 86 | 87 | SPARK_OPTS="--packages=$SPARK_PKGS" 88 | SPARK_HOME=$HOME/Downloads/spark-1.6.0-bin-hadoop2.6 89 | 90 | Configure Toree to use these packages. 91 | 92 | jupyter toree install \ 93 | --spark_home=$SPARK_HOME \ 94 | --spark_opts=$SPARK_OPTS 95 | 96 | ## Start 97 | 98 | Start notebook. 99 | 100 | jupyter notebook 101 | 102 | Point browser to . 103 | 104 | Then open a new notebook using *New > Toree*. 105 | 106 | ## Troubleshooting 107 | 108 | If you run into issues downloading dependencies wipe out `~/.m2` and 109 | `~/.ivy2`. Spark uses Ivy2 which sometimes corrupts these folders. 110 | This is a likely source of dependency errors. 111 | 112 | ## Test CSV library 113 | 114 | Now go to the notebook [Demo.ipynb](Demo.ipynb) and test code there. 115 | 116 | # Going Public 117 | 118 | ## Publishing GitHub Notebooks 119 | 120 | How can I share my notebook with other people? 121 | 122 | - Go to 123 | - Create gist with extension `.ipynb` 124 | - Copy notebook to clipboard: `cat NOTEBOOK.ipynb | pbcopy` 125 | - Paste clipboard into the gist you just created 126 | - Copy the Gist ID 127 | - Go to 128 | - Paste the Gist ID here 129 | - Share the link NBViewer gives you 130 | 131 | ## Creating Slide Shows 132 | 133 | How can I create a slide show from a notebook? 134 | 135 | - Create a Toree notebook 136 | - Click on *View > Cell Toolbar > Slideshow* 137 | - `jupyter nbconvert NOTEBOOK.ipynb --to slides --post serve` 138 | - Open browser at 139 | 140 | ## Slide Show Demo 141 | 142 | To view the slides demo: 143 | 144 | jupyter nbconvert Slides-Demo.ipynb --to slides --post serve 145 | 146 | Open browser at . 147 | -------------------------------------------------------------------------------- /Slides-Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Toree Demo" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "fragment" 19 | } 20 | }, 21 | "source": [ 22 | "Lets run some Spark code." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "collapsed": false, 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "Array(1, 4, 9, 16, 25, 36, 49, 64, 81, 100)" 39 | ] 40 | }, 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "sc.parallelize(1 to 100).map(x => x*x).take(10)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "slideshow": { 54 | "slide_type": "fragment" 55 | } 56 | }, 57 | "source": [ 58 | "What if we want to see more of the result?" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": { 65 | "collapsed": false, 66 | "slideshow": { 67 | "slide_type": "fragment" 68 | } 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "Array(1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400)" 75 | ] 76 | }, 77 | "execution_count": 2, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "sc.parallelize(1 to 100).map(x => x*x).take(20)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "slideshow": { 90 | "slide_type": "slide" 91 | } 92 | }, 93 | "source": [ 94 | "# Strings" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 3, 100 | "metadata": { 101 | "collapsed": false, 102 | "slideshow": { 103 | "slide_type": "subslide" 104 | } 105 | }, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "(hello,3)\n", 111 | "(two,1)\n", 112 | "(one,1)\n", 113 | "(world,1)" 114 | ] 115 | }, 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "val lines = Array(\"hello world\", \"hello one\", \"hello two\")\n", 123 | "sc.parallelize(lines).\n", 124 | " flatMap(line => line.split(\"\\\\W+\")).\n", 125 | " map(word => (word,1)).\n", 126 | " reduceByKey(_+_).\n", 127 | " sortBy({case (word,count) => count},false).\n", 128 | " collect.\n", 129 | " mkString(\"\\n\")" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "celltoolbar": "Slideshow", 135 | "kernelspec": { 136 | "display_name": "Toree", 137 | "language": "", 138 | "name": "toree" 139 | }, 140 | "language_info": { 141 | "name": "scala" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 0 146 | } 147 | --------------------------------------------------------------------------------