├── .gitignore
├── Demo.ipynb
├── README.md
└── Slides-Demo.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | metastore_db
2 | derby.log
3 | .ipynb_checkpoints
4 | *.csv
5 | tmp*
6 | *.slides.html
7 | 


--------------------------------------------------------------------------------
/Demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Apache Toree Demo"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Simple Spark Scala\n",
 15 |     "\n",
 16 |     "Test notebook with simple Spark Scala code.\n",
 17 |     "\n",
 18 |     "Take numbers 1 to 100, keep the numbers that are even, square them, and keep the first 10."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {
 25 |     "attributes": {
 26 |      "classes": [
 27 |       "scala"
 28 |      ],
 29 |      "id": ""
 30 |     },
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "Array(4, 16, 36, 64, 100, 144, 196, 256, 324, 400)"
 38 |       ]
 39 |      },
 40 |      "execution_count": 1,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "sc.parallelize(1 to 100).\n",
 47 |     "  filter(x => x % 2 == 0).\n",
 48 |     "  map(x => x * x).\n",
 49 |     "  take(10)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "Use tab for auto-complete."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Test CSV Library"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "### Useful functions\n",
 71 |     "\n",
 72 |     "Define some functions."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 2,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "// Grab URL contents\n",
 84 |     "def getUrl(url:String):String = \n",
 85 |     "  scala.io.Source.fromURL(url).mkString\n",
 86 |     "\n",
 87 |     "// Write file\n",
 88 |     "def fileWrite(path:String,contents:String) = {\n",
 89 |     "  import java.io.{PrintWriter,File}\n",
 90 |     "  val writer = new PrintWriter(new File(path))\n",
 91 |     "  writer.write(contents)\n",
 92 |     "  writer.close\n",
 93 |     "}"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "### Download Prices\n",
101 |     "\n",
102 |     "Get the historical stock price of AAPL and save it in AAPL.csv"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "AAPL.csv\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "val symbol = \"AAPL\"\n",
122 |     "val baseUrl = \"http://real-chart.finance.yahoo.com\"\n",
123 |     "val url = s\"${baseUrl}/table.csv?s=${symbol}&g=d&ignore=.csv\"\n",
124 |     "val csv = getUrl(url)\n",
125 |     "val csvFile = s\"${symbol}.csv\"\n",
126 |     "fileWrite(csvFile, csv)\n",
127 |     "println(csvFile)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "### Highest Prices\n",
135 |     "\n",
136 |     "Find the days with the highest adjusted close prices."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 4,
142 |    "metadata": {
143 |     "collapsed": false
144 |    },
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "(130.67132,2015-05-22)\n",
151 |       "(130.579411,2015-02-23)\n",
152 |       "(130.235775,2015-04-27)\n",
153 |       "(130.20796,2015-07-20)\n",
154 |       "(130.178369,2015-05-27)\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "val stockRdd = sc.textFile(csvFile).\n",
160 |     "  filter(line => line matches \".*\\\\d.*\").\n",
161 |     "  map(line => line.split(\",\")).\n",
162 |     "  map(fields => (fields(6).toDouble,fields(0))).\n",
163 |     "  sortBy({case (close,date) => close},false)\n",
164 |     "\n",
165 |     "stockRdd.take(5).foreach(println)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Load CSV\n",
173 |     "\n",
174 |     "Now lets use SQL to analyze the stock instead of directly manipulating records."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "Load CSV file as data frame."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 5,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "val df = sqlContext.read.\n",
193 |     "    format(\"com.databricks.spark.csv\").\n",
194 |     "    option(\"header\", \"true\").\n",
195 |     "    option(\"inferSchema\", \"true\").\n",
196 |     "    load(\"AAPL.csv\")"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "### View Data Frame\n",
204 |     "\n",
205 |     "What does `df` look like?"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 6,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "+----------+----------+\n",
220 |       "|      Date| Adj Close|\n",
221 |       "+----------+----------+\n",
222 |       "|2016-03-18|105.919998|\n",
223 |       "|2016-03-17|105.800003|\n",
224 |       "|2016-03-16|105.970001|\n",
225 |       "|2016-03-15|104.580002|\n",
226 |       "|2016-03-14|102.519997|\n",
227 |       "|2016-03-11|102.260002|\n",
228 |       "|2016-03-10|101.169998|\n",
229 |       "|2016-03-09|101.120003|\n",
230 |       "|2016-03-08|101.029999|\n",
231 |       "|2016-03-07|101.870003|\n",
232 |       "|2016-03-04|103.010002|\n",
233 |       "|2016-03-03|     101.5|\n",
234 |       "|2016-03-02|    100.75|\n",
235 |       "|2016-03-01|100.529999|\n",
236 |       "|2016-02-29| 96.690002|\n",
237 |       "|2016-02-26| 96.910004|\n",
238 |       "|2016-02-25| 96.760002|\n",
239 |       "|2016-02-24| 96.099998|\n",
240 |       "|2016-02-23| 94.690002|\n",
241 |       "|2016-02-22| 96.879997|\n",
242 |       "+----------+----------+\n",
243 |       "only showing top 20 rows\n",
244 |       "\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "df.select(\"Date\",\"Adj Close\").show"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "### SQL Queries\n",
257 |     "\n",
258 |     "Register it as a SQL table."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 7,
264 |    "metadata": {
265 |     "collapsed": true
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "df.registerTempTable(\"aapl\")"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Find out how many rows it has."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 8,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "+---------+\n",
291 |       "|row_count|\n",
292 |       "+---------+\n",
293 |       "|     8893|\n",
294 |       "+---------+\n",
295 |       "\n"
296 |      ]
297 |     }
298 |    ],
299 |    "source": [
300 |     "sqlContext.sql(\"SELECT COUNT(1) AS row_count FROM aapl\").show"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "### Highest Prices\n",
308 |     "\n",
309 |     "Find out what the highest adjusted close was."
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 9,
315 |    "metadata": {
316 |     "collapsed": false
317 |    },
318 |    "outputs": [
319 |     {
320 |      "name": "stdout",
321 |      "output_type": "stream",
322 |      "text": [
323 |       "+---------+\n",
324 |       "|max_close|\n",
325 |       "+---------+\n",
326 |       "|130.67132|\n",
327 |       "+---------+\n",
328 |       "\n"
329 |      ]
330 |     }
331 |    ],
332 |    "source": [
333 |     "sqlContext.sql(\"SELECT MAX(`Adj Close`) AS max_close FROM aapl\").show"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "Find the dates of the 5 highest adjusted close prices."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 10,
346 |    "metadata": {
347 |     "attributes": {
348 |      "classes": [
349 |       "scala"
350 |      ],
351 |      "id": ""
352 |     },
353 |     "collapsed": false
354 |    },
355 |    "outputs": [
356 |     {
357 |      "name": "stdout",
358 |      "output_type": "stream",
359 |      "text": [
360 |       "+----------+----------+\n",
361 |       "|      Date| Adj Close|\n",
362 |       "+----------+----------+\n",
363 |       "|2015-05-22| 130.67132|\n",
364 |       "|2015-02-23|130.579411|\n",
365 |       "|2015-04-27|130.235775|\n",
366 |       "|2015-07-20| 130.20796|\n",
367 |       "|2015-05-27|130.178369|\n",
368 |       "+----------+----------+\n",
369 |       "\n"
370 |      ]
371 |     }
372 |    ],
373 |    "source": [
374 |     "sqlContext.sql(\"\"\"SELECT Date,`Adj Close` FROM aapl \n",
375 |     "    ORDER BY `Adj Close` DESC LIMIT 5\"\"\").show"
376 |    ]
377 |   }
378 |  ],
379 |  "metadata": {
380 |   "kernelspec": {
381 |    "display_name": "Toree",
382 |    "language": "",
383 |    "name": "toree"
384 |   },
385 |   "language_info": {
386 |    "name": "scala"
387 |   }
388 |  },
389 |  "nbformat": 4,
390 |  "nbformat_minor": 0
391 | }
392 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Apache Toree Quickstart
  2 | 
  3 | ## Presentation
  4 | 
  5 | Presentation accompanying this tutorial:
  6 | 
  7 | <http://www.slideshare.net/asimjalis/apache-toree>
  8 | 
  9 | ## Preamble
 10 | 
 11 | All of the following commands should be executed in a new terminal
 12 | window. You do not need to modify your `.profile` or `.bashrc` or any
 13 | other configuration file.
 14 | 
 15 | ## Install Spark
 16 | 
 17 | Install Spark from <http://spark.apache.org/downloads.html>.
 18 | 
 19 | Make sure you download the pre-built *binaries* for Hadoop 2.6 and
 20 | later, and not the sources. (By default the download link will point
 21 | to the sources and not the binaries.)
 22 | 
 23 | ## Install Pip, Jupyter, Toree
 24 | 
 25 | Install pip if you don't already have it installed.
 26 | 
 27 |     sudo easy_install pip
 28 | 
 29 | If you already have pip then make sure it is updated.
 30 | 
 31 |     sudo pip install --upgrade pip
 32 | 
 33 | Install Jupyter and Toree.
 34 | 
 35 |     sudo pip install jupyter
 36 |     sudo pip install toree
 37 | 
 38 | ## Configure
 39 | 
 40 | Set `SPARK_HOME` to point to the directory where you downloaded and
 41 | expanded the Spark binaries. Instead of 1.6.1 you might have a
 42 | different version number.
 43 | 
 44 |     SPARK_HOME=$HOME/Downloads/spark-1.6.1-bin-hadoop2.6
 45 | 
 46 | Configure Toree. 
 47 | 
 48 |     jupyter toree install \
 49 |       --spark_home=$SPARK_HOME
 50 | 
 51 | ## Start
 52 | 
 53 | Start notebook.
 54 | 
 55 |     jupyter notebook
 56 | 
 57 | Point browser to <http://localhost:8888>.
 58 | 
 59 | Then open a new notebook using *New > Toree*.
 60 | 
 61 | ## Test
 62 | 
 63 | Test notebook with simple Spark Scala code.
 64 | 
 65 | ```scala
 66 | sc.parallelize(1 to 100).
 67 |   filter(x => x % 2 == 0).
 68 |   map(x => x * x).
 69 |   take(10)
 70 | ```
 71 | 
 72 | Use tab for auto-complete.
 73 | 
 74 | # 3rd Party Libraries
 75 | 
 76 | ## Configure
 77 | 
 78 | List all packages you will use.
 79 | 
 80 |     SPARK_PKGS=$(cat << END | xargs echo | sed 's/ /,/g'
 81 |     com.databricks:spark-csv_2.10:1.4.0
 82 |     com.databricks:spark-avro_2.10:2.0.1
 83 |     END)
 84 | 
 85 | Define `SPARK_OPTS` and `SPARK_HOME`.
 86 | 
 87 |     SPARK_OPTS="--packages=$SPARK_PKGS"
 88 |     SPARK_HOME=$HOME/Downloads/spark-1.6.0-bin-hadoop2.6
 89 | 
 90 | Configure Toree to use these packages.
 91 | 
 92 |     jupyter toree install \
 93 |       --spark_home=$SPARK_HOME \
 94 |       --spark_opts=$SPARK_OPTS
 95 | 
 96 | ## Start
 97 | 
 98 | Start notebook.
 99 | 
100 |     jupyter notebook
101 | 
102 | Point browser to <http://localhost:8888>.
103 | 
104 | Then open a new notebook using *New > Toree*.
105 | 
106 | ## Troubleshooting
107 | 
108 | If you run into issues downloading dependencies wipe out `~/.m2` and
109 | `~/.ivy2`. Spark uses Ivy2 which sometimes corrupts these folders.
110 | This is a likely source of dependency errors.
111 | 
112 | ## Test CSV library
113 | 
114 | Now go to the notebook [Demo.ipynb](Demo.ipynb) and test code there.
115 | 
116 | # Going Public
117 | 
118 | ## Publishing GitHub Notebooks
119 | 
120 | How can I share my notebook with other people?
121 | 
122 | - Go to <https://gist.github.com>
123 | - Create gist with extension `.ipynb`
124 | - Copy notebook to clipboard: `cat NOTEBOOK.ipynb | pbcopy`
125 | - Paste clipboard into the gist you just created
126 | - Copy the Gist ID
127 | - Go to <http://nbviewer.jupyter.org>
128 | - Paste the Gist ID here
129 | - Share the link NBViewer gives you
130 | 
131 | ## Creating Slide Shows
132 | 
133 | How can I create a slide show from a notebook?
134 | 
135 | - Create a Toree notebook
136 | - Click on *View > Cell Toolbar > Slideshow*
137 | - `jupyter nbconvert NOTEBOOK.ipynb --to slides --post serve`
138 | - Open browser at <http://localhost:8000>
139 | 
140 | ## Slide Show Demo
141 | 
142 | To view the slides demo:
143 | 
144 |     jupyter nbconvert Slides-Demo.ipynb --to slides --post serve
145 | 
146 | Open browser at <http://localhost:8000>.
147 | 


--------------------------------------------------------------------------------
/Slides-Demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Toree Demo"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "slideshow": {
 18 |      "slide_type": "fragment"
 19 |     }
 20 |    },
 21 |    "source": [
 22 |     "Lets run some Spark code."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "collapsed": false,
 30 |     "slideshow": {
 31 |      "slide_type": "fragment"
 32 |     }
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "Array(1, 4, 9, 16, 25, 36, 49, 64, 81, 100)"
 39 |       ]
 40 |      },
 41 |      "execution_count": 1,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "sc.parallelize(1 to 100).map(x => x*x).take(10)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {
 53 |     "slideshow": {
 54 |      "slide_type": "fragment"
 55 |     }
 56 |    },
 57 |    "source": [
 58 |     "What if we want to see more of the result?"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 2,
 64 |    "metadata": {
 65 |     "collapsed": false,
 66 |     "slideshow": {
 67 |      "slide_type": "fragment"
 68 |     }
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "Array(1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400)"
 75 |       ]
 76 |      },
 77 |      "execution_count": 2,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "sc.parallelize(1 to 100).map(x => x*x).take(20)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {
 89 |     "slideshow": {
 90 |      "slide_type": "slide"
 91 |     }
 92 |    },
 93 |    "source": [
 94 |     "# Strings"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 3,
100 |    "metadata": {
101 |     "collapsed": false,
102 |     "slideshow": {
103 |      "slide_type": "subslide"
104 |     }
105 |    },
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "(hello,3)\n",
111 |        "(two,1)\n",
112 |        "(one,1)\n",
113 |        "(world,1)"
114 |       ]
115 |      },
116 |      "execution_count": 3,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "val lines = Array(\"hello world\", \"hello one\", \"hello two\")\n",
123 |     "sc.parallelize(lines).\n",
124 |     "  flatMap(line => line.split(\"\\\\W+\")).\n",
125 |     "  map(word => (word,1)).\n",
126 |     "  reduceByKey(_+_).\n",
127 |     "  sortBy({case (word,count) => count},false).\n",
128 |     "  collect.\n",
129 |     "  mkString(\"\\n\")"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "celltoolbar": "Slideshow",
135 |   "kernelspec": {
136 |    "display_name": "Toree",
137 |    "language": "",
138 |    "name": "toree"
139 |   },
140 |   "language_info": {
141 |    "name": "scala"
142 |   }
143 |  },
144 |  "nbformat": 4,
145 |  "nbformat_minor": 0
146 | }
147 | 


--------------------------------------------------------------------------------