├── doc └── passions.parquet │ ├── _SUCCESS │ ├── ._SUCCESS.crc │ ├── part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet │ └── .part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet.crc ├── .gitignore ├── LICENSE ├── ob-spark-shell.el └── README.org /doc/passions.parquet/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/passions.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled 2 | *.elc 3 | 4 | # Packaging 5 | .cask 6 | -------------------------------------------------------------------------------- /doc/passions.parquet/part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerrypnz/ob-spark-shell/master/doc/passions.parquet/part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet -------------------------------------------------------------------------------- /doc/passions.parquet/.part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerrypnz/ob-spark-shell/master/doc/passions.parquet/.part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet.crc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Pepijn Looije 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ob-spark-shell.el: -------------------------------------------------------------------------------- 1 | ;;; ob-spark-shell --- Org mode Babel backend for spark-shell (Scala) 2 | ;;; Commentary: 3 | ;;; Code: 4 | 5 | (require 'dash) 6 | (require 'ob) 7 | (require 's) 8 | (require 'scala-mode) 9 | 10 | (defcustom ob-spark-shell-program 11 | "spark-shell" 12 | "Path to the spark-shell program." 13 | :type 'string 14 | :group 'ob-spark-shell) 15 | 16 | (defcustom ob-spark-shell-cli-args 17 | '(("--conf" . "spark.ui.showConsoleProgress=false")) 18 | "CLI arguments." 19 | :type '(alist :key-type string :value-type string) 20 | :group 'ob-spark-shell) 21 | 22 | (defcustom ob-spark-shell-termination-string 23 | "--end of output--" 24 | "Signals the end of output." 25 | :type :string 26 | :group 'ob-spark-shell) 27 | 28 | (defun ob-spark-shell--initiate-session (&optional session) 29 | "If there is not a current comit buffer in SESSION then create it. 30 | Return the initialized session." 31 | (unless (string= session "none") 32 | (let ((session-buffer (get-buffer session)) 33 | (cli-args (-mapcat (lambda (arg) (list (car arg) (cdr arg))) 34 | ob-spark-shell-cli-args))) 35 | (if (org-babel-comint-buffer-livep session-buffer) 36 | session-buffer 37 | (save-window-excursion 38 | (let ((new-buffer (apply 'make-comint-in-buffer 39 | session 40 | nil 41 | ob-spark-shell-program 42 | nil 43 | cli-args))) 44 | (switch-to-buffer new-buffer) 45 | new-buffer)))))) 46 | 47 | (defun ob-spark-shell--output (result) 48 | "Manipulate the RESULT to present the value in a pretty way." 49 | (let ((lines (cddr (butlast result 3)))) 50 | (let ((begin (substring (car lines) 0 2))) 51 | (if (equal begin "+-") 52 | (cdr (mapcar (lambda (line) 53 | (if (equal (substring line 0 1) "+") 54 | 'hline 55 | (cdr (s-split "[|]" line)))) 56 | lines)) 57 | (s-join "" lines))))) 58 | 59 | (defun ob-spark-shell--var-to-scala (var) 60 | "Manipulate the VAR so Scala understands it." 61 | (let ((contents (cdr var))) 62 | (s-concat "val " 63 | (s-join " = " 64 | (list (symbol-name (car contents)) 65 | (format "\"%s\"" (cdr contents)))) 66 | "\n"))) 67 | 68 | (defun ob-spark-shell--session-name (params) 69 | "Make sure that PARAMS include a value for :session." 70 | (let ((param (cdr (assoc :session params)))) 71 | (if (string= param "none") 72 | (error "Ob-spark-shell currently only supports evaluation using a session. 73 | Make sure your src block has a :session param") 74 | param))) 75 | 76 | (defun org-babel-execute:spark-shell (body params) 77 | "Execute BODY, a block of Scala code, in a spark-shell with org-babel. 78 | This function is called by `org-babel-execute-src-block'. 79 | Arguments are supplied through PARAMS." 80 | (let ((vars (org-babel-get-header (org-babel-process-params params) :var)) 81 | (result-type (cdr (assoc :result-type params))) 82 | (full-body (org-babel-expand-body:generic body params)) 83 | (session (org-babel-prep-session:spark-shell (ob-spark-shell--session-name params) params)) 84 | (full-terminator (s-concat "\n" ob-spark-shell-termination-string "\n\n"))) 85 | (dolist (var vars) 86 | (org-babel-comint-with-output 87 | (session "\n\n" t full-body) 88 | (insert (ob-spark-shell--var-to-scala var)) 89 | (comint-send-input nil t))) 90 | (let ((tempfile (org-babel-temp-file "spark-shell-vars-" ".scala"))) 91 | (with-temp-file tempfile 92 | (insert full-body (s-concat "\nprintln(\"" ob-spark-shell-termination-string "\")"))) 93 | (ob-spark-shell--output (org-babel-comint-with-output 94 | (session full-terminator t full-body) 95 | (insert ":load " tempfile) 96 | (comint-send-input nil t)))))) 97 | 98 | (defun org-babel-prep-session:spark-shell (session params) 99 | "Prepare SESSION according to the header arguments specified in PARAMS." 100 | (ob-spark-shell--initiate-session session)) 101 | 102 | (define-derived-mode spark-shell-mode scala-mode "SparkShell") 103 | 104 | (provide 'ob-spark-shell) 105 | ;;; ob-spark-shell ends here 106 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | * ob-spark-shell 2 | 3 | A Scala spark-shell backend for [[http://orgmode.org][Org-mode]]'s [[http://orgmode.org/worg/org-contrib/babel/][Babel]]. 4 | 5 | ** Background 6 | 7 | The only way I currently use this in my workflow is by creating case classes for tests when writing new Spark jobs. First, I load the parquet file and print the schema (which is really fast). Second, I create a case class for my unit test based on the printed schema. Finally, I =df.as[T].show(1)= (T is the new case class), which shows the dataframe or throws an exception when the case class doesn't conform the production schema. 8 | 9 | To be honest, for the time being I'm using IntelliJ to write the tests and jobs. But having the Org-mode document available to quickly inspect production data gives the development of new ETL jobs a speed boost. I'm looking forward to using this to run ad-hoc jobs on an external cluster, too. I do not recommend using this on a cluster, yet. 10 | 11 | At this moment, it is not obvious how to connect to an external cluster. With some CLI arguments (see options), however, you can connect to any kind of cluster using the =--master= argument. Additionally, I once ran a remote spark-shell through an SSH tunnel (which consequently ran spark-shell through Docker). This is possible by supplying =ssh= as the program instead of =spark-shell= and supplying a bunch of arguments. I will add easy access to this functionality in the future, once the project is more stable. 12 | 13 | ** Example 14 | 15 | *How does it work? Please check out the [[https://github.com/pepijn/ob-spark-shell/raw/master/README.org][raw version of this README]] to see the underlying Org-mode code.* 16 | 17 | In the following example, we will first create a dataframe ourselves with two people in it. After that, we read some data from a parquet file. To end the example, we join the two dataframes. 18 | 19 | *** Creating our own dataframe 20 | 21 | Create a dataframe with people using a Scala case class. 22 | 23 | #+BEGIN_SRC spark-shell :session example :exports both 24 | case class Person(name: String, age: Int) 25 | 26 | val people = Seq( 27 | Person("Fred", 23), 28 | Person("Sally", 42)) 29 | 30 | val peopleDf = spark.createDataFrame(people) 31 | #+END_SRC 32 | 33 | #+RESULTS: 34 | #+begin_example 35 | defined class Person 36 | people: Seq[Person] = List(Person(Fred,23), Person(Sally,42)) 37 | peopleDf: org.apache.spark.sql.DataFrame = [name: string, age: int] 38 | #+end_example 39 | 40 | How many people are in this dataframe? 41 | 42 | #+BEGIN_SRC spark-shell :session example :exports both 43 | peopleDf.count 44 | #+END_SRC 45 | 46 | #+RESULTS: 47 | : res39: Long = 2 48 | 49 | =ob-spark-shell= recognizes Spark table outputs and returns them as Org-mode tables. 50 | 51 | #+BEGIN_SRC spark-shell :session example :exports both 52 | peopleDf.show 53 | #+END_SRC 54 | 55 | #+RESULTS: 56 | | name | age | 57 | |-------+-----| 58 | | Fred | 23 | 59 | | Sally | 42 | 60 | |-------+-----| 61 | 62 | *** Reading parquet file 63 | 64 | We store the location of a parquet file in the =passions-parquet= variable, below. 65 | 66 | #+NAME: passions-parquet 67 | : doc/passions.parquet 68 | 69 | In the following code block, we expose the =passions-parquet= variable (as =passions_parquet=) to the spark-shell and then we load the parquet file and count the results. 70 | 71 | #+BEGIN_SRC spark-shell :session example :exports both :var passions_parquet=passions-parquet 72 | val passionsDf = spark.read.load(passions_parquet) 73 | passionsDf.count 74 | #+END_SRC 75 | 76 | #+RESULTS: 77 | #+begin_example 78 | passionsDf: org.apache.spark.sql.DataFrame = [person: string, thing: string] 79 | res43: Long = 2 80 | #+end_example 81 | 82 | What is in the dataframe? 83 | 84 | #+BEGIN_SRC spark-shell :session example :exports both :var passions_parquet=passions-parquet 85 | passionsDf.show 86 | #+END_SRC 87 | 88 | #+RESULTS: 89 | | person | thing | 90 | |--------+-----------| 91 | | Sally | Ice cream | 92 | | Fred | Pizza | 93 | |--------+-----------| 94 | 95 | *** Joining them 96 | 97 | To finish this example, let's join the two dataframes that we made so far. 98 | 99 | #+BEGIN_SRC spark-shell :session example :exports both 100 | import org.apache.spark.sql.functions._ 101 | val df = peopleDf. 102 | join(passionsDf, $"name" === $"person"). 103 | select($"person", $"age", $"thing".as("likes")) 104 | 105 | #+END_SRC 106 | 107 | #+RESULTS: 108 | #+begin_example 109 | import org.apache.spark.sql.functions._ 110 | df: org.apache.spark.sql.DataFrame = [person: string, age: int ... 1 more field] 111 | #+end_example 112 | 113 | Now we have everything in one place :-) 114 | 115 | #+BEGIN_SRC spark-shell :session example :exports both 116 | df.show 117 | #+END_SRC 118 | 119 | #+RESULTS: 120 | | person | age | likes | 121 | |--------+-----+-----------| 122 | | Fred | 23 | Pizza | 123 | | Sally | 42 | Ice cream | 124 | |--------+-----+-----------| 125 | 126 | 127 | ** Options 128 | 129 | =ob-spark-shell-program=: specify the path of your =spark-shell= program. 130 | 131 | =ob-spark-shell-cli-args=: add your own CLI args. Disables the progress bar by 132 | default (=spark.ui.showConsoleProgress=false=). 133 | 134 | ** Limitations 135 | 136 | - Lacks var type reflection; all vars are Scala strings. 137 | 138 | ** Acknowledgements 139 | 140 | Built at [[https://www.nubank.com.br][Nubank]]. 141 | 142 | Some code taken from: 143 | 144 | - =ob-ipython=: https://github.com/gregsexton/ob-ipython/ 145 | - =cexl=: https://github.com/krisajenkins/cexl 146 | 147 | #+BEGIN_SRC spark-shell :session example :exports none 148 | // Create parquet file 149 | case class Passion(person: String, thing: String) 150 | 151 | val passions = Seq( 152 | Passion("Fred", "Pizza"), 153 | Passion("Sally", "Ice cream")) 154 | 155 | val df = spark.createDataFrame(passions) 156 | df.repartition(1).write. 157 | option("compression", "none"). 158 | parquet("doc/passions.parquet") 159 | #+END_SRC 160 | 161 | #+RESULTS: 162 | #+begin_example 163 | defined class Passion 164 | passions: Seq[Passion] = List(Passion(Fred,Pizza), Passion(Sally,Ice cream)) 165 | df: org.apache.spark.sql.DataFrame = [person: string, thing: string] 166 | #+end_example 167 | --------------------------------------------------------------------------------