├── doc
    └── passions.parquet
    │   ├── _SUCCESS
    │   ├── ._SUCCESS.crc
    │   ├── part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet
    │   └── .part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet.crc
├── .gitignore
├── LICENSE
├── ob-spark-shell.el
└── README.org


/doc/passions.parquet/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/passions.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled
2 | *.elc
3 | 
4 | # Packaging
5 | .cask
6 | 


--------------------------------------------------------------------------------
/doc/passions.parquet/part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerrypnz/ob-spark-shell/master/doc/passions.parquet/part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet


--------------------------------------------------------------------------------
/doc/passions.parquet/.part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerrypnz/ob-spark-shell/master/doc/passions.parquet/.part-r-00000-fbbd8cdb-1324-4ab8-b452-1e6ff8e55714.parquet.crc


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Pepijn Looije
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ob-spark-shell.el:
--------------------------------------------------------------------------------
  1 | ;;; ob-spark-shell --- Org mode Babel backend for spark-shell (Scala)
  2 | ;;; Commentary:
  3 | ;;; Code:
  4 | 
  5 | (require 'dash)
  6 | (require 'ob)
  7 | (require 's)
  8 | (require 'scala-mode)
  9 | 
 10 | (defcustom ob-spark-shell-program
 11 |   "spark-shell"
 12 |   "Path to the spark-shell program."
 13 |   :type 'string
 14 |   :group 'ob-spark-shell)
 15 | 
 16 | (defcustom ob-spark-shell-cli-args
 17 |   '(("--conf" . "spark.ui.showConsoleProgress=false"))
 18 |   "CLI arguments."
 19 |   :type '(alist :key-type string :value-type string)
 20 |   :group 'ob-spark-shell)
 21 | 
 22 | (defcustom ob-spark-shell-termination-string
 23 |   "--end of output--"
 24 |   "Signals the end of output."
 25 |   :type :string
 26 |   :group 'ob-spark-shell)
 27 | 
 28 | (defun ob-spark-shell--initiate-session (&optional session)
 29 |   "If there is not a current comit buffer in SESSION then create it.
 30 | Return the initialized session."
 31 |   (unless (string= session "none")
 32 |     (let ((session-buffer (get-buffer session))
 33 |           (cli-args (-mapcat (lambda (arg) (list (car arg) (cdr arg)))
 34 |                              ob-spark-shell-cli-args)))
 35 |       (if (org-babel-comint-buffer-livep session-buffer)
 36 |           session-buffer
 37 |         (save-window-excursion
 38 |           (let ((new-buffer (apply 'make-comint-in-buffer
 39 |                                    session
 40 |                                    nil
 41 |                                    ob-spark-shell-program
 42 |                                    nil
 43 |                                    cli-args)))
 44 |             (switch-to-buffer new-buffer)
 45 |             new-buffer))))))
 46 | 
 47 | (defun ob-spark-shell--output (result)
 48 |   "Manipulate the RESULT to present the value in a pretty way."
 49 |   (let ((lines (cddr (butlast result 3))))
 50 |     (let ((begin (substring (car lines) 0 2)))
 51 |       (if (equal begin "+-")
 52 |           (cdr (mapcar (lambda (line)
 53 |                          (if (equal (substring line 0 1) "+")
 54 |                              'hline
 55 |                            (cdr (s-split "[|]" line))))
 56 |                        lines))
 57 |         (s-join "" lines)))))
 58 | 
 59 | (defun ob-spark-shell--var-to-scala (var)
 60 |   "Manipulate the VAR so Scala understands it."
 61 |   (let ((contents (cdr var)))
 62 |     (s-concat "val "
 63 |               (s-join " = "
 64 |                       (list (symbol-name (car contents))
 65 |                             (format "\"%s\"" (cdr contents))))
 66 |               "\n")))
 67 | 
 68 | (defun ob-spark-shell--session-name (params)
 69 |   "Make sure that PARAMS include a value for :session."
 70 |   (let ((param (cdr (assoc :session params))))
 71 |     (if (string= param "none")
 72 |         (error "Ob-spark-shell currently only supports evaluation using a session.
 73 | Make sure your src block has a :session param")
 74 |       param)))
 75 | 
 76 | (defun org-babel-execute:spark-shell (body params)
 77 |   "Execute BODY, a block of Scala code, in a spark-shell with org-babel.
 78 | This function is called by `org-babel-execute-src-block'.
 79 | Arguments are supplied through PARAMS."
 80 |   (let ((vars (org-babel-get-header (org-babel-process-params params) :var))
 81 |         (result-type (cdr (assoc :result-type params)))
 82 |         (full-body (org-babel-expand-body:generic body params))
 83 |         (session (org-babel-prep-session:spark-shell (ob-spark-shell--session-name params) params))
 84 |         (full-terminator (s-concat "\n" ob-spark-shell-termination-string "\n\n")))
 85 |     (dolist (var vars)
 86 |       (org-babel-comint-with-output
 87 |           (session "\n\n" t full-body)
 88 |         (insert (ob-spark-shell--var-to-scala var))
 89 |         (comint-send-input nil t)))
 90 |     (let ((tempfile (org-babel-temp-file "spark-shell-vars-" ".scala")))
 91 |       (with-temp-file tempfile
 92 |         (insert full-body (s-concat "\nprintln(\"" ob-spark-shell-termination-string "\")")))
 93 |       (ob-spark-shell--output (org-babel-comint-with-output
 94 |                                   (session full-terminator t full-body)
 95 |                                 (insert ":load " tempfile)
 96 |                                 (comint-send-input nil t))))))
 97 | 
 98 | (defun org-babel-prep-session:spark-shell (session params)
 99 |   "Prepare SESSION according to the header arguments specified in PARAMS."
100 |   (ob-spark-shell--initiate-session session))
101 | 
102 | (define-derived-mode spark-shell-mode scala-mode "SparkShell")
103 | 
104 | (provide 'ob-spark-shell)
105 | ;;; ob-spark-shell ends here
106 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | * ob-spark-shell
  2 | 
  3 | A Scala spark-shell backend for [[http://orgmode.org][Org-mode]]'s [[http://orgmode.org/worg/org-contrib/babel/][Babel]].
  4 | 
  5 | ** Background
  6 | 
  7 | The only way I currently use this in my workflow is by creating case classes for tests when writing new Spark jobs. First, I load the parquet file and print the schema (which is really fast). Second, I create a case class for my unit test based on the printed schema. Finally, I =df.as[T].show(1)= (T is the new case class), which shows the dataframe or throws an exception when the case class doesn't conform the production schema.
  8 | 
  9 | To be honest, for the time being I'm using IntelliJ to write the tests and jobs. But having the Org-mode document available to quickly inspect production data gives the development of new ETL jobs a speed boost. I'm looking forward to using this to run ad-hoc jobs on an external cluster, too. I do not recommend using this on a cluster, yet.
 10 | 
 11 | At this moment, it is not obvious how to connect to an external cluster. With some CLI arguments (see options), however, you can connect to any kind of cluster using the =--master= argument. Additionally, I once ran a remote spark-shell through an SSH tunnel (which consequently ran spark-shell through Docker). This is possible by supplying =ssh= as the program instead of =spark-shell= and supplying a bunch of arguments. I will add easy access to this functionality in the future, once the project is more stable.
 12 | 
 13 | ** Example
 14 | 
 15 | *How does it work? Please check out the [[https://github.com/pepijn/ob-spark-shell/raw/master/README.org][raw version of this README]] to see the underlying Org-mode code.*
 16 | 
 17 | In the following example, we will first create a dataframe ourselves with two people in it. After that, we read some data from a parquet file. To end the example, we join the two dataframes.
 18 | 
 19 | *** Creating our own dataframe
 20 | 
 21 | Create a dataframe with people using a Scala case class.
 22 | 
 23 | #+BEGIN_SRC spark-shell :session example :exports both
 24 | case class Person(name: String, age: Int)
 25 | 
 26 | val people = Seq(
 27 |   Person("Fred", 23),
 28 |   Person("Sally", 42))
 29 | 
 30 | val peopleDf = spark.createDataFrame(people)
 31 | #+END_SRC
 32 | 
 33 | #+RESULTS:
 34 | #+begin_example
 35 | defined class Person
 36 | people: Seq[Person] = List(Person(Fred,23), Person(Sally,42))
 37 | peopleDf: org.apache.spark.sql.DataFrame = [name: string, age: int]
 38 | #+end_example
 39 | 
 40 | How many people are in this dataframe?
 41 | 
 42 | #+BEGIN_SRC spark-shell :session example :exports both
 43 | peopleDf.count
 44 | #+END_SRC
 45 | 
 46 | #+RESULTS:
 47 | : res39: Long = 2
 48 | 
 49 | =ob-spark-shell= recognizes Spark table outputs and returns them as Org-mode tables.
 50 | 
 51 | #+BEGIN_SRC spark-shell :session example :exports both
 52 | peopleDf.show
 53 | #+END_SRC
 54 | 
 55 | #+RESULTS:
 56 | | name  | age |
 57 | |-------+-----|
 58 | | Fred  |  23 |
 59 | | Sally |  42 |
 60 | |-------+-----|
 61 | 
 62 | *** Reading parquet file
 63 | 
 64 | We store the location of a parquet file in the =passions-parquet= variable, below.
 65 | 
 66 | #+NAME: passions-parquet
 67 | : doc/passions.parquet
 68 | 
 69 | In the following code block, we expose the =passions-parquet= variable (as =passions_parquet=) to the spark-shell and then we load the parquet file and count the results.
 70 | 
 71 | #+BEGIN_SRC spark-shell :session example :exports both :var passions_parquet=passions-parquet
 72 | val passionsDf = spark.read.load(passions_parquet)
 73 | passionsDf.count
 74 | #+END_SRC
 75 | 
 76 | #+RESULTS:
 77 | #+begin_example
 78 | passionsDf: org.apache.spark.sql.DataFrame = [person: string, thing: string]
 79 | res43: Long = 2
 80 | #+end_example
 81 | 
 82 | What is in the dataframe?
 83 | 
 84 | #+BEGIN_SRC spark-shell :session example :exports both :var passions_parquet=passions-parquet
 85 | passionsDf.show
 86 | #+END_SRC
 87 | 
 88 | #+RESULTS:
 89 | | person | thing     |
 90 | |--------+-----------|
 91 | | Sally  | Ice cream |
 92 | | Fred   | Pizza     |
 93 | |--------+-----------|
 94 | 
 95 | *** Joining them
 96 | 
 97 | To finish this example, let's join the two dataframes that we made so far.
 98 | 
 99 | #+BEGIN_SRC spark-shell :session example :exports both
100 | import org.apache.spark.sql.functions._
101 | val df = peopleDf.
102 |   join(passionsDf, $"name" === $"person").
103 |   select($"person", $"age", $"thing".as("likes"))
104 | 
105 | #+END_SRC
106 | 
107 | #+RESULTS:
108 | #+begin_example
109 | import org.apache.spark.sql.functions._
110 | df: org.apache.spark.sql.DataFrame = [person: string, age: int ... 1 more field]
111 | #+end_example
112 | 
113 | Now we have everything in one place :-)
114 | 
115 | #+BEGIN_SRC spark-shell :session example :exports both
116 | df.show
117 | #+END_SRC
118 | 
119 | #+RESULTS:
120 | | person | age | likes     |
121 | |--------+-----+-----------|
122 | | Fred   |  23 | Pizza     |
123 | | Sally  |  42 | Ice cream |
124 | |--------+-----+-----------|
125 | 
126 | 
127 | ** Options
128 | 
129 | =ob-spark-shell-program=: specify the path of your =spark-shell= program.
130 | 
131 | =ob-spark-shell-cli-args=: add your own CLI args. Disables the progress bar by
132 | default (=spark.ui.showConsoleProgress=false=).
133 | 
134 | ** Limitations
135 | 
136 | - Lacks var type reflection; all vars are Scala strings.
137 | 
138 | ** Acknowledgements
139 | 
140 | Built at [[https://www.nubank.com.br][Nubank]].
141 | 
142 | Some code taken from:
143 | 
144 | - =ob-ipython=: https://github.com/gregsexton/ob-ipython/
145 | - =cexl=: https://github.com/krisajenkins/cexl
146 | 
147 | #+BEGIN_SRC spark-shell :session example :exports none
148 | // Create parquet file
149 | case class Passion(person: String, thing: String)
150 | 
151 | val passions = Seq(
152 |   Passion("Fred", "Pizza"),
153 |   Passion("Sally", "Ice cream"))
154 | 
155 | val df = spark.createDataFrame(passions)
156 | df.repartition(1).write.
157 |   option("compression", "none").
158 |   parquet("doc/passions.parquet")
159 | #+END_SRC
160 | 
161 | #+RESULTS:
162 | #+begin_example
163 | defined class Passion
164 | passions: Seq[Passion] = List(Passion(Fred,Pizza), Passion(Sally,Ice cream))
165 | df: org.apache.spark.sql.DataFrame = [person: string, thing: string]
166 | #+end_example
167 | 


--------------------------------------------------------------------------------