├── README.md
└── jupyter-zeppelin.py


/README.md:
--------------------------------------------------------------------------------
 1 | ## Jupyter/Zeppelin conversion
 2 | 
 3 | This repo has code for converting Zeppelin notebooks to Jupyter's ipynb format.
 4 | 
 5 | To convert a notebook, run:
 6 | 
 7 | ```
 8 | python jupyter-zeppelin.py note.json
 9 | ```
10 | 
11 | This will create a file named using the Zeppelin note's name in the current directory. Alternatively, you can pass an output path:
12 | 
13 | ```
14 | python jupyter-zeppelin.py note.json Example.ipynb
15 | ```
16 | 
17 | ### Supported conventions
18 | 
19 | This converter supports the following Zeppelin conventions:
20 | 
21 | * Code paragraphs are converted to code cells
22 | * `%md` paragraphs are converted to Jupyter markdown cells
23 | * `%html` paragraphs are converted to Jupyter code cells using cell magic `%%html`
24 | * `%sql` paragraphs are converted to Jupyter code cells using cell magic `%%sql`
25 | * Paragraphs with unknown magics are converted to raw cells
26 | * TEXT output is converted to `text/plain` output
27 | * HTML output is converted to `text/html` output; some style and JS may not work in Jupyter
28 | * TABLE output is converted to simple `text/html` tables
29 |   * `%html` table cells are embedded in the table HTML
30 |   * Normal table cells are escaped and then embedded in the table HTML
31 | 


--------------------------------------------------------------------------------
/jupyter-zeppelin.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import re
  3 | import csv
  4 | import json
  5 | import html
  6 | import nbformat
  7 | import codecs
  8 | from aws.s3 import S3
  9 | from StringIO import StringIO
 10 | 
 11 | MD = re.compile(r'%md\s')
 12 | SQL = re.compile(r'%sql\s')
 13 | UNKNOWN_MAGIC = re.compile(r'%\w+\s')
 14 | HTML = re.compile(r'%html\s')
 15 | 
 16 | def read_io(path):
 17 |     """Reads the contents of a local or S3 path into a StringIO.
 18 |     """
 19 |     note = StringIO()
 20 |     if path.startswith("s3://"):
 21 |         s3 = S3(env='prod')
 22 |         for line in s3.read(path):
 23 |             note.write(line)
 24 |             note.write("\n")
 25 |     else:
 26 |         with open(path) as local:
 27 |             for line in local.readlines():
 28 |                 note.write(line)
 29 | 
 30 |     note.seek(0)
 31 | 
 32 |     return note
 33 | 
 34 | def table_cell_to_html(cell):
 35 |     """Formats a cell from a Zeppelin TABLE as HTML.
 36 |     """
 37 |     if HTML.match(cell):
 38 |         # the contents is already HTML
 39 |         return cell
 40 |     else:
 41 |         return html.escape(cell)
 42 | 
 43 | def table_to_html(tsv):
 44 |     """Formats the tab-separated content of a Zeppelin TABLE as HTML.
 45 |     """
 46 |     io = StringIO.StringIO(tsv)
 47 |     reader = csv.reader(io, delimiter="\t")
 48 |     fields = reader.next()
 49 |     column_headers = "".join([ "<th>" + name + "</th>" for name in fields ])
 50 |     lines = [
 51 |             "<table>",
 52 |             "<tr>{column_headers}</tr>".format(column_headers=column_headers)
 53 |         ]
 54 |     for row in reader:
 55 |         lines.append("<tr>" + "".join([ "<td>" + table_cell_to_html(cell) + "</td>" for cell in row ]) + "</tr>")
 56 |     lines.append("</table>")
 57 |     return "\n".join(lines)
 58 | 
 59 | 
 60 | def convert_json(zeppelin_json):
 61 |     """Converts a Zeppelin note from JSON to a Jupyter NotebookNode.
 62 |     """
 63 |     return convert_parsed(json.load(zeppelin_json))
 64 | 
 65 | def convert_parsed(zeppelin_note):
 66 |     """Converts a Zeppelin note from parsed JSON to a Jupyter NotebookNode.
 67 |     """
 68 |     notebook_name = zeppelin_note['name']
 69 | 
 70 |     cells = []
 71 |     index = 0
 72 |     for paragraph in zeppelin_note['paragraphs']:
 73 |         code = paragraph.get('text')
 74 |         if not code:
 75 |             continue
 76 | 
 77 |         code = code.lstrip()
 78 | 
 79 |         cell = {}
 80 | 
 81 |         if MD.match(code):
 82 |             cell['cell_type'] = 'markdown'
 83 |             cell['metadata'] = {}
 84 |             cell['source'] = code.lstrip('%md').lstrip("\n") # remove '%md'
 85 |         elif SQL.match(code) or HTML.match(code):
 86 |             cell['cell_type'] = 'code'
 87 |             cell['execution_count'] = index
 88 |             cell['metadata'] = {}
 89 |             cell['outputs'] = []
 90 |             cell['source'] = '%' + code # add % to convert to cell magic
 91 |         elif UNKNOWN_MAGIC.match(code):
 92 |             # use raw cells for unknown magic
 93 |             cell['cell_type'] = 'raw'
 94 |             cell['metadata'] = {'format': 'text/plain'}
 95 |             cell['source'] = code
 96 |         else:
 97 |             cell['cell_type'] = 'code'
 98 |             cell['execution_count'] = index
 99 |             cell['metadata'] = {'autoscroll': 'auto'}
100 |             cell['outputs'] = []
101 |             cell['source'] = code
102 | 
103 |         cells.append(cell)
104 | 
105 |         result = paragraph.get('result')
106 |         if cell['cell_type'] == 'code' and result:
107 |             if result['code'] == 'SUCCESS':
108 |                 result_type = result.get('type')
109 |                 output_by_mime_type = {}
110 |                 if result_type == 'TEXT':
111 |                     output_by_mime_type['text/plain'] = result['msg']
112 |                 elif result_type == 'HTML':
113 |                     output_by_mime_type['text/html'] = result['msg']
114 |                 elif result_type == 'TABLE':
115 |                     output_by_mime_type['text/html'] = table_to_html(result['msg'])
116 | 
117 |                 cell['outputs'] = [{
118 |                     'output_type': 'execute_result',
119 |                     'metadata': {},
120 |                     'execution_count': index,
121 |                     'data': output_by_mime_type
122 |                 }]
123 | 
124 |         index += 1
125 | 
126 |     notebook = nbformat.from_dict({
127 |         "metadata": {
128 |             "kernelspec": {
129 |                 "display_name": "Spark 2.0.0 - Scala 2.11",
130 |                 "language": "scala",
131 |                 "name": "spark2-scala"
132 |             },
133 |             "language_info": {
134 |                 "codemirror_mode": "text/x-scala",
135 |                 "file_extension": ".scala",
136 |                 "mimetype": "text/x-scala",
137 |                 "name": "scala",
138 |                 "pygments_lexer": "scala",
139 |                 "version": "2.11.8"
140 |             }
141 |         },
142 |         "nbformat": 4,
143 |         "nbformat_minor": 2,
144 |         "cells" : cells,
145 |     })
146 | 
147 |     return (notebook_name, notebook)
148 | 
149 | def write_notebook(notebook_name, notebook, path=None):
150 |     """Writes a NotebookNode to a file created from the notebook name.
151 | 
152 |     If path is None, the output path will be created the notebook name in the current directory.
153 |     """
154 |     filename = path
155 |     if not filename:
156 |         filename = notebook_name + '.ipynb'
157 |         if os.path.exists(filename):
158 |             for i in range(1, 1000):
159 |                 filename = notebook_name + ' (' + str(i) + ').ipynb'
160 |                 if not os.path.exists(filename):
161 |                     break
162 |                 if i == 1000:
163 |                     raise RuntimeError('Cannot write %s: versions 1-1000 already exist.' % (notebook_name,))
164 | 
165 |     with codecs.open(filename, 'w', encoding='UTF-8') as io:
166 |         nbformat.write(notebook, io)
167 | 
168 |     return filename
169 | 
170 | if __name__ == '__main__':
171 |     num_args = len(sys.argv)
172 | 
173 |     zeppelin_note_path = None
174 |     target_path = None
175 |     if num_args == 2:
176 |         zeppelin_note_path = sys.argv[1]
177 |     elif num_args == 3:
178 |         target_path = sys.argv[2]
179 | 
180 |     if not zeppelin_note_path:
181 |         exit()
182 | 
183 |     name, content = convert_json(read_io(zeppelin_note_path))
184 |     write_notebook(name, content, target_path)
185 | 
186 | 


--------------------------------------------------------------------------------