├── README.md
└── jupyter-zeppelin.py
/README.md:
--------------------------------------------------------------------------------
1 | ## Jupyter/Zeppelin conversion
2 |
3 | This repo has code for converting Zeppelin notebooks to Jupyter's ipynb format.
4 |
5 | To convert a notebook, run:
6 |
7 | ```
8 | python jupyter-zeppelin.py note.json
9 | ```
10 |
11 | This will create a file named using the Zeppelin note's name in the current directory. Alternatively, you can pass an output path:
12 |
13 | ```
14 | python jupyter-zeppelin.py note.json Example.ipynb
15 | ```
16 |
17 | ### Supported conventions
18 |
19 | This converter supports the following Zeppelin conventions:
20 |
21 | * Code paragraphs are converted to code cells
22 | * `%md` paragraphs are converted to Jupyter markdown cells
23 | * `%html` paragraphs are converted to Jupyter code cells using cell magic `%%html`
24 | * `%sql` paragraphs are converted to Jupyter code cells using cell magic `%%sql`
25 | * Paragraphs with unknown magics are converted to raw cells
26 | * TEXT output is converted to `text/plain` output
27 | * HTML output is converted to `text/html` output; some style and JS may not work in Jupyter
28 | * TABLE output is converted to simple `text/html` tables
29 | * `%html` table cells are embedded in the table HTML
30 | * Normal table cells are escaped and then embedded in the table HTML
31 |
--------------------------------------------------------------------------------
/jupyter-zeppelin.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import re
3 | import csv
4 | import json
5 | import html
6 | import nbformat
7 | import codecs
8 | from aws.s3 import S3
9 | from StringIO import StringIO
10 |
11 | MD = re.compile(r'%md\s')
12 | SQL = re.compile(r'%sql\s')
13 | UNKNOWN_MAGIC = re.compile(r'%\w+\s')
14 | HTML = re.compile(r'%html\s')
15 |
16 | def read_io(path):
17 | """Reads the contents of a local or S3 path into a StringIO.
18 | """
19 | note = StringIO()
20 | if path.startswith("s3://"):
21 | s3 = S3(env='prod')
22 | for line in s3.read(path):
23 | note.write(line)
24 | note.write("\n")
25 | else:
26 | with open(path) as local:
27 | for line in local.readlines():
28 | note.write(line)
29 |
30 | note.seek(0)
31 |
32 | return note
33 |
34 | def table_cell_to_html(cell):
35 | """Formats a cell from a Zeppelin TABLE as HTML.
36 | """
37 | if HTML.match(cell):
38 | # the contents is already HTML
39 | return cell
40 | else:
41 | return html.escape(cell)
42 |
43 | def table_to_html(tsv):
44 | """Formats the tab-separated content of a Zeppelin TABLE as HTML.
45 | """
46 | io = StringIO.StringIO(tsv)
47 | reader = csv.reader(io, delimiter="\t")
48 | fields = reader.next()
49 | column_headers = "".join([ "
" + name + " | " for name in fields ])
50 | lines = [
51 | "",
52 | "{column_headers}
".format(column_headers=column_headers)
53 | ]
54 | for row in reader:
55 | lines.append("" + "".join([ "" + table_cell_to_html(cell) + " | " for cell in row ]) + "
")
56 | lines.append("
")
57 | return "\n".join(lines)
58 |
59 |
60 | def convert_json(zeppelin_json):
61 | """Converts a Zeppelin note from JSON to a Jupyter NotebookNode.
62 | """
63 | return convert_parsed(json.load(zeppelin_json))
64 |
65 | def convert_parsed(zeppelin_note):
66 | """Converts a Zeppelin note from parsed JSON to a Jupyter NotebookNode.
67 | """
68 | notebook_name = zeppelin_note['name']
69 |
70 | cells = []
71 | index = 0
72 | for paragraph in zeppelin_note['paragraphs']:
73 | code = paragraph.get('text')
74 | if not code:
75 | continue
76 |
77 | code = code.lstrip()
78 |
79 | cell = {}
80 |
81 | if MD.match(code):
82 | cell['cell_type'] = 'markdown'
83 | cell['metadata'] = {}
84 | cell['source'] = code.lstrip('%md').lstrip("\n") # remove '%md'
85 | elif SQL.match(code) or HTML.match(code):
86 | cell['cell_type'] = 'code'
87 | cell['execution_count'] = index
88 | cell['metadata'] = {}
89 | cell['outputs'] = []
90 | cell['source'] = '%' + code # add % to convert to cell magic
91 | elif UNKNOWN_MAGIC.match(code):
92 | # use raw cells for unknown magic
93 | cell['cell_type'] = 'raw'
94 | cell['metadata'] = {'format': 'text/plain'}
95 | cell['source'] = code
96 | else:
97 | cell['cell_type'] = 'code'
98 | cell['execution_count'] = index
99 | cell['metadata'] = {'autoscroll': 'auto'}
100 | cell['outputs'] = []
101 | cell['source'] = code
102 |
103 | cells.append(cell)
104 |
105 | result = paragraph.get('result')
106 | if cell['cell_type'] == 'code' and result:
107 | if result['code'] == 'SUCCESS':
108 | result_type = result.get('type')
109 | output_by_mime_type = {}
110 | if result_type == 'TEXT':
111 | output_by_mime_type['text/plain'] = result['msg']
112 | elif result_type == 'HTML':
113 | output_by_mime_type['text/html'] = result['msg']
114 | elif result_type == 'TABLE':
115 | output_by_mime_type['text/html'] = table_to_html(result['msg'])
116 |
117 | cell['outputs'] = [{
118 | 'output_type': 'execute_result',
119 | 'metadata': {},
120 | 'execution_count': index,
121 | 'data': output_by_mime_type
122 | }]
123 |
124 | index += 1
125 |
126 | notebook = nbformat.from_dict({
127 | "metadata": {
128 | "kernelspec": {
129 | "display_name": "Spark 2.0.0 - Scala 2.11",
130 | "language": "scala",
131 | "name": "spark2-scala"
132 | },
133 | "language_info": {
134 | "codemirror_mode": "text/x-scala",
135 | "file_extension": ".scala",
136 | "mimetype": "text/x-scala",
137 | "name": "scala",
138 | "pygments_lexer": "scala",
139 | "version": "2.11.8"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 2,
144 | "cells" : cells,
145 | })
146 |
147 | return (notebook_name, notebook)
148 |
149 | def write_notebook(notebook_name, notebook, path=None):
150 | """Writes a NotebookNode to a file created from the notebook name.
151 |
152 | If path is None, the output path will be created the notebook name in the current directory.
153 | """
154 | filename = path
155 | if not filename:
156 | filename = notebook_name + '.ipynb'
157 | if os.path.exists(filename):
158 | for i in range(1, 1000):
159 | filename = notebook_name + ' (' + str(i) + ').ipynb'
160 | if not os.path.exists(filename):
161 | break
162 | if i == 1000:
163 | raise RuntimeError('Cannot write %s: versions 1-1000 already exist.' % (notebook_name,))
164 |
165 | with codecs.open(filename, 'w', encoding='UTF-8') as io:
166 | nbformat.write(notebook, io)
167 |
168 | return filename
169 |
170 | if __name__ == '__main__':
171 | num_args = len(sys.argv)
172 |
173 | zeppelin_note_path = None
174 | target_path = None
175 | if num_args == 2:
176 | zeppelin_note_path = sys.argv[1]
177 | elif num_args == 3:
178 | target_path = sys.argv[2]
179 |
180 | if not zeppelin_note_path:
181 | exit()
182 |
183 | name, content = convert_json(read_io(zeppelin_note_path))
184 | write_notebook(name, content, target_path)
185 |
186 |
--------------------------------------------------------------------------------