├── README.md └── jupyter-zeppelin.py /README.md: -------------------------------------------------------------------------------- 1 | ## Jupyter/Zeppelin conversion 2 | 3 | This repo has code for converting Zeppelin notebooks to Jupyter's ipynb format. 4 | 5 | To convert a notebook, run: 6 | 7 | ``` 8 | python jupyter-zeppelin.py note.json 9 | ``` 10 | 11 | This will create a file named using the Zeppelin note's name in the current directory. Alternatively, you can pass an output path: 12 | 13 | ``` 14 | python jupyter-zeppelin.py note.json Example.ipynb 15 | ``` 16 | 17 | ### Supported conventions 18 | 19 | This converter supports the following Zeppelin conventions: 20 | 21 | * Code paragraphs are converted to code cells 22 | * `%md` paragraphs are converted to Jupyter markdown cells 23 | * `%html` paragraphs are converted to Jupyter code cells using cell magic `%%html` 24 | * `%sql` paragraphs are converted to Jupyter code cells using cell magic `%%sql` 25 | * Paragraphs with unknown magics are converted to raw cells 26 | * TEXT output is converted to `text/plain` output 27 | * HTML output is converted to `text/html` output; some style and JS may not work in Jupyter 28 | * TABLE output is converted to simple `text/html` tables 29 | * `%html` table cells are embedded in the table HTML 30 | * Normal table cells are escaped and then embedded in the table HTML 31 | -------------------------------------------------------------------------------- /jupyter-zeppelin.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import re 3 | import csv 4 | import json 5 | import html 6 | import nbformat 7 | import codecs 8 | from aws.s3 import S3 9 | from StringIO import StringIO 10 | 11 | MD = re.compile(r'%md\s') 12 | SQL = re.compile(r'%sql\s') 13 | UNKNOWN_MAGIC = re.compile(r'%\w+\s') 14 | HTML = re.compile(r'%html\s') 15 | 16 | def read_io(path): 17 | """Reads the contents of a local or S3 path into a StringIO. 18 | """ 19 | note = StringIO() 20 | if path.startswith("s3://"): 21 | s3 = S3(env='prod') 22 | for line in s3.read(path): 23 | note.write(line) 24 | note.write("\n") 25 | else: 26 | with open(path) as local: 27 | for line in local.readlines(): 28 | note.write(line) 29 | 30 | note.seek(0) 31 | 32 | return note 33 | 34 | def table_cell_to_html(cell): 35 | """Formats a cell from a Zeppelin TABLE as HTML. 36 | """ 37 | if HTML.match(cell): 38 | # the contents is already HTML 39 | return cell 40 | else: 41 | return html.escape(cell) 42 | 43 | def table_to_html(tsv): 44 | """Formats the tab-separated content of a Zeppelin TABLE as HTML. 45 | """ 46 | io = StringIO.StringIO(tsv) 47 | reader = csv.reader(io, delimiter="\t") 48 | fields = reader.next() 49 | column_headers = "".join([ "" + name + "" for name in fields ]) 50 | lines = [ 51 | "", 52 | "{column_headers}".format(column_headers=column_headers) 53 | ] 54 | for row in reader: 55 | lines.append("" + "".join([ "" for cell in row ]) + "") 56 | lines.append("
" + table_cell_to_html(cell) + "
") 57 | return "\n".join(lines) 58 | 59 | 60 | def convert_json(zeppelin_json): 61 | """Converts a Zeppelin note from JSON to a Jupyter NotebookNode. 62 | """ 63 | return convert_parsed(json.load(zeppelin_json)) 64 | 65 | def convert_parsed(zeppelin_note): 66 | """Converts a Zeppelin note from parsed JSON to a Jupyter NotebookNode. 67 | """ 68 | notebook_name = zeppelin_note['name'] 69 | 70 | cells = [] 71 | index = 0 72 | for paragraph in zeppelin_note['paragraphs']: 73 | code = paragraph.get('text') 74 | if not code: 75 | continue 76 | 77 | code = code.lstrip() 78 | 79 | cell = {} 80 | 81 | if MD.match(code): 82 | cell['cell_type'] = 'markdown' 83 | cell['metadata'] = {} 84 | cell['source'] = code.lstrip('%md').lstrip("\n") # remove '%md' 85 | elif SQL.match(code) or HTML.match(code): 86 | cell['cell_type'] = 'code' 87 | cell['execution_count'] = index 88 | cell['metadata'] = {} 89 | cell['outputs'] = [] 90 | cell['source'] = '%' + code # add % to convert to cell magic 91 | elif UNKNOWN_MAGIC.match(code): 92 | # use raw cells for unknown magic 93 | cell['cell_type'] = 'raw' 94 | cell['metadata'] = {'format': 'text/plain'} 95 | cell['source'] = code 96 | else: 97 | cell['cell_type'] = 'code' 98 | cell['execution_count'] = index 99 | cell['metadata'] = {'autoscroll': 'auto'} 100 | cell['outputs'] = [] 101 | cell['source'] = code 102 | 103 | cells.append(cell) 104 | 105 | result = paragraph.get('result') 106 | if cell['cell_type'] == 'code' and result: 107 | if result['code'] == 'SUCCESS': 108 | result_type = result.get('type') 109 | output_by_mime_type = {} 110 | if result_type == 'TEXT': 111 | output_by_mime_type['text/plain'] = result['msg'] 112 | elif result_type == 'HTML': 113 | output_by_mime_type['text/html'] = result['msg'] 114 | elif result_type == 'TABLE': 115 | output_by_mime_type['text/html'] = table_to_html(result['msg']) 116 | 117 | cell['outputs'] = [{ 118 | 'output_type': 'execute_result', 119 | 'metadata': {}, 120 | 'execution_count': index, 121 | 'data': output_by_mime_type 122 | }] 123 | 124 | index += 1 125 | 126 | notebook = nbformat.from_dict({ 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Spark 2.0.0 - Scala 2.11", 130 | "language": "scala", 131 | "name": "spark2-scala" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": "text/x-scala", 135 | "file_extension": ".scala", 136 | "mimetype": "text/x-scala", 137 | "name": "scala", 138 | "pygments_lexer": "scala", 139 | "version": "2.11.8" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 2, 144 | "cells" : cells, 145 | }) 146 | 147 | return (notebook_name, notebook) 148 | 149 | def write_notebook(notebook_name, notebook, path=None): 150 | """Writes a NotebookNode to a file created from the notebook name. 151 | 152 | If path is None, the output path will be created the notebook name in the current directory. 153 | """ 154 | filename = path 155 | if not filename: 156 | filename = notebook_name + '.ipynb' 157 | if os.path.exists(filename): 158 | for i in range(1, 1000): 159 | filename = notebook_name + ' (' + str(i) + ').ipynb' 160 | if not os.path.exists(filename): 161 | break 162 | if i == 1000: 163 | raise RuntimeError('Cannot write %s: versions 1-1000 already exist.' % (notebook_name,)) 164 | 165 | with codecs.open(filename, 'w', encoding='UTF-8') as io: 166 | nbformat.write(notebook, io) 167 | 168 | return filename 169 | 170 | if __name__ == '__main__': 171 | num_args = len(sys.argv) 172 | 173 | zeppelin_note_path = None 174 | target_path = None 175 | if num_args == 2: 176 | zeppelin_note_path = sys.argv[1] 177 | elif num_args == 3: 178 | target_path = sys.argv[2] 179 | 180 | if not zeppelin_note_path: 181 | exit() 182 | 183 | name, content = convert_json(read_io(zeppelin_note_path)) 184 | write_notebook(name, content, target_path) 185 | 186 | --------------------------------------------------------------------------------