├── tests ├── __init__.py └── test_sparksql.py ├── setup.cfg ├── .gitignore ├── screenshots └── example.png ├── sparksql_magic ├── __init__.py └── sparksql.py ├── Pipfile ├── Makefile ├── setup.py ├── README.md ├── LICENSE └── Pipfile.lock /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | python-tag = py36 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode/ 3 | .idea/ 4 | __pycache__/ 5 | build/ 6 | dist/ 7 | *.egg-info/ 8 | -------------------------------------------------------------------------------- /screenshots/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cryeo/sparksql-magic/HEAD/screenshots/example.png -------------------------------------------------------------------------------- /sparksql_magic/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.3' 2 | 3 | from .sparksql import SparkSql 4 | 5 | 6 | def load_ipython_extension(ipython): 7 | ipython.register_magics(SparkSql) 8 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | pytest = "==4.5.0" 8 | 9 | [packages] 10 | sparksql-magic = {editable = true,path = "."} 11 | 12 | [requires] 13 | python_version = "3.6" 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: prepare 2 | prepare: 3 | pipenv install --dev 4 | 5 | .PHONY: test 6 | test: prepare 7 | pytest 8 | 9 | .PHONY: clean 10 | clean: 11 | rm -rf sparksql_magic.egg-info build dist 12 | 13 | .PHONY: dist 14 | dist: clean 15 | python setup.py sdist bdist_wheel 16 | 17 | .PHONY: upload 18 | upload: dist 19 | twine upload dist/* 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | import sparksql_magic 4 | 5 | setup( 6 | name='sparksql-magic', 7 | version=sparksql_magic.__version__, 8 | description='Spark SQL magic command for Jupyter notebooks', 9 | long_description=open('README.md', 'r').read(), 10 | long_description_content_type='text/markdown', 11 | author='Chaerim Yeo', 12 | author_email='yeochaerim@gmail.com', 13 | url='https://github.com/cryeo/sparksql-magic', 14 | license='MIT License', 15 | install_requires=['pyspark>=2.3.0', 'ipython>=7.4.0'], 16 | packages=find_packages(exclude=('tests', 'docs')), 17 | python_requires='>=3.6', 18 | classifiers=[ 19 | 'Development Status :: 2 - Pre-Alpha', 20 | 'License :: OSI Approved :: MIT License', 21 | 'Programming Language :: Python', 22 | 'Programming Language :: Python :: 3', 23 | 'Programming Language :: Python :: 3.6', 24 | 'Topic :: Software Development :: Libraries :: Python Modules', 25 | ], 26 | ) 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sparksql-magic 2 | 3 | Spark SQL magic command for Jupyter notebooks. 4 | 5 | ![Example](screenshots/example.png) 6 | 7 | ## Prerequisites 8 | - Python >= 3.6 9 | - PySpark >= 2.3.0 10 | - IPython >= 7.4.0 11 | 12 | ## Install 13 | ``` 14 | pip install sparksql-magic 15 | ``` 16 | 17 | ## Usage 18 | 19 | ### Load 20 | ``` 21 | %load_ext sparksql_magic 22 | ``` 23 | 24 | ### Config 25 | ``` 26 | %config SparkSql.limit= 27 | ``` 28 | 29 | |Option|Default|Description| 30 | |---|---|---| 31 | |`SparkSql.limit`|20|The maximum number of rows to display| 32 | 33 | ### Parameter 34 | ``` 35 | %%sparksql [-c|--cache] [-e|--eager] [-v|--view VIEW] [-l|--limit LIMIT] [variable] 36 | 37 | ``` 38 | 39 | |Parameter|Description| 40 | |---|---| 41 | |`-c` `--cache`|Cache dataframe| 42 | |`-e` `--eager`|Cache dataframe with eager load| 43 | |`-v VIEW` `--view VIEW`|Create or replace temporary view| 44 | |`-l LIMIT` `--limit LIMIT`|The maximum number of rows to display (Default: `SparkSql.limit`)| 45 | |`variable`|Capture dataframe in a local variable| 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Chaerim Yeo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_sparksql.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark import Row 3 | from unittest import mock 4 | from sparksql_magic.sparksql import bind_variables, get_results, make_tag 5 | 6 | 7 | @pytest.fixture 8 | def df(): 9 | return mock.MagicMock( 10 | columns=['col1', 'col2', 'col3'], 11 | take=lambda n: [Row(col1=i, col2=str(i), col3=None) for i in range(n)] 12 | ) 13 | 14 | 15 | @pytest.fixture 16 | def user_ns(): 17 | return { 18 | 'month': 5, 19 | 'day': '10', 20 | } 21 | 22 | 23 | def test_get_results(df): 24 | assert get_results(df, 0) == (['col1', 'col2', 'col3'], [['0', '0', 'null']]) 25 | assert get_results(df, 1) == (['col1', 'col2', 'col3'], [['0', '0', 'null'], ['1', '1', 'null']]) 26 | 27 | 28 | def test_bind_variables(user_ns): 29 | assert bind_variables('SELECT * FROM table', user_ns) == 'SELECT * FROM table' 30 | assert bind_variables("SELECT * FROM table WHERE month='{month}'", user_ns) == "SELECT * FROM table WHERE month='5'" 31 | assert bind_variables("SELECT * FROM table WHERE month='{month}' AND day='{day}'", user_ns) == "SELECT * FROM table WHERE month='5' AND day='10'" 32 | 33 | with pytest.raises(NameError): 34 | bind_variables("SELECT * FROM table WHERE hour='{hour}'", user_ns) 35 | 36 | 37 | def test_make_tag(): 38 | assert make_tag('td') == '' 39 | assert make_tag('td', 'body') == 'body' 40 | assert make_tag('td', 'body', style='font-weight: bold') == 'body' 41 | -------------------------------------------------------------------------------- /sparksql_magic/sparksql.py: -------------------------------------------------------------------------------- 1 | import re 2 | from html import escape 3 | 4 | from IPython.core.display import HTML 5 | from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope 6 | from IPython.core.magic_arguments import argument, magic_arguments, parse_argstring 7 | from pyspark.sql import SparkSession 8 | from traitlets import Int 9 | 10 | BIND_VARIABLE_PATTERN = re.compile(r'{([A-Za-z0-9_]+)}') 11 | 12 | 13 | @magics_class 14 | class SparkSql(Magics): 15 | limit = Int(20, config=True, help='The maximum number of rows to display') 16 | 17 | @needs_local_scope 18 | @cell_magic 19 | @magic_arguments() 20 | @argument('variable', nargs='?', type=str, help='Capture dataframe in a local variable') 21 | @argument('-c', '--cache', action='store_true', help='Cache dataframe') 22 | @argument('-e', '--eager', action='store_true', help='Cache dataframe with eager load') 23 | @argument('-v', '--view', type=str, help='Create or replace temporary view') 24 | @argument('-l', '--limit', type=int, help='The maximum number of rows to display') 25 | def sparksql(self, line='', cell='', local_ns=None): 26 | if local_ns is None: 27 | local_ns = {} 28 | 29 | user_ns = self.shell.user_ns.copy() 30 | user_ns.update(local_ns) 31 | 32 | args = parse_argstring(self.sparksql, line) 33 | 34 | spark = get_instantiated_spark_session() 35 | 36 | if spark is None: 37 | print("active spark session is not found") 38 | return 39 | 40 | df = spark.sql(bind_variables(cell, user_ns)) 41 | if args.cache or args.eager: 42 | print('cache dataframe with %s load' % ('eager' if args.eager else 'lazy')) 43 | df = df.cache() 44 | if args.eager: 45 | df.count() 46 | if args.view: 47 | print('create temporary view `%s`' % args.view) 48 | df.createOrReplaceTempView(args.view) 49 | if args.variable: 50 | print('capture dataframe to local variable `%s`' % args.variable) 51 | self.shell.user_ns.update({args.variable: df}) 52 | 53 | limit = args.limit or self.limit 54 | header, contents = get_results(df, limit) 55 | if len(contents) > limit: 56 | print('only showing top %d row(s)' % limit) 57 | 58 | html = make_tag('tr', 59 | ''.join(map(lambda x: make_tag('td', escape(x), style='font-weight: bold'), header)), 60 | style='border-bottom: 1px solid') 61 | for index, row in enumerate(contents[:limit]): 62 | html += make_tag('tr', ''.join(map(lambda x: make_tag('td', escape(x)), row))) 63 | 64 | return HTML(make_tag('table', html)) 65 | 66 | 67 | def bind_variables(query, user_ns): 68 | def fetch_variable(match): 69 | variable = match.group(1) 70 | if variable not in user_ns: 71 | raise NameError('variable `%s` is not defined', variable) 72 | return str(user_ns[variable]) 73 | 74 | return re.sub(BIND_VARIABLE_PATTERN, fetch_variable, query) 75 | 76 | 77 | def get_results(df, limit): 78 | def convert_value(value): 79 | if value is None: 80 | return 'null' 81 | return str(value) 82 | 83 | header = df.columns 84 | contents = list(map(lambda row: list(map(convert_value, row)), df.take(limit + 1))) 85 | 86 | return header, contents 87 | 88 | 89 | def make_tag(tag_name, body='', **kwargs): 90 | attributes = ' '.join(map(lambda x: '%s="%s"' % x, kwargs.items())) 91 | if attributes: 92 | return '<%s %s>%s' % (tag_name, attributes, body, tag_name) 93 | else: 94 | return '<%s>%s' % (tag_name, body, tag_name) 95 | 96 | 97 | def get_instantiated_spark_session(): 98 | return SparkSession._instantiatedSession 99 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "ca85c002dde395aa42d62f01a43ffb41b8a462f47b6543e43df554efb68a25ea" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "appnope": { 20 | "hashes": [ 21 | "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", 22 | "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" 23 | ], 24 | "markers": "sys_platform == 'darwin'", 25 | "version": "==0.1.0" 26 | }, 27 | "backcall": { 28 | "hashes": [ 29 | "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", 30 | "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" 31 | ], 32 | "version": "==0.1.0" 33 | }, 34 | "decorator": { 35 | "hashes": [ 36 | "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", 37 | "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" 38 | ], 39 | "version": "==4.4.0" 40 | }, 41 | "ipython": { 42 | "hashes": [ 43 | "sha256:54c5a8aa1eadd269ac210b96923688ccf01ebb2d0f21c18c3c717909583579a8", 44 | "sha256:e840810029224b56cd0d9e7719dc3b39cf84d577f8ac686547c8ba7a06eeab26" 45 | ], 46 | "version": "==7.5.0" 47 | }, 48 | "ipython-genutils": { 49 | "hashes": [ 50 | "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", 51 | "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" 52 | ], 53 | "version": "==0.2.0" 54 | }, 55 | "jedi": { 56 | "hashes": [ 57 | "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b", 58 | "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c" 59 | ], 60 | "version": "==0.13.3" 61 | }, 62 | "parso": { 63 | "hashes": [ 64 | "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33", 65 | "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376" 66 | ], 67 | "version": "==0.4.0" 68 | }, 69 | "pexpect": { 70 | "hashes": [ 71 | "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", 72 | "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" 73 | ], 74 | "markers": "sys_platform != 'win32'", 75 | "version": "==4.7.0" 76 | }, 77 | "pickleshare": { 78 | "hashes": [ 79 | "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", 80 | "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" 81 | ], 82 | "version": "==0.7.5" 83 | }, 84 | "prompt-toolkit": { 85 | "hashes": [ 86 | "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", 87 | "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", 88 | "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" 89 | ], 90 | "version": "==2.0.9" 91 | }, 92 | "ptyprocess": { 93 | "hashes": [ 94 | "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", 95 | "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" 96 | ], 97 | "version": "==0.6.0" 98 | }, 99 | "py4j": { 100 | "hashes": [ 101 | "sha256:721189616b3a7d28212dfb2e7c6a1dd5147b03105f1fc37ff2432acd0e863fa5", 102 | "sha256:a950fe7de1bfd247a0a4dddb9118f332d22a89e01e0699135ea8038c15ee1293" 103 | ], 104 | "version": "==0.10.7" 105 | }, 106 | "pygments": { 107 | "hashes": [ 108 | "sha256:31cba6ffb739f099a85e243eff8cb717089fdd3c7300767d9fc34cb8e1b065f5", 109 | "sha256:5ad302949b3c98dd73f8d9fcdc7e9cb592f120e32a18e23efd7f3dc51194472b" 110 | ], 111 | "version": "==2.4.0" 112 | }, 113 | "pyspark": { 114 | "hashes": [ 115 | "sha256:6839718ce9f779e81153d8a14a843a5c4b2d5e6574f3c916aec241022d717cb2" 116 | ], 117 | "version": "==2.4.3" 118 | }, 119 | "six": { 120 | "hashes": [ 121 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 122 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 123 | ], 124 | "version": "==1.12.0" 125 | }, 126 | "sparksql-magic": { 127 | "editable": true, 128 | "path": "." 129 | }, 130 | "traitlets": { 131 | "hashes": [ 132 | "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", 133 | "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" 134 | ], 135 | "version": "==4.3.2" 136 | }, 137 | "wcwidth": { 138 | "hashes": [ 139 | "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", 140 | "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" 141 | ], 142 | "version": "==0.1.7" 143 | } 144 | }, 145 | "develop": { 146 | "atomicwrites": { 147 | "hashes": [ 148 | "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", 149 | "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" 150 | ], 151 | "version": "==1.3.0" 152 | }, 153 | "attrs": { 154 | "hashes": [ 155 | "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", 156 | "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" 157 | ], 158 | "version": "==19.1.0" 159 | }, 160 | "more-itertools": { 161 | "hashes": [ 162 | "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", 163 | "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a" 164 | ], 165 | "markers": "python_version > '2.7'", 166 | "version": "==7.0.0" 167 | }, 168 | "pluggy": { 169 | "hashes": [ 170 | "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180", 171 | "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a" 172 | ], 173 | "version": "==0.11.0" 174 | }, 175 | "py": { 176 | "hashes": [ 177 | "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", 178 | "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" 179 | ], 180 | "version": "==1.8.0" 181 | }, 182 | "pytest": { 183 | "hashes": [ 184 | "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24", 185 | "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6" 186 | ], 187 | "index": "pypi", 188 | "version": "==4.5.0" 189 | }, 190 | "six": { 191 | "hashes": [ 192 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 193 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 194 | ], 195 | "version": "==1.12.0" 196 | }, 197 | "wcwidth": { 198 | "hashes": [ 199 | "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", 200 | "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" 201 | ], 202 | "version": "==0.1.7" 203 | } 204 | } 205 | } 206 | --------------------------------------------------------------------------------