├── .gitignore ├── LICENSE ├── README.md ├── hiveql ├── __init__.py ├── __main__.py ├── constants.py ├── install.py ├── kernel.py ├── main.py ├── resources │ ├── custom.css │ ├── logo-32x32.png │ └── logo-64x64.png └── tool_sql.py ├── requirements.txt ├── setup.py └── tests └── test_sql.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | 107 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 EDS-APHP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HiveQL Kernel 2 | 3 | ### Requirements 4 | 5 | If you are going to connect using kerberos: 6 | 7 | ``` 8 | sudo apt-get install python3-dev libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit 9 | ``` 10 | 11 | ### Installation 12 | 13 | To install the kernel: 14 | 15 | ``` 16 | pip install --upgrade hiveqlKernel 17 | jupyter hiveql install --user 18 | ``` 19 | 20 | ### Connection configuration 21 | 22 | Two methods are available to connect to a Hive server: 23 | 24 | * Directly inside the notebook 25 | * Using a configuration file 26 | 27 | If the configuration file is present, everytime you run a new HiveQL kernel it uses it, else you must configure your connection inside the notebook. The configuration in the notebook overwrites the one in the configuration file if present. 28 | 29 | #### Configure directly in the notebook cells 30 | 31 | Inside a Notebook cell, copy&paste this, change the configuration to match your needs, and run it. 32 | 33 | ``` 34 | $$ url=hive://@:/ 35 | $$ connect_args={"auth": "KERBEROS", "kerberos_service_name": "hive", "configuration": {"tez.queue.name": "myqueue"}} 36 | $$ pool_size=5 37 | $$ max_overflow=10 38 | ``` 39 | 40 | These args are passed to sqlalchemy, who registered pyHive as the 'hive' SQL back-end. 41 | See [github.com/dropbox/PyHive](https://github.com/dropbox/PyHive/#sqlalchemy). 42 | 43 | #### Configure using a configuration file 44 | 45 | The HiveQL kernel is looking for the configuration file at `~/.hiveql_kernel.conf` by default. You can specify another path using `HIVE_KERNEL_CONF_FILE`. 46 | 47 | The contents must be like this (in json format): 48 | 49 | ``` 50 | { "url": "hive://@:/", "connect_args" : { "auth": "KERBEROS", "kerberos_service_name":"hive", "configuration": {"tez.queue.name": "myqueue"}}, "pool_size": 5, "max_overflow": 10, "default_limit": 20, "display_mode": "be" } 51 | ``` 52 | 53 | 54 | ### Usage 55 | 56 | Inside a HiveQL kernel you can type HiveQL directly in the cells and it displays a HTML table with the results. 57 | 58 | You also have other options, like changing the default display limit (=20) like this : 59 | 60 | ``` 61 | $$ default_limit=50 62 | ``` 63 | 64 | Some hive functions are extended. They allow to filter with some patterns. 65 | 66 | ``` 67 | SHOW TABLES 68 | SHOW DATABASES 69 | ``` 70 | 71 | 72 | ### Run tests 73 | 74 | ``` 75 | python -m pytest 76 | ``` 77 | 78 | 79 | Have fun! 80 | -------------------------------------------------------------------------------- /hiveql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/__init__.py -------------------------------------------------------------------------------- /hiveql/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from ipykernel.kernelapp import IPKernelApp 4 | 5 | 6 | class HiveQLKernelApp(IPKernelApp): 7 | """ 8 | The main kernel application, inheriting from the ipykernel 9 | """ 10 | from .kernel import HiveQLKernel 11 | from .install import HiveqlKernelInstall, HiveqlKernelRemove 12 | kernel_class = HiveQLKernel 13 | 14 | # We override subcommands to add our own install & remove commands 15 | subcommands = { 16 | 'install': (HiveqlKernelInstall, 17 | HiveqlKernelInstall.description.splitlines()[0]), 18 | 'remove': (HiveqlKernelRemove, 19 | HiveqlKernelRemove.description.splitlines()[0]), 20 | } 21 | 22 | 23 | 24 | def main(): 25 | HiveQLKernelApp.launch_instance() 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /hiveql/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | __version__ = '1.0.20' 3 | 4 | KERNEL_NAME = 'hiveql' 5 | LANGUAGE = 'hiveql' 6 | DISPLAY_NAME = 'HiveQL' 7 | 8 | DEFAULT_TEXT_LANG = ['en'] 9 | 10 | CONFIG_FILE = os.environ.get("HIVE_KERNEL_CONF_FILE", "~/.hiveql_kernel.json") 11 | -------------------------------------------------------------------------------- /hiveql/install.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import os 4 | import pkgutil 5 | import sys 6 | from tempfile import TemporaryDirectory 7 | 8 | from IPython.utils.path import ensure_dir_exists 9 | from jupyter_client.kernelspecapp import InstallKernelSpec, RemoveKernelSpec 10 | from jupyter_core.paths import jupyter_config_dir 11 | from traitlets import Unicode 12 | 13 | from .constants import __version__, KERNEL_NAME, DISPLAY_NAME, LANGUAGE 14 | 15 | PY3 = sys.version_info[0] == 3 16 | if PY3: 17 | unicode = str 18 | 19 | MODULEDIR = os.path.dirname(__file__) 20 | PKGNAME = os.path.basename(MODULEDIR) 21 | 22 | # The kernel specfile 23 | kernel_json = { 24 | "argv": [sys.executable, 25 | "-m", PKGNAME, 26 | "-f", "{connection_file}"], 27 | "display_name": DISPLAY_NAME, 28 | "language": LANGUAGE, 29 | "name": KERNEL_NAME 30 | } 31 | 32 | 33 | def css_frame_prefix(name): 34 | '''Define the comment prefix used in custom css to frame kernel CSS''' 35 | return u'/* @{{KERNEL}} {} '.format(name) 36 | 37 | 38 | def copyresource(resource, filename, destdir): 39 | """ 40 | Copy a resource file to a destination 41 | """ 42 | data = pkgutil.get_data(resource, os.path.join('resources', filename)) 43 | # log.info( "Installing %s", os.path.join(destdir,filename) ) 44 | with open(os.path.join(destdir, filename), 'wb') as fp: 45 | fp.write(data) 46 | 47 | 48 | def remove_custom_css(destdir, resource=PKGNAME): 49 | """ 50 | Remove the kernel CSS from custom.css 51 | """ 52 | 53 | # Remove the inclusion in the main CSS 54 | if not os.path.isdir(destdir): 55 | return False 56 | custom = os.path.join(destdir, 'custom.css') 57 | copy = True 58 | found = False 59 | prefix = css_frame_prefix(resource) 60 | with io.open(custom + '-new', 'wt') as fout: 61 | with io.open(custom) as fin: 62 | for line in fin: 63 | if line.startswith(prefix + 'START'): 64 | copy = False 65 | found = True 66 | elif line.startswith(prefix + 'END'): 67 | copy = True 68 | elif copy: 69 | fout.write(line) 70 | 71 | if found: 72 | os.rename(custom + '-new', custom) 73 | else: 74 | os.unlink(custom + '-new') 75 | 76 | return found 77 | 78 | 79 | def install_custom_css(destdir, resource=PKGNAME): 80 | """ 81 | Add the kernel CSS to custom.css 82 | """ 83 | ensure_dir_exists(destdir) 84 | custom = os.path.join(destdir, 'custom.css') 85 | prefix = css_frame_prefix(resource) 86 | 87 | # Check if custom.css already includes it. If so, let's remove it first 88 | exists = False 89 | if os.path.exists(custom): 90 | with io.open(custom) as f: 91 | for line in f: 92 | if line.find(prefix) >= 0: 93 | exists = True 94 | break 95 | if exists: 96 | remove_custom_css(destdir, resource) 97 | 98 | # Fetch the CSS file 99 | cssfile = 'custom.css' 100 | data = pkgutil.get_data(resource, os.path.join('resources', cssfile)) 101 | # get_data() delivers encoded data, str (Python2) or bytes (Python3) 102 | 103 | # Add the CSS at the beginning of custom.css 104 | # io.open uses unicode strings (unicode in Python2, str in Python3) 105 | with io.open(custom + '-new', 'wt', encoding='utf-8') as fout: 106 | fout.write(u'{}START ======================== */\n'.format(prefix)) 107 | fout.write(data.decode('utf-8')) 108 | fout.write(u'{}END ======================== */\n'.format(prefix)) 109 | if os.path.exists(custom): 110 | with io.open(custom, 'rt', encoding='utf-8') as fin: 111 | for line in fin: 112 | fout.write(unicode(line)) 113 | os.rename(custom + '-new', custom) 114 | 115 | 116 | def install_kernel_resources(destdir, resource=PKGNAME, files=None): 117 | """ 118 | Copy the resource files to the kernelspec folder. 119 | """ 120 | if files is None: 121 | files = ['logo-64x64.png', 'logo-32x32.png'] 122 | for filename in files: 123 | try: 124 | copyresource(resource, filename, destdir) 125 | except Exception as e: 126 | sys.stderr.write(str(e)) 127 | 128 | 129 | class HiveqlKernelInstall(InstallKernelSpec): 130 | """ 131 | The kernel installation class 132 | """ 133 | 134 | version = __version__ 135 | kernel_name = KERNEL_NAME 136 | description = '''Install the HiveQL Jupyter Kernel. 137 | Either as a system kernel or for a concrete user''' 138 | 139 | logdir = Unicode(os.environ.get('LOGDIR', ''), 140 | config=True, 141 | help="""Default directory to use for the logfile.""" 142 | ) 143 | aliases = {'logdir': 'HiveQLKernelInstall.logdir'} 144 | 145 | def parse_command_line(self, argv): 146 | """ 147 | Skip parent method and go for its ancestor 148 | (because parent method requires an extra argument: the kernel to install) 149 | """ 150 | super(InstallKernelSpec, self).parse_command_line(argv) 151 | 152 | def start(self): 153 | if self.user and self.prefix: 154 | self.exit("Can't specify both user and prefix. Please choose one or the other.") 155 | 156 | self.log.info('Installing HiveQL kernel') 157 | with TemporaryDirectory() as td: 158 | os.chmod(td, 0o755) # Starts off as 700, not user readable 159 | # Add kernel spec 160 | if len(self.logdir): 161 | kernel_json['env'] = {'LOGDIR_DEFAULT': self.logdir} 162 | with open(os.path.join(td, 'kernel.json'), 'w') as f: 163 | json.dump(kernel_json, f, sort_keys=True) 164 | # Add resources 165 | install_kernel_resources(td, resource=PKGNAME) 166 | # Install JSON kernel specification + resources 167 | self.log.info('Installing kernel spec') 168 | self.sourcedir = td 169 | install_dir = self.kernel_spec_manager.install_kernel_spec( 170 | td, 171 | kernel_name=self.kernel_name, 172 | user=self.user, 173 | prefix=self.prefix, 174 | replace=self.replace, 175 | ) 176 | self.log.info("Installed into %s", install_dir) 177 | 178 | # install_kernel( self.kernel_spec_manager ) 179 | # self.create_kernel_json( install_dir ) 180 | 181 | # Install the custom css 182 | self.log.info('Installing CSS') 183 | if self.user: 184 | # Use the ~/.jupyter/custom dir 185 | import jupyter_core 186 | destd = os.path.join(jupyter_config_dir(), 'custom') 187 | else: 188 | # Use the system custom dir 189 | import notebook 190 | destd = os.path.join(notebook.DEFAULT_STATIC_FILES_PATH, 'custom') 191 | 192 | self.log.info('Installing CSS into %s', destd) 193 | install_custom_css(destd) 194 | 195 | 196 | # -------------------------------------------------------------------------- 197 | 198 | 199 | class HiveqlKernelRemove(RemoveKernelSpec): 200 | """ 201 | The kernel uninstallation class 202 | """ 203 | 204 | spec_names = [KERNEL_NAME] 205 | description = '''Remove the HiveQL Jupyter Kernel''' 206 | 207 | def parse_command_line(self, argv): 208 | """ 209 | Skip parent method and go for its ancestor 210 | (because parent method requires an extra argument: the kernel to remove) 211 | """ 212 | super(RemoveKernelSpec, self).parse_command_line(argv) 213 | 214 | def start(self): 215 | # Call parent (this time the real parent) to remove the kernelspec dir 216 | super(HiveqlKernelRemove, self).start() 217 | 218 | # Remove the installed custom CSS 219 | # Try the ~/.jupyter/custom dir & the system custom dir 220 | self.log.info('Removing CSS') 221 | import jupyter_core 222 | import notebook 223 | cssd = (os.path.join(jupyter_config_dir(), 'custom'), 224 | os.path.join(notebook.DEFAULT_STATIC_FILES_PATH, 'custom')) 225 | for destd in cssd: 226 | if remove_custom_css(destd): 227 | self.log.info('Removed CSS from %s', destd) 228 | -------------------------------------------------------------------------------- /hiveql/kernel.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import traceback 4 | import re 5 | import os.path 6 | 7 | from ipykernel.kernelbase import Kernel 8 | from sqlalchemy.exc import OperationalError, ResourceClosedError 9 | 10 | from .constants import __version__, KERNEL_NAME, CONFIG_FILE 11 | 12 | from sqlalchemy import * 13 | import pandas as pd 14 | from .tool_sql import * 15 | from sqlalchemy import event 16 | 17 | 18 | import time 19 | 20 | logger = logging.getLogger(__name__) 21 | logger.setLevel(logging.DEBUG) 22 | 23 | 24 | class KernelSyntaxError(Exception): 25 | pass 26 | 27 | 28 | error_con_not_created = """Connection not initialized! 29 | Please specify your pyHive configuration like this : 30 | 31 | ------------- 32 | $$ url=hive://@:/ 33 | $$ connect_args={"auth": "KERBEROS","kerberos_service_name": "hive"} 34 | $$ pool_size=5 35 | $$ max_overflow=10 36 | 37 | YOUR SQL REQUEST HERE IF ANY 38 | ------------- 39 | 40 | -> if you want to update the current connection, just type it again with another configuration 41 | -> $$ are mandatory characters that specify that this line is a configuration for this kernel 42 | 43 | Other parameters are available such as : 44 | 45 | $$ default_limit=50 # -> without this parameter, default_limit is set to 20 46 | $$ display_mode=be # -> this will display a table with the beginning (b) and end (e) of the SQL response (options are: b, e and be) 47 | 48 | """ 49 | 50 | 51 | # DOCUMENTATION here: https://ipython.readthedocs.io/en/stable/development/wrapperkernels.html 52 | class ConnectionNotCreated(Exception): 53 | def __init__(self): 54 | Exception.__init__(self, error_con_not_created) 55 | 56 | 57 | class HiveQLKernel(Kernel): 58 | implementation = KERNEL_NAME 59 | implementation_version = __version__ 60 | banner = 'HiveQL REPL' 61 | language = "hiveql" 62 | language_info = { 63 | 'name': 'hive', 64 | 'codemirror_mode': "sql", 65 | 'pygments_lexer': 'postgresql', 66 | 'mimetype': 'text/x-hive', 67 | 'file_extension': '.hiveql', 68 | } 69 | last_conn = None 70 | params = { 71 | "default_limit": 20, 72 | "display_mode": "be" 73 | } 74 | conf = None 75 | conf_file = os.path.expanduser(CONFIG_FILE) 76 | if os.path.isfile(conf_file): 77 | with open(conf_file, mode='r') as file_hanlde: 78 | conf = json.load(file_hanlde) 79 | 80 | def __init__(self, **kwargs): 81 | conf_file = os.path.expanduser(CONFIG_FILE) 82 | if os.path.isfile(conf_file): 83 | with open(conf_file, mode='r') as file_hanlde: 84 | self.conf = json.load(file_hanlde) 85 | pyhiveconf, sql_req = self.parse_code(code="") 86 | 87 | self.create_conn(**pyhiveconf) 88 | 89 | #if self.last_conn is None: 90 | # raise ConnectionNotCreated() 91 | 92 | Kernel.__init__(self, **kwargs) 93 | 94 | def send_exception(self, e): 95 | if type(e) in [ConnectionNotCreated]: 96 | tb = "" 97 | else: 98 | tb = "\n" + traceback.format_exc() 99 | return self.send_error(tb) 100 | 101 | def send_error(self, contents): 102 | self.send_response(self.iopub_socket, 'stream', { 103 | 'name': 'stderr', 104 | 'text': str(contents) 105 | }) 106 | return { 107 | 'status': 'error', 108 | 'execution_count': self.execution_count, 109 | 'payload': [], 110 | 'user_expressions': {} 111 | } 112 | 113 | def send_info(self, contents): 114 | self.send_response(self.iopub_socket, 'stream', { 115 | 'name': 'stdout', 116 | 'text': str(contents) 117 | }) 118 | 119 | def create_conn(self, url, **kwargs): 120 | #self.send_info("create_engine('" + url + "', " + ', '.join( 121 | # [str(k) + '=' + (str(v) if type(v) == str else json.dumps(v)) for k, v in kwargs.items()]) + ")\n") 122 | self.last_conn = create_engine(url,**kwargs) 123 | self.last_conn.connect() 124 | #self.send_info("Connection established to database!\n") 125 | 126 | def reconfigure(self, params): 127 | if 'default_limit' in params: 128 | try: 129 | self.params['default_limit'] = int(params['default_limit']) 130 | # self.send_info("Set display limit to {}\n".format(self.params['default_limit'])) 131 | except ValueError as e: 132 | self.send_exception(e) 133 | if 'display_mode' in params: 134 | v = params['display_mode'] 135 | if type(v) == str and v in ['b', 'e', 'be']: 136 | self.params['display_mode'] = v 137 | else: 138 | self.send_error("Invalid display_mode, options are b, e and be.") 139 | 140 | def parse_code(self, code): 141 | req = code.strip() 142 | 143 | headers = {} 144 | sql_req = "" 145 | beginning = True 146 | for l in req.split('\n'): 147 | l = l.strip() 148 | if l.startswith("$$"): 149 | if beginning: 150 | k, v = l.replace("$", "").split("=") 151 | k, v = k.strip(), v.strip() 152 | if v.startswith('{'): 153 | v = json.loads(v) 154 | else: 155 | try: 156 | v = int(v) 157 | except ValueError: 158 | pass 159 | headers[k] = v 160 | else: 161 | raise KernelSyntaxError("Headers starting with %% must be at the beginning of your request.") 162 | else: 163 | beginning = False 164 | sql_req += '\n' + l 165 | 166 | if self.last_conn is None and not headers and self.conf is not None: 167 | headers = self.conf # if cells doesn't contain $$ and connection is None, overriding headers with conf data 168 | 169 | sql_req = sql_req.strip() 170 | if sql_req.endswith(';'): 171 | sql_req = sql_req[:-1] + "\n" # the last newline let add the limit without being commented by a last comment 172 | 173 | a = ['default_limit', 'display_mode'] 174 | params, pyhiveconf = {k: v for k, v in headers.items() if k in a}, {k: v for k, v in headers.items() if k not in a} 175 | 176 | self.reconfigure(params) 177 | 178 | return pyhiveconf, sql_req 179 | 180 | def format_time(self, start, end): 181 | hours, rem = divmod(end-start, 3600) 182 | minutes, seconds = divmod(rem, 60) 183 | return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) 184 | 185 | def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): 186 | try: 187 | pyhiveconf, sql_req = self.parse_code(code) 188 | 189 | if 'url' in pyhiveconf: 190 | self.create_conn(**pyhiveconf) 191 | 192 | if self.last_conn is None: 193 | raise ConnectionNotCreated() 194 | 195 | # If code empty 196 | if not sql_req: 197 | return { 198 | 'status': 'ok', 199 | 'execution_count': self.execution_count, 200 | 'payload': [], 201 | 'user_expressions': {} 202 | } 203 | pd.set_option('display.max_colwidth', -1) 204 | sql_req = sql_remove_comment(sql_req) 205 | 206 | for query_raw in sql_explode(sql_req): 207 | query = sql_rewrite(query_raw, self.params['default_limit']) 208 | logger.info("Running the following HiveQL query: {}".format(query)) 209 | start = time.time() 210 | result = self.last_conn.execute(query.strip()) 211 | res = result.cursor.fetch_logs() 212 | if len(res) > 0: 213 | raise Exception("\n".join(res)) 214 | end = time.time() 215 | elapsed_time = self.format_time(start, end) 216 | if result is not None and result.returns_rows is True: 217 | df = pd.DataFrame(result.fetchall(), columns=result.keys(), dtype="object") 218 | if sql_is_show(query) or sql_is_describe(query): # allow limiting show tables/databases and describe table with a pattern 219 | pattern = extract_pattern(query_raw) 220 | if sql_is_describe(query): 221 | # hive has "col_name" spark has "col_name" 222 | df = df[df.col_name.str.contains(pattern)] 223 | if sql_is_show_tables(query): 224 | # hive has "tab_name" spark has "tableName" 225 | if "tab_name" in df.columns: 226 | df = df[df.tab_name.str.contains(pattern)] 227 | else: 228 | df = df[df.tableName.str.contains(pattern)] 229 | if sql_is_show_databases(query): 230 | # hive has "database_name" spark has "databaseName" 231 | if "database_name" in df.columns: 232 | df = df[df.database_name.str.contains(pattern)] 233 | else: 234 | df = df[df.databaseName.str.contains(pattern)] 235 | html = df_to_html(df) 236 | self.send_info("Elapsed Time: {} !\n".format(elapsed_time)) 237 | self.send_response(self.iopub_socket, 'display_data', { 238 | 'data': { 239 | "text/html": html, 240 | }, 241 | "metadata": { 242 | "image/png": { 243 | "width": 640, 244 | "height": 480, 245 | }, 246 | } 247 | }) 248 | else: 249 | if sql_is_use(query): 250 | self.send_info("Database changed successfully in {} !\n".format(elapsed_time)) 251 | elif sql_is_create(query): 252 | self.send_info("Table created successfully in {} !\n".format(elapsed_time)) 253 | elif sql_is_drop(query): 254 | self.send_info("Table dropped successfully in {} !\n".format(elapsed_time)) 255 | elif sql_is_set_variable(query): 256 | self.send_info("Variable set successfully in {} !\n".format(elapsed_time)) 257 | else: 258 | self.send_info("Query executed successfully in {} !\n".format(elapsed_time)) 259 | return { 260 | 'status': 'ok', 261 | 'execution_count': self.execution_count, 262 | 'payload': [], 263 | 'user_expressions': {} 264 | } 265 | except OperationalError as oe: 266 | return self.send_error(refactor(oe)) 267 | except ResourceClosedError as rce: 268 | return self.send_error(rce) 269 | except NotAllowedQueriesError as e: 270 | return self.send_error("only 'select', 'with', 'set property=value', 'create table x.y stored as orc' 'drop table', 'use database', 'show databases', 'show tables', 'describe myTable' statements are allowed") 271 | except Exception as e: 272 | return self.send_exception(e) 273 | 274 | 275 | def df_to_html(df): 276 | #for column in df: 277 | # if df[column].dtype == 'object': 278 | # df[column] = df[column].apply(lambda x: x.replace("\n","
")) 279 | return df.fillna('NULL').astype(str).to_html(notebook=True) 280 | 281 | 282 | def refactor(oe): 283 | error_string = "error_code: {}\nsql_state: {}\nerror_message: {}".format(oe.orig.args[0].status.errorCode, 284 | oe.orig.args[0].status.sqlState, 285 | oe.orig.args[0].status.errorMessage) 286 | return error_string 287 | 288 | def do_shutdown(self, restart): 289 | """Cleanup the created source code files and executables when shutting down the kernel""" 290 | self.last_conn.disconnect() 291 | if restart: 292 | self.last_conn.connect() 293 | 294 | -------------------------------------------------------------------------------- /hiveql/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ipykernel.kernelapp import IPKernelApp 4 | 5 | from .kernel import HiveQLKernel 6 | 7 | 8 | logging.basicConfig(level=logging.DEBUG) 9 | 10 | IPKernelApp.launch_instance(kernel_class=HiveQLKernel) -------------------------------------------------------------------------------- /hiveql/resources/custom.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/resources/custom.css -------------------------------------------------------------------------------- /hiveql/resources/logo-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/resources/logo-32x32.png -------------------------------------------------------------------------------- /hiveql/resources/logo-64x64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/resources/logo-64x64.png -------------------------------------------------------------------------------- /hiveql/tool_sql.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class MultipleQueriesError(Exception): 4 | pass 5 | 6 | class NotAllowedQueriesError(Exception): 7 | pass 8 | 9 | def sql_extract_limit( sql_str): 10 | pattern = re.compile("limit\\s+(\\d+)\\s*;?$", re.I) 11 | res = pattern.search(sql_str) 12 | if res: 13 | return int(res.group(1)) 14 | else: 15 | return 0 16 | 17 | def sql_incrust_limit( sql_str, default_limit): 18 | if sql_extract_limit(sql_str) > 0: 19 | pattern = re.compile("([\\s\\S]*)?(limit\\s*\\d*\\s?;?)+$", re.I)# replace any existing limit with the default limit 20 | res = pattern.sub("\\1 limit " + str(default_limit), sql_str) 21 | return res 22 | else: 23 | pattern = re.compile("([^;]+)[\\s\\S]*$", re.I)# replace any existing limit with the default limit 24 | res = pattern.sub("\\1 limit " + str(default_limit), sql_str) 25 | return res 26 | 27 | 28 | def sql_rewrite( sql_str, default_limit): 29 | sql_str = sql_remove_comment(sql_str) 30 | if sql_is_count(sql_str):# no limit for count 31 | return sql_str 32 | if sql_is_selection(sql_str): #the query is a selection 33 | if sql_extract_limit(sql_str) > default_limit or sql_extract_limit(sql_str) ==0:# the limit is not set or to high 34 | return sql_incrust_limit(sql_str, default_limit) # force the default limit 35 | if sql_is_show(sql_str) or sql_is_describe(sql_str): #the query is a show 36 | pattern = re.compile("(\\w+)\\s+(\\w+)([\\s\\S]*)$", re.I)# replace any existing limit with the default limit 37 | res = pattern.sub("\\1 \\2", sql_str) 38 | return res 39 | return sql_str 40 | 41 | def extract_pattern(sql_str): 42 | sql_str = sql_remove_comment(sql_str) 43 | pattern = re.compile("(\\w+)\\s+(\\w+)([\\s\\S]*)$", re.I) 44 | res = pattern.sub("\\3", sql_str) 45 | return res.strip() 46 | 47 | def sql_is_selection(sql_str): 48 | sql_str = sql_remove_comment(sql_str) 49 | return sql_is_with(sql_str) or re.search(r'^\s*select', sql_str, re.I) 50 | 51 | def sql_is_with(sql_str): 52 | sql_str = sql_remove_comment(sql_str) 53 | return re.search(r'^\s*with', sql_str, re.I) 54 | 55 | def sql_is_count(sql_str): 56 | sql_str = sql_remove_comment(sql_str) 57 | return re.search(r'^\s*select\s+count\(.\)\s+from', sql_str, re.I) 58 | 59 | def sql_is_create(sql_str): 60 | sql_str = sql_remove_comment(sql_str) 61 | return re.search(r'^\s*create\s+table\s+.*stored\s+as\s+orc\s+as', sql_str, re.I) 62 | 63 | def sql_is_drop(sql_str): 64 | sql_str = sql_remove_comment(sql_str) 65 | return re.search(r'^\s*drop\s+table', sql_str, re.I) 66 | 67 | def sql_is_describe(sql_str): 68 | sql_str = sql_remove_comment(sql_str) 69 | return re.search(r'^\s*describe\s+', sql_str, re.I) 70 | 71 | def sql_is_show(sql_str): 72 | sql_str = sql_remove_comment(sql_str) 73 | return sql_is_show_tables(sql_str) or sql_is_show_databases(sql_str) 74 | 75 | def sql_is_show_tables(sql_str): 76 | sql_str = sql_remove_comment(sql_str) 77 | return re.search(r'^\s*show\s+tables', sql_str, re.I) 78 | 79 | def sql_is_show_databases(sql_str): 80 | sql_str = sql_remove_comment(sql_str) 81 | return re.search(r'^\s*show\s+databases', sql_str, re.I) 82 | 83 | def sql_is_use(sql_str): 84 | sql_str = sql_remove_comment(sql_str) 85 | return re.search(r'^\s*use\s+', sql_str, re.I) 86 | 87 | def sql_is_set_variable(sql_str): 88 | sql_str = sql_remove_comment(sql_str) 89 | return re.search(r'^\s*set\s+\w+.*=\w+', sql_str, re.I) 90 | 91 | def sql_is_set(sql_str): 92 | sql_str = sql_remove_comment(sql_str) 93 | return re.search(r'^\s*set\s*$', sql_str, re.I) or re.search(r'^\s*set\s+\w+.*$', sql_str, re.I) 94 | 95 | def sql_is_explain(sql_str): 96 | sql_str = sql_remove_comment(sql_str) 97 | return re.search(r'^\s*explain\s+', sql_str, re.I) 98 | 99 | def sql_is_add(sql_str): 100 | sql_str = sql_remove_comment(sql_str) 101 | return re.search(r'^\s*add\s+', sql_str, re.I) 102 | 103 | def sql_remove_comment(sql_str): 104 | res = re.sub("--.*\n","", sql_str, re.MULTILINE) 105 | return res 106 | 107 | def sql_explode(sql_str): 108 | tmp = [] 109 | sql_str = sql_remove_comment(sql_str) 110 | for sql in sql_str.split(";"): 111 | if sql.strip() != "": 112 | tmp.append(sql.strip()) 113 | return tmp 114 | 115 | def sql_validate(sql_str): 116 | if sql_is_set(sql_str) or sql_is_add(sql_str) or sql_is_drop(sql_str) or sql_is_create(sql_str) or sql_is_describe(sql_str) or sql_is_show(sql_str) or sql_is_use(sql_str) or sql_is_set_variable(sql_str) or sql_is_selection(sql_str) or sql_is_explain(sql_str): 117 | pass 118 | else: 119 | raise NotAllowedQueriesError() 120 | 121 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipykernel==4.* 2 | jupyter_client==5.* 3 | jupyter_core==4.* 4 | pandas>=0.23.0 5 | pyhive==0.6.* 6 | sasl==0.2.* 7 | thrift_sasl==0.3.* 8 | pytest==4.2.* 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from os import path 3 | 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | 8 | with open(path.join(here, 'requirements.txt')) as f: 9 | requirements = f.read().splitlines() 10 | 11 | with open(path.join(here, 'README.md')) as f: 12 | long_description = f.read() 13 | 14 | 15 | from hiveql.constants import __version__, KERNEL_NAME, DISPLAY_NAME 16 | 17 | setup( 18 | name=KERNEL_NAME + "Kernel", 19 | version=__version__, 20 | description=DISPLAY_NAME + ' Kernel', 21 | long_description=long_description, 22 | url='https://github.com/EDS-APHP/HiveQLKernel', 23 | author='APHP - EDS', 24 | license='MIT', 25 | keywords='Hive HiveQL PyHive Kernel Ipykernel', 26 | packages=find_packages(), 27 | install_requires=requirements, 28 | entry_points={ 29 | 'console_scripts': 30 | ['jupyter-hiveql = {}.__main__:main'.format(KERNEL_NAME)], 31 | }, 32 | include_package_data=False, # otherwise package_data is not used 33 | package_data={ 34 | KERNEL_NAME: ['resources/logo-*.png', 'resources/*.css'], 35 | }, 36 | ) 37 | -------------------------------------------------------------------------------- /tests/test_sql.py: -------------------------------------------------------------------------------- 1 | from hiveql.tool_sql import * 2 | import logging 3 | 4 | def test_validate(): 5 | SQL = "select * from toto; " 6 | assert sql_validate(SQL) == None 7 | SQL = "drop table toto; " 8 | assert sql_validate(SQL) == None 9 | SQL = "with t as (select * from tata) select * from t; " 10 | assert sql_validate(SQL) == None 11 | SQL = "show databases" 12 | assert sql_validate(SQL) == None 13 | SQL = "describe extended toto" 14 | assert sql_validate(SQL) == None 15 | SQL = "describe a.toto" 16 | assert sql_validate(SQL) == None 17 | SQL = "show databases mydatabase" 18 | assert sql_validate(SQL) == None 19 | SQL = "SET hive.enforce.sorting = true" 20 | assert sql_validate(SQL) == None 21 | SQL = "SET hive.enforce.sorting = true;select * from t" 22 | assert sql_validate(SQL) == None 23 | 24 | def test_extract_limit(): 25 | SQL = "select * from t limit 200;" 26 | assert sql_extract_limit(SQL) == 200 27 | SQL = "select * from (select * from b limit 20000) t limit 1000" 28 | assert sql_extract_limit(SQL) == 1000 29 | SQL = "select * from (select * from b limit 20000) t " 30 | assert sql_extract_limit(SQL) == 0 31 | 32 | def test_incrust_limit(): 33 | DEFAULT_LIMIT = 100 34 | SQL = "select * from t limit 200;" 35 | assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == "select * from t limit 100" 36 | SQL = "select * from t ;" 37 | assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == "select * from t limit 100" 38 | SQL = "select * from t " 39 | assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == SQL + " limit 100" 40 | SQL = "select * from (select * from t limit 100 as t) " 41 | assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == SQL + " limit 100" 42 | SQL = "with t as (select * from t limit 100) select * from t" 43 | assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == SQL + " limit 100" 44 | 45 | def test_is_selection(): 46 | SQL = "select * from t limit 200;" 47 | assert sql_is_selection(SQL) 48 | SQL = " witH a as (select * from t) select * from a limit 200;" 49 | assert sql_is_selection(SQL) 50 | SQL = "create * from t limit 200;" 51 | assert sql_is_selection(SQL) == None 52 | SQL = "--hello world\n--second\nselect * from toto; " 53 | assert sql_is_selection(SQL) 54 | SQL = "--hello world\n--second\nselect * from toto; " 55 | assert sql_validate(SQL) == None 56 | 57 | def test_is_create(): 58 | SQL = "create table toto stored as orc as select * from t;" 59 | assert sql_is_create(SQL) 60 | SQL = " create table toto as select * from t;" 61 | assert sql_is_create(SQL) == None 62 | 63 | def test_is_create(): 64 | SQL = " drop table toto stored as orc as select * from t;" 65 | assert sql_is_drop(SQL) 66 | SQL = " drip table toto stored as orc as select * from t;" 67 | assert sql_is_drop(SQL) == None 68 | 69 | def test_is_show(): 70 | SQL = " show databases;" 71 | assert sql_is_show(SQL) 72 | SQL = "show databases toto" 73 | assert sql_is_show(SQL) 74 | 75 | def test_is_describe(): 76 | SQL = "describe a.toto" 77 | assert sql_is_describe(SQL) 78 | SQL = "show my databases" 79 | assert sql_is_describe(SQL) == None 80 | 81 | def test_rewrite(): 82 | DEFAULT_LIMIT = 100 83 | SQL = "select * from t limit 200;" 84 | assert sql_rewrite(SQL, DEFAULT_LIMIT) == "select * from t limit 100" 85 | SQL = "show databases cse" 86 | assert sql_rewrite(SQL, DEFAULT_LIMIT) == "show databases" 87 | SQL = "show tables cse" 88 | assert sql_rewrite(SQL, DEFAULT_LIMIT) == "show tables" 89 | SQL = "select count(1) from t" 90 | assert sql_rewrite(SQL, DEFAULT_LIMIT) == "select count(1) from t" 91 | SQL = "select count(*) from t" 92 | assert sql_rewrite(SQL, DEFAULT_LIMIT) == "select count(*) from t" 93 | 94 | 95 | def test_explain(): 96 | SQL = "explain select * from toto" 97 | assert sql_is_explain(SQL) 98 | 99 | def test_ctas(): 100 | SQL = "create table toto stored as orc as select * from tata" 101 | assert sql_rewrite(SQL, 100) == SQL 102 | 103 | def test_remove_comment(): 104 | SQL = "--hello world\n--second\nselect * from toto; " 105 | assert sql_remove_comment(SQL) == "select * from toto; " 106 | 107 | def test_add(): 108 | SQL = "add jar;" 109 | assert sql_is_add(SQL) != None 110 | SQL = "ADD jar " 111 | assert sql_is_add(SQL) 112 | 113 | 114 | def test_explode_sql(): 115 | SQL = "select * from t; " 116 | assert sql_explode(SQL) == ["select * from t"] 117 | SQL = "select * from t; use db" 118 | assert sql_explode(SQL) == ["select * from t","use db"] 119 | 120 | --------------------------------------------------------------------------------