├── .gitignore
├── LICENSE
├── README.md
├── hiveql
    ├── __init__.py
    ├── __main__.py
    ├── constants.py
    ├── install.py
    ├── kernel.py
    ├── main.py
    ├── resources
    │   ├── custom.css
    │   ├── logo-32x32.png
    │   └── logo-64x64.png
    └── tool_sql.py
├── requirements.txt
├── setup.py
└── tests
    └── test_sql.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | 
107 | .idea


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 EDS-APHP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HiveQL Kernel
 2 | 
 3 | ### Requirements
 4 | 
 5 | If you are going to connect using kerberos:
 6 | 
 7 | ```
 8 | sudo apt-get install python3-dev libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit
 9 | ```
10 | 
11 | ### Installation
12 | 
13 | To install the kernel:
14 | 
15 | ```
16 | pip install --upgrade hiveqlKernel
17 | jupyter hiveql install --user
18 | ```
19 | 
20 | ### Connection configuration
21 | 
22 | Two methods are available to connect to a Hive server:
23 | 
24 | * Directly inside the notebook
25 | * Using a configuration file
26 | 
27 | If the configuration file is present, everytime you run a new HiveQL kernel it uses it, else you must configure your connection inside the notebook. The configuration in the notebook overwrites the one in the configuration file if present.
28 | 
29 | #### Configure directly in the notebook cells
30 | 
31 | Inside a Notebook cell, copy&paste this, change the configuration to match your needs, and run it.
32 | 
33 | ```
34 | $$ url=hive://<kerberos-username>@<hive-host>:<hive-port>/<db-name>
35 | $$ connect_args={"auth": "KERBEROS", "kerberos_service_name": "hive", "configuration": {"tez.queue.name": "myqueue"}}
36 | $$ pool_size=5
37 | $$ max_overflow=10
38 | ```
39 | 
40 | These args are passed to sqlalchemy, who registered pyHive as the 'hive' SQL back-end.
41 | See [github.com/dropbox/PyHive](https://github.com/dropbox/PyHive/#sqlalchemy).
42 | 
43 | #### Configure using a configuration file
44 | 
45 | The HiveQL kernel is looking for the configuration file at `~/.hiveql_kernel.conf` by default. You can specify another path using `HIVE_KERNEL_CONF_FILE`.
46 | 
47 | The contents must be like this (in json format):
48 | 
49 | ```
50 | { "url": "hive://<kerberos-username>@<hive-host>:<hive-port>/<db-name>", "connect_args" : { "auth": "KERBEROS", "kerberos_service_name":"hive", "configuration": {"tez.queue.name": "myqueue"}}, "pool_size": 5, "max_overflow": 10, "default_limit": 20, "display_mode": "be" }
51 | ```
52 | 
53 | 
54 | ### Usage
55 | 
56 | Inside a HiveQL kernel you can type HiveQL directly in the cells and it displays a HTML table with the results.
57 | 
58 | You also have other options, like changing the default display limit (=20) like this :
59 | 
60 | ```
61 | $$ default_limit=50
62 | ```
63 | 
64 | Some hive functions are extended. They allow to filter with some patterns.
65 | 
66 | ```
67 | SHOW TABLES <pattern>
68 | SHOW DATABASES <pattern>
69 | ```
70 | 
71 | 
72 | ### Run tests
73 | 
74 | ```
75 | python -m pytest
76 | ```
77 | 
78 | 
79 | Have fun!
80 | 


--------------------------------------------------------------------------------
/hiveql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/__init__.py


--------------------------------------------------------------------------------
/hiveql/__main__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from ipykernel.kernelapp import IPKernelApp
 4 | 
 5 | 
 6 | class HiveQLKernelApp(IPKernelApp):
 7 |     """
 8 |     The main kernel application, inheriting from the ipykernel
 9 |     """
10 |     from .kernel import HiveQLKernel
11 |     from .install import HiveqlKernelInstall, HiveqlKernelRemove
12 |     kernel_class = HiveQLKernel
13 | 
14 |     # We override subcommands to add our own install & remove commands
15 |     subcommands = {
16 |         'install': (HiveqlKernelInstall,
17 |                     HiveqlKernelInstall.description.splitlines()[0]),
18 |         'remove': (HiveqlKernelRemove,
19 |                    HiveqlKernelRemove.description.splitlines()[0]),
20 |     }
21 | 
22 | 
23 | 
24 | def main():
25 |     HiveQLKernelApp.launch_instance()
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/hiveql/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | __version__ = '1.0.20'
 3 | 
 4 | KERNEL_NAME = 'hiveql'
 5 | LANGUAGE = 'hiveql'
 6 | DISPLAY_NAME = 'HiveQL'
 7 | 
 8 | DEFAULT_TEXT_LANG = ['en']
 9 | 
10 | CONFIG_FILE = os.environ.get("HIVE_KERNEL_CONF_FILE", "~/.hiveql_kernel.json")
11 | 


--------------------------------------------------------------------------------
/hiveql/install.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | import os
  4 | import pkgutil
  5 | import sys
  6 | from tempfile import TemporaryDirectory
  7 | 
  8 | from IPython.utils.path import ensure_dir_exists
  9 | from jupyter_client.kernelspecapp import InstallKernelSpec, RemoveKernelSpec
 10 | from jupyter_core.paths import jupyter_config_dir
 11 | from traitlets import Unicode
 12 | 
 13 | from .constants import __version__, KERNEL_NAME, DISPLAY_NAME, LANGUAGE
 14 | 
 15 | PY3 = sys.version_info[0] == 3
 16 | if PY3:
 17 |     unicode = str
 18 | 
 19 | MODULEDIR = os.path.dirname(__file__)
 20 | PKGNAME = os.path.basename(MODULEDIR)
 21 | 
 22 | # The kernel specfile
 23 | kernel_json = {
 24 |     "argv": [sys.executable,
 25 |              "-m", PKGNAME,
 26 |              "-f", "{connection_file}"],
 27 |     "display_name": DISPLAY_NAME,
 28 |     "language": LANGUAGE,
 29 |     "name": KERNEL_NAME
 30 | }
 31 | 
 32 | 
 33 | def css_frame_prefix(name):
 34 |     '''Define the comment prefix used in custom css to frame kernel CSS'''
 35 |     return u'/* @{{KERNEL}} {} '.format(name)
 36 | 
 37 | 
 38 | def copyresource(resource, filename, destdir):
 39 |     """
 40 |     Copy a resource file to a destination
 41 |     """
 42 |     data = pkgutil.get_data(resource, os.path.join('resources', filename))
 43 |     # log.info( "Installing %s", os.path.join(destdir,filename) )
 44 |     with open(os.path.join(destdir, filename), 'wb') as fp:
 45 |         fp.write(data)
 46 | 
 47 | 
 48 | def remove_custom_css(destdir, resource=PKGNAME):
 49 |     """
 50 |     Remove the kernel CSS from custom.css
 51 |     """
 52 | 
 53 |     # Remove the inclusion in the main CSS
 54 |     if not os.path.isdir(destdir):
 55 |         return False
 56 |     custom = os.path.join(destdir, 'custom.css')
 57 |     copy = True
 58 |     found = False
 59 |     prefix = css_frame_prefix(resource)
 60 |     with io.open(custom + '-new', 'wt') as fout:
 61 |         with io.open(custom) as fin:
 62 |             for line in fin:
 63 |                 if line.startswith(prefix + 'START'):
 64 |                     copy = False
 65 |                     found = True
 66 |                 elif line.startswith(prefix + 'END'):
 67 |                     copy = True
 68 |                 elif copy:
 69 |                     fout.write(line)
 70 | 
 71 |     if found:
 72 |         os.rename(custom + '-new', custom)
 73 |     else:
 74 |         os.unlink(custom + '-new')
 75 | 
 76 |     return found
 77 | 
 78 | 
 79 | def install_custom_css(destdir, resource=PKGNAME):
 80 |     """
 81 |     Add the kernel CSS to custom.css
 82 |     """
 83 |     ensure_dir_exists(destdir)
 84 |     custom = os.path.join(destdir, 'custom.css')
 85 |     prefix = css_frame_prefix(resource)
 86 | 
 87 |     # Check if custom.css already includes it. If so, let's remove it first
 88 |     exists = False
 89 |     if os.path.exists(custom):
 90 |         with io.open(custom) as f:
 91 |             for line in f:
 92 |                 if line.find(prefix) >= 0:
 93 |                     exists = True
 94 |                     break
 95 |     if exists:
 96 |         remove_custom_css(destdir, resource)
 97 | 
 98 |     # Fetch the CSS file
 99 |     cssfile = 'custom.css'
100 |     data = pkgutil.get_data(resource, os.path.join('resources', cssfile))
101 |     # get_data() delivers encoded data, str (Python2) or bytes (Python3)
102 | 
103 |     # Add the CSS at the beginning of custom.css
104 |     # io.open uses unicode strings (unicode in Python2, str in Python3)
105 |     with io.open(custom + '-new', 'wt', encoding='utf-8') as fout:
106 |         fout.write(u'{}START ======================== */\n'.format(prefix))
107 |         fout.write(data.decode('utf-8'))
108 |         fout.write(u'{}END ======================== */\n'.format(prefix))
109 |         if os.path.exists(custom):
110 |             with io.open(custom, 'rt', encoding='utf-8') as fin:
111 |                 for line in fin:
112 |                     fout.write(unicode(line))
113 |     os.rename(custom + '-new', custom)
114 | 
115 | 
116 | def install_kernel_resources(destdir, resource=PKGNAME, files=None):
117 |     """
118 |     Copy the resource files to the kernelspec folder.
119 |     """
120 |     if files is None:
121 |         files = ['logo-64x64.png', 'logo-32x32.png']
122 |     for filename in files:
123 |         try:
124 |             copyresource(resource, filename, destdir)
125 |         except Exception as e:
126 |             sys.stderr.write(str(e))
127 | 
128 | 
129 | class HiveqlKernelInstall(InstallKernelSpec):
130 |     """
131 |     The kernel installation class
132 |     """
133 | 
134 |     version = __version__
135 |     kernel_name = KERNEL_NAME
136 |     description = '''Install the HiveQL Jupyter Kernel.
137 |     Either as a system kernel or for a concrete user'''
138 | 
139 |     logdir = Unicode(os.environ.get('LOGDIR', ''),
140 |                      config=True,
141 |                      help="""Default directory to use for the logfile."""
142 |                      )
143 |     aliases = {'logdir': 'HiveQLKernelInstall.logdir'}
144 | 
145 |     def parse_command_line(self, argv):
146 |         """
147 |         Skip parent method and go for its ancestor
148 |         (because parent method requires an extra argument: the kernel to install)
149 |         """
150 |         super(InstallKernelSpec, self).parse_command_line(argv)
151 | 
152 |     def start(self):
153 |         if self.user and self.prefix:
154 |             self.exit("Can't specify both user and prefix. Please choose one or the other.")
155 | 
156 |         self.log.info('Installing HiveQL kernel')
157 |         with TemporaryDirectory() as td:
158 |             os.chmod(td, 0o755)  # Starts off as 700, not user readable
159 |             # Add kernel spec
160 |             if len(self.logdir):
161 |                 kernel_json['env'] = {'LOGDIR_DEFAULT': self.logdir}
162 |             with open(os.path.join(td, 'kernel.json'), 'w') as f:
163 |                 json.dump(kernel_json, f, sort_keys=True)
164 |             # Add resources
165 |             install_kernel_resources(td, resource=PKGNAME)
166 |             # Install JSON kernel specification + resources
167 |             self.log.info('Installing kernel spec')
168 |             self.sourcedir = td
169 |             install_dir = self.kernel_spec_manager.install_kernel_spec(
170 |                 td,
171 |                 kernel_name=self.kernel_name,
172 |                 user=self.user,
173 |                 prefix=self.prefix,
174 |                 replace=self.replace,
175 |             )
176 |         self.log.info("Installed into %s", install_dir)
177 | 
178 |         # install_kernel( self.kernel_spec_manager )
179 |         # self.create_kernel_json( install_dir )
180 | 
181 |         # Install the custom css
182 |         self.log.info('Installing CSS')
183 |         if self.user:
184 |             # Use the ~/.jupyter/custom dir
185 |             import jupyter_core
186 |             destd = os.path.join(jupyter_config_dir(), 'custom')
187 |         else:
188 |             # Use the system custom dir
189 |             import notebook
190 |             destd = os.path.join(notebook.DEFAULT_STATIC_FILES_PATH, 'custom')
191 | 
192 |         self.log.info('Installing CSS into %s', destd)
193 |         install_custom_css(destd)
194 | 
195 | 
196 | # --------------------------------------------------------------------------
197 | 
198 | 
199 | class HiveqlKernelRemove(RemoveKernelSpec):
200 |     """
201 |     The kernel uninstallation class
202 |     """
203 | 
204 |     spec_names = [KERNEL_NAME]
205 |     description = '''Remove the HiveQL Jupyter Kernel'''
206 | 
207 |     def parse_command_line(self, argv):
208 |         """
209 |         Skip parent method and go for its ancestor
210 |         (because parent method requires an extra argument: the kernel to remove)
211 |         """
212 |         super(RemoveKernelSpec, self).parse_command_line(argv)
213 | 
214 |     def start(self):
215 |         # Call parent (this time the real parent) to remove the kernelspec dir
216 |         super(HiveqlKernelRemove, self).start()
217 | 
218 |         # Remove the installed custom CSS
219 |         # Try the ~/.jupyter/custom dir & the system custom dir
220 |         self.log.info('Removing CSS')
221 |         import jupyter_core
222 |         import notebook
223 |         cssd = (os.path.join(jupyter_config_dir(), 'custom'),
224 |                 os.path.join(notebook.DEFAULT_STATIC_FILES_PATH, 'custom'))
225 |         for destd in cssd:
226 |             if remove_custom_css(destd):
227 |                 self.log.info('Removed CSS from %s', destd)
228 | 


--------------------------------------------------------------------------------
/hiveql/kernel.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import traceback
  4 | import re
  5 | import os.path
  6 | 
  7 | from ipykernel.kernelbase import Kernel
  8 | from sqlalchemy.exc import OperationalError, ResourceClosedError
  9 | 
 10 | from .constants import __version__, KERNEL_NAME, CONFIG_FILE
 11 | 
 12 | from sqlalchemy import *
 13 | import pandas as pd
 14 | from .tool_sql import *
 15 | from sqlalchemy import event
 16 | 
 17 | 
 18 | import time
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | logger.setLevel(logging.DEBUG)
 22 | 
 23 | 
 24 | class KernelSyntaxError(Exception):
 25 |     pass
 26 | 
 27 | 
 28 | error_con_not_created = """Connection not initialized!
 29 | Please specify your pyHive configuration like this :
 30 | 
 31 | -------------
 32 | $$ url=hive://<kerberos-username>@<hive-host>:<hive-port>/<db-name>
 33 | $$ connect_args={"auth": "KERBEROS","kerberos_service_name": "hive"}
 34 | $$ pool_size=5
 35 | $$ max_overflow=10
 36 | 
 37 | YOUR SQL REQUEST HERE IF ANY
 38 | -------------
 39 | 
 40 | -> if you want to update the current connection, just type it again with another configuration
 41 | -> $$ are mandatory characters that specify that this line is a configuration for this kernel
 42 | 
 43 | Other parameters are available such as :
 44 | 
 45 | $$ default_limit=50 # -> without this parameter, default_limit is set to 20
 46 | $$ display_mode=be # -> this will display a table with the beginning (b) and end (e) of the SQL response (options are: b, e and be)
 47 | 
 48 | """
 49 | 
 50 | 
 51 | # DOCUMENTATION here: https://ipython.readthedocs.io/en/stable/development/wrapperkernels.html
 52 | class ConnectionNotCreated(Exception):
 53 |     def __init__(self):
 54 |         Exception.__init__(self, error_con_not_created)
 55 | 
 56 | 
 57 | class HiveQLKernel(Kernel):
 58 |     implementation = KERNEL_NAME
 59 |     implementation_version = __version__
 60 |     banner = 'HiveQL REPL'
 61 |     language = "hiveql"
 62 |     language_info = {
 63 |         'name': 'hive',
 64 |         'codemirror_mode': "sql",
 65 |         'pygments_lexer': 'postgresql',
 66 |         'mimetype': 'text/x-hive',
 67 |         'file_extension': '.hiveql',
 68 |     }
 69 |     last_conn = None
 70 |     params = {
 71 |         "default_limit": 20,
 72 |         "display_mode": "be"
 73 |     }
 74 |     conf = None
 75 |     conf_file = os.path.expanduser(CONFIG_FILE)
 76 |     if os.path.isfile(conf_file):
 77 |         with open(conf_file, mode='r') as file_hanlde:
 78 |             conf = json.load(file_hanlde)
 79 | 
 80 |     def __init__(self, **kwargs):
 81 |         conf_file = os.path.expanduser(CONFIG_FILE)
 82 |         if os.path.isfile(conf_file):
 83 |             with open(conf_file, mode='r') as file_hanlde:
 84 |                 self.conf = json.load(file_hanlde)
 85 |         pyhiveconf, sql_req = self.parse_code(code="")
 86 | 
 87 |         self.create_conn(**pyhiveconf)
 88 | 
 89 |         #if self.last_conn is None:
 90 |         #    raise ConnectionNotCreated()
 91 | 
 92 |         Kernel.__init__(self, **kwargs)
 93 | 
 94 |     def send_exception(self, e):
 95 |         if type(e) in [ConnectionNotCreated]:
 96 |             tb = ""
 97 |         else:
 98 |             tb = "\n" + traceback.format_exc()
 99 |         return self.send_error(tb)
100 | 
101 |     def send_error(self, contents):
102 |         self.send_response(self.iopub_socket, 'stream', {
103 |             'name': 'stderr',
104 |             'text': str(contents)
105 |         })
106 |         return {
107 |             'status': 'error',
108 |             'execution_count': self.execution_count,
109 |             'payload': [],
110 |             'user_expressions': {}
111 |         }
112 | 
113 |     def send_info(self, contents):
114 |         self.send_response(self.iopub_socket, 'stream', {
115 |             'name': 'stdout',
116 |             'text': str(contents)
117 |         })
118 | 
119 |     def create_conn(self, url, **kwargs):
120 |         #self.send_info("create_engine('" + url + "', " + ', '.join(
121 |         #    [str(k) + '=' + (str(v) if type(v) == str else json.dumps(v)) for k, v in kwargs.items()]) + ")\n")
122 |         self.last_conn = create_engine(url,**kwargs)
123 |         self.last_conn.connect()
124 |         #self.send_info("Connection established to database!\n")
125 | 
126 |     def reconfigure(self, params):
127 |         if 'default_limit' in params:
128 |             try:
129 |                 self.params['default_limit'] = int(params['default_limit'])
130 |          #       self.send_info("Set display limit to {}\n".format(self.params['default_limit']))
131 |             except ValueError as e:
132 |                 self.send_exception(e)
133 |         if 'display_mode' in params:
134 |             v = params['display_mode']
135 |             if type(v) == str and v in ['b', 'e', 'be']:
136 |                 self.params['display_mode'] = v
137 |             else:
138 |                 self.send_error("Invalid display_mode, options are b, e and be.")
139 | 
140 |     def parse_code(self, code):
141 |         req = code.strip()
142 | 
143 |         headers = {}
144 |         sql_req = ""
145 |         beginning = True
146 |         for l in req.split('\n'):
147 |             l = l.strip()
148 |             if l.startswith("$$"):
149 |                 if beginning:
150 |                     k, v = l.replace("$", "").split("=")
151 |                     k, v = k.strip(), v.strip()
152 |                     if v.startswith('{'):
153 |                         v = json.loads(v)
154 |                     else:
155 |                         try:
156 |                             v = int(v)
157 |                         except ValueError:
158 |                             pass
159 |                     headers[k] = v
160 |                 else:
161 |                     raise KernelSyntaxError("Headers starting with %% must be at the beginning of your request.")
162 |             else:
163 |                 beginning = False
164 |                 sql_req += '\n' + l
165 | 
166 |         if self.last_conn is None and not headers and self.conf is not None:
167 |             headers = self.conf  # if cells doesn't contain $$ and connection is None, overriding headers with conf data
168 | 
169 |         sql_req = sql_req.strip()
170 |         if sql_req.endswith(';'):
171 |             sql_req = sql_req[:-1] + "\n" # the last newline let add the limit without being commented by a last comment
172 | 
173 |         a = ['default_limit', 'display_mode']
174 |         params, pyhiveconf = {k: v for k, v in headers.items() if k in a}, {k: v for k, v in headers.items() if k not in a}
175 | 
176 |         self.reconfigure(params)
177 | 
178 |         return pyhiveconf, sql_req
179 | 
180 |     def format_time(self, start, end):
181 |         hours, rem = divmod(end-start, 3600)
182 |         minutes, seconds = divmod(rem, 60)
183 |         return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
184 | 
185 |     def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
186 |         try:
187 |             pyhiveconf, sql_req = self.parse_code(code)
188 | 
189 |             if 'url' in pyhiveconf:
190 |                 self.create_conn(**pyhiveconf)
191 | 
192 |             if self.last_conn is None:
193 |                 raise ConnectionNotCreated()
194 | 
195 |             # If code empty
196 |             if not sql_req:
197 |                 return {
198 |                     'status': 'ok',
199 |                     'execution_count': self.execution_count,
200 |                     'payload': [],
201 |                     'user_expressions': {}
202 |                 }
203 |             pd.set_option('display.max_colwidth', -1)
204 |             sql_req = sql_remove_comment(sql_req)
205 | 
206 |             for query_raw in sql_explode(sql_req):
207 |                 query = sql_rewrite(query_raw, self.params['default_limit'])
208 |                 logger.info("Running the following HiveQL query: {}".format(query))
209 |                 start = time.time()
210 |                 result = self.last_conn.execute(query.strip())
211 |                 res = result.cursor.fetch_logs()
212 |                 if len(res) > 0:
213 |                     raise Exception("\n".join(res))
214 |                 end = time.time()
215 |                 elapsed_time = self.format_time(start, end)
216 |                 if result is not None and result.returns_rows is True:
217 |                     df = pd.DataFrame(result.fetchall(), columns=result.keys(), dtype="object")
218 |                     if sql_is_show(query) or sql_is_describe(query): # allow limiting show tables/databases and describe table with a pattern
219 |                         pattern = extract_pattern(query_raw)
220 |                         if sql_is_describe(query):
221 |                             # hive has "col_name" spark has "col_name"
222 |                             df = df[df.col_name.str.contains(pattern)]
223 |                         if sql_is_show_tables(query):
224 |                             # hive has "tab_name" spark has "tableName"
225 |                             if "tab_name" in df.columns:
226 |                                 df = df[df.tab_name.str.contains(pattern)]
227 |                             else:
228 |                                 df = df[df.tableName.str.contains(pattern)]
229 |                         if sql_is_show_databases(query):
230 |                             # hive has "database_name" spark has "databaseName"
231 |                             if "database_name" in df.columns:
232 |                                 df = df[df.database_name.str.contains(pattern)]
233 |                             else:
234 |                                 df = df[df.databaseName.str.contains(pattern)]
235 |                     html = df_to_html(df)
236 |                     self.send_info("Elapsed Time: {} !\n".format(elapsed_time))
237 |                     self.send_response(self.iopub_socket, 'display_data', {
238 |                         'data': {
239 |                             "text/html": html,
240 |                         },
241 |                         "metadata": {
242 |                             "image/png": {
243 |                                 "width": 640,
244 |                                 "height": 480,
245 |                             },
246 |                         }
247 |                     })
248 |                 else:
249 |                     if sql_is_use(query):
250 |                         self.send_info("Database changed successfully in {} !\n".format(elapsed_time))
251 |                     elif sql_is_create(query):
252 |                         self.send_info("Table created successfully in {} !\n".format(elapsed_time))
253 |                     elif sql_is_drop(query):
254 |                         self.send_info("Table dropped successfully in {} !\n".format(elapsed_time))
255 |                     elif sql_is_set_variable(query):
256 |                         self.send_info("Variable set successfully in {} !\n".format(elapsed_time))
257 |                     else:
258 |                         self.send_info("Query executed successfully in {} !\n".format(elapsed_time))
259 |             return {
260 |                 'status': 'ok',
261 |                 'execution_count': self.execution_count,
262 |                 'payload': [],
263 |                 'user_expressions': {}
264 |             }
265 |         except OperationalError as oe:
266 |             return self.send_error(refactor(oe))
267 |         except ResourceClosedError as rce:
268 |             return self.send_error(rce)
269 |         except NotAllowedQueriesError as e:
270 |             return self.send_error("only 'select', 'with', 'set property=value', 'create table x.y stored as orc' 'drop table', 'use database', 'show databases', 'show tables', 'describe myTable' statements are allowed")
271 |         except Exception as e:
272 |             return self.send_exception(e)
273 | 
274 | 
275 | def df_to_html(df):
276 |     #for column in df:
277 |     #    if df[column].dtype == 'object':
278 |     #        df[column] =  df[column].apply(lambda x: x.replace("\n","<br>"))
279 |     return df.fillna('NULL').astype(str).to_html(notebook=True)
280 | 
281 | 
282 | def refactor(oe):
283 |     error_string = "error_code: {}\nsql_state: {}\nerror_message: {}".format(oe.orig.args[0].status.errorCode,
284 |                                                                              oe.orig.args[0].status.sqlState,
285 |                                                                              oe.orig.args[0].status.errorMessage)
286 |     return error_string
287 | 
288 | def do_shutdown(self, restart):
289 |     """Cleanup the created source code files and executables when shutting down the kernel"""
290 |     self.last_conn.disconnect()
291 |     if restart:
292 |         self.last_conn.connect()
293 |         
294 | 


--------------------------------------------------------------------------------
/hiveql/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ipykernel.kernelapp import IPKernelApp
 4 | 
 5 | from .kernel import HiveQLKernel
 6 | 
 7 | 
 8 | logging.basicConfig(level=logging.DEBUG)
 9 | 
10 | IPKernelApp.launch_instance(kernel_class=HiveQLKernel)


--------------------------------------------------------------------------------
/hiveql/resources/custom.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/resources/custom.css


--------------------------------------------------------------------------------
/hiveql/resources/logo-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/resources/logo-32x32.png


--------------------------------------------------------------------------------
/hiveql/resources/logo-64x64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/HiveQLKernel/6a5ec42fa4c97b1c7d83d678e8a6ae7c9ead790c/hiveql/resources/logo-64x64.png


--------------------------------------------------------------------------------
/hiveql/tool_sql.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | class MultipleQueriesError(Exception):
  4 |     pass
  5 | 
  6 | class NotAllowedQueriesError(Exception):
  7 |     pass
  8 | 
  9 | def sql_extract_limit( sql_str):
 10 |     pattern = re.compile("limit\\s+(\\d+)\\s*;?$", re.I)
 11 |     res = pattern.search(sql_str)
 12 |     if res:
 13 |         return int(res.group(1))
 14 |     else:
 15 |         return 0
 16 | 
 17 | def sql_incrust_limit( sql_str, default_limit):
 18 |     if sql_extract_limit(sql_str) > 0:
 19 |         pattern = re.compile("([\\s\\S]*)?(limit\\s*\\d*\\s?;?)+$", re.I)# replace any existing limit with the default limit
 20 |         res = pattern.sub("\\1 limit " + str(default_limit), sql_str)
 21 |         return res
 22 |     else:
 23 |         pattern = re.compile("([^;]+)[\\s\\S]*$", re.I)# replace any existing limit with the default limit
 24 |         res = pattern.sub("\\1 limit " + str(default_limit), sql_str)
 25 |         return res
 26 | 
 27 | 
 28 | def sql_rewrite( sql_str, default_limit):
 29 |     sql_str = sql_remove_comment(sql_str)
 30 |     if sql_is_count(sql_str):# no limit for count
 31 |         return sql_str
 32 |     if sql_is_selection(sql_str): #the query is a selection
 33 |         if sql_extract_limit(sql_str) > default_limit or sql_extract_limit(sql_str) ==0:# the limit is not set or to high
 34 |             return sql_incrust_limit(sql_str, default_limit) # force the default limit
 35 |     if sql_is_show(sql_str) or sql_is_describe(sql_str): #the query is a show
 36 |         pattern = re.compile("(\\w+)\\s+(\\w+)([\\s\\S]*)$", re.I)# replace any existing limit with the default limit
 37 |         res = pattern.sub("\\1 \\2", sql_str)
 38 |         return res
 39 |     return sql_str
 40 | 
 41 | def extract_pattern(sql_str):
 42 |     sql_str = sql_remove_comment(sql_str)
 43 |     pattern = re.compile("(\\w+)\\s+(\\w+)([\\s\\S]*)$", re.I)
 44 |     res = pattern.sub("\\3", sql_str)
 45 |     return res.strip()
 46 |             
 47 | def sql_is_selection(sql_str):
 48 |     sql_str = sql_remove_comment(sql_str)
 49 |     return sql_is_with(sql_str) or re.search(r'^\s*select', sql_str, re.I)
 50 | 
 51 | def sql_is_with(sql_str):
 52 |     sql_str = sql_remove_comment(sql_str)
 53 |     return re.search(r'^\s*with', sql_str, re.I)
 54 | 
 55 | def sql_is_count(sql_str):
 56 |     sql_str = sql_remove_comment(sql_str)
 57 |     return re.search(r'^\s*select\s+count\(.\)\s+from', sql_str, re.I)
 58 | 
 59 | def sql_is_create(sql_str):
 60 |     sql_str = sql_remove_comment(sql_str)
 61 |     return re.search(r'^\s*create\s+table\s+.*stored\s+as\s+orc\s+as', sql_str, re.I)
 62 | 
 63 | def sql_is_drop(sql_str):
 64 |     sql_str = sql_remove_comment(sql_str)
 65 |     return re.search(r'^\s*drop\s+table', sql_str, re.I)
 66 | 
 67 | def sql_is_describe(sql_str):
 68 |     sql_str = sql_remove_comment(sql_str)
 69 |     return re.search(r'^\s*describe\s+', sql_str, re.I)
 70 | 
 71 | def sql_is_show(sql_str):
 72 |     sql_str = sql_remove_comment(sql_str)
 73 |     return sql_is_show_tables(sql_str) or sql_is_show_databases(sql_str)
 74 | 
 75 | def sql_is_show_tables(sql_str):
 76 |     sql_str = sql_remove_comment(sql_str)
 77 |     return re.search(r'^\s*show\s+tables', sql_str, re.I)
 78 | 
 79 | def sql_is_show_databases(sql_str):
 80 |     sql_str = sql_remove_comment(sql_str)
 81 |     return re.search(r'^\s*show\s+databases', sql_str, re.I)
 82 |    
 83 | def sql_is_use(sql_str):
 84 |     sql_str = sql_remove_comment(sql_str)
 85 |     return re.search(r'^\s*use\s+', sql_str, re.I)
 86 | 
 87 | def sql_is_set_variable(sql_str):
 88 |     sql_str = sql_remove_comment(sql_str)
 89 |     return re.search(r'^\s*set\s+\w+.*=\w+', sql_str, re.I)
 90 | 
 91 | def sql_is_set(sql_str):
 92 |     sql_str = sql_remove_comment(sql_str)
 93 |     return re.search(r'^\s*set\s*$', sql_str, re.I) or re.search(r'^\s*set\s+\w+.*$', sql_str, re.I)
 94 | 
 95 | def sql_is_explain(sql_str):
 96 |     sql_str = sql_remove_comment(sql_str)
 97 |     return re.search(r'^\s*explain\s+', sql_str, re.I)
 98 | 
 99 | def sql_is_add(sql_str):
100 |     sql_str = sql_remove_comment(sql_str)
101 |     return re.search(r'^\s*add\s+', sql_str, re.I)
102 | 
103 | def sql_remove_comment(sql_str):
104 |     res = re.sub("--.*\n","", sql_str, re.MULTILINE)
105 |     return res
106 | 
107 | def sql_explode(sql_str):
108 |     tmp = []
109 |     sql_str = sql_remove_comment(sql_str)
110 |     for sql in sql_str.split(";"):
111 |         if sql.strip() != "":
112 |             tmp.append(sql.strip())
113 |     return tmp
114 | 
115 | def sql_validate(sql_str):
116 |     if sql_is_set(sql_str) or sql_is_add(sql_str) or sql_is_drop(sql_str) or sql_is_create(sql_str) or sql_is_describe(sql_str) or sql_is_show(sql_str) or sql_is_use(sql_str) or sql_is_set_variable(sql_str) or sql_is_selection(sql_str) or sql_is_explain(sql_str):
117 |         pass
118 |     else:
119 |         raise NotAllowedQueriesError()
120 | 
121 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ipykernel==4.*
2 | jupyter_client==5.*
3 | jupyter_core==4.*
4 | pandas>=0.23.0
5 | pyhive==0.6.*
6 | sasl==0.2.*
7 | thrift_sasl==0.3.*
8 | pytest==4.2.*
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from os import path
 3 | 
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | 
 8 | with open(path.join(here, 'requirements.txt')) as f:
 9 |     requirements = f.read().splitlines()
10 | 
11 | with open(path.join(here, 'README.md')) as f:
12 |     long_description = f.read()
13 | 
14 | 
15 | from hiveql.constants import __version__, KERNEL_NAME, DISPLAY_NAME
16 | 
17 | setup(
18 |     name=KERNEL_NAME + "Kernel",
19 |     version=__version__,
20 |     description=DISPLAY_NAME + ' Kernel',
21 |     long_description=long_description,
22 |     url='https://github.com/EDS-APHP/HiveQLKernel',
23 |     author='APHP - EDS',
24 |     license='MIT',
25 |     keywords='Hive HiveQL PyHive Kernel Ipykernel',
26 |     packages=find_packages(),
27 |     install_requires=requirements,
28 |     entry_points={
29 |         'console_scripts':
30 |             ['jupyter-hiveql = {}.__main__:main'.format(KERNEL_NAME)],
31 |     },
32 |     include_package_data=False,  # otherwise package_data is not used
33 |     package_data={
34 |         KERNEL_NAME: ['resources/logo-*.png', 'resources/*.css'],
35 |     },
36 | )
37 | 


--------------------------------------------------------------------------------
/tests/test_sql.py:
--------------------------------------------------------------------------------
  1 | from hiveql.tool_sql import *
  2 | import logging 
  3 | 
  4 | def test_validate():
  5 |     SQL = "select * from toto; "
  6 |     assert sql_validate(SQL) == None
  7 |     SQL = "drop table toto; "
  8 |     assert sql_validate(SQL) == None
  9 |     SQL = "with t as (select * from tata) select * from t; "
 10 |     assert sql_validate(SQL) == None
 11 |     SQL = "show databases"
 12 |     assert sql_validate(SQL) == None
 13 |     SQL = "describe extended toto"
 14 |     assert sql_validate(SQL) == None
 15 |     SQL = "describe a.toto"
 16 |     assert sql_validate(SQL) == None
 17 |     SQL = "show databases mydatabase"
 18 |     assert sql_validate(SQL) == None
 19 |     SQL = "SET hive.enforce.sorting = true"
 20 |     assert sql_validate(SQL) == None
 21 |     SQL = "SET hive.enforce.sorting = true;select * from t"
 22 |     assert sql_validate(SQL) == None
 23 | 
 24 | def test_extract_limit():
 25 |     SQL = "select * from t limit 200;"
 26 |     assert sql_extract_limit(SQL) == 200
 27 |     SQL = "select * from (select * from b limit 20000) t limit 1000"
 28 |     assert sql_extract_limit(SQL) == 1000
 29 |     SQL = "select * from (select * from b limit 20000) t "
 30 |     assert sql_extract_limit(SQL) == 0
 31 | 
 32 | def test_incrust_limit():
 33 |     DEFAULT_LIMIT = 100
 34 |     SQL = "select * from t limit 200;"
 35 |     assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == "select * from t  limit 100"
 36 |     SQL = "select * from t ;"
 37 |     assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == "select * from t  limit 100"
 38 |     SQL = "select * from t "
 39 |     assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == SQL + " limit 100"
 40 |     SQL = "select * from (select * from t limit 100 as t) "
 41 |     assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == SQL + " limit 100"
 42 |     SQL = "with t as (select * from t limit 100) select * from t"
 43 |     assert sql_incrust_limit(SQL, DEFAULT_LIMIT) == SQL + " limit 100"
 44 | 
 45 | def test_is_selection():
 46 |     SQL = "select * from t limit 200;"
 47 |     assert sql_is_selection(SQL)
 48 |     SQL = " witH a as (select * from t) select * from a limit 200;"
 49 |     assert sql_is_selection(SQL)
 50 |     SQL = "create * from t limit 200;"
 51 |     assert sql_is_selection(SQL) == None
 52 |     SQL = "--hello world\n--second\nselect * from toto; "
 53 |     assert sql_is_selection(SQL)
 54 |     SQL = "--hello world\n--second\nselect * from toto; "
 55 |     assert sql_validate(SQL) == None
 56 | 
 57 | def test_is_create():
 58 |     SQL = "create table toto stored as orc as select * from t;"
 59 |     assert sql_is_create(SQL)
 60 |     SQL = "    create table toto  as select * from t;"
 61 |     assert sql_is_create(SQL) == None
 62 | 
 63 | def test_is_create():
 64 |     SQL = " drop table toto stored as orc as select * from t;"
 65 |     assert sql_is_drop(SQL)
 66 |     SQL = " drip table toto stored as orc as select * from t;"
 67 |     assert sql_is_drop(SQL) == None
 68 | 
 69 | def test_is_show():
 70 |     SQL = " show databases;"
 71 |     assert sql_is_show(SQL)
 72 |     SQL = "show databases toto"
 73 |     assert sql_is_show(SQL)
 74 | 
 75 | def test_is_describe():
 76 |     SQL = "describe  a.toto"
 77 |     assert sql_is_describe(SQL)
 78 |     SQL = "show my databases"
 79 |     assert sql_is_describe(SQL) == None
 80 | 
 81 | def test_rewrite():
 82 |     DEFAULT_LIMIT = 100
 83 |     SQL = "select * from t limit 200;"
 84 |     assert sql_rewrite(SQL, DEFAULT_LIMIT) == "select * from t  limit 100"
 85 |     SQL = "show databases cse"
 86 |     assert sql_rewrite(SQL, DEFAULT_LIMIT) == "show databases"
 87 |     SQL = "show tables cse"
 88 |     assert sql_rewrite(SQL, DEFAULT_LIMIT) == "show tables"
 89 |     SQL = "select count(1)  from t"
 90 |     assert sql_rewrite(SQL, DEFAULT_LIMIT) == "select count(1)  from t"
 91 |     SQL = "select count(*)  from t"
 92 |     assert sql_rewrite(SQL, DEFAULT_LIMIT) == "select count(*)  from t"
 93 | 
 94 | 
 95 | def test_explain():
 96 |     SQL = "explain select * from toto"
 97 |     assert sql_is_explain(SQL) 
 98 | 
 99 | def test_ctas():
100 |     SQL = "create table toto stored as orc as select * from tata"
101 |     assert sql_rewrite(SQL, 100)  == SQL
102 | 
103 | def test_remove_comment():
104 |     SQL = "--hello world\n--second\nselect * from toto; "
105 |     assert sql_remove_comment(SQL) == "select * from toto; "
106 | 
107 | def test_add():
108 |     SQL = "add jar;"
109 |     assert sql_is_add(SQL) != None
110 |     SQL = "ADD jar "
111 |     assert sql_is_add(SQL)
112 | 
113 | 
114 | def test_explode_sql():
115 |     SQL = "select * from t; "
116 |     assert sql_explode(SQL) == ["select * from t"]
117 |     SQL = "select * from t; use db"
118 |     assert sql_explode(SQL) == ["select * from t","use db"]
119 | 
120 | 


--------------------------------------------------------------------------------