├── LICENSE ├── README.md ├── mozstorage_logparser.py └── grokexplain.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2012, Mozilla Foundation 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is it? 2 | 3 | It takes SQLite explanations and visualizes them. If you have performance data 4 | in the form of I/O counts and opcode execution counts, it can also show those. 5 | 6 | See [this blog post](http://www.visophyte.org/blog/2010/04/06/performance-annotated-sqlite-explaination-visualizations-using-systemtap/). 7 | 8 | # What ingredients are required? 9 | 10 | you need pygraphviz... and graphviz! 11 | 12 | # How do I make it go? 13 | 14 | It now parses the command line and has stopped being dump. Basically you just 15 | point it at a ".json" file of the right type or a ".txt" file that is the 16 | output of performing a "EXPLAIN SELECT" using the sqlite3 binary built with 17 | "--enable-debug" and it does its thing. If you are not fancy enough to build 18 | sqlite with debug, the code still has some leftover logic that is capable of 19 | parsing a schema dump, but you will need to turn that back on yourself or 20 | ask nicely. 21 | 22 | Invoke the script with "--help" to get more details. 23 | 24 | Commands you might run to get that nougaty information include: 25 | 26 | sqlite3-cvs global-messages-db.sqlite 'explain SELECT * FROM messages INNER JOIN messagesText ON messages.id = messagesText.rowid WHERE id IN (SELECT docid FROM messagesText WHERE subject MATCH "sutherland") AND deleted = 0 AND folderID IS NOT NULL AND messageKey IS NOT NULL ORDER BY date DESC LIMIT 100;' > /tmp/explained.txt 27 | 28 | $ sqlite3-cvs global-messages-db.sqlite '.schema messages%' > /tmp/schemainfo.txt 29 | -------------------------------------------------------------------------------- /mozstorage_logparser.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | # Gecko's Toolkit's Storage module logs to the mozStorage module using NSPR. 4 | # This file parses and understands the log results. The primary goal is to 5 | # be able to run Firefox with the logging enabled and end up with a pile of 6 | # useful information. 7 | # 8 | # An example command line to produce such a log would be to have the following 9 | # environment variables set when running firefox, such as by pasting this at 10 | # the front of a shell command line: 11 | # NSPR_LOG_MODULES=mozStorage:5,timestamp NSPR_LOG_FILE=/tmp/mozStorage.log 12 | # 13 | # Core goals: 14 | # - Be able to filter results to connections based on filename. (We don't have 15 | # path to go on right now; we should probably enhance mozStorage's logging 16 | # to be profile-relative or something like that.) 17 | # - Be able to easily extract the list of statements used (with values) to 18 | # generate EXPLAIN and grokexplain.py output for them in a bulk-ish fashion. 19 | # - Be able to produce some profilish-ish performance statistics from the logs 20 | # 21 | # ## Implementation Overview ## 22 | # 23 | # Log lines are parsed into a simple normalized dictionary-style representation 24 | # from their human readable form. Higher level processing is then done on 25 | # those, but always keeping thost dicts around. 26 | # 27 | # All entries include the following keys: 28 | # - ts (long): timestamp, JS-style; millis since epoch 29 | # - tid (str): thread id 30 | # - type: the entry type 31 | # 32 | # Currently commented out, but you can put it back: 33 | # - raw: the raw string payload of the message 34 | # 35 | # We define the following named types of these entries: 36 | # - open: connection opened. { filename, conn } 37 | # - close: connection closed. { filename, conn } 38 | # - init: statement initialized. { async, sql, conn, stmt } where sql has the 39 | # parameter placeholders intact. 40 | # - initAsync: the sqlite3_stmt created for an async mozStorage statement. 41 | # { sql, conn, stmt, native } 42 | # - exec: statement executed (sqlite3_trace). { sql, conn } where sql has the 43 | # parameter placeholders replaced with the values. 44 | # - profile: statement execution completed (sqlite3_profile) { sql, conn, 45 | # durationMS }. 46 | # - reset: statement reset and bindings cleared. Only present in DEBUG builds. 47 | # { sql, conn, stmt } where sql has the parameter placeholders 48 | # intact. 49 | # - finalize: statement finalized. { sql, gc, conn, stmt } 50 | # 51 | # 52 | # ## Meta ## 53 | # This script is written in python because I had some existing log parsing logic 54 | # from my Thunderbird days available and because both grokexplain.py (from this 55 | # project, grok-sqlite-explain) and the amazing 56 | # https://github.com/laysakura/SQLiteDbVisualizer tool were written in Python. 57 | # 58 | # My plan is that, like SQLiteDbVisualizer, we use JSON as an interchange 59 | # format, supporting JSON dumping at multiple levels of abstraction, and that 60 | # any UIs will just consume the JSON. 61 | 62 | import calendar, time 63 | from collections import OrderedDict 64 | from datetime import datetime 65 | import os, os.path 66 | import re 67 | import subprocess 68 | 69 | import grokexplain 70 | 71 | import json 72 | import optparse 73 | 74 | VERBOSE = False 75 | PARANOIA = False 76 | 77 | def coalesce_indented_lines(lineGen): 78 | '''Consume a line-providing iterator, coalescing lines that were wrapped 79 | with indentation, just like many storage users may do when trying to make 80 | their SQL pretty. This isn't super-smart; lazy log jerks will break us. 81 | ''' 82 | accum_line = None 83 | for line in lineGen: 84 | is_indented = line.startswith(' ') or line.startswith(')') 85 | if accum_line is not None: 86 | if not is_indented: 87 | yield accum_line 88 | accum_line = line 89 | else: 90 | accum_line += line 91 | else: 92 | accum_line = line 93 | if accum_line is not None: 94 | yield accum_line 95 | 96 | # example: 97 | # 2015-01-15 21:24:23.942870 UTC 98 | NSPR_LOG_TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S.%f %Z' 99 | def parse_nspr_log_timestamp(s): 100 | '''Parse the NSPR log timestamp to a JS-style milliseconds-since-epoch''' 101 | # Ugh. Although I fear that ugh may also be partially due to Stockholm 102 | # syndrome from JS's time representation. 103 | dt = datetime.strptime(s, NSPR_LOG_TIMESTAMP_FORMAT) 104 | return calendar.timegm(dt.utctimetuple()) * 1000 + dt.microsecond 105 | 106 | def unwrap_nspr_log_lines(lineGen): 107 | '''Consume NSPR timestamped log lines, generating a tuple of (JS-style 108 | timestamp (long), thread id (str), the message payload)''' 109 | for line in lineGen: 110 | tsStr = line[:30] 111 | if not tsStr.endswith('UTC'): 112 | if VERBOSE: 113 | print 'Line with bad timestamp:', line 114 | continue 115 | ts = parse_nspr_log_timestamp(tsStr) 116 | 117 | idxThreadStart = line.find('[', 32) 118 | idxThreadEnd = line.find(']:', idxThreadStart + 1) 119 | if idxThreadStart == -1: 120 | if VERBOSE: 121 | print 'Line with bad thread id:', line 122 | continue 123 | tid = line[idxThreadStart+1:idxThreadEnd] 124 | 125 | # +3 gets us to the "D/whatever", +5 gets us to the "whatever" 126 | level = line[idxThreadEnd+3:idxThreadEnd+4] 127 | idxModuleSpace = line.find(' ', idxThreadEnd+5) 128 | module = line[idxThreadEnd+5:idxModuleSpace] 129 | msg = line[idxModuleSpace+1:].rstrip() 130 | 131 | #print repr((ts, tid, level, module, msg)) 132 | yield ts, tid, level, module, msg 133 | 134 | 135 | # Opening connection to 'places.sqlite' (7f39931861a0) 136 | RE_OPEN = re.compile("^Opening connection to '(.+)' \((?:0x)?([0-9a-fA-F]+)\)$") 137 | # Closing connection to 'cookies.sqlite' 138 | RE_CLOSE = re.compile("^Closing connection to '(.+)' \((?:0x)?([0-9a-fA-F]+)\)$") 139 | # Initialized statement 'SOME SQL WITH PARAMETER PLACEHOLDERS' (0x7f398ea28b20) 140 | # NOTE! the pointer is the statement pointer, not the connection id! 141 | RE_INIT = re.compile("^(Initialized|Inited async) statement '(.+)' \(conn 0x([0-9a-fA-F]+) stmt 0x([0-9a-fA-F]+)\)$", 142 | re.DOTALL) 143 | # sqlite3_trace on 7f399f719fe0 for 'SOME SQL WITH PARAMETERS FILLED' 144 | RE_EXEC = re.compile("^sqlite3_trace on conn 0x([0-9a-fA-F]+) for '(.+)'$", 145 | re.DOTALL) 146 | RE_PROFILE = re.compile( 147 | "^sqlite3_profile on conn 0x([0-9a-fA-F]+) duration (\d+) for '(.+)'$", 148 | re.DOTALL) 149 | # Resetting statement: 'SOME SQL WITH PARAMETER PLACEHOLDERS' 150 | RE_RESET = re.compile("^Resetting statement: '(.+)' " + 151 | "\(conn 0x([0-9a-fA-F]+) stmt 0x([0-9a-fA-F]+)\)$", 152 | re.DOTALL) 153 | # Finalizing statement 'SOME SQL W/PLACEHOLDERS' 154 | # Finalizing statement 'SOME SQL W/PLACEHOLDERS' during garbage-collection 155 | # Auto-finalizing SQL statement '...' (conn %p stmt %p) 156 | RE_FINALIZE = re.compile( 157 | "^(?:Finalizing|Auto-finalizing) (async )?statement '(.+)'" + 158 | "( during garbage-collection)? " + 159 | "\(conn 0x([0-9a-fA-F]+) stmt 0x([0-9a-fA-F]+)\)$", 160 | re.DOTALL) 161 | 162 | # Native async statement (conn %p stm %p) initialized '...' as %p 163 | RE_NATIVE_ASYNC = re.compile( 164 | "^Native async statement \(conn 0x([0-9a-fA-F]+) stmt 0x([0-9a-fA-F]+)\)" + 165 | " initialized '(.+)' as 0x([0-9a-fA-F]+)$", 166 | re.DOTALL) 167 | 168 | RE_TRACE_STMT = re.compile( 169 | "^TRACE_STMT on ([0-9a-fA-F]+): '(.+)'$", 170 | re.DOTALL) 171 | RE_TRACE_TIME = re.compile( 172 | "^TRACE_TIME on ([0-9a-fA-F]+): (.+)ms$", 173 | re.DOTALL) 174 | 175 | # !!! I haven't implemented parsing for these yet (not seeing them in usage): 176 | # Vacuum failed with error: %d '%s'. Database was: '%s' 177 | # Compilation failure: '...' could not be compiled due to an error: ... 178 | # Cloned statement (conn %p stmt %p) to async native %p 179 | 180 | class StorageLogParser(object): 181 | ''' 182 | Generator style low-level parser. 183 | ''' 184 | def parse(self, f): 185 | for ts, tid, level, module, msg in unwrap_nspr_log_lines(coalesce_indented_lines(f)): 186 | firstWord = msg[:msg.find(' ')] 187 | d = OrderedDict() 188 | d['ts'] = ts 189 | d['tid'] = tid 190 | if firstWord == 'Opening': 191 | m = RE_OPEN.match(msg) 192 | if not m: 193 | d['type'] = 'bad' 194 | if VERBOSE: 195 | print 'Sad open msg:', msg 196 | else: 197 | d['type'] = 'open' 198 | d['filename'] = m.group(1) 199 | d['conn'] = m.group(2) 200 | elif firstWord == 'Closing': 201 | m = RE_CLOSE.match(msg) 202 | if not m: 203 | d['type'] = 'bad' 204 | if VERBOSE: 205 | print 'Sad close msg:', msg 206 | else: 207 | d['type'] = 'close' 208 | d['filename'] = m.group(1) 209 | d['conn'] = m.group(2) 210 | elif firstWord == 'Initialized' or firstWord == 'Inited': 211 | m = RE_INIT.match(msg) 212 | if not m: 213 | d['type'] = 'bad' 214 | if VERBOSE: 215 | print 'Sad init msg:', msg 216 | else: 217 | d['type'] = 'init' 218 | d['async'] = m.group(1) != 'Initaizlied' 219 | d['sql'] = m.group(2) 220 | d['conn'] = m.group(3) 221 | d['stmt'] = m.group(4) 222 | elif firstWord == 'Native': 223 | m = RE_NATIVE_ASYNC.match(msg) 224 | if not m: 225 | d['type'] = 'bad' 226 | if VERBOSE: 227 | print 'Sad async native init msg:', msg 228 | else: 229 | d['type'] = 'asyncNative' 230 | d['sql'] = m.group(3) 231 | d['conn'] = m.group(1) 232 | d['stmt'] = m.group(2) 233 | d['native'] = m.group(4) 234 | # Think this is now TRACE_STMT, XXX remove 235 | elif firstWord == 'sqlite3_trace': 236 | m = RE_EXEC.match(msg) 237 | if not m: 238 | d['type'] = 'bad' 239 | if VERBOSE: 240 | print 'Sad exec msg:', msg 241 | else: 242 | d['type'] = 'exec' 243 | d['sql'] = m.group(2) 244 | d['conn'] = m.group(1) 245 | # Think this is now TRACE_TIME, XXX remove 246 | elif firstWord == 'sqlite3_profile': 247 | m = RE_PROFILE.match(msg) 248 | if not m: 249 | d['type'] = 'bad' 250 | if VERBOSE: 251 | print 'Sad exec msg:', msg 252 | else: 253 | d['type'] = 'profile' 254 | d['sql'] = m.group(3) 255 | d['conn'] = m.group(1) 256 | d['durationMS'] = float(m.group(2)) / 1000000.0 257 | elif firstWord == 'Resetting': 258 | m = RE_RESET.match(msg) 259 | if not m: 260 | d['type'] = 'bad' 261 | if VERBOSE: 262 | print 'Sad reset msg:', msg 263 | else: 264 | d['type'] = 'reset' 265 | d['sql'] = m.group(1) 266 | d['conn'] = m.group(2) 267 | d['stmt'] = m.group(3) 268 | elif firstWord == 'Finalizing' or firstWord == 'Auto-finalizing': 269 | m = RE_FINALIZE.match(msg) 270 | if not m: 271 | d['type'] = 'bad' 272 | if VERBOSE: 273 | print 'Sad finalize msg:', repr(msg) 274 | else: 275 | d['type'] = 'finalize' 276 | d['conn'] = m.group(4) 277 | d['stmt'] = m.group(5) 278 | d['sql'] = m.group(2) 279 | d['gc'] = m.group(3) and True or False 280 | elif firstWord == 'TRACE_STMT': 281 | m = RE_TRACE_STMT.match(msg) 282 | if not m: 283 | d['type'] = 'bad' 284 | if VERBOSE: 285 | print 'Sad TRACE_STMT msg:', repr(msg) 286 | else: 287 | d['type'] = 'exec' 288 | d['conn'] = m.group(1) 289 | d['sql'] = m.group(2) 290 | elif firstWord == 'TRACE_TIME': 291 | m = RE_TRACE_STMT.match(msg) 292 | if not m: 293 | d['type'] = 'bad' 294 | if VERBOSE: 295 | print 'Sad TRACE_TIME msg:', repr(msg) 296 | else: 297 | d['type'] = 'profile' 298 | d['conn'] = m.group(1) 299 | d['durationMS'] = float(m.group(2)) 300 | else: 301 | d['type'] = 'unknown' 302 | if VERBOSE: 303 | print 'Weird mozStorage line', msg 304 | #print d 305 | yield d 306 | 307 | RE_PARAM = re.compile('[:?]') 308 | def normalize_param_sql(sql): 309 | ''' 310 | Given parameterized SQL, create a normalized-ish version that's suitable 311 | for using against populated sql using startswith. 312 | ''' 313 | m = RE_PARAM.search(sql) 314 | if m: 315 | return sql[:m.start(0)] 316 | return sql 317 | 318 | class StorageLogChewer(object): 319 | def __init__(self): 320 | # all connection sessions 321 | self.conn_sessions = [] 322 | 323 | def filter_conn_sessions_by_path(self, filter_path): 324 | filter_path = os.path.normcase(os.path.abspath(filter_path)) 325 | def checkit(cinfo): 326 | normed = os.path.normcase(os.path.abspath(cinfo['filename'])) 327 | return normed == filter_path 328 | return filter(checkit, self.conn_sessions) 329 | 330 | def chew(self, parseGen): 331 | ''' 332 | Consume a StorageLogParser's generator, aggregating information by 333 | connection pointer at the top level and summarizing executed statements 334 | within that. 335 | 336 | The most useful thing we do is (within a connection), we aggregate 337 | statements based on their unbound SQL. We then hang the specific bound 338 | SQL executions off of that. We can use the bound SQL for EXPLAINing 339 | without going crazy explaining a ton of things. (Noting that if ANALYZE 340 | is used so the optimizer has more to go on, it may be appropriate to 341 | EXPLAIN everything and de-duplicate its output.) 342 | 343 | NOTE: Our "unique" values are pointers, which can inherently be reused 344 | once their lifecycle is over. 345 | ''' 346 | # Maps live/active connection id's (the hex pointer strings) to our 347 | # connection info aggregate. (see make_conn_info) 348 | conns_by_id = {} 349 | # Map connection ids to a list of presumed actively executing 350 | # statements. Statements get added by 'exec' and removed by 'profile'. 351 | # We use a list because virtual tables (and maybe other things?) mean 352 | # that we can potentially have multiple SQL statements "in flight" on 353 | # a connection at the same time. 354 | active_statements_by_conn = {} 355 | 356 | def find_active_statement_by_param_sql(actives, param_sql): 357 | norm_sql = normalize_param_sql(param_sql) 358 | # (it's been a while... probably the wrong idiom?) 359 | for i in xrange(len(actives) - 1, 0, -1): 360 | active = actives[i] 361 | populated_sql = active['sql'] 362 | #print 'CHECK', populated_sql.startswith(norm_sql), '!!!', norm_sql, '|||', populated_sql 363 | if populated_sql.startswith(norm_sql): 364 | return actives.pop(i) 365 | return None 366 | 367 | def make_conn_info(connId, filename, d): 368 | '''helper that exists because we have to infer memory dbs''' 369 | cinfo = conns_by_id[connId] = OrderedDict() 370 | cinfo['filename'] = filename 371 | cinfo['id'] = connId 372 | # all logged entries for this conn. 373 | cinfo['entries'] = [] 374 | cinfo['execs'] = [] 375 | cinfo['uniqueExecs'] = {} 376 | self.conn_sessions.append(cinfo) 377 | return cinfo 378 | 379 | for d in parseGen: 380 | dtype = d['type'] 381 | 382 | connId = d.get('conn') 383 | if connId: 384 | cinfo = conns_by_id.get(connId) 385 | if cinfo is None: 386 | cinfo = make_conn_info(connId, d['filename'], d) 387 | cinfo['entries'].append(d) 388 | else: 389 | connId = None 390 | cinfo = None 391 | 392 | if dtype == 'open': 393 | # Nothing interesting to do for open since we're already 394 | # implicitly doing everything. 395 | pass 396 | elif dtype == 'close': 397 | # It's no longer alive after this! (Although it wouldn't 398 | # be surprising for complicated threading/life-cycle things 399 | # to result in some spurious re-inferring of a live connection. 400 | del conns_by_id[connId] 401 | elif dtype == 'init': 402 | # The init pointer is describing the pointer of the statement 403 | # right now, so this is entirely useless. We can improve things 404 | # by fixing the mozStorage logging. 405 | pass 406 | elif dtype == 'exec': 407 | # exec/profile are keyed only by connection 408 | active_statements = active_statements_by_conn.get(connId) 409 | if active_statements is None: 410 | active_statements = active_statements_by_conn[connId] = [] 411 | active_statements.append(d) 412 | # avoid unbounded growth of this tracking 413 | if len(active_statements) > 10: 414 | active_statements.pop(0) 415 | elif dtype == 'profile': 416 | param_sql = d['sql'] 417 | active_statements = active_statements_by_conn.get(connId) 418 | if active_statements is None: 419 | if VERBOSE: 420 | print 'Weird profile mismatch on conn', connId 421 | continue 422 | active = find_active_statement_by_param_sql(active_statements, 423 | param_sql) 424 | if active is None: 425 | # this is potentially quite likely, so no logging unless 426 | # we want extreme logging 427 | if PARANOIA: 428 | print 'profile without active', d 429 | continue 430 | 431 | # update the exec with info on this 432 | execs = cinfo['execs'] 433 | execInfo = OrderedDict() 434 | execInfo['unboundSQL'] = param_sql 435 | execInfo['boundSQL'] = active['sql'] 436 | execInfo['startTS'] = active['ts'] 437 | execInfo['endTS'] = d['ts'] 438 | execInfo['durationMS'] = d['durationMS'] 439 | execs.append(execInfo) 440 | # and the unique execs (this is what we use for EXPLAIN, mainly) 441 | uniqueExecs = cinfo['uniqueExecs'] 442 | uniqueExec = uniqueExecs.get(param_sql) 443 | if uniqueExec is None: 444 | uniqueExec = uniqueExecs[param_sql] = OrderedDict() 445 | uniqueExec['unboundSQL'] = param_sql 446 | uniqueExec['execs'] = [] 447 | 448 | uniqueExecInst = OrderedDict() 449 | uniqueExecInst['boundSQL'] = active['sql'] 450 | uniqueExecInst['startTS'] = active['ts'] 451 | uniqueExecInst['endTS'] = d['ts'] 452 | uniqueExecInst['durationMS'] = d['durationMS'] 453 | 454 | uniqueExec['execs'].append(uniqueExecInst) 455 | 456 | def dump_db_schema(sqlite_path, db_path): 457 | args = [ 458 | sqlite_path, 459 | db_path, 460 | '.schema' 461 | ] 462 | pope = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, 463 | stderr=subprocess.PIPE, shell=False) 464 | pope.stdin.close() 465 | lines = [] 466 | for line in pope.stdout: 467 | lines.append(line) 468 | stderr = pope.stderr.read() 469 | return lines 470 | 471 | def run_explain(sqlite_path, db_path, sql): 472 | sanitized_sql = sql.replace('\n', ' ') 473 | args = [ 474 | sqlite_path, 475 | '-separator', '<|>', 476 | db_path, 477 | 'EXPLAIN ' + sanitized_sql, 478 | ] 479 | pope = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, 480 | stderr=subprocess.PIPE, shell=False) 481 | pope.stdin.close() 482 | rows = [] 483 | for line in pope.stdout: 484 | rows.append(line.rstrip().split('<|>')) 485 | stderr = pope.stderr.read() 486 | return rows 487 | 488 | class ExplainProcessor(object): 489 | ''' 490 | Process connection sessions to generate 491 | ''' 492 | def __init__(self, conn_sessions, out_dir, db_path, sqlite_path): 493 | self.conn_sessions = conn_sessions 494 | self.out_dir = out_dir 495 | self.db_path = db_path 496 | self.sqlite_path = sqlite_path 497 | 498 | self.next_id = 1 499 | self.explanations = [] 500 | self.already_explained = set() 501 | 502 | def processSession(self, cinfo, sg): 503 | explanations = self.explanations 504 | already_explained = self.already_explained 505 | for uexec in cinfo['uniqueExecs'].itervalues(): 506 | unbound_sql = uexec['unboundSQL'] 507 | if unbound_sql in already_explained: 508 | continue 509 | already_explained.add(unbound_sql) 510 | 511 | first_exec = uexec['execs'][0] 512 | first_bound_sql = first_exec['boundSQL'] 513 | rows = run_explain(self.sqlite_path, self.db_path, first_bound_sql) 514 | 515 | use_id = self.next_id 516 | self.next_id += 1 517 | explain_info = OrderedDict() 518 | explain_info['id'] = use_id 519 | explain_info['unboundSQL'] = unbound_sql 520 | explain_info['usedBoundSQL'] = first_bound_sql 521 | explain_info['rows'] = rows 522 | explanations.append(explain_info) 523 | 524 | eg = grokexplain.ExplainGrokker() 525 | eg.parseExplainStringRows(rows, sg) 526 | eg.performFlow() 527 | 528 | # it's possible this was a super-boring thing, in which case we 529 | # 530 | 531 | filename_prefix = 'blocks-%d' % (use_id,) 532 | grokexplain.output_blocks(eg, first_bound_sql, self.out_dir, 533 | filename_prefix) 534 | explain_info['dot'] = filename_prefix + '.dot' 535 | explain_info['png'] = filename_prefix + '.png' 536 | 537 | 538 | 539 | def processAll(self): 540 | # Since we have the database available, we can automatically dump the 541 | # schema. 542 | sg = grokexplain.SchemaGrokker() 543 | sg.grok(dump_db_schema(self.sqlite_path, self.db_path)) 544 | 545 | print repr(sg.tables) 546 | 547 | for session in self.conn_sessions: 548 | self.processSession(session, sg) 549 | 550 | 551 | class CmdLine(object): 552 | usage = '''usage: %prog [options] mozStorage_nspr.log 553 | 554 | Process mozStorage NSPR log output. 555 | ''' 556 | 557 | def buildParser(self): 558 | parser = optparse.OptionParser(usage=self.usage) 559 | 560 | parser.add_option('-v', '--verbose', 561 | action='store_true', dest='verbose', default=False, 562 | help='Output a lot of info about what we are doing.') 563 | 564 | parser.add_option('--db-path', 565 | dest='db_path', default=None, 566 | help=('Path to the database we care about from this '+ 567 | 'log; we will filter connections to this name '+ 568 | 'as well.')) 569 | 570 | # This is a sucky way to handle actions, but until we have more stuff 571 | # to do, it works. If omitted we're just going to dump the structured 572 | # JSON output to stdout. 573 | parser.add_option('--explain', 574 | action='store_true', dest='do_explain', 575 | default=False, 576 | help='Automatically EXPLAIN unique SQL statements') 577 | 578 | parser.add_option('-o', '--output-dir', 579 | dest='out_dir', default='/tmp/explained', 580 | help='Directory to output results in.') 581 | 582 | parser.add_option('--sqlite', 583 | dest='sqlite', default=None, 584 | help='SQLite executable to use, preferably debug') 585 | 586 | return parser 587 | 588 | def run(self): 589 | global VERBOSE 590 | 591 | parser = self.buildParser() 592 | options, args = parser.parse_args() 593 | 594 | VERBOSE = options.verbose 595 | 596 | # create the output directory if it doesn't already exist 597 | if options.out_dir: 598 | if not os.path.exists(options.out_dir): 599 | os.mkdir(options.out_dir) 600 | 601 | if options.sqlite: 602 | sqlite_path = options.sqlite 603 | else: 604 | # How the author rolls... 605 | sqlite_path = os.path.expanduser('~/bin/sqlite3') 606 | if not os.path.exists(sqlite_path): 607 | # How suckers roll... (system SQLite almost certainly isn't a 608 | # debug build, unfortunately) 609 | sqlite_path = '/usr/bin/sqlite3' 610 | 611 | db_path = options.db_path 612 | if db_path: 613 | db_path = os.path.abspath(db_path) 614 | 615 | for filename in args: 616 | parser = StorageLogParser() 617 | chewer = StorageLogChewer() 618 | f = open(filename, 'rt') 619 | chewer.chew(parser.parse(f)) 620 | f.close() 621 | 622 | if options.db_path: 623 | conn_sessions = chewer.filter_conn_sessions_by_path(db_path) 624 | else: 625 | conn_sessions = chewer.conn_sessions 626 | 627 | if options.do_explain: 628 | eproc = ExplainProcessor(conn_sessions, options.out_dir, 629 | db_path, sqlite_path) 630 | eproc.processAll() 631 | explained_path = os.path.join(options.out_dir, 'explained.json') 632 | explained_file = open(explained_path, 'wt') 633 | json.dump(eproc.explanations, explained_file, indent=2) 634 | explained_file.close() 635 | else: 636 | print json.dumps(conn_sessions, indent=2) 637 | 638 | 639 | if __name__ == '__main__': 640 | cmdline = CmdLine() 641 | cmdline.run() 642 | -------------------------------------------------------------------------------- /grokexplain.py: -------------------------------------------------------------------------------- 1 | # Attempt to understand what is happening in a SQLite EXPLAIN-ation and 2 | # build a useful control-flow diagram. (We can also do some data-flow 3 | # stuff, but it never really turned out to be too useful.) 4 | # 5 | # Info on the opcodes and what not can be found at: 6 | # http://www.sqlite.org/opcode.html 7 | # 8 | # Andrew Sutherland 9 | 10 | import pygraphviz 11 | import cStringIO as StringIO 12 | import os, os.path, textwrap 13 | import optparse 14 | from cgi import escape as escapeHTML 15 | import subprocess 16 | 17 | class TableSchema(object): 18 | ''' 19 | Table meta-data; the name of a table and its columns. 20 | ''' 21 | def __init__(self, name, colnames): 22 | self.name = name 23 | self.columns = colnames 24 | 25 | 26 | class SchemaGrokker(object): 27 | ''' 28 | Parses a schema dump like "sqlite3 DATABASE .schema" outputs for info 29 | about tables and virtual tables. 30 | 31 | Huh, so, it seems like we don't actually use this at the current time. 32 | I think this was relevant originally, but once I discovered that the debug 33 | build would include basically the same info in the comment column, it 34 | became significantly less important. 35 | ''' 36 | def __init__(self): 37 | self.tables = {} 38 | self.virtualTables = {} 39 | 40 | def grok(self, file_or_lines): 41 | for line in file_or_lines: 42 | if line.startswith('CREATE TABLE'): 43 | name = line.split(' ')[2] 44 | # Not sure what the rationale was for this, but this is no good 45 | #if '_' in name: 46 | # # HACK virtual table fallout detection 47 | # continue 48 | 49 | insideParens = line[line.find('(')+1:line.rfind(';')-1] 50 | columnNames = [] 51 | for coldef in insideParens.split(', '): 52 | columnNames.append(coldef.split()[0]) 53 | table = TableSchema(name, columnNames) 54 | self.tables[name] = table 55 | if line.startswith('CREATE VIRTUAL TABLE'): 56 | name = line.split(' ')[3] 57 | if '_' in name: 58 | # HACK virtual table fallout detection 59 | continue 60 | 61 | insideParens = line[line.find('(')+1:line.rfind(';')-1] 62 | columnNames = [] 63 | for coldef in insideParens.split(', ')[1:]: 64 | columnNames.append(coldef.split()[0]) 65 | columnNames.append('everything?') 66 | columnNames.append('docid') 67 | table = TableSchema(name, columnNames) 68 | self.virtualTables[name] = table 69 | 70 | 71 | class Table(object): 72 | def __init__(self, **kwargs): 73 | self.name = kwargs.pop('name') 74 | self.columns = kwargs.pop('columns', 0) 75 | # meh, this should probably be a single mode 76 | self.ephemeral = kwargs.pop('ephemeral', False) 77 | self.virtual = kwargs.pop('virtual', False) 78 | self.pseudo = kwargs.pop('pseudo', False) 79 | self.openedAt = kwargs.pop('openedAt', None) 80 | self.closedAt = kwargs.pop('closedAt', None) 81 | self.schema = kwargs.pop('schema', None) 82 | 83 | if self.schema: 84 | self.name = self.schema.name 85 | 86 | # a table is just a table 87 | self.on = None 88 | 89 | def __str__(self): 90 | return '%s, %d columns' % ( 91 | self.name, 92 | self.columns) 93 | 94 | class Index(object): 95 | def __init__(self, **kwargs): 96 | self.on = kwargs.pop('table', None) 97 | self.name = kwargs.pop('name') 98 | self.columns = kwargs.pop('columns') 99 | self.openedAt = kwargs.pop('openedAt', None) 100 | self.closedAt = kwargs.pop('closedAt', None) 101 | self.schema = kwargs.pop('schema', None) 102 | 103 | def __str__(self): 104 | return 'Index on [%s], %d columns' % (self.on, self.columns,) 105 | 106 | class Cursor(object): 107 | def __init__(self, **kwargs): 108 | self.handle = kwargs.pop('handle') 109 | # the goal of id is to be unique for the entire body. of course, 110 | # we can't guarantee this, so we just copy it over. but hopefully 111 | # we keep the concept and its usage distinct... 112 | self.id = self.handle 113 | self.on = kwargs.pop('on') 114 | self.writable = kwargs.pop('writable', True) 115 | self.openedAt = kwargs.pop('openedAt', None) 116 | self.closedAt = kwargs.pop('closedAt', None) 117 | 118 | self.writesAffectedBy = set() 119 | self.seeksAffectedBy = set() 120 | 121 | def __str__(self): 122 | return 'Cursor %d on %s' % (self.handle, self.on,) 123 | 124 | class RegStates(object): 125 | def __init__(self, copyFrom=None): 126 | #: maps register number to a set of cursors that have impacted this reg 127 | self.regCursorImpacts = {} 128 | #: maps register number to a set of values that this register may 129 | #: contain 130 | self.regValues = {} 131 | if copyFrom: 132 | for reg, cursors in copyFrom.regCursorImpacts.items(): 133 | self.regCursorImpacts[reg] = cursors.copy() 134 | if VERBOSE: 135 | print ' copying cursors', reg, cursors 136 | for reg, values in copyFrom.regValues.items(): 137 | if VERBOSE: 138 | print ' copying values', reg, values 139 | self.regValues[reg] = values.copy() 140 | 141 | def getRegCursorImpacts(self, reg): 142 | # I think there is defaultdict stuff we could use now... 143 | if reg in self.regCursorImpacts: 144 | return self.regCursorImpacts[reg] 145 | else: 146 | return set() 147 | 148 | def setRegCursorImpacts(self, reg, cursors): 149 | self.regCursorImpacts[reg] = cursors.copy() 150 | 151 | def addRegValue(self, reg, value): 152 | if reg in self.regValues: 153 | values = self.regValues[reg] 154 | else: 155 | values = self.regValues[reg] = set() 156 | alreadyPresent = value in values 157 | values.add(value) 158 | return not alreadyPresent 159 | 160 | def setRegValue(self, reg, value): 161 | self.regValues[reg] = set([value]) 162 | 163 | def getRegValues(self, reg): 164 | if reg in self.regValues: 165 | return self.regValues[reg] 166 | else: 167 | return set() 168 | 169 | def checkDelta(self, other): 170 | # -- cursor impact 171 | # update regs the other guy also has 172 | for reg, cursors in self.regCursorImpacts.items(): 173 | if reg in other.regCursorImpacts: 174 | # difference if they don't match 175 | if cursors != other.regCursorImpacts[reg]: 176 | return True 177 | else: # difference if he didn't have it 178 | return True 179 | # different if he has something we don't have 180 | for reg, cursors in other.regCursorImpacts.items(): 181 | if not reg in self.regCursorImpacts: 182 | return True 183 | 184 | # -- values 185 | for reg, values in self.regValues.items(): 186 | if reg in other.regValues: 187 | # diff if they don't match 188 | if values != other.regValues[reg]: 189 | return True 190 | else: # difference if he didn't have it 191 | return True 192 | # different if he has something we don't have 193 | for reg, values in other.regValues.items(): 194 | if not reg in self.regValues: 195 | return True 196 | 197 | def copy(self): 198 | return RegStates(self) 199 | 200 | def update(self, other): 201 | ''' 202 | Update the states of our registers with the states of the registers in 203 | the other RegStates object. 204 | ''' 205 | # -- cursor impact 206 | # update regs the other guy also has 207 | for reg, cursors in self.regCursorImpacts.items(): 208 | # this sorta defeats our getitem magic... 209 | if reg in other.regCursorImpacts: 210 | cursors.update(other.regCursorImpacts[reg]) 211 | # copy regs we don't have (but the other guy does) 212 | for reg, cursors in other.regCursorImpacts.items(): 213 | if not reg in self.regCursorImpacts: 214 | self.regCursorImpacts[reg] = cursors.copy() 215 | 216 | # -- values 217 | for reg, values in self.regValues.items(): 218 | # this sorta defeats our getitem magic... 219 | if reg in other.regValues: 220 | values.update(other.regValues[reg]) 221 | # copy regs we don't have (but the other guy does) 222 | for reg, values in other.regValues.items(): 223 | if not reg in self.regValues: 224 | self.regValues[reg] = values.copy() 225 | 226 | def __str__(self): 227 | return '\n cursor impacts: %s\n values: %s' % ( 228 | repr(self.regCursorImpacts), repr(self.regValues)) 229 | 230 | def graphStr(self): 231 | s = '' 232 | for reg, values in self.regValues.items(): 233 | s += ' r%s: %s' % (reg, values) 234 | 235 | if HAVE_COUNTS: 236 | s = "%s" % (s,) 237 | else: 238 | s = "%s" % (s,) 239 | return s 240 | 241 | class BasicBlock(object): 242 | def __init__(self, ops): 243 | self.ops = ops 244 | self.inRegs = RegStates() 245 | self.outRegs = RegStates() 246 | self.done = False 247 | 248 | # establish a back-link for us lazy-types 249 | for op in self.ops: 250 | op.block = self 251 | 252 | @property 253 | def id(self): 254 | return self.ops[0].addr 255 | 256 | @property 257 | def lastAddr(self): 258 | return self.ops[-1].addr 259 | 260 | @property 261 | def comeFrom(self): 262 | return self.ops[0].comeFrom 263 | 264 | @property 265 | def goTo(self): 266 | lastOp = self.ops[-1] 267 | # if we have explicit goto's, use them 268 | if lastOp.goTo: 269 | return lastOp.goTo 270 | # otherwise assume we should flow to the next guy if we're non-terminal 271 | if lastOp.terminate: 272 | return [] 273 | else: 274 | return [self.ops[-1].addr + 1] 275 | 276 | def __str__(self): 277 | return 'Block id: %d last: %d comeFrom: %s goTo: %s' % ( 278 | self.id, self.lastAddr, self.comeFrom, self.goTo) 279 | 280 | class GenericOpInfo(object): 281 | ''' 282 | Simple op meta-info. 283 | ''' 284 | def __init__(self, addr, name, params, comment): 285 | self.addr = addr 286 | self.name = name 287 | self.params = params 288 | self.comment = comment 289 | 290 | #: opcode addresses that may jump/flow to this opcode 291 | self.comeFrom = [] 292 | #: opcode addresses that we may transfer control to (jump or next) 293 | self.goTo = [] 294 | #: database handle-y things created by this opcode 295 | self.births = [] 296 | #: database handle-y things closed by this opcode 297 | self.kills = [] 298 | 299 | #: register numbers that we (may) read from 300 | self.regReads = [] 301 | #: register numbers that we (may) write to 302 | self.regWrites = [] 303 | #: the immediate value this opcode uses (if it uses one). 304 | #: We can't represent some immediates, so this may just be true 305 | self.usesImmediate = None 306 | #: the Cursor this opcode uses for read purposes, if any 307 | self.usesCursor = None 308 | #: the Cursor this opcode uses for write purposes, if any. implies uses 309 | self.writesCursor = None 310 | #: the Cursor this opcode performs a seek operation on 311 | self.seeksCursor = None 312 | #: the column numbers (on the cursor) this op accesses 313 | self.usesColumns = None 314 | #: does this opcode terminate program execution 315 | self.terminate = False 316 | #: does this opcode engage in dynamic jumps (and as such we should hint 317 | #: to the basic block engine to create a block) 318 | self.dynamicGoTo = False 319 | #: used by Gosub to track the register it writes to 320 | self.dynamicWritePC = None 321 | 322 | #: (potentially normalized) number of times opcode invoked 323 | self.invocCount = None 324 | #: (potentially normalized) number of btree pages accessed 325 | self.pageCount = None 326 | 327 | self.affectedByCursors = set() 328 | 329 | def ensureJumpTargets(self, addrs, adjustment=1): 330 | ''' 331 | Make sure this opcode knows that it can jump to the given addresses 332 | (probably with an adjustment). Returns the set of targets that were 333 | unknown. 334 | ''' 335 | unknown = set() 336 | for addr in addrs: 337 | realAddr = addr + adjustment 338 | if realAddr not in self.goTo: 339 | self.goTo.append(realAddr) 340 | unknown.add(realAddr) 341 | return unknown 342 | 343 | HIGHLIGHT_OPS = {'Yield': '#ff00ff', 344 | 'Gosub': '#ff00ff', 345 | 'Return': '#ff00ff'} 346 | INVOC_THRESHOLDS = [0, 1, 2, 10, 100, 1000] 347 | PAGE_THRESHOLDS = [0, 1, 2, 8, 64, 256] 348 | def graphStr(self, schemaInfo): 349 | def bar(count, thresholds): 350 | if count == thresholds[0]: 351 | return "||||| " 352 | elif count == thresholds[1]: 353 | return ("|" + 354 | "|||| ") 355 | elif count == thresholds[2]: 356 | return ("|" + 357 | "|||| ") 358 | elif count <= thresholds[3]: 359 | return ("||" + 360 | "||| ") 361 | elif count <= thresholds[4]: 362 | return ("|||" + 363 | "|| ") 364 | elif count <= thresholds[5]: 365 | return ("||||" + 366 | "| ") 367 | else: 368 | return "||||| " 369 | 370 | if HAVE_COUNTS: 371 | s = bar(self.invocCount or 0, self.INVOC_THRESHOLDS) 372 | s += bar(self.pageCount or 0, self.PAGE_THRESHOLDS) 373 | else: 374 | s = '' 375 | 376 | if self.usesCursor: 377 | s += "%d %s [%s]" % ( 378 | self.usesCursor.color, self.addr, self.name, 379 | self.usesCursor.handle) 380 | elif self.name in self.HIGHLIGHT_OPS: 381 | s += "%d %s" % ( 382 | self.addr, self.HIGHLIGHT_OPS[self.name], 383 | escapeHTML(self.name)) 384 | else: 385 | s += '%d %s' % (self.addr, escapeHTML(self.name)) 386 | 387 | if self.affectedByCursors: 388 | cursors = list(self.affectedByCursors) 389 | cursors.sort(lambda a, b: a.handle - b.handle) 390 | cursorStrings = [] 391 | for cursor in cursors: 392 | if cursor == self.usesCursor: 393 | continue 394 | cursorStrings.append( 395 | "%d" % ( 396 | cursor.color, cursor.handle)) 397 | if cursorStrings: 398 | s += ' (' + ' '.join(cursorStrings) + ')' 399 | 400 | if self.usesCursor and self.births: 401 | if self.usesCursor in self.births and self.usesCursor.on: 402 | s += ' %s' % (escapeHTML(self.usesCursor.on.name),) 403 | 404 | if self.usesImmediate is not None: 405 | s += ' imm %s' % (self.usesImmediate) 406 | 407 | if self.usesColumns: 408 | schema = self.usesCursor.on.schema 409 | if schema: 410 | colNames = [] 411 | for colNum in self.usesColumns: 412 | colNames.append(escapeHTML(schema.columns[colNum])) 413 | s += ' col %s' % (', '.join(colNames)) 414 | 415 | if self.regReads: 416 | s += ' using r%s' % (', r'.join(map(str, self.regReads)),) 417 | if self.regWrites: 418 | s += ' to r%s' % (', r'.join(map(str, self.regWrites)),) 419 | 420 | if self.comment: 421 | s += " %s" % (escapeHTML(self.comment), 422 | ) 423 | 424 | if HAVE_COUNTS: 425 | s = ("%s" + 426 | " %d%d") % ( 427 | s, self.invocCount or 0, self.pageCount or 0) 428 | else: 429 | s = "%s" % (s,) 430 | 431 | return s 432 | 433 | def dump(self): 434 | if self.comeFrom: 435 | print ' ', self.comeFrom, '---->' 436 | print '%d %s' % (self.addr, self.name), 437 | print ' reads: %s writes: %s' % (self.regReads, self.regWrites) 438 | if self.goTo: 439 | print ' ---->', self.goTo 440 | 441 | class ExplainGrokker(object): 442 | def __init__(self): 443 | self.ephemeralTables = [] 444 | self.virtualTables = [] 445 | #: maps "vtab:*:*" strings to Table instances... 446 | self.vtableObjs = {} 447 | self.realTables = [] 448 | self.realTablesByName = {} 449 | self.pseudoTables = [] 450 | 451 | self.allTables = [] 452 | 453 | self.indices = [] 454 | self.realIndicesByName = {} 455 | self.cursors = [] 456 | self.code = [] 457 | self.cursorByHandle = {} 458 | 459 | self.resultRowOps = [] 460 | 461 | def _newEphemeralTable(self, **kwargs): 462 | table = Table(ephemeral=True, openedAt=self.op, **kwargs) 463 | self.ephemeralTables.append(table) 464 | self.allTables.append(table) 465 | self.op.births.append(table) 466 | return table 467 | 468 | def _newVirtualTable(self, vtabkey, **kwargs): 469 | table = Table(virtual=True, openedAt=self.op, **kwargs) 470 | self.vtableObjs[vtabkey] = table 471 | self.virtualTables.append(table) 472 | self.allTables.append(table) 473 | self.op.births.append(table) 474 | return table 475 | 476 | def _newRealTable(self, nameAndInfo, **kwargs): 477 | rootIdb, name = nameAndInfo.split('; ') 478 | if name in self.realTablesByName: 479 | table = self.realTablesByName[name] 480 | else: 481 | table = Table(name=name, openedAt=self.op, **kwargs) 482 | self.realTables.append(table) 483 | self.realTablesByName[name] = table 484 | self.allTables.append(table) 485 | self.op.births.append(table) 486 | return table 487 | 488 | def _newPseudoTable(self, **kwargs): 489 | table = Table(pseudo=True,openedAt=self.op, **kwargs) 490 | self.pseudoTables.append(table) 491 | self.allTables.append(table) 492 | self.op.births.append(table) 493 | return table 494 | 495 | def _parseKeyinfo(self, indexDetails): 496 | # see displayP4 in the source 497 | # right now we just care about the first arg, nField which is the 498 | # "number of key colums in the index" 499 | keyparts = indexDetails[2:-1].split(',') 500 | numColumns = int(keyparts[0]) 501 | return numColumns 502 | 503 | def _newIndexOn(self, indexDetails, nameAndInfo, **kwargs): 504 | # indexDetails is of the form: "keyinfo(%d,"... 505 | numColumns = self._parseKeyinfo(indexDetails) 506 | rootIdb, name = nameAndInfo.split('; ') 507 | if name in self.realIndicesByName: 508 | index = self.realIndicesByName[name] 509 | else: 510 | index = Index(columns=numColumns, openedAt=self.op, name=name, 511 | **kwargs) 512 | self.indices.append(index) 513 | self.realIndicesByName[name] = index 514 | self.op.births.append(index) 515 | return index 516 | 517 | def _newCursor(self, handle, thing, **kwargs): 518 | if handle in self.cursorByHandle: 519 | # R/W change is okay 520 | cursor = self.cursorByHandle[handle] 521 | if cursor.on != thing: 522 | raise Exception('ERROR! Cursor handle collision!') 523 | else: 524 | cursor = Cursor(handle=handle, on=thing, openedAt=self.op, **kwargs) 525 | self.cursors.append(cursor) 526 | self.cursorByHandle[handle] = cursor 527 | self.op.births.append(cursor) 528 | self.op.usesCursor = cursor 529 | self.op.affectedByCursors.add(cursor) 530 | return cursor 531 | 532 | def _getCursor(self, handle, write=False, seek=False): 533 | cursor = self.cursorByHandle[handle] 534 | self.op.usesCursor = cursor 535 | self.op.writesCursor = write 536 | self.op.seeksCursor = seek 537 | self.op.affectedByCursors.add(cursor) 538 | return cursor 539 | 540 | def _getVtable(self, vtabkey, write=False): 541 | ''' 542 | Given a P4-resident "vtab:*:*" 543 | ''' 544 | if vtabkey in self.cursorByHandle: 545 | cursor = self.cursorByHandle[vtabkey] 546 | else: 547 | cursor = Cursor(handle=vtabkey, on=None) 548 | self.cursors.append(cursor) 549 | self.cursorByHandle[vtabkey] = cursor 550 | 551 | self.op.usesCursor = cursor 552 | self.op.writesCursor = write 553 | self.op.affectedByCursors.add(cursor) 554 | return cursor 555 | 556 | 557 | def _killThing(self, thing): 558 | self.op.kills.append(thing) 559 | thing.closedAt = self.op 560 | 561 | if thing.on: 562 | self._killThing(thing.on) 563 | 564 | def _killCursor(self, handle): 565 | if handle not in self.cursorByHandle: 566 | print 'Warning; tried to close a non-open cursor; might be our bad' 567 | return 568 | cursor = self._getCursor(handle) 569 | self._killThing(cursor) 570 | 571 | def _op_OpenCommon(self, params, writable): 572 | # if P4 is a keyinfo, then it's an index and comment is the name of the 573 | # index. 574 | # if P4 is not a keyinfo, then it's the number of columns in the table 575 | # and comment is the name of the table. 576 | cursorNum = params[0] 577 | 578 | if isinstance(params[3], basestring): 579 | indexDetails = params[3] 580 | cursorOn = self._newIndexOn(indexDetails, 581 | self.op.comment) 582 | else: 583 | cursorOn = self._newRealTable(self.op.comment, 584 | columns=params[3]) 585 | 586 | self._newCursor(cursorNum, cursorOn, writable=writable) 587 | 588 | def _op_OpenRead(self, params): 589 | self._op_OpenCommon(params, False) 590 | 591 | def _op_OpenWrite(self, params): 592 | self._op_OpenCommon(params, True) 593 | 594 | def _op_OpenPseudo(self, params): 595 | # a psuedo-table is a 'fake table with a single row of data' 596 | # cursor is P1 597 | # the pseudo-table is stored into a blob in P2 598 | # number of fields/columns is p3 599 | # XXX our Column opcode might benefit from being aware that it's dealing 600 | # with a cursor to a psuedo table so that it can translate its actions 601 | # into actions on the underlying register too. but we no longer really 602 | # care about that so much these days. 603 | cursorNum = params[0] 604 | table = self._newPseudoTable( 605 | name=("pseudo%d" % (cursorNum,)), 606 | columns=params[2]) 607 | self._newCursor(cursorNum, table) 608 | self.op.regWrites.append(params[1]) 609 | 610 | def _op_VOpen(self, params): 611 | # p1: cursor number 612 | # p4: vtable structure 613 | cursorNum = params[0] 614 | table = self._newVirtualTable(params[3], 615 | name=("virtual%d" % (cursorNum,))) 616 | self._newCursor(cursorNum, table) 617 | 618 | def _op_OpenEphemeral(self, params): 619 | cursorNum = params[0] 620 | numColumns = params[1] 621 | indexDetails = params[3] 622 | table = self._newEphemeralTable( 623 | name=("ephemeral%d" % (cursorNum,)), 624 | columns=numColumns) 625 | if indexDetails: 626 | cursorOn = self._newIndexOn(indexDetails, 627 | table=table, 628 | name="eindex%d" % (cursorNum,)) 629 | else: 630 | cursorOn = table 631 | self._newCursor(cursorNum, cursorOn) 632 | 633 | def _op_SorterOpen(self, params): 634 | # per docs, this is just like OpenEphemeral but it's for "large tables 635 | # using an external merge-sort algorithm". 636 | pass 637 | def _op_SorterInsert(self, params): 638 | # same as IdxInsert 639 | pass 640 | def _op_SorterSort(self, params): 641 | # same as Sort 642 | pass 643 | def _op_SorterData(self, params): 644 | # Its own thing 645 | pass 646 | def _op_SorterNext(self, params): 647 | # advance read cursor to next sorted element 648 | pass 649 | 650 | 651 | 652 | def _op_Permute(self, params): 653 | # it just prints "intarray" at least for non-debug. not very helpful! 654 | pass 655 | 656 | def _op_Compare(self, params): 657 | # P1..(P1+P3-1) 658 | self.op.regReads.extend([params[0] + x for x in range(params[2])]) 659 | # P2..(P2+P3-1) 660 | self.op.regReads.extend([params[1] + x for x in range(params[2])]) 661 | # uh, we don't use this yet. 662 | self._parseKeyinfo(params[3]) 663 | # we contaminate the jump decision... 664 | self.op.regWrites.append('for_jump') 665 | 666 | def _condJump(self, regs, target): 667 | if regs: 668 | if isinstance(regs, list): 669 | self.op.regReads.extend(regs) 670 | else: 671 | self.op.regReads.append(regs) 672 | self.op.goTo.append(self.op.addr + 1) 673 | self.op.goTo.append(target) 674 | self.op.usesImmediate = target 675 | 676 | def _jump(self, target): 677 | self.op.goTo.append(target) 678 | self.op.usesImmediate = target 679 | 680 | def _op_Goto(self, params): 681 | self._jump(params[1]) 682 | 683 | def _op_Init(self, params): 684 | # says to jump to P2 if P2 is not zero 685 | if (params[1]): 686 | self._jump(params[1]) 687 | 688 | def _op_Jump(self, params): 689 | # we base our decision on the result of the last compare 690 | self.op.regReads.append('for_jump') 691 | self._jump(params[0]) 692 | self._jump(params[1]) 693 | self._jump(params[2]) 694 | self.op.usesImmediate = None # too many for now... XXX 695 | 696 | def _op_Gosub(self, params): 697 | self.op.regWrites.append(params[0]) 698 | self.op.dynamicWritePC = params[0] 699 | self._jump(params[1]) 700 | if NO_YIELDS: 701 | self.op.goTo.append(self.op.addr + 1) 702 | 703 | # def _op_InitCoroutine(self, params): 704 | # pass 705 | 706 | # def _op_EndCoroutine(self, params): 707 | # pass 708 | 709 | def _op_Yield(self, params): 710 | self.op.regReads.append(params[0]) 711 | self.op.regWrites.append(params[0]) 712 | if not NO_YIELDS: 713 | self.op.dynamicWritePC = params[0] 714 | # we won't know where out goTo goes to until dataflow analysis, nor 715 | # where we would 'come from' to the next opcode. But we do know that 716 | # after us is a basic block break, so let's hint that. 717 | self.op.dynamicGoTo = params[0] 718 | # do not arbitrarily flow to the next dude! 719 | self.op.terminate = True 720 | 721 | def _op_Return(self, params): 722 | # just like for Yield, we have no idea where we are going until 723 | # dataflow. 724 | self.op.regReads.append(params[0]) 725 | self.op.dynamicGoTo = params[0] 726 | 727 | def _op_NullRow(self, params): 728 | # moves us to a no-op row 729 | self._getCursor(params[0], False, True) 730 | 731 | def _op_Seek(self, params): 732 | self._getCursor(params[0], False, True) 733 | self.op.regReads.append(params[1]) 734 | 735 | def _op_SeekCommon(self, params, comparison): 736 | cursor = self._getCursor(params[0], False, True) 737 | if isinstance(cursor.on, Table): 738 | self.op.regReads.append(params[2]) 739 | else: 740 | for x in range(params[3]): 741 | self.op.regReads.append(params[2] + x) 742 | if params[1]: 743 | self._condJump(None, params[1]) 744 | 745 | def _op_SeekLT(self, params): 746 | self._op_SeekCommon(params, '<') 747 | def _op_SeekLE(self, params): 748 | self._op_SeekCommon(params, '<=') 749 | def _op_SeekGE(self, params): 750 | self._op_SeekCommon(params, '>=') 751 | def _op_SeekGT(self, params): 752 | self._op_SeekCommon(params, '>') 753 | 754 | def _op_IdxCommon(self, params, comparison): 755 | self._getCursor(params[0]) 756 | indexKey_regs = [params[2] + x for x in range(params[3])] 757 | self._condJump(indexKey_regs, params[1]) 758 | 759 | def _op_IdxLT(self, params): 760 | self._op_IdxCommon(params, '<') 761 | def _op_IdxLE(self, params): 762 | self._op_IdxCommon(params, '<=') 763 | def _op_IdxGE(self, params): 764 | self._op_IdxCommon(params, '>=') 765 | def _op_IdxGT(self, params): 766 | self._op_IdxCommon(params, '>') 767 | 768 | def _op_IdxRowid(self, params): 769 | self._getCursor(params[0]) 770 | self.op.regWrites.append(params[1]) 771 | def _op_Rowid(self, params): 772 | self._op_IdxRowid(params) 773 | def _op_NewRowid(self, params): 774 | self._getCursor(params[0]) 775 | self.op.regReads.append(params[2]) 776 | self.op.regWrites.extend(params[1:3]) 777 | 778 | def _op_RowSetAdd(self, params): 779 | # reg[p2] => reg[p1] 780 | self.op.regReads.append(params[1]) 781 | self.op.regWrites.append(params[0]) 782 | 783 | def _op_RowSetRead(self, params): 784 | # extract from reg[p1] => reg[p3], or conditional jump to p2 785 | self._condJump(params[0], params[1]) 786 | self.op.regWrites.append(params[2]) 787 | 788 | def _op_NotExists(self, params): 789 | self._getCursor(params[0], False, True) 790 | self._condJump(params[2], params[1]) 791 | 792 | def _op_Found(self, params): 793 | self._getCursor(params[0], False, True) 794 | self._condJump(params[2], params[1]) 795 | def _op_NotFound(self, params): 796 | self._op_Found(params) 797 | 798 | def _op_VBegin(self, params): 799 | # This is used in conjunction with VUpdate without any VOpen. Although 800 | # there is no real cursor that results from this, the usage is akin to a 801 | # cursor... 802 | vtcursor = self._getVtable(params[3]) 803 | self.op.births.append(vtcursor) 804 | vtcursor.openedAt=self.op 805 | 806 | def _op_VUpdate(self, params): 807 | # performs virtual table INSERT and/or DELETE 808 | # p1 is whether we should update the last_insert_rowid value 809 | # p2 is the number of arguments 810 | # p3 is the register index of the first argument (of which we have p2 811 | # arguments) 812 | # p4 is the vtable we are operating on ("vtab:*:*") 813 | self.op.regReads.extend([params[2] + x for x in range(params[1])]) 814 | self._getVtable(params[3], True) 815 | 816 | def _op_VFilter(self, params): 817 | self._getCursor(params[0], False, True) 818 | # +1 is actually argc, which we can't see with a bit'o'legwork 819 | # TODOMAYBE: fancy legwork if we can statically know the argc 820 | self.op.regReads.extend([params[2], params[2]+1, 821 | # however, we do know it must be >= 1 822 | params[2] + 2]) 823 | self._condJump(None, params[1]) 824 | 825 | def _op_VNext(self, params): 826 | self._getCursor(params[0], False, True) 827 | self._condJump(None, params[1]) 828 | def _op_Next(self, params): 829 | self._op_VNext(params) 830 | def _op_Prev(self, params): 831 | self._op_VNext(params) 832 | 833 | def _op_Last(self, params): 834 | self._getCursor(params[0], False, True) 835 | if params[1]: 836 | self._condJump(None, params[1]) 837 | def _op_Rewind(self, params): 838 | self._op_Last(params) 839 | _op_Sort = _op_Rewind 840 | 841 | def _op_Column(self, params): 842 | self._getCursor(params[0]) 843 | self.op.usesColumns = [params[1]] 844 | self.op.regWrites.append(params[2]) 845 | def _op_VColumn(self, params): 846 | self._op_Column(params) 847 | 848 | def _op_Affinity(self, params): 849 | ''' 850 | This sets the data type of a bunch of registers... 851 | Treating this as a nop since it doesn't really do anything. 852 | ''' 853 | pass 854 | 855 | def _op_MakeRecord(self, params): 856 | self.op.regReads.extend([params[0] + x for x in range(params[1])]) 857 | # writes to reg p3 858 | self.op.regWrites.append(params[2]) 859 | 860 | def _op_ResultRow(self, params): 861 | self.op.regReads.extend([params[0] + x for x in range(params[1])]) 862 | self.resultRowOps.append(self.op) 863 | 864 | def _op_AggStep(self, params): 865 | # reads are taken from P2 onwards, P5 is the number of args 866 | self.op.regReads.extend([params[1] + x for x in range(int(params[4]))]) 867 | # P3 is the accumulator so writes go there 868 | self.op.regWrites = [params[2]] 869 | # p4 is the function def, ex: count(1), currently ignored 870 | pass 871 | 872 | def _op_AggFinal(self, params): 873 | # So although P2 is the number of args, the docs say that this function 874 | # really only cares about the accumulator 875 | self.op.regReads = [params[0]] 876 | # P4 is the function def, ex: count(1), currently ignored 877 | 878 | def _op_Insert(self, params): 879 | # P4 is the table... 880 | self._getCursor(params[0], True) 881 | self.op.regReads.extend([params[1], params[2]]) 882 | 883 | def _op_IdxInsert(self, params): 884 | self._getCursor(params[0], True) 885 | self.op.regReads.append(params[1]) 886 | 887 | def _op_Delete(self, params): 888 | # a delete is a write... 889 | self._getCursor(params[0], True) 890 | 891 | def _op_IdxDelete(self, params): 892 | # delete from cursor P1 893 | # index key is packed into p2...p2+p3-1p 894 | self._getCursor(params[0], True) 895 | self.op.regReads.extend([params[1] + x for x in range(params[2])]) 896 | 897 | def _op_Sequence(self, params): 898 | self._getCursor(params[0]) 899 | self.op.regWrites.append(params[1]) 900 | 901 | def _op_Close(self, params): 902 | self._killCursor(params[0]) 903 | 904 | def _op_IsNull(self, params): 905 | if params[2]: 906 | regs = [params[0] + x for x in range(params[2])] 907 | else: 908 | regs = params[0] 909 | self._condJump(regs, params[1]) 910 | 911 | def _op_NotNull(self, params): 912 | self._condJump([params[0]], params[1]) 913 | 914 | def _op_MustBeInt(self, params): 915 | self.op.regReads.append(params[0]) 916 | self.op.goTo.append(self.op.addr + 1) 917 | if params[1]: 918 | self.op.goTo.append(params[1]) 919 | else: 920 | # you know what? we don't care about exceptions. screw them. 921 | #self.op.goTo.append('SQLITE_MISMATCH') 922 | pass 923 | 924 | def _op_If(self, params): 925 | self._condJump([params[0]], params[1]) 926 | def _op_IfNot(self, params): 927 | self._condJump([params[0]], params[1]) 928 | def _op_IfPos(self, params): 929 | self._condJump([params[0]], params[1]) 930 | 931 | def _op_Eq(self, params): 932 | self._condJump([params[0], params[2]], params[1]) 933 | def _op_Ne(self, params): 934 | self._condJump([params[0], params[2]], params[1]) 935 | def _op_Lt(self, params): 936 | self._condJump([params[0], params[2]], params[1]) 937 | def _op_Le(self, params): 938 | self._condJump([params[0], params[2]], params[1]) 939 | def _op_Gt(self, params): 940 | self._condJump([params[0], params[2]], params[1]) 941 | def _op_Ge(self, params): 942 | self._condJump([params[0], params[2]], params[1]) 943 | 944 | def _op_IfZero(self, params): 945 | self._condJump([params[0]], params[1]) 946 | 947 | def _op_Variable(self, params): 948 | # bound parameters P1..P1+P3-1 transferred to regs P2...P2+P3-1 949 | # when P3==1, P4 should hold the name. 950 | self.op.regWrites.extend([params[1] + x for x in range(params[2])]) 951 | if params[2] == 1 and params[3]: 952 | # just put the binding name in the commment; it visually works 953 | self.op.comment = params[3] 954 | 955 | def _op_Move(self, params): 956 | self.op.regReads.extend([params[0] + x for x in range(params[2])]) 957 | self.op.regWrites.extend([params[1] + x for x in range(params[2])]) 958 | 959 | def _op_Copy(self, params, shallow=False): 960 | self.op.regReads.append(params[0]) 961 | self.op.regWrites.append(params[1]) 962 | 963 | def _op_SCopy(self, params): 964 | self._op_Copy(params, True) 965 | 966 | def _op_AddImm(self, params): 967 | self.op.regReads.append(params[0]) 968 | self.op.regWrites.append(params[0]) 969 | self.op.usesImmediate = params[1] 970 | 971 | def _op_String(self, params): 972 | self.op.regWrites.append(params[1]) 973 | self.op.usesImmediate = params[3] 974 | 975 | def _op_String8(self, params): 976 | self._op_String(params) 977 | 978 | def _op_Integer(self, params): 979 | self.op.regWrites.append(params[1]) 980 | self.op.usesImmediate = params[0] 981 | 982 | def _op_Int64(self, params): 983 | self.op.regWrites.append(params[1]) 984 | self.op.usesImmediate = params[3] 985 | 986 | def _op_Real(self, params): 987 | self.op.regWrites.append(params[1]) 988 | self.op.usesImmediate = params[3] 989 | 990 | def _op_Blob(self, params): 991 | self.op.regWrites.append(params[1]) 992 | self.op.usesImmediate = '(blob)' 993 | 994 | def _op_Null(self, params): 995 | self.op.regWrites.append(params[1]) 996 | if (params[2] > params[1]): 997 | for x in xrange(params[1] + 1, params[2] + 1): 998 | self.op.regWrites.append(x) 999 | 1000 | def _op_Halt(self, params): 1001 | self.op.terminate = True 1002 | 1003 | def _op_AutoCommit(self, params): 1004 | # Used by things like BEGIN IMMEDIATE and COMMIT. Technically this is 1005 | # supported to cause the VM to halt but it still seems to always be 1006 | # followed by a halt anyways? 1007 | # Leaving as a 'pass' rather than a terminate since we always see the 1008 | # Halt and I'd hate for there to be a fall-through nuance. 1009 | pass 1010 | 1011 | def _op_HaltIfNull(self, params): 1012 | # halts if reg[P3] is null. 1013 | self.op.regReads.append(params[2]) 1014 | 1015 | def _mathCommon(self, params): 1016 | # inputs P1 and P2, outputs P3 1017 | self.op.regReads.extend([params[0], params[1]]) 1018 | self.op.regWrites.append(params[2]) 1019 | def _op_Add(self, params): 1020 | self._mathCommon(params) 1021 | def _op_Multiply(self, params): 1022 | self._mathCommon(params) 1023 | def _op_Subtract(self, params): 1024 | self._mathCommon(params) 1025 | def _op_Divide(self, params): 1026 | self._mathCommon(params) 1027 | def _op_Remainder(self, params): 1028 | self._mathCommon(params) 1029 | 1030 | def _op_Function(self, params): 1031 | # P1 is a bitmask indicating where each arg is constant (when set) 1032 | self.op.regReads.append(params[0]) 1033 | # args are P2..P2+P5-1 1034 | self.op.regReads.extend( 1035 | [params[1] + x for x in range(int(params[4], 10))]) 1036 | # result stored in P3 1037 | self.op.regWrites.append(params[2]) 1038 | # P4 is the function pointer which ends up a nice string for us 1039 | # transfer the function pointer string to the comment. 1040 | self.op.comment = params[3] 1041 | 1042 | def _op_VerifyCookie(self, params): 1043 | # schema delta check. no one cares. 1044 | pass 1045 | 1046 | def _op_Noop(self, params): 1047 | # used as a jump target apparently 1048 | pass 1049 | 1050 | def _op_TableLock(self, params): 1051 | # Table locks are boring 1052 | pass 1053 | 1054 | def _op_Transaction(self, params): 1055 | # Transactions are currently boring. 1056 | pass 1057 | 1058 | def _op_Explain(self, params): 1059 | # P4 is the EXPLAIN QUERY PLAN output for what's going on. Clobber it 1060 | # to be our comment! 1061 | self.op.comment = params[3] 1062 | 1063 | def _op_Expire(self, params): 1064 | # ExecuteSimpleSQL and friends use this, I think. And so I don't think 1065 | # we care. 1066 | pass 1067 | 1068 | def parseExplainTextFile(self, file_or_lines, schemaInfo=None): 1069 | ''' 1070 | Process the text-formatted results of an EXPLAIN query like what would 1071 | would get from invoking "sqlite DATABASE 'EXPLAIN ...'". 1072 | ''' 1073 | # the contents aren't long enough to merit a generator 1074 | rows = [] 1075 | for line in file_or_lines: 1076 | bits = line.split('|') 1077 | rows.append(bits) 1078 | 1079 | self.parseExplainStringRows(rows, schemaInfo) 1080 | 1081 | def parseExplainStringRows(self, rows, schemaInfo=None): 1082 | ''' 1083 | Process the somewhat pre-chewed output of a sqlite EXPLAIN command. 1084 | In this case, we're expecting our caller to have directly driven the 1085 | invocation and to have chosen the delimiter and split on it for us. 1086 | Or maybe even directly used a python sqlite lib or something. 1087 | ''' 1088 | self.schemaInfo = schemaInfo or SchemaGrokker() 1089 | 1090 | def chewParams(params): 1091 | params[0] = int(params[0]) 1092 | params[1] = int(params[1]) 1093 | params[2] = int(params[2]) 1094 | if params[3] == '': 1095 | pass 1096 | elif params[3] == 'NULL': 1097 | params[3] = None 1098 | elif params[3].isdigit(): 1099 | params[3] = int(params[3]) 1100 | return params 1101 | 1102 | for bits in rows: 1103 | print 'Parsing', bits 1104 | addr = int(bits[0]) 1105 | opcode = bits[1] 1106 | params = chewParams(bits[2:7]) 1107 | comment = bits[7].strip() 1108 | 1109 | # opcode renaming compensation... 1110 | if opcode.startswith('Move') and len(opcode) > 4: 1111 | opcode = opcode.replace('Move', 'Seek') 1112 | 1113 | self.op = GenericOpInfo(addr, opcode, params, comment) 1114 | self.code.append(self.op) 1115 | 1116 | handler = getattr(self, "_op_" + opcode, None) 1117 | if handler: 1118 | handler(params) 1119 | else: 1120 | print 'Ignoring opcode', opcode 1121 | 1122 | 1123 | def parseJsonOpcodesList(self, opcodeList, counts=None, pages=None): 1124 | ''' 1125 | Process a python list where each entry in the list is itself a list 1126 | representing a row of EXPLAIN output. The columns are then: 1127 | [addr (int), opcode (string), p1 (int), p2 (int), p3 (int), p4 (string), 1128 | p5 (int), comment (string or None)]. 1129 | 1130 | In a SQLite build built with -DDEBUG, the comment will be the name of 1131 | the column where appropriate. This eliminates the need to have 1132 | metadata available. 1133 | 1134 | @param counts A list of invocation counts (index is opcode addr) 1135 | @param pages A list of btree page access counts (index is opcode addr) 1136 | ''' 1137 | self.schemaInfo = SchemaGrokker() 1138 | 1139 | for row in opcodeList: 1140 | addr = row[0] 1141 | opcode = row[1] 1142 | params = row[2:7] # p1 - p5 1143 | comment = row[7] and str(row[7]) # the graphviz code hates unicode. 1144 | 1145 | if params[3].isdigit(): 1146 | params[3] = int(params[3]) 1147 | else: 1148 | # JSON returns unicode but graphviz hates it. 1149 | params[3] = str(params[3]) 1150 | 1151 | # opcode renaming compensation... 1152 | if opcode.startswith('Move') and len(opcode) > 4: 1153 | opcode = opcode.replace('Move', 'Seek') 1154 | 1155 | self.op = GenericOpInfo(addr, opcode, params, comment) 1156 | self.code.append(self.op) 1157 | # if we have a count for this opcode, set it 1158 | if counts and len(counts) > addr: 1159 | self.op.invocCount = counts[addr] 1160 | if pages and len(pages) > addr: 1161 | self.op.pageCount = pages[addr] 1162 | 1163 | handler = getattr(self, "_op_" + opcode, None) 1164 | if handler: 1165 | handler(params) 1166 | else: 1167 | print 'Ignoring opcode', opcode 1168 | 1169 | 1170 | def performFlow(self): 1171 | needReflow = True 1172 | while needReflow: 1173 | print '--- dataflow pass happening ---' 1174 | self.figureBasicBlocks() 1175 | needReflow = self.dataFlow() 1176 | 1177 | def figureBasicBlocks(self): 1178 | # build comeFrom links 1179 | for op in self.code: 1180 | for addr in op.goTo: 1181 | # ignore exception magic 1182 | if isinstance(addr, basestring): 1183 | continue 1184 | if VERBOSE: 1185 | print 'at op', op.addr, 'goTo', addr 1186 | targ_op = self.code[addr] 1187 | if op.addr not in targ_op.comeFrom: 1188 | targ_op.comeFrom.append(op.addr) 1189 | elif VERBOSE: 1190 | print 'not adding?' 1191 | 1192 | self.basicBlocks = {} 1193 | self.basicBlocksByEnd = {} 1194 | 1195 | # build the blocks 1196 | def make_block(ops): 1197 | if ops: 1198 | block = BasicBlock(ops) 1199 | if VERBOSE: 1200 | print 'new', block 1201 | self.basicBlocks[block.id] = block 1202 | self.basicBlocksByEnd[block.lastAddr] = block 1203 | 1204 | block_ops = [] 1205 | for op in self.code: 1206 | # if we come from somewhere, then we start a new block (and create 1207 | # a basic block for any opcodes queued in block_ops) 1208 | if op.comeFrom: 1209 | make_block(block_ops) 1210 | block_ops = [] 1211 | block_ops.append(op) 1212 | # if this op jumps places, then close out this set of block_ops 1213 | # into a basic block. 1214 | if op.goTo or op.dynamicGoTo != False: 1215 | make_block(block_ops) 1216 | block_ops = [] 1217 | 1218 | if block_ops: 1219 | make_block(block_ops) 1220 | 1221 | 1222 | def colorCursors(self): 1223 | TANGO_COLORS = ['#73d216', '#3465a4', '#75507b', '#cc0000', 1224 | '#f57900', '#c17d11', '#cc00cc', 1225 | '#4e9a06', '#204a87', '#5c3566', '#a40000', 1226 | '#c4a000', '#ce5c00', '#8f5902'] 1227 | # could do the hue thing... 1228 | for i, cursor in enumerate(self.cursors): 1229 | cursor.color = TANGO_COLORS[i % len(TANGO_COLORS)] 1230 | 1231 | def _graphvizRenderHelper(self, g, outpath): 1232 | ''' 1233 | pygraphviz does not support HTML strings, which precludes our making 1234 | pretty pretty labels directly. So, we horribly write the dot file 1235 | to disk then munge it to have HTML strings in it. Brilliant, no? 1236 | 1237 | Your labels that should be HTML labels should be bracketed in << and 1238 | >>. 1239 | ''' 1240 | f = open(outpath, 'w') 1241 | g.write(f) 1242 | f.close() 1243 | 1244 | f = open(outpath, 'r') 1245 | #tmpf.seek(0, 0) 1246 | buf = f.read() 1247 | #buf = buf.replace('\\\n>', '>\\\n').replace('\\\n"', '"\\\n') 1248 | buf = buf.replace('\\\n', '') 1249 | buf = buf.replace('"<<', '<').replace('>>"', '>') 1250 | buf = buf.replace('\\\\\nn', '\\\n
') 1251 | f.close() 1252 | f = open(outpath, 'w') 1253 | f.write(buf) 1254 | f.close() 1255 | 1256 | BB_TABLE_HEADER = "" 1257 | BB_TABLE_FOOTER = "
" 1258 | def diagBasicBlocks(self, outpath, sqlStr): 1259 | ''' 1260 | Diagram the basic blocks, putting the sqlStr up in the header. 1261 | ''' 1262 | self.colorCursors() 1263 | 1264 | 1265 | g = pygraphviz.AGraph(directed=True, strict=False) 1266 | for block in self.basicBlocks.values(): 1267 | ltext = ("< " + self.BB_TABLE_HEADER + 1268 | (DEBUG and (block.inRegs.graphStr()) or '') + 1269 | ''.join( 1270 | [op.graphStr(self.schemaInfo) for op in block.ops]) + 1271 | (DEBUG and (block.outRegs.graphStr()) or '') + 1272 | self.BB_TABLE_FOOTER + 1273 | " >") 1274 | g.add_node(block.id, label=str(ltext)) 1275 | 1276 | for block in self.basicBlocks.values(): 1277 | for addr in block.goTo: 1278 | if isinstance(addr, basestring): 1279 | continue 1280 | target_block = self.basicBlocks[addr] 1281 | attrs = {} 1282 | if target_block.id == block.lastAddr + 1: 1283 | attrs['color'] = 'gray' 1284 | g.add_edge(block.id, target_block.id, **attrs) 1285 | 1286 | # wrap and de-unicode (which our graphviz lib hates) 1287 | g.graph_attr['label'] = '\\n'.join(textwrap.wrap(str(sqlStr), 80)) 1288 | g.graph_attr['labelloc'] = 't' # top 1289 | g.node_attr['shape'] = 'box' 1290 | g.node_attr['fontsize'] = '8' 1291 | 1292 | self._graphvizRenderHelper(g, outpath) 1293 | 1294 | def dataFlow(self): 1295 | todo = [self.basicBlocks[0]] 1296 | 1297 | # dynamic jumps can require the CFG to be rebuilt, although we try and 1298 | # avoid that where possible. This variable tracks this, which requires 1299 | # us to re-build our CFG and re-process things. Thankfully all our 1300 | # results that have already been computed should still be accurate, we 1301 | # just might need to flow things even further. 1302 | cfgInvalid = False 1303 | 1304 | def goToBlocks(block): 1305 | for addr in block.goTo: 1306 | yield self.basicBlocks[addr] 1307 | 1308 | def comeFromBlocks(block): 1309 | for addr in block.comeFrom: 1310 | if not addr in self.basicBlocksByEnd: 1311 | print 'PROBLEM! no such comeFrom on addr', addr, 'for block:' 1312 | print block 1313 | yield self.basicBlocksByEnd[addr] 1314 | 1315 | def originBlocksDone(block): 1316 | for addr in block.comeForm: 1317 | if not self.basicBlocksByEnd[addr].done: 1318 | return False 1319 | return True 1320 | 1321 | def ensureJumpTargets(op, addrs, adjustment=1): 1322 | # This returns the (adjusted) addresses of jumps that are new to 1323 | # us. We need to check to make sure there is actually a basic 1324 | # block that starts there. If not, we need to mark the CFG 1325 | # invalid and requiring a re-processing. 1326 | # Also, we need to fix up comeFrom 1327 | if VERBOSE: 1328 | print 'adding dynamic jumps to', addrs, 'from', op 1329 | unknownRealAddrs = op.ensureJumpTargets(addrs, adjustment) 1330 | for unknownRealAddr in unknownRealAddrs: 1331 | if unknownRealAddr in self.basicBlocks: 1332 | other = self.basicBlocks[unknownRealAddr] 1333 | if not op.addr in other.comeFrom: 1334 | other.comeFrom.append(op.addr) 1335 | else: 1336 | print '!' * 80 1337 | print 'WARNING, CFG split at', unknownRealAddr, 'required' 1338 | print 'Jerky opcode is', op 1339 | print '!' * 80 1340 | cfgInvalid = True 1341 | return unknownRealAddrs 1342 | 1343 | def flowBlock(block): 1344 | changes = False 1345 | 1346 | for parent in comeFromBlocks(block): 1347 | block.inRegs.update(parent.outRegs) 1348 | if VERBOSE: 1349 | print 'inRegs', block.inRegs 1350 | curRegs = block.inRegs.copy() 1351 | if VERBOSE: 1352 | print 'curRegs', curRegs 1353 | 1354 | for op in block.ops: 1355 | # affect the operation for its input regs 1356 | for reg in op.regReads: 1357 | op.affectedByCursors.update(curRegs.getRegCursorImpacts(reg)) 1358 | 1359 | if op.writesCursor: # implies usesCursor 1360 | op.usesCursor.writesAffectedBy.update(op.affectedByCursors) 1361 | if op.seeksCursor: 1362 | op.usesCursor.seeksAffectedBy.update(op.affectedByCursors) 1363 | 1364 | # affect the output registers 1365 | for reg in op.regWrites: 1366 | if curRegs.getRegCursorImpacts(reg) != op.affectedByCursors: 1367 | if VERBOSE: 1368 | print 'change', reg, curRegs.getRegCursorImpacts(reg), op.affectedByCursors 1369 | curRegs.setRegCursorImpacts(reg, op.affectedByCursors) 1370 | #changes = True 1371 | 1372 | # stuff for Gosub, Yield, Return 1373 | if op.dynamicGoTo: 1374 | if ensureJumpTargets(op, curRegs.getRegValues(op.dynamicGoTo)): 1375 | # jump target changes are mutations of the CFG and 1376 | # require change processing! 1377 | changes = True 1378 | if op.dynamicWritePC: 1379 | curRegs.setRegValue(op.dynamicWritePC, op.addr) 1380 | 1381 | if block.outRegs.checkDelta(curRegs): 1382 | changes = True 1383 | block.outRegs = curRegs 1384 | if VERBOSE: 1385 | print 'outRegs', block.outRegs 1386 | 1387 | return changes 1388 | 1389 | while todo: 1390 | block = todo.pop() 1391 | if VERBOSE: 1392 | print '................' 1393 | print 'processing block', block 1394 | # if a change happened, the block is not done, and his kids are not 1395 | # done (and need to be processed) 1396 | if flowBlock(block): 1397 | block.done = False 1398 | if VERBOSE: 1399 | print 'Changes on', block 1400 | for child in goToBlocks(block): 1401 | child.done = False 1402 | if not child in todo: 1403 | todo.insert(0, child) 1404 | # no changes, so mark us done but schedule our children if they 1405 | # are not done. 1406 | else: 1407 | block.done = True 1408 | for child in goToBlocks(block): 1409 | if not child.done: 1410 | if not child in todo: 1411 | todo.insert(0, child) 1412 | 1413 | return cfgInvalid 1414 | 1415 | def diagDataFlow(self, outpath): 1416 | self.colorCursors() 1417 | 1418 | g = pygraphviz.AGraph(directed=True) 1419 | for cursor in self.cursors: 1420 | label = "<<%s>>" % ( 1421 | cursor.color, cursor) 1422 | g.add_node(cursor.id, label=str(label)) 1423 | for cursor in self.cursors: 1424 | for originCursor in cursor.writesAffectedBy: 1425 | g.add_edge(originCursor.id, cursor.id) 1426 | for originCursor in cursor.seeksAffectedBy: 1427 | g.add_edge(originCursor.id, cursor.id, color="#cccccc") 1428 | 1429 | for result_op in self.resultRowOps: 1430 | for cursor in result_op.affectedByCursors: 1431 | g.add_edge(cursor.id, "Results") 1432 | 1433 | g.node_attr['fontsize'] = '10' 1434 | self._graphvizRenderHelper(g, outpath) 1435 | 1436 | 1437 | def dump(self): 1438 | print 'Code:' 1439 | for op in self.code: 1440 | op.dump() 1441 | 1442 | print 'Tables:' 1443 | for table in self.allTables: 1444 | print ' ', table 1445 | 1446 | class VdbeStats(object): 1447 | ''' 1448 | Process output from sqlite-perf.stp which emits a data structure with the 1449 | following form: 1450 | { 1451 | "stats": [ 1452 | {"sql": "SQL STRING, possibly ", 1453 | "counts": [1,1,1,5,5,2,1], 1454 | "pages": [0,0,1,4] 1455 | }, 1456 | ... 1457 | ] 1458 | } 1459 | ''' 1460 | 1461 | #: The minimum 1462 | SIGNIFICANT_THRESHOLD = 64 1463 | 1464 | def __init__(self): 1465 | self.sqlToInstances = {} 1466 | self.sqlToSignificant = {} 1467 | 1468 | def parseJsonStats(self, path): 1469 | import json 1470 | f = open(path, 'r') 1471 | obj = json.load(f) 1472 | f.close() 1473 | 1474 | for row in obj["stats"]: 1475 | sql = row["sql"] 1476 | if sql in self.sqlToInstances: 1477 | instances = self.sqlToInstances[sql] 1478 | else: 1479 | instances = self.sqlToInstances[sql] = [] 1480 | instances.append(row) 1481 | 1482 | def hasStats(self, sql): 1483 | return sql in self.sqlToInstances 1484 | 1485 | def hasSignificantStats(self, sql): 1486 | ''' 1487 | Scan all instances for a count >= than the significance threshold. 1488 | We save off the first significant guy we find. 1489 | ''' 1490 | if not sql in self.sqlToInstances: 1491 | return False 1492 | 1493 | SIG_THRESH = self.SIGNIFICANT_THRESHOLD 1494 | for row in self.sqlToInstances[sql]: 1495 | for count in row["counts"]: 1496 | if count >= SIG_THRESH: 1497 | self.sqlToSignificant[sql] = row 1498 | return True 1499 | 1500 | return False 1501 | 1502 | def getRepresentativeCounts(self, sql): 1503 | ''' 1504 | Pick or synthesize a representative count set. Currently that means 1505 | just pick the first (significant) one... 1506 | 1507 | Only call this if hasStats/hasSignificantStats returned true for sql. 1508 | ''' 1509 | if sql in self.sqlToSignificant: 1510 | return self.sqlToSignificant[sql] 1511 | 1512 | return self.sqlToInstances[sql][0] 1513 | 1514 | 1515 | ## Horrible hacky globals 1516 | # These propagate commands from the command-line to the logic above where 1517 | # I didn't want to have to type self a lot, apparently. Now that we're 1518 | # also letting ExplainGrokker be used from other modules, these need to be 1519 | # defined here. We're defining them to the defaults that make sense in this 1520 | # context, but this should all be cleaned up. 1521 | 1522 | # Yield are dataflow analysis. This turned out to be overkill and not really 1523 | # useful. Note this is a horrible hacky way to control this, but at least 1524 | # it's hooked up to the command line mechanism! 1525 | NO_YIELDS = True 1526 | # No need to be chatty, especially since we don't own stdout by default 1527 | VERBOSE = False 1528 | DEBUG = False 1529 | # Caller probably doesn't have stats... This is most likely to need cleanup. 1530 | HAVE_COUNTS = False 1531 | 1532 | def output_blocks(explainGrokker, sql, out_dir, filename_prefix, 1533 | make_png=True): 1534 | block_path = os.path.join(out_dir, filename_prefix + '.dot') 1535 | explainGrokker.diagBasicBlocks(block_path, sql) 1536 | 1537 | if make_png: 1538 | png_path = os.path.join(out_dir, filename_prefix + '.png') 1539 | subprocess.check_call(["/usr/bin/dot", "-Tpng", 1540 | "-o", png_path, 1541 | block_path]) 1542 | 1543 | 1544 | class CmdLine(object): 1545 | usage = '''usage: %prog [options] explained.json/explained.txt 1546 | 1547 | We process SQLite EXPLAIN output in order to visualize it using graphviz. 1548 | We require that your SQLite was built with -DDEBUG because then we get 1549 | useful schema meta-data from the comment column and we really enjoy that. 1550 | ''' 1551 | 1552 | def buildParser(self): 1553 | parser = optparse.OptionParser(usage=self.usage) 1554 | 1555 | parser.add_option('-d', '--debug', 1556 | action='store_true', dest='debug', default=False, 1557 | help='Dump registers at block entry/exit') 1558 | parser.add_option('-y', '--yields', 1559 | action='store_true', dest='yields', default=False, 1560 | help='Process yields for control/dataflow analysis.') 1561 | parser.add_option('-v', '--verbose', 1562 | action='store_true', dest='verbose', default=False, 1563 | help='Output a lot of info about what we are doing.') 1564 | 1565 | parser.add_option('--vdbe-stats', 1566 | dest='statsfile', default=None, 1567 | help='JSON vdbe stats file to process') 1568 | 1569 | # decision-making on what to output 1570 | parser.add_option('-a', '--all', 1571 | action='store_true', dest='all', default=False, 1572 | help=('Output data for everything, not just ' + 1573 | 'concerning data points.')) 1574 | parser.add_option('-m', '--match', 1575 | dest='match_substring', default=None, 1576 | help=('Substring to match in queries to determine ' + 1577 | 'whether to output for a query.')) 1578 | 1579 | parser.add_option('-o', '--output-dir', 1580 | dest='out_dir', default=None, 1581 | help='Directory to output results in.') 1582 | parser.add_option('--dataflow', 1583 | action='store_true', dest='dataflow', default=False, 1584 | help='Output dataflow overview.') 1585 | 1586 | parser.add_option('--no-png', 1587 | action='store_false', dest='make_pngs', default=True, 1588 | help='Do not automatically create png files.') 1589 | 1590 | return parser 1591 | 1592 | def run(self): 1593 | global DEBUG, NO_YIELDS, VERBOSE, HAVE_COUNTS 1594 | 1595 | parser = self.buildParser() 1596 | options, args = parser.parse_args() 1597 | 1598 | DEBUG = options.debug 1599 | NO_YIELDS = not options.yields 1600 | VERBOSE = options.verbose 1601 | 1602 | # create the output directory if it doesn't already exist 1603 | if options.out_dir: 1604 | if not os.path.exists(options.out_dir): 1605 | os.mkdir(options.out_dir) 1606 | 1607 | # load the VDBE stats if available 1608 | if options.statsfile: 1609 | vdbestats = VdbeStats() 1610 | vdbestats.parseJsonStats(options.statsfile) 1611 | HAVE_COUNTS = True 1612 | else: 1613 | vdbestats = None 1614 | HAVE_COUNTS = False 1615 | 1616 | for filename in args: 1617 | if filename.endswith('.json'): 1618 | import json 1619 | f = open(filename, 'r') 1620 | obj = json.load(f) 1621 | f.close() 1622 | 1623 | for iQuery, query in enumerate(obj["queries"]): 1624 | sql = query["sql"] 1625 | # figure out whether this 'matches' the display rules 1626 | if options.all: 1627 | matches = True 1628 | elif options.match_substring: 1629 | matches = options.match_substring.lower() in sql.lower() 1630 | elif vdbestats: 1631 | matches = vdbestats.hasSignificantStats(sql) 1632 | else: 1633 | matches = False 1634 | 1635 | if not matches: 1636 | continue 1637 | 1638 | if vdbestats: 1639 | row = vdbestats.getRepresentativeCounts(sql) 1640 | counts = row["counts"] 1641 | pages = row["pages"] 1642 | elif not matches: 1643 | continue 1644 | else: 1645 | counts = None 1646 | pages = None 1647 | 1648 | print 'PROCESSING', query["sql"] 1649 | eg = ExplainGrokker() 1650 | eg.parseJsonOpcodesList(query["operations"], 1651 | counts=counts, pages=pages) 1652 | eg.performFlow() 1653 | 1654 | if options.out_dir: 1655 | output_blocks(eg, sql, options.out_dir, 1656 | '%d-blocks' % (iQuery,), 1657 | make_png=options.make_pngs) 1658 | 1659 | if options.out_dir and options.dataflow: 1660 | flowpath = os.path.join(options.out_dir, 1661 | '%d-flow.dot' % (iQuery,)) 1662 | eg.diagDataFlow(flowpath) 1663 | elif filename.endswith('.txt'): 1664 | eg = ExplainGrokker() 1665 | f = open(filename, 'rt') 1666 | # XXX hook up external schema output parse again? 1667 | eg.parseExplainTextFile(f) 1668 | f.close() 1669 | eg.performFlow() 1670 | 1671 | basename = os.path.splitext(os.path.basename(filename))[0] 1672 | 1673 | if options.out_dir: 1674 | output_blocks(eg, "...", options.out_dir, 1675 | '%s-blocks' % (basename,), 1676 | make_png=options.make_pngs) 1677 | 1678 | if options.out_dir and options.dataflow: 1679 | flowpath = os.path.join(options.out_dir, 1680 | '%s-flow.dot' % (basename,)) 1681 | eg.diagDataFlow(flowpath) 1682 | else: 1683 | print 'Input filename needs to end with .json or .txt!' 1684 | 1685 | 1686 | 1687 | if __name__ == '__main__': 1688 | cmdline = CmdLine() 1689 | cmdline.run() 1690 | --------------------------------------------------------------------------------