├── MANIFEST.in ├── pgtweaklib ├── meta.py ├── tree.py └── __init__.py ├── bin └── pgtweak ├── setup.py ├── LICENSE ├── README.rst └── test ├── testnode.py └── testpgtweaklib.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | include INSTALL 4 | -------------------------------------------------------------------------------- /pgtweaklib/meta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Matt Harrison 2 | 3 | __version__ = '0.1' 4 | __author__ = 'Matt Harrison' 5 | __email__ = 'matthewharrison at gmail.com' 6 | -------------------------------------------------------------------------------- /bin/pgtweak: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2010 Matt Harrison 3 | 4 | import sys 5 | 6 | import pgtweaklib 7 | 8 | if __name__ == '__main__': 9 | try: 10 | sys.exit(pgtweaklib.main(sys.argv)) 11 | except Exception, e: 12 | sys.stderr.write('%s\n'%str(e)) 13 | sys.exit(1) 14 | 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Matt Harrison 2 | from distutils.core import setup 3 | #from setuptools import setup 4 | 5 | from pgtweaklib import meta 6 | 7 | setup(name='PgTweak', 8 | version=meta.__version__, 9 | author=meta.__author__, 10 | description='FILL IN', 11 | scripts=['bin/pgtweak'], 12 | package_dir={'pgtweaklib':'pgtweaklib'}, 13 | packages=['pgtweaklib'], 14 | ) 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Matt Harrison 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PgTweak 2 | ======= 3 | 4 | A tool to run queries against different postgres settings. 5 | Normally one would scour the postgres logs for slow queries. 6 | 7 | Author: Matt Harrison (matthewharrison@gmail.com) 8 | LICENSE: MIT 9 | 10 | Scouring logs for queries 11 | ------------------------- 12 | 13 | Run the following to pull queries from log files:: 14 | 15 | pgtweak --extract-queries --log-file some/pg/log/file > extracted.sql 16 | 17 | Getting a query file 18 | -------------------- 19 | 20 | One can go through extracted.sql to find a query to example, or hand 21 | write them in a text file. 22 | 23 | Generating a sample config file 24 | ------------------------------- 25 | 26 | A config file is used to indicate what settings to tweak. It is a 27 | json file that looks like this:: 28 | 29 | [ 30 | [ 31 | "work_mem", 32 | [ 33 | "1MB", 34 | "512MB" 35 | ] 36 | ], 37 | [ 38 | "maintenance_work_mem", 39 | [ 40 | "1MB", 41 | "512MB" 42 | ] 43 | ] 44 | ] 45 | 46 | That is the one generated by:: 47 | 48 | pgtweak --gen-config 49 | 50 | You can add more settings and possible values for settings. Remember 51 | though, that it will test the combination of each setting. For every 52 | new value option you add to a setting, you double the runs. (Note 53 | that just adding an setting with a single value just includes that in 54 | every run.) 55 | 56 | Testing settings 57 | ---------------- 58 | 59 | Run:: 60 | 61 | pgtweak --query-file simple.sql --analyze-queries --dburl postgres://postgres@localhost/testetl --config-file conf.json > results.json 62 | 63 | Note that ``dburl`` is a SqlAlchemy style connection string like 64 | ``postgres://user:password@host:post/database`` 65 | 66 | The ``results.json`` file contains JSON list containing the query, time to 67 | run, results, and settings for that query. 68 | 69 | Settings and explanations 70 | ========================= 71 | 72 | Explain settings (TODO) 73 | 74 | http://www.varlena.com/GeneralBits/Tidbits/annotated_conf_e.html 75 | 76 | random_page_cost 77 | ----------------- 78 | 79 | http://archives.postgresql.org/pgsql-performance/2004-02/msg00287.php 80 | 81 | general settings 82 | ----------------- 83 | 84 | http://archives.postgresql.org/pgsql-hackers/2008-11/msg00919.php 85 | 86 | 87 | 88 | 89 | Enabling Logging 90 | ================ 91 | 92 | Need to turn on Postgres logging in ``postgresql.conf``:: 93 | 94 | log_destination = stderr 95 | logging_collector = on 96 | log_filename = 'pgsql-%Y-%m-%d.log' 97 | log_min_error_statement = 'error' 98 | log_connections = false 99 | log_line_prefix = '<%t>' 100 | log_statement = all 101 | log_directory = '/var/log/pglog' 102 | 103 | * Make sure log_directory exists and has correct permissions 104 | 105 | * Restart postgres 106 | -------------------------------------------------------------------------------- /pgtweaklib/tree.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | NODE_TYPES = { 4 | 'Append':1, 5 | 'Hash':1, 6 | 'Hash Join':1, 7 | 'HashAggregate':1, 8 | 'Index Scan':1, 9 | 'Nested Loop':1, 10 | 'Result':1, 11 | 'Seq Scan':1, 12 | 'Sort':1, 13 | 14 | } 15 | 16 | class Node(object): 17 | def __init__(self, name, parent): 18 | self.parent = parent 19 | self.name = name 20 | self.children = [] 21 | self.data = [] 22 | self.table = None # for Seq Scan 23 | self.index = None # for Index Scan 24 | self.column = None # for Index Scan 25 | self.estimated = None 26 | self.actual = None 27 | self.level = None # spaces before content used for parent detection 28 | 29 | def __str__(self): 30 | return ' '*self.level + self.name 31 | class NodeData(object): 32 | def __init__(self, name, value): 33 | self.name = name 34 | self.value = value 35 | 36 | class EstimatedData(object): 37 | def __init__(self, start, end, rows, width): 38 | self.start = start 39 | self.end = end 40 | self.rows = rows 41 | self.width = width 42 | 43 | def get_estimated(line): 44 | """ 45 | >>> e = get_estimated(' (cost=2747.32..2747.32 rows=2 width=282) ') 46 | >>> e.start 47 | '2747.32' 48 | >>> e.end 49 | '2747.32' 50 | >>> e.rows 51 | '2' 52 | >>> e.width 53 | '282' 54 | """ 55 | start_idx = line.index('cost=') 56 | end_idx = line.index(')') 57 | line = line[start_idx:end_idx] 58 | split = line.split(' ') 59 | cost = split[0] 60 | dot_dot_idx = cost.index('..') 61 | start = cost[cost.index('=')+1:dot_dot_idx] 62 | end = cost[dot_dot_idx+2:] 63 | 64 | rows = split[1].split('=')[-1] 65 | 66 | width = split[2].split('=')[-1] 67 | return EstimatedData(start, end, rows, width) 68 | 69 | class ActualData(object): 70 | def __init__(self, start, end, rows, loops): 71 | self.start = start 72 | self.end = end 73 | self.rows = rows 74 | self.loops = loops 75 | 76 | def get_actual(line): 77 | start_idx = line.index('time=') 78 | end_idx = line.index(')', start_idx) 79 | line = line[start_idx:end_idx] 80 | split = line.split(' ') 81 | cost = split[0] 82 | dot_dot_idx = cost.index('..') 83 | start = cost[cost.index('=')+1:dot_dot_idx] 84 | end = cost[dot_dot_idx+2:] 85 | 86 | rows = split[1].split('=')[-1] 87 | 88 | loops = split[2].split('=')[-1] 89 | return ActualData(start, end, rows, loops) 90 | 91 | 92 | def build_tree(results): 93 | """ 94 | Given EXPLAIN ANALYZE results return the root node 95 | 96 | """ 97 | root = None 98 | if "\n" in results: 99 | results = results.split("\n") 100 | parent = None 101 | last_node = None 102 | for line in results: 103 | if 'QUERY PLAN' in line: 104 | continue 105 | elif line.startswith('--'): 106 | continue 107 | elif 'cost=' in line and 'actual time=' in line: 108 | stripped = line.lstrip() 109 | 110 | leading_space = len(line) - len(stripped) 111 | if last_node and leading_space > last_node.level: 112 | parent = last_node 113 | while parent and leading_space <= parent.level: 114 | parent = parent.parent 115 | 116 | node = parse_node_line(line, parent, leading_space) 117 | 118 | if parent is None: 119 | root = node 120 | parent = node 121 | last_node = node 122 | else: 123 | data = parse_data_line(line) 124 | parent.data.append(data) 125 | return root 126 | 127 | def parse_data_line(line): 128 | line = line.strip() 129 | colon_idx = line.find(':') 130 | name = line[:colon_idx] 131 | data = line[colon_idx+1:] 132 | return NodeData(name, data) 133 | 134 | def get_name(line): 135 | first_paren = line.index('(') 136 | name = line[:first_paren].rstrip() 137 | 138 | arrow = '-> ' 139 | if arrow in name: 140 | name = name[len(arrow):] 141 | 142 | other = None 143 | if name.startswith('Seq Scan'): 144 | seq_re = re.compile(r'(\s+\w+)* on (?P\w+)') 145 | other = seq_re.search(name).group('table') 146 | name = 'Seq Scan' 147 | elif name.startswith('Index Scan'): 148 | idx_re = re.compile(r'Index Scan using (?P\w+) on (?P\w+)') 149 | idx = idx_re.search(name).group('index') 150 | col = idx_re.search(name).group('col') 151 | other = (idx, col) 152 | name = 'Index Scan' 153 | 154 | assert name in NODE_TYPES 155 | return name, other 156 | 157 | def parse_node_line(line, parent, depth=None): 158 | stripped = line.lstrip() 159 | leading_space = len(line) - len(stripped) 160 | name, other = get_name(stripped) 161 | 162 | estimated = get_estimated(stripped) 163 | actual = get_actual(stripped) 164 | node = Node(name, parent) 165 | node.estimated = estimated 166 | node.actual = actual 167 | node.level = leading_space 168 | if name == 'Seq Scan': 169 | node.table = other 170 | elif name == 'Index Scan': 171 | node.index = other[0] 172 | node.col = other[1] 173 | if parent: 174 | parent.children.append(node) 175 | return node 176 | 177 | 178 | if __name__ == '__main__': 179 | import doctest 180 | doctest.testmod() 181 | -------------------------------------------------------------------------------- /test/testnode.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pgtweaklib import tree 3 | 4 | 5 | BASIC = """ QUERY PLAN 6 | ------------------------------------------------------------------------------------ 7 | Result (cost=0.00..0.01 rows=1 width=0) (actual time=0.003..0.004 rows=1 loops=1) 8 | Total runtime: 0.025 ms 9 | (2 rows)""" 10 | 11 | STAR = """ QUERY PLAN 12 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 13 | Sort (cost=2747.32..2747.32 rows=2 width=282) (actual time=774.035..774.087 rows=64 loops=1) 14 | Sort Key: product_dim.product, product_dim.subcategory_name, product_dim.category_num, product_dim.upc_description, product_dim.department_name, product_dim.subcategory_num, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.create_date, product_dim.size, product_dim.department_num, product_dim.primary_upc, product_dim.category_name, promo_dim.sales_type 15 | Sort Method: quicksort Memory: 58kB 16 | -> HashAggregate (cost=2747.24..2747.31 rows=2 width=282) (actual time=773.503..773.658 rows=64 loops=1) 17 | -> Nested Loop (cost=19.68..2747.10 rows=2 width=282) (actual time=0.866..762.048 rows=1538 loops=1) 18 | -> Hash Join (cost=19.68..2006.22 rows=219 width=73) (actual time=0.282..448.024 rows=57142 loops=1) 19 | Hash Cond: (public.week_sales_fact.promo_key = promo_dim.promo_key) 20 | -> Hash Join (cost=17.53..1998.67 rows=857 width=74) (actual time=0.133..318.213 rows=57142 loops=1) 21 | Hash Cond: (public.week_sales_fact.week_key = week_dim.week_key) 22 | -> Append (cost=0.00..1758.28 rows=57144 width=78) (actual time=0.008..189.884 rows=57142 loops=1) 23 | -> Seq Scan on week_sales_fact (cost=0.00..16.15 rows=2 width=154) (actual time=0.001..0.001 rows=0 loops=1) 24 | Filter: ((week_key >= 390) AND (week_key <= 393)) 25 | -> Seq Scan on week_sales_fact_390 week_sales_fact (cost=0.00..428.90 rows=14060 width=78) (actual time=0.004..23.368 rows=14060 loops=1) 26 | Filter: ((week_key >= 390) AND (week_key <= 393)) 27 | -> Seq Scan on week_sales_fact_391 week_sales_fact (cost=0.00..429.44 rows=14096 width=78) (actual time=0.009..24.123 rows=14096 loops=1) 28 | Filter: ((week_key >= 390) AND (week_key <= 393)) 29 | -> Seq Scan on week_sales_fact_392 week_sales_fact (cost=0.00..443.19 rows=14546 width=78) (actual time=0.009..24.163 rows=14546 loops=1) 30 | Filter: ((week_key >= 390) AND (week_key <= 393)) 31 | -> Seq Scan on week_sales_fact_393 week_sales_fact (cost=0.00..440.60 rows=14440 width=78) (actual time=0.010..24.430 rows=14440 loops=1) 32 | Filter: ((week_key >= 390) AND (week_key <= 393)) 33 | -> Hash (cost=17.49..17.49 rows=3 width=4) (actual time=0.115..0.115 rows=4 loops=1) 34 | -> Seq Scan on week_dim (cost=0.00..17.49 rows=3 width=4) (actual time=0.039..0.108 rows=4 loops=1) 35 | Filter: ((week_end >= '2009-10-20 00:00:00'::timestamp without time zone) AND (week_end <= '2009-11-10 00:00:00'::timestamp without time zone)) 36 | -> Hash (cost=1.51..1.51 rows=51 width=7) (actual time=0.140..0.140 rows=51 loops=1) 37 | -> Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=7) (actual time=0.006..0.070 rows=51 loops=1) 38 | -> Index Scan using product_dim_pkey on product_dim (cost=0.00..3.37 rows=1 width=217) (actual time=0.003..0.003 rows=0 loops=57142) 39 | Index Cond: (product_dim.product_key = public.week_sales_fact.product_key) 40 | Filter: (((product_dim.category)::text = ' 18 SOFT DRINKS'::text) AND ((product_dim.subcategory)::text = ' 18 1 18-CSD SUGAR SOFT DRINKS'::text) AND ((product_dim.department)::text = ' 1 GROCERY'::text)) 41 | Total runtime: 774.359 ms 42 | (29 rows) 43 | """ 44 | class TestNode(unittest.TestCase): 45 | def test_basic(self): 46 | root = tree.build_tree(BASIC) 47 | self.assertEquals(root.name, "Result") 48 | 49 | def test_star(self): 50 | root = tree.build_tree(STAR) 51 | self.assertEquals(root.name, 'Sort') 52 | hash_agg = root.children[0] 53 | self.assertEquals(len(root.children), 1) 54 | self.assertEquals(hash_agg.name, 'HashAggregate') 55 | self.assertEquals([x.name for x in hash_agg.children], ['Nested Loop']) 56 | parent = hash_agg.children[0] 57 | for names in [ ['Hash Join', 'Index Scan'], ['Hash Join', 'Hash'],['Append', 'Hash'], ['Seq Scan', 'Seq Scan', 'Seq Scan', 'Seq Scan', 'Seq Scan', ]]: 58 | self.assertEquals([x.name for x in parent.children], names) 59 | parent = parent.children[0] 60 | nest_loop = hash_agg.children[0] 61 | if __name__ == '__main__': 62 | unittest.main() 63 | -------------------------------------------------------------------------------- /test/testpgtweaklib.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Matt Harrison 2 | 3 | import unittest 4 | 5 | import pgtweaklib 6 | 7 | class TestPgtweaklib(unittest.TestCase): 8 | def test_needs_mem_boost(self): 9 | results = """GroupAggregate (cost=42856.20..45104.53 rows=32119 width=247) (actual time=32606.422..44289.146 rows=9869 loops=1) 10 | -> Sort (cost=42856.20..42936.50 rows=32119 width=247) (actual time=32605.346..43664.767 rows=162941 loops=1) 11 | Sort Key: nov_14_08ag_wk_vendor_fact.department, nov_14_08ag_wk_vendor_fact.department_name, nov_14_08ag_wk_vendor_fact.department_num, nov_14_08ag_wk_vendor_fact.category, nov_14_08ag_wk_vendor_fact.category_name, nov_14_08ag_wk_vendor_fact.category_num, nov_14_08ag_wk_vendor_fact.subcategory, nov_14_08ag_wk_vendor_fact.subcategory_name, nov_14_08ag_wk_vendor_fact.subcategory_num, nov_14_08ag_wk_vendor_fact.product, nov_14_08ag_wk_vendor_fact.primary_upc, nov_14_08ag_wk_vendor_fact.upc_description, nov_14_08ag_wk_vendor_fact.size, nov_14_08ag_wk_vendor_fact.create_date, nov_14_08ag_wk_vendor_fact.discontinued_date 12 | Sort Method: external merge Disk: 43768kB 13 | -> Nested Loop (cost=0.00..36717.41 rows=32119 width=247) (actual time=3.505..755.673 rows=162941 loops=1) 14 | """ 15 | self.assertEquals(suggestions(results), 'up work_mem to at least 44MB') 16 | 17 | def test_needs_index(self): 18 | """ 19 | The following results should suggest adding an index 20 | """ 21 | results = """ QUERY PLAN 22 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 23 | Sort (cost=2747.32..2747.32 rows=2 width=282) (actual time=774.035..774.087 rows=64 loops=1) 24 | Sort Key: product_dim.product, product_dim.subcategory_name, product_dim.category_num, product_dim.upc_description, product_dim.department_name, product_dim.subcategory_num, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.create_date, product_dim.size, product_dim.department_num, product_dim.primary_upc, product_dim.category_name, promo_dim.sales_type 25 | Sort Method: quicksort Memory: 58kB 26 | -> HashAggregate (cost=2747.24..2747.31 rows=2 width=282) (actual time=773.503..773.658 rows=64 loops=1) 27 | -> Nested Loop (cost=19.68..2747.10 rows=2 width=282) (actual time=0.866..762.048 rows=1538 loops=1) 28 | -> Hash Join (cost=19.68..2006.22 rows=219 width=73) (actual time=0.282..448.024 rows=57142 loops=1) 29 | Hash Cond: (public.week_sales_fact.promo_key = promo_dim.promo_key) 30 | -> Hash Join (cost=17.53..1998.67 rows=857 width=74) (actual time=0.133..318.213 rows=57142 loops=1) 31 | Hash Cond: (public.week_sales_fact.week_key = week_dim.week_key) 32 | -> Append (cost=0.00..1758.28 rows=57144 width=78) (actual time=0.008..189.884 rows=57142 loops=1) 33 | -> Seq Scan on week_sales_fact (cost=0.00..16.15 rows=2 width=154) (actual time=0.001..0.001 rows=0 loops=1) 34 | Filter: ((week_key >= 390) AND (week_key <= 393)) 35 | -> Seq Scan on week_sales_fact_390 week_sales_fact (cost=0.00..428.90 rows=14060 width=78) (actual time=0.004..23.368 rows=14060 loops=1) 36 | Filter: ((week_key >= 390) AND (week_key <= 393)) 37 | -> Seq Scan on week_sales_fact_391 week_sales_fact (cost=0.00..429.44 rows=14096 width=78) (actual time=0.009..24.123 rows=14096 loops=1) 38 | Filter: ((week_key >= 390) AND (week_key <= 393)) 39 | -> Seq Scan on week_sales_fact_392 week_sales_fact (cost=0.00..443.19 rows=14546 width=78) (actual time=0.009..24.163 rows=14546 loops=1) 40 | Filter: ((week_key >= 390) AND (week_key <= 393)) 41 | -> Seq Scan on week_sales_fact_393 week_sales_fact (cost=0.00..440.60 rows=14440 width=78) (actual time=0.010..24.430 rows=14440 loops=1) 42 | Filter: ((week_key >= 390) AND (week_key <= 393)) 43 | -> Hash (cost=17.49..17.49 rows=3 width=4) (actual time=0.115..0.115 rows=4 loops=1) 44 | -> Seq Scan on week_dim (cost=0.00..17.49 rows=3 width=4) (actual time=0.039..0.108 rows=4 loops=1) 45 | Filter: ((week_end >= '2009-10-20 00:00:00'::timestamp without time zone) AND (week_end <= '2009-11-10 00:00:00'::timestamp without time zone)) 46 | -> Hash (cost=1.51..1.51 rows=51 width=7) (actual time=0.140..0.140 rows=51 loops=1) 47 | -> Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=7) (actual time=0.006..0.070 rows=51 loops=1) 48 | -> Index Scan using product_dim_pkey on product_dim (cost=0.00..3.37 rows=1 width=217) (actual time=0.003..0.003 rows=0 loops=57142) 49 | Index Cond: (product_dim.product_key = public.week_sales_fact.product_key) 50 | Filter: (((product_dim.category)::text = ' 18 SOFT DRINKS'::text) AND ((product_dim.subcategory)::text = ' 18 1 18-CSD SUGAR SOFT DRINKS'::text) AND ((product_dim.department)::text = ' 1 GROCERY'::text)) 51 | Total runtime: 774.359 ms 52 | (29 rows) 53 | """ 54 | query=""" 55 | EXPLAIN ANALYZE SELECT TIMESTAMP E'11/10/09' AS week_end, product_dim.category AS category, product_dim.category_num AS category_num, product_dim.category_name AS category_name, product_dim.product AS product, product_dim.primary_upc AS primary_upc, product_dim.upc_description AS upc_description, product_dim.size AS size, product_dim.create_date AS create_date, product_dim.discontinued_date AS discontinued_date, product_dim.manufacturer_code AS manufacturer_code, product_dim.subcategory_name AS subcategory_name, product_dim.subcategory AS subcategory, product_dim.subcategory_num AS subcategory_num, product_dim.subcategory_name AS subcategory_name, product_dim.category_num AS category_num, product_dim.upc_description AS upc_description, product_dim.department_name AS department_name, product_dim.subcategory_num AS subcategory_num, product_dim.discontinued_date AS discontinued_date, product_dim.manufacturer_code AS manufacturer_code, product_dim.department AS department, product_dim.department_num AS department_num, product_dim.department_name AS department_name, product_dim.create_date AS create_date, product_dim.size AS size, product_dim.department_num AS department_num, product_dim.primary_upc AS primary_upc, product_dim.category_name AS category_name, promo_dim.sales_type AS sales_type, sum(week_sales_fact.sales) AS sales, sum(week_sales_fact.sales_with_cost) AS sales_with_cost, sum(week_sales_fact.units) AS units, sum(week_sales_fact.units_with_cost) AS units_with_cost, sum(week_sales_fact.cost_with_cost) AS cost_with_cost, sum(week_sales_fact.dealamt) AS dealamt, sum(week_sales_fact.purbillback) AS purbillback, sum(week_sales_fact.scanbillback) AS scanbillback, sum(week_sales_fact.rebateamt) AS rebateamt, sum(week_sales_fact.regprice) AS regprice 56 | FROM product_dim, promo_dim, week_sales_fact, week_dim 57 | WHERE product_dim.department IN (E' 1 GROCERY') AND product_dim.category IN (E' 18 SOFT DRINKS') AND product_dim.subcategory IN (E' 18 1 18-CSD SUGAR SOFT DRINKS') AND week_dim.week_end BETWEEN E'10/20/09' AND E'11/10/09' AND week_sales_fact.product_key = product_dim.product_key AND week_sales_fact.promo_key = promo_dim.promo_key AND week_sales_fact.week_key = week_dim.week_key AND week_sales_fact.week_key BETWEEN 390 AND 393 GROUP BY product_dim.category, product_dim.category_num, product_dim.category_name, product_dim.product, product_dim.primary_upc, product_dim.upc_description, product_dim.size, product_dim.create_date, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.subcategory_name, product_dim.subcategory, product_dim.subcategory_num, product_dim.subcategory_name, product_dim.category_num, product_dim.upc_description, product_dim.department_name, product_dim.subcategory_num, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.department, product_dim.department_num, product_dim.department_name, product_dim.create_date, product_dim.size, product_dim.department_num, product_dim.primary_upc, product_dim.category_name, promo_dim.sales_type ORDER BY TIMESTAMP E'11/10/09', category, product, subcategory_name, subcategory, category_num, upc_description, department_name, subcategory_num, discontinued_date, manufacturer_code, department, create_date, size, department_num, primary_upc, category_name, sales_type ; 58 | """ 59 | 60 | self.assertEquals(suggestions(results), 'add index to product_dim') 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /pgtweaklib/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | PgTweak 4 | ======== 5 | 6 | A tool to run queries against different postgres settings. 7 | Normally one would scour the postgres logs for slow queries. 8 | 9 | Scouring logs for queries 10 | -------------------------- 11 | 12 | Run the following to pull queries from log files:: 13 | 14 | pgtweak --extract-queries --log-file some/pg/log/file > extracted.sql 15 | 16 | Getting a query file 17 | --------------------- 18 | 19 | One can go through extracted.sql to find a query to example, or hand 20 | write them in a text file. 21 | 22 | Generating a sample config file 23 | -------------------------------- 24 | 25 | A config file is used to indicate what settings to tweak. It is a 26 | json file that looks like this:: 27 | 28 | [ 29 | [ 30 | "work_mem", 31 | [ 32 | "1MB", 33 | "512MB" 34 | ] 35 | ], 36 | [ 37 | "maintenance_work_mem", 38 | [ 39 | "1MB", 40 | "512MB" 41 | ] 42 | ] 43 | ] 44 | 45 | That is the one generated by :: 46 | 47 | pgtweak --gen-config 48 | 49 | You can add more settings and possible values for settings. Remember 50 | though, that it will test the combination of each setting. For every 51 | new value option you add to a setting, you double the runs. (Note 52 | that just adding an setting with a single value just includes that in 53 | every run.) 54 | 55 | 56 | testing settings 57 | ----------------- 58 | 59 | Run:: 60 | 61 | pgtweak --query-file simple.sql --analyze-queries --dburl postgres://postgres@localhost/testetl --config-file conf.json > results.json 62 | 63 | Note that ``dburl`` is a SqlAlchemy style connection string (ie 64 | postgres://user:password@host:post/database). 65 | 66 | The results.json file contains json list containing the query, time to 67 | run, results, and settings for that query. 68 | 69 | Settings and explanations 70 | ========================== 71 | 72 | TODO 73 | 74 | Enabling Logging 75 | ================= 76 | 77 | need to turn on Postgres logging in postgresql.conf 78 | 79 | log_destination = stderr 80 | logging_collector = on 81 | log_filename = 'pgsql-%Y-%m-%d.log' 82 | log_min_error_statement = 'error' 83 | log_connections = false 84 | log_line_prefix = '<%t>' 85 | log_statement = all 86 | log_directory = '/var/log/pglog' 87 | 88 | * Make sure log_directory exists and has correct permissions 89 | * Restart postgres 90 | 91 | """ 92 | 93 | # Copyright (c) 2010 Matt Harrison 94 | from itertools import islice, repeat, izip, cycle 95 | import json 96 | import optparse 97 | import re 98 | import sys 99 | import time 100 | 101 | import psycopg2 102 | 103 | import meta 104 | 105 | STATEMENT = 'statement:' 106 | SEP = '-----\n' 107 | 108 | def process_dburl(dburl): 109 | """ 110 | Convert sqlalchemy style urls to psycopg2 friendly params 111 | >>> process_dburl('postgres://matt:password@localhost:9999/testdb') 112 | ('localhost', 'testdb', 'matt', 'password', '9999') 113 | >>> process_dburl('postgres://matt/testdb') 114 | (None, 'testdb', 'matt', None, None) 115 | >>> process_dburl('postgres://matt') 116 | (None, '', 'matt', None, None) 117 | >>> process_dburl('postgres://matt:password') 118 | (None, '', 'matt', 'password', None) 119 | >>> process_dburl('postgres://matt@localhost') 120 | ('localhost', '', 'matt', None, None) 121 | >>> process_dburl('postgres://matt@localhost:9999') 122 | ('localhost', '', 'matt', None, '9999') 123 | """ 124 | regex = re.compile(r'^postgres://(?P\w+):?(?P\w+)?@?(?P\w+)?:?(?P\d\d\d\d)?/?(?P\w*)') 125 | 126 | host = regex.search(dburl).group('host') 127 | dbname = regex.search(dburl).group('db') 128 | user = regex.search(dburl).group('name') 129 | password = regex.search(dburl).group('pass') 130 | port = regex.search(dburl).group('port') 131 | 132 | return host, dbname, user, password, port 133 | 134 | def get_conn_string(dburl): 135 | """ 136 | >>> get_conn_string('postgres://matt:password@localhost:9999/testdb') 137 | 'dbname=testdb user=matt password=password host=localhost port=9999' 138 | """ 139 | host, dbname, user, password, port = process_dburl(dburl) 140 | conn = [] 141 | for param in ['dbname', 'user', 'password', 142 | 'host', 'port', 'sslmode']: 143 | value = locals().get(param, None) 144 | if value: 145 | conn.append('%s=%s' %(param, value)) 146 | result = ' '.join(conn) 147 | return result 148 | 149 | class Analyzer(object): 150 | def __init__(self, query, dburl=None): 151 | self.dburl = dburl 152 | self.query = query 153 | self.host = None 154 | self.dbname = None 155 | self.user = None 156 | self.password = None 157 | 158 | def _process_dburl(self): 159 | self.host, self.dbname, self.user, self.password, self.port = process_dburl(self.dburl) 160 | return 161 | regex = re.compile(r'^postgres://(?P\w+):?(?P\w+)?@?(?P\w+)?:?(?P\d\d\d\d)?/(?P\w*)') 162 | 163 | self.host = regex.search(self.dburl).group('host') 164 | self.dbname = regex.search(self.dburl).group('db') 165 | self.user = regex.search(self.dburl).group('name') 166 | self.password = regex.search(self.dburl).group('pass') 167 | self.port = regex.search(self.dburl).group('port') 168 | 169 | def run_config(self, config, fout): 170 | results = [] 171 | for settings in config.gen_settings(): 172 | setting_dict = dict(settings) 173 | with TweakContext(self.dburl, setting_dict) as tweaker: 174 | lines = list(x[0] for x in tweaker.run(add_explain_analyze(self.query))) 175 | results.append({'query':self.query, 176 | 'settings':setting_dict, 177 | 'time':tweaker.elapsed, 178 | 'results':lines}) 179 | fout.write(json.dumps(results, indent=2)) 180 | 181 | 182 | def analyze(self): 183 | conn = psycopg2.connect(get_conn_string(self.dburl)) 184 | cur = conn.cursor() 185 | query = self.query 186 | for mod in ['ANALYZE', 'EXPLAIN']: 187 | if mod not in query: 188 | query = '%s %s' %(mod, query) 189 | 190 | cur.execute(query) 191 | results = [x[0] for x in cur.fetchall()] 192 | cur.close() 193 | conn.close() 194 | 195 | def add_explain_analyze(query): 196 | for mod in ['ANALYZE', 'EXPLAIN']: 197 | if mod not in query: 198 | query = '%s %s' %(mod, query) 199 | return query 200 | 201 | 202 | class Config(object): 203 | """ 204 | A config is a bunch a setting names with possible values. 205 | When you call gen_settings it iterates over the combinations of those settings 206 | 207 | The config file is persisted as a json file 208 | 209 | >>> c = Config() 210 | >>> c.add_setting('foo', ['1', '100']) 211 | >>> c.add_setting('bar', ['2', '200']) 212 | >>> list(c.gen_settings()) 213 | [[('foo', '1'), ('bar', '2')], [('foo', '1'), ('bar', '200')], [('foo', '100'), ('bar', '2')], [('foo', '100'), ('bar', '200')]] 214 | """ 215 | def __init__(self): 216 | self.settings = [] 217 | 218 | def add_setting(self, name, param_values): 219 | self.settings.append((name, param_values)) 220 | 221 | def gen_settings(self): 222 | setting_groups = [] 223 | for key, values in self.settings: 224 | setting_groups.append([(key,y) for y in values]) 225 | for combo in iterCombinations(*setting_groups): 226 | yield combo 227 | 228 | def write(self, fout): 229 | fout.write(json.dumps(self.settings, indent=2)) 230 | 231 | def load_config(fin): 232 | c = Config() 233 | c.settings = json.loads(fin.read()) 234 | return c 235 | 236 | def iterCombinations(*iterables, **kwds): 237 | ''' 238 | >>> #list(iterCombinations(*[[('a',1), ('a',2)],[('b', 1), ('b', 2)]])) 239 | >>> #list(iterCombinations([('a',1), ('a',2)],[('b', 1), ('b', 2)])) 240 | >>> list(iterCombinations(range(5), 'abc')) 241 | [[0, 'a'], [0, 'b'], [0, 'c'], [1, 'a'], [1, 'b'], [1, 'c'], [2, 'a'], [2, 'b'], [2, 'c'], [3, 'a'], [3, 'b'], [3, 'c'], [4, 'a'], [4, 'b'], [4, 'c']] 242 | http://code.activestate.com/recipes/501151-generating-combinations-in-blocks/ 243 | Generates the combinations of the given iterables. 244 | 245 | @returns: An iterator over all combinations of the C{iterables}. Each yielded 246 | combination is a list of size C{len(iterables)} where the i-th element 247 | is drawn from the i-th iterable. B{Important note:} This list should not 248 | be modified between iterations; if you need to modify it, copy it to a 249 | new container. 250 | 251 | @param iterables: One or more arbitrary iterables. 252 | @param kwds: Currently only 'blocksize' allowed. 253 | @keyword blocksize: Determines the order of the yielded combinations. By 254 | default (C{blocksize=1}), the first iterable corresponds to the most outer 255 | loop (slowest change) and the last iterable corresponds to the most inner 256 | loop (fastest change). 257 | 258 | For larger blocksize, each iterable is first partitioned into consecutive 259 | blocks of size C{blocksize} (except perhaps for the last block which may 260 | be shorter). Then each combination is yielded by first iterating over 261 | each block combination C{B := (B1,B2,..Bn)} and then yielding each 262 | combination from B. 263 | 264 | More generally, C{blocksize} can be an iterable so that different 265 | C{iterables} can be partitioned by different block size. In this case, 266 | C{blocksize} is repeated as many time as necessary to match C{len(iterables)}. 267 | For instance:: 268 | iterCombinations(range(4),range(6),range(8),'xyz', blocksize=(2,3)) 269 | partitions C{range(4)} and C{range(8)} with blocksize=2, while C{range(6)} 270 | and 'xyz' are partitioned with blocksize=3. 271 | ''' 272 | combo = [None] * len(iterables) 273 | blocksize = kwds.get('blocksize', 1) 274 | if isinstance(blocksize, int): 275 | sizes = repeat(blocksize) 276 | else: 277 | sizes = cycle(iter(blocksize)) 278 | block_lists = [list(_iterblocks(it,sz)) for it,sz in izip(iterables,sizes)] 279 | for block_combo in _iterCombinations(block_lists, [None] * len(iterables)): 280 | #for block_combo in _iterCombinations(block_lists, len(iterables)): 281 | combo = [None] * len(iterables) 282 | for _ in _iterCombinations(block_combo, combo): 283 | #for _ in _iterCombinations(block_combo, len(iterables)): 284 | yield combo 285 | 286 | 287 | def _iterCombinations(groups, combo_list, index=0): 288 | """ 289 | >>> list(_iterCombinations('abc', [None, None, None])) 290 | [['a', 'b', 'c']] 291 | >>> #list(_iterCombinations('abc', 1)) 292 | 293 | # note that combo_list persists over time... hence this problem 294 | >>> list(_iterCombinations(['abc', 'xyz'], [None]* 2)) 295 | [['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z']] 296 | 297 | """ 298 | # generate recursively all combinations of groups, updating combo_list 299 | # *in-place* for each combination. 300 | if index < len(groups)-1: 301 | for x in groups[index]: 302 | combo_list[index] = x 303 | for foo in _iterCombinations(groups,combo_list,index+1): 304 | yield combo_list 305 | else: # optimization to avoid the last level of recursion 306 | assert index == len(groups)-1 307 | for x in groups[index]: 308 | combo_list[index] = x 309 | yield combo_list 310 | 311 | 312 | def _iterblocks(iterable, blocksize, factory=tuple): 313 | """ 314 | >>> list(_iterblocks('abc', 1)) 315 | [('a',), ('b',), ('c',)] 316 | >>> list(_iterblocks('abc', 2)) 317 | [('a', 'b'), ('c',)] 318 | >>> list(_iterblocks('abc', 4)) == list(_iterblocks('abc', 3)) 319 | True 320 | """ 321 | # split the iterable into blocks of blocksize 322 | iterable = iter(iterable) 323 | while True: 324 | block = factory(islice(iterable,blocksize)) 325 | if not block: break 326 | yield block 327 | if len(block) < blocksize: break 328 | 329 | def analyze_queries(query_lines, dburl, config_file, fout): 330 | config = load_config(config_file) 331 | for query in query_lines: 332 | a = Analyzer(query, dburl) 333 | a.run_config(config, fout) 334 | 335 | def _get_qf_queries(fin, query_separator=SEP): 336 | """ 337 | Yield lines of a single query 338 | """ 339 | lines = [] 340 | for line in fin: 341 | if line == query_separator: 342 | if lines: 343 | yield ''.join(lines) 344 | lines = [] 345 | else: 346 | lines.append(line) 347 | if lines: 348 | yield ''.join(lines) 349 | 350 | 351 | def get_log_queries(fin, fout, query_separator=SEP): 352 | for q_lines in _query_lines(fin): 353 | for line in q_lines: 354 | fout.write(line) 355 | fout.write(SEP) 356 | 357 | def _query_lines(fin): 358 | """ 359 | Given a query log yield lines of a single query 360 | """ 361 | in_q = False 362 | lines = [] 363 | for line in fin: 364 | if in_q and not line.startswith('\t'): 365 | in_q = False 366 | if lines: 367 | yield lines 368 | lines = [] 369 | if not in_q and STATEMENT in line: 370 | in_q = True 371 | start_idx = line.index(STATEMENT) + len(STATEMENT) 372 | line = line[start_idx:] 373 | if in_q: 374 | lines.append(line) 375 | if lines: 376 | yield lines 377 | 378 | def get_expected_real_row(lines): 379 | """ 380 | >>> get_expected_real_row(['Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=11) (actual time=0.004..0.052 rows=54 loops=1)', 'Total runtime: 0.150 ms']) 381 | (51, 54) 382 | """ 383 | ACT = 'actual time=' 384 | regex = re.compile(r'.*\(cost=.* rows=(?P\d*) .*\) \(actual.* rows=(?P\d*) .*\).*') 385 | for line in lines: 386 | if ACT in line: 387 | est = regex.search(line).group('rows_est') 388 | real = regex.search(line).group('rows_real') 389 | return int(est), int(real) 390 | return None 391 | 392 | 393 | def get_runtime(lines): 394 | """ 395 | >>> get_runtime(['Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=11) (actual time=0.004..0.052 rows=51 loops=1)', 'Total runtime: 0.150 ms']) <= 0.15 396 | True 397 | """ 398 | TOT = 'Total runtime: ' 399 | for line in lines: 400 | if line.startswith(TOT): 401 | assert line.endswith(' ms') 402 | idx = line.index(TOT) + len(TOT) 403 | return float(line[idx:-3]) 404 | return None 405 | 406 | def run_sql(cur, sql): 407 | cur.execute(sql) 408 | try: 409 | for row in cur.fetchall(): 410 | yield row 411 | except psycopg2.ProgrammingError, e: 412 | if 'no results to fetch in e': 413 | pass 414 | else: 415 | raise 416 | 417 | class TweakContext(object): 418 | def __init__(self, dburl, settings_dict): 419 | self.dburl = dburl 420 | self.settings_dict = settings_dict 421 | self.conn = None 422 | self.cur = None 423 | self.old_settings = {} 424 | self.elapsed = None 425 | 426 | def __enter__(self): 427 | self.conn = psycopg2.connect(get_conn_string(self.dburl)) 428 | self.cur = self.conn.cursor() 429 | # get old db settings 430 | for key in self.settings_dict: 431 | sql = 'SHOW %s' % key 432 | results = list(run_sql(self.cur, sql)) 433 | self.old_settings[key] = results[0][0] 434 | 435 | # set database settings 436 | for key, value in self.settings_dict.items(): 437 | sql = "SET %s = '%s'" %(key, value) 438 | results = list(run_sql(self.cur, sql)) 439 | 440 | # need to return the as part 441 | return self 442 | 443 | def run(self, query): 444 | start = time.time() 445 | for row in run_sql(self.cur, query): 446 | yield row 447 | self.elapsed = time.time() - start 448 | 449 | def __exit__(self, *args): 450 | # set old db settings 451 | for key, value in self.old_settings.items(): 452 | sql = "SET %s = '%s'" %(key, value) 453 | results = list(run_sql(self.cur, sql)) 454 | 455 | self.cur.close() 456 | self.conn.close() 457 | 458 | 459 | 460 | def main(prog_args): 461 | parser = optparse.OptionParser(version=meta.__version__) 462 | extract_group = optparse.OptionGroup(parser, 'Extract Queries from psql log') 463 | extract_group.add_option('--extract-queries', action='store_true', help='extract queries from a log file') 464 | extract_group.add_option('--log-file', help='specify log file') 465 | parser.add_option_group(extract_group) 466 | 467 | analyze_group = optparse.OptionGroup(parser, 'Analyze extracted queries') 468 | analyze_group.add_option('--analyze-queries', action='store_true', help='analyze query file (or stdin)') 469 | analyze_group.add_option('--query-file', help='specify query file (file with queries delimited by %s or stdin)' % SEP) 470 | analyze_group.add_option('--dburl', help='sqlalchemy connection string (we do not use sa, just the style of it ie postgres://postgres:password@localhost:port/dbname)') 471 | analyze_group.add_option('--config-file', help='specify config file containing parameters for analyzing') 472 | parser.add_option_group(analyze_group) 473 | 474 | parser.add_option('--gen-config', action='store_true', help='generate a config file writeing work_mem and maintenance_work_mem') 475 | 476 | parser.add_option('--test', action='store_true', help='run doctests') 477 | 478 | 479 | opt, args = parser.parse_args(prog_args) 480 | 481 | if opt.extract_queries: 482 | if not opt.log_file: 483 | sys.error('Specify --log-file') 484 | return 1 485 | fin = open(opt.log_file) 486 | fout = sys.stdout 487 | get_log_queries(fin, fout) 488 | fout.close() 489 | 490 | if opt.analyze_queries: 491 | if not opt.query_file: 492 | fin = sys.stdin 493 | else: 494 | fin = open(opt.query_file) 495 | if not opt.config_file: 496 | sys.error('Specify --config-file (--gen-config to generate sample)') 497 | 498 | analyze_queries(_get_qf_queries(fin), opt.dburl, open(opt.config_file), sys.stdout) 499 | 500 | if opt.gen_config: 501 | c = Config() 502 | c.add_setting('work_mem', ['1MB', '512MB']) 503 | c.add_setting('maintenance_work_mem', ['1MB', '512MB']) 504 | c.write(sys.stdout) 505 | 506 | if opt.test: 507 | import doctest 508 | doctest.testmod() 509 | 510 | if __name__ == '__main__': 511 | sys.exit(main(sys.argv)) 512 | 513 | --------------------------------------------------------------------------------