├── MANIFEST.in
├── pgtweaklib
├── meta.py
├── tree.py
└── __init__.py
├── bin
└── pgtweak
├── setup.py
├── LICENSE
├── README.rst
└── test
├── testnode.py
└── testpgtweaklib.py
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | include INSTALL
4 |
--------------------------------------------------------------------------------
/pgtweaklib/meta.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2010 Matt Harrison
2 |
3 | __version__ = '0.1'
4 | __author__ = 'Matt Harrison'
5 | __email__ = 'matthewharrison at gmail.com'
6 |
--------------------------------------------------------------------------------
/bin/pgtweak:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) 2010 Matt Harrison
3 |
4 | import sys
5 |
6 | import pgtweaklib
7 |
8 | if __name__ == '__main__':
9 | try:
10 | sys.exit(pgtweaklib.main(sys.argv))
11 | except Exception, e:
12 | sys.stderr.write('%s\n'%str(e))
13 | sys.exit(1)
14 |
15 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2010 Matt Harrison
2 | from distutils.core import setup
3 | #from setuptools import setup
4 |
5 | from pgtweaklib import meta
6 |
7 | setup(name='PgTweak',
8 | version=meta.__version__,
9 | author=meta.__author__,
10 | description='FILL IN',
11 | scripts=['bin/pgtweak'],
12 | package_dir={'pgtweaklib':'pgtweaklib'},
13 | packages=['pgtweaklib'],
14 | )
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010 Matt Harrison
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | PgTweak
2 | =======
3 |
4 | A tool to run queries against different postgres settings.
5 | Normally one would scour the postgres logs for slow queries.
6 |
7 | Author: Matt Harrison (matthewharrison@gmail.com)
8 | LICENSE: MIT
9 |
10 | Scouring logs for queries
11 | -------------------------
12 |
13 | Run the following to pull queries from log files::
14 |
15 | pgtweak --extract-queries --log-file some/pg/log/file > extracted.sql
16 |
17 | Getting a query file
18 | --------------------
19 |
20 | One can go through extracted.sql to find a query to example, or hand
21 | write them in a text file.
22 |
23 | Generating a sample config file
24 | -------------------------------
25 |
26 | A config file is used to indicate what settings to tweak. It is a
27 | json file that looks like this::
28 |
29 | [
30 | [
31 | "work_mem",
32 | [
33 | "1MB",
34 | "512MB"
35 | ]
36 | ],
37 | [
38 | "maintenance_work_mem",
39 | [
40 | "1MB",
41 | "512MB"
42 | ]
43 | ]
44 | ]
45 |
46 | That is the one generated by::
47 |
48 | pgtweak --gen-config
49 |
50 | You can add more settings and possible values for settings. Remember
51 | though, that it will test the combination of each setting. For every
52 | new value option you add to a setting, you double the runs. (Note
53 | that just adding an setting with a single value just includes that in
54 | every run.)
55 |
56 | Testing settings
57 | ----------------
58 |
59 | Run::
60 |
61 | pgtweak --query-file simple.sql --analyze-queries --dburl postgres://postgres@localhost/testetl --config-file conf.json > results.json
62 |
63 | Note that ``dburl`` is a SqlAlchemy style connection string like
64 | ``postgres://user:password@host:post/database``
65 |
66 | The ``results.json`` file contains JSON list containing the query, time to
67 | run, results, and settings for that query.
68 |
69 | Settings and explanations
70 | =========================
71 |
72 | Explain settings (TODO)
73 |
74 | http://www.varlena.com/GeneralBits/Tidbits/annotated_conf_e.html
75 |
76 | random_page_cost
77 | -----------------
78 |
79 | http://archives.postgresql.org/pgsql-performance/2004-02/msg00287.php
80 |
81 | general settings
82 | -----------------
83 |
84 | http://archives.postgresql.org/pgsql-hackers/2008-11/msg00919.php
85 |
86 |
87 |
88 |
89 | Enabling Logging
90 | ================
91 |
92 | Need to turn on Postgres logging in ``postgresql.conf``::
93 |
94 | log_destination = stderr
95 | logging_collector = on
96 | log_filename = 'pgsql-%Y-%m-%d.log'
97 | log_min_error_statement = 'error'
98 | log_connections = false
99 | log_line_prefix = '<%t>'
100 | log_statement = all
101 | log_directory = '/var/log/pglog'
102 |
103 | * Make sure log_directory exists and has correct permissions
104 |
105 | * Restart postgres
106 |
--------------------------------------------------------------------------------
/pgtweaklib/tree.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | NODE_TYPES = {
4 | 'Append':1,
5 | 'Hash':1,
6 | 'Hash Join':1,
7 | 'HashAggregate':1,
8 | 'Index Scan':1,
9 | 'Nested Loop':1,
10 | 'Result':1,
11 | 'Seq Scan':1,
12 | 'Sort':1,
13 |
14 | }
15 |
16 | class Node(object):
17 | def __init__(self, name, parent):
18 | self.parent = parent
19 | self.name = name
20 | self.children = []
21 | self.data = []
22 | self.table = None # for Seq Scan
23 | self.index = None # for Index Scan
24 | self.column = None # for Index Scan
25 | self.estimated = None
26 | self.actual = None
27 | self.level = None # spaces before content used for parent detection
28 |
29 | def __str__(self):
30 | return ' '*self.level + self.name
31 | class NodeData(object):
32 | def __init__(self, name, value):
33 | self.name = name
34 | self.value = value
35 |
36 | class EstimatedData(object):
37 | def __init__(self, start, end, rows, width):
38 | self.start = start
39 | self.end = end
40 | self.rows = rows
41 | self.width = width
42 |
43 | def get_estimated(line):
44 | """
45 | >>> e = get_estimated(' (cost=2747.32..2747.32 rows=2 width=282) ')
46 | >>> e.start
47 | '2747.32'
48 | >>> e.end
49 | '2747.32'
50 | >>> e.rows
51 | '2'
52 | >>> e.width
53 | '282'
54 | """
55 | start_idx = line.index('cost=')
56 | end_idx = line.index(')')
57 | line = line[start_idx:end_idx]
58 | split = line.split(' ')
59 | cost = split[0]
60 | dot_dot_idx = cost.index('..')
61 | start = cost[cost.index('=')+1:dot_dot_idx]
62 | end = cost[dot_dot_idx+2:]
63 |
64 | rows = split[1].split('=')[-1]
65 |
66 | width = split[2].split('=')[-1]
67 | return EstimatedData(start, end, rows, width)
68 |
69 | class ActualData(object):
70 | def __init__(self, start, end, rows, loops):
71 | self.start = start
72 | self.end = end
73 | self.rows = rows
74 | self.loops = loops
75 |
76 | def get_actual(line):
77 | start_idx = line.index('time=')
78 | end_idx = line.index(')', start_idx)
79 | line = line[start_idx:end_idx]
80 | split = line.split(' ')
81 | cost = split[0]
82 | dot_dot_idx = cost.index('..')
83 | start = cost[cost.index('=')+1:dot_dot_idx]
84 | end = cost[dot_dot_idx+2:]
85 |
86 | rows = split[1].split('=')[-1]
87 |
88 | loops = split[2].split('=')[-1]
89 | return ActualData(start, end, rows, loops)
90 |
91 |
92 | def build_tree(results):
93 | """
94 | Given EXPLAIN ANALYZE results return the root node
95 |
96 | """
97 | root = None
98 | if "\n" in results:
99 | results = results.split("\n")
100 | parent = None
101 | last_node = None
102 | for line in results:
103 | if 'QUERY PLAN' in line:
104 | continue
105 | elif line.startswith('--'):
106 | continue
107 | elif 'cost=' in line and 'actual time=' in line:
108 | stripped = line.lstrip()
109 |
110 | leading_space = len(line) - len(stripped)
111 | if last_node and leading_space > last_node.level:
112 | parent = last_node
113 | while parent and leading_space <= parent.level:
114 | parent = parent.parent
115 |
116 | node = parse_node_line(line, parent, leading_space)
117 |
118 | if parent is None:
119 | root = node
120 | parent = node
121 | last_node = node
122 | else:
123 | data = parse_data_line(line)
124 | parent.data.append(data)
125 | return root
126 |
127 | def parse_data_line(line):
128 | line = line.strip()
129 | colon_idx = line.find(':')
130 | name = line[:colon_idx]
131 | data = line[colon_idx+1:]
132 | return NodeData(name, data)
133 |
134 | def get_name(line):
135 | first_paren = line.index('(')
136 | name = line[:first_paren].rstrip()
137 |
138 | arrow = '-> '
139 | if arrow in name:
140 | name = name[len(arrow):]
141 |
142 | other = None
143 | if name.startswith('Seq Scan'):
144 | seq_re = re.compile(r'(\s+\w+)* on (?P
\w+)')
145 | other = seq_re.search(name).group('table')
146 | name = 'Seq Scan'
147 | elif name.startswith('Index Scan'):
148 | idx_re = re.compile(r'Index Scan using (?P\w+) on (?P\w+)')
149 | idx = idx_re.search(name).group('index')
150 | col = idx_re.search(name).group('col')
151 | other = (idx, col)
152 | name = 'Index Scan'
153 |
154 | assert name in NODE_TYPES
155 | return name, other
156 |
157 | def parse_node_line(line, parent, depth=None):
158 | stripped = line.lstrip()
159 | leading_space = len(line) - len(stripped)
160 | name, other = get_name(stripped)
161 |
162 | estimated = get_estimated(stripped)
163 | actual = get_actual(stripped)
164 | node = Node(name, parent)
165 | node.estimated = estimated
166 | node.actual = actual
167 | node.level = leading_space
168 | if name == 'Seq Scan':
169 | node.table = other
170 | elif name == 'Index Scan':
171 | node.index = other[0]
172 | node.col = other[1]
173 | if parent:
174 | parent.children.append(node)
175 | return node
176 |
177 |
178 | if __name__ == '__main__':
179 | import doctest
180 | doctest.testmod()
181 |
--------------------------------------------------------------------------------
/test/testnode.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from pgtweaklib import tree
3 |
4 |
5 | BASIC = """ QUERY PLAN
6 | ------------------------------------------------------------------------------------
7 | Result (cost=0.00..0.01 rows=1 width=0) (actual time=0.003..0.004 rows=1 loops=1)
8 | Total runtime: 0.025 ms
9 | (2 rows)"""
10 |
11 | STAR = """ QUERY PLAN
12 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
13 | Sort (cost=2747.32..2747.32 rows=2 width=282) (actual time=774.035..774.087 rows=64 loops=1)
14 | Sort Key: product_dim.product, product_dim.subcategory_name, product_dim.category_num, product_dim.upc_description, product_dim.department_name, product_dim.subcategory_num, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.create_date, product_dim.size, product_dim.department_num, product_dim.primary_upc, product_dim.category_name, promo_dim.sales_type
15 | Sort Method: quicksort Memory: 58kB
16 | -> HashAggregate (cost=2747.24..2747.31 rows=2 width=282) (actual time=773.503..773.658 rows=64 loops=1)
17 | -> Nested Loop (cost=19.68..2747.10 rows=2 width=282) (actual time=0.866..762.048 rows=1538 loops=1)
18 | -> Hash Join (cost=19.68..2006.22 rows=219 width=73) (actual time=0.282..448.024 rows=57142 loops=1)
19 | Hash Cond: (public.week_sales_fact.promo_key = promo_dim.promo_key)
20 | -> Hash Join (cost=17.53..1998.67 rows=857 width=74) (actual time=0.133..318.213 rows=57142 loops=1)
21 | Hash Cond: (public.week_sales_fact.week_key = week_dim.week_key)
22 | -> Append (cost=0.00..1758.28 rows=57144 width=78) (actual time=0.008..189.884 rows=57142 loops=1)
23 | -> Seq Scan on week_sales_fact (cost=0.00..16.15 rows=2 width=154) (actual time=0.001..0.001 rows=0 loops=1)
24 | Filter: ((week_key >= 390) AND (week_key <= 393))
25 | -> Seq Scan on week_sales_fact_390 week_sales_fact (cost=0.00..428.90 rows=14060 width=78) (actual time=0.004..23.368 rows=14060 loops=1)
26 | Filter: ((week_key >= 390) AND (week_key <= 393))
27 | -> Seq Scan on week_sales_fact_391 week_sales_fact (cost=0.00..429.44 rows=14096 width=78) (actual time=0.009..24.123 rows=14096 loops=1)
28 | Filter: ((week_key >= 390) AND (week_key <= 393))
29 | -> Seq Scan on week_sales_fact_392 week_sales_fact (cost=0.00..443.19 rows=14546 width=78) (actual time=0.009..24.163 rows=14546 loops=1)
30 | Filter: ((week_key >= 390) AND (week_key <= 393))
31 | -> Seq Scan on week_sales_fact_393 week_sales_fact (cost=0.00..440.60 rows=14440 width=78) (actual time=0.010..24.430 rows=14440 loops=1)
32 | Filter: ((week_key >= 390) AND (week_key <= 393))
33 | -> Hash (cost=17.49..17.49 rows=3 width=4) (actual time=0.115..0.115 rows=4 loops=1)
34 | -> Seq Scan on week_dim (cost=0.00..17.49 rows=3 width=4) (actual time=0.039..0.108 rows=4 loops=1)
35 | Filter: ((week_end >= '2009-10-20 00:00:00'::timestamp without time zone) AND (week_end <= '2009-11-10 00:00:00'::timestamp without time zone))
36 | -> Hash (cost=1.51..1.51 rows=51 width=7) (actual time=0.140..0.140 rows=51 loops=1)
37 | -> Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=7) (actual time=0.006..0.070 rows=51 loops=1)
38 | -> Index Scan using product_dim_pkey on product_dim (cost=0.00..3.37 rows=1 width=217) (actual time=0.003..0.003 rows=0 loops=57142)
39 | Index Cond: (product_dim.product_key = public.week_sales_fact.product_key)
40 | Filter: (((product_dim.category)::text = ' 18 SOFT DRINKS'::text) AND ((product_dim.subcategory)::text = ' 18 1 18-CSD SUGAR SOFT DRINKS'::text) AND ((product_dim.department)::text = ' 1 GROCERY'::text))
41 | Total runtime: 774.359 ms
42 | (29 rows)
43 | """
44 | class TestNode(unittest.TestCase):
45 | def test_basic(self):
46 | root = tree.build_tree(BASIC)
47 | self.assertEquals(root.name, "Result")
48 |
49 | def test_star(self):
50 | root = tree.build_tree(STAR)
51 | self.assertEquals(root.name, 'Sort')
52 | hash_agg = root.children[0]
53 | self.assertEquals(len(root.children), 1)
54 | self.assertEquals(hash_agg.name, 'HashAggregate')
55 | self.assertEquals([x.name for x in hash_agg.children], ['Nested Loop'])
56 | parent = hash_agg.children[0]
57 | for names in [ ['Hash Join', 'Index Scan'], ['Hash Join', 'Hash'],['Append', 'Hash'], ['Seq Scan', 'Seq Scan', 'Seq Scan', 'Seq Scan', 'Seq Scan', ]]:
58 | self.assertEquals([x.name for x in parent.children], names)
59 | parent = parent.children[0]
60 | nest_loop = hash_agg.children[0]
61 | if __name__ == '__main__':
62 | unittest.main()
63 |
--------------------------------------------------------------------------------
/test/testpgtweaklib.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2010 Matt Harrison
2 |
3 | import unittest
4 |
5 | import pgtweaklib
6 |
7 | class TestPgtweaklib(unittest.TestCase):
8 | def test_needs_mem_boost(self):
9 | results = """GroupAggregate (cost=42856.20..45104.53 rows=32119 width=247) (actual time=32606.422..44289.146 rows=9869 loops=1)
10 | -> Sort (cost=42856.20..42936.50 rows=32119 width=247) (actual time=32605.346..43664.767 rows=162941 loops=1)
11 | Sort Key: nov_14_08ag_wk_vendor_fact.department, nov_14_08ag_wk_vendor_fact.department_name, nov_14_08ag_wk_vendor_fact.department_num, nov_14_08ag_wk_vendor_fact.category, nov_14_08ag_wk_vendor_fact.category_name, nov_14_08ag_wk_vendor_fact.category_num, nov_14_08ag_wk_vendor_fact.subcategory, nov_14_08ag_wk_vendor_fact.subcategory_name, nov_14_08ag_wk_vendor_fact.subcategory_num, nov_14_08ag_wk_vendor_fact.product, nov_14_08ag_wk_vendor_fact.primary_upc, nov_14_08ag_wk_vendor_fact.upc_description, nov_14_08ag_wk_vendor_fact.size, nov_14_08ag_wk_vendor_fact.create_date, nov_14_08ag_wk_vendor_fact.discontinued_date
12 | Sort Method: external merge Disk: 43768kB
13 | -> Nested Loop (cost=0.00..36717.41 rows=32119 width=247) (actual time=3.505..755.673 rows=162941 loops=1)
14 | """
15 | self.assertEquals(suggestions(results), 'up work_mem to at least 44MB')
16 |
17 | def test_needs_index(self):
18 | """
19 | The following results should suggest adding an index
20 | """
21 | results = """ QUERY PLAN
22 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
23 | Sort (cost=2747.32..2747.32 rows=2 width=282) (actual time=774.035..774.087 rows=64 loops=1)
24 | Sort Key: product_dim.product, product_dim.subcategory_name, product_dim.category_num, product_dim.upc_description, product_dim.department_name, product_dim.subcategory_num, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.create_date, product_dim.size, product_dim.department_num, product_dim.primary_upc, product_dim.category_name, promo_dim.sales_type
25 | Sort Method: quicksort Memory: 58kB
26 | -> HashAggregate (cost=2747.24..2747.31 rows=2 width=282) (actual time=773.503..773.658 rows=64 loops=1)
27 | -> Nested Loop (cost=19.68..2747.10 rows=2 width=282) (actual time=0.866..762.048 rows=1538 loops=1)
28 | -> Hash Join (cost=19.68..2006.22 rows=219 width=73) (actual time=0.282..448.024 rows=57142 loops=1)
29 | Hash Cond: (public.week_sales_fact.promo_key = promo_dim.promo_key)
30 | -> Hash Join (cost=17.53..1998.67 rows=857 width=74) (actual time=0.133..318.213 rows=57142 loops=1)
31 | Hash Cond: (public.week_sales_fact.week_key = week_dim.week_key)
32 | -> Append (cost=0.00..1758.28 rows=57144 width=78) (actual time=0.008..189.884 rows=57142 loops=1)
33 | -> Seq Scan on week_sales_fact (cost=0.00..16.15 rows=2 width=154) (actual time=0.001..0.001 rows=0 loops=1)
34 | Filter: ((week_key >= 390) AND (week_key <= 393))
35 | -> Seq Scan on week_sales_fact_390 week_sales_fact (cost=0.00..428.90 rows=14060 width=78) (actual time=0.004..23.368 rows=14060 loops=1)
36 | Filter: ((week_key >= 390) AND (week_key <= 393))
37 | -> Seq Scan on week_sales_fact_391 week_sales_fact (cost=0.00..429.44 rows=14096 width=78) (actual time=0.009..24.123 rows=14096 loops=1)
38 | Filter: ((week_key >= 390) AND (week_key <= 393))
39 | -> Seq Scan on week_sales_fact_392 week_sales_fact (cost=0.00..443.19 rows=14546 width=78) (actual time=0.009..24.163 rows=14546 loops=1)
40 | Filter: ((week_key >= 390) AND (week_key <= 393))
41 | -> Seq Scan on week_sales_fact_393 week_sales_fact (cost=0.00..440.60 rows=14440 width=78) (actual time=0.010..24.430 rows=14440 loops=1)
42 | Filter: ((week_key >= 390) AND (week_key <= 393))
43 | -> Hash (cost=17.49..17.49 rows=3 width=4) (actual time=0.115..0.115 rows=4 loops=1)
44 | -> Seq Scan on week_dim (cost=0.00..17.49 rows=3 width=4) (actual time=0.039..0.108 rows=4 loops=1)
45 | Filter: ((week_end >= '2009-10-20 00:00:00'::timestamp without time zone) AND (week_end <= '2009-11-10 00:00:00'::timestamp without time zone))
46 | -> Hash (cost=1.51..1.51 rows=51 width=7) (actual time=0.140..0.140 rows=51 loops=1)
47 | -> Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=7) (actual time=0.006..0.070 rows=51 loops=1)
48 | -> Index Scan using product_dim_pkey on product_dim (cost=0.00..3.37 rows=1 width=217) (actual time=0.003..0.003 rows=0 loops=57142)
49 | Index Cond: (product_dim.product_key = public.week_sales_fact.product_key)
50 | Filter: (((product_dim.category)::text = ' 18 SOFT DRINKS'::text) AND ((product_dim.subcategory)::text = ' 18 1 18-CSD SUGAR SOFT DRINKS'::text) AND ((product_dim.department)::text = ' 1 GROCERY'::text))
51 | Total runtime: 774.359 ms
52 | (29 rows)
53 | """
54 | query="""
55 | EXPLAIN ANALYZE SELECT TIMESTAMP E'11/10/09' AS week_end, product_dim.category AS category, product_dim.category_num AS category_num, product_dim.category_name AS category_name, product_dim.product AS product, product_dim.primary_upc AS primary_upc, product_dim.upc_description AS upc_description, product_dim.size AS size, product_dim.create_date AS create_date, product_dim.discontinued_date AS discontinued_date, product_dim.manufacturer_code AS manufacturer_code, product_dim.subcategory_name AS subcategory_name, product_dim.subcategory AS subcategory, product_dim.subcategory_num AS subcategory_num, product_dim.subcategory_name AS subcategory_name, product_dim.category_num AS category_num, product_dim.upc_description AS upc_description, product_dim.department_name AS department_name, product_dim.subcategory_num AS subcategory_num, product_dim.discontinued_date AS discontinued_date, product_dim.manufacturer_code AS manufacturer_code, product_dim.department AS department, product_dim.department_num AS department_num, product_dim.department_name AS department_name, product_dim.create_date AS create_date, product_dim.size AS size, product_dim.department_num AS department_num, product_dim.primary_upc AS primary_upc, product_dim.category_name AS category_name, promo_dim.sales_type AS sales_type, sum(week_sales_fact.sales) AS sales, sum(week_sales_fact.sales_with_cost) AS sales_with_cost, sum(week_sales_fact.units) AS units, sum(week_sales_fact.units_with_cost) AS units_with_cost, sum(week_sales_fact.cost_with_cost) AS cost_with_cost, sum(week_sales_fact.dealamt) AS dealamt, sum(week_sales_fact.purbillback) AS purbillback, sum(week_sales_fact.scanbillback) AS scanbillback, sum(week_sales_fact.rebateamt) AS rebateamt, sum(week_sales_fact.regprice) AS regprice
56 | FROM product_dim, promo_dim, week_sales_fact, week_dim
57 | WHERE product_dim.department IN (E' 1 GROCERY') AND product_dim.category IN (E' 18 SOFT DRINKS') AND product_dim.subcategory IN (E' 18 1 18-CSD SUGAR SOFT DRINKS') AND week_dim.week_end BETWEEN E'10/20/09' AND E'11/10/09' AND week_sales_fact.product_key = product_dim.product_key AND week_sales_fact.promo_key = promo_dim.promo_key AND week_sales_fact.week_key = week_dim.week_key AND week_sales_fact.week_key BETWEEN 390 AND 393 GROUP BY product_dim.category, product_dim.category_num, product_dim.category_name, product_dim.product, product_dim.primary_upc, product_dim.upc_description, product_dim.size, product_dim.create_date, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.subcategory_name, product_dim.subcategory, product_dim.subcategory_num, product_dim.subcategory_name, product_dim.category_num, product_dim.upc_description, product_dim.department_name, product_dim.subcategory_num, product_dim.discontinued_date, product_dim.manufacturer_code, product_dim.department, product_dim.department_num, product_dim.department_name, product_dim.create_date, product_dim.size, product_dim.department_num, product_dim.primary_upc, product_dim.category_name, promo_dim.sales_type ORDER BY TIMESTAMP E'11/10/09', category, product, subcategory_name, subcategory, category_num, upc_description, department_name, subcategory_num, discontinued_date, manufacturer_code, department, create_date, size, department_num, primary_upc, category_name, sales_type ;
58 | """
59 |
60 | self.assertEquals(suggestions(results), 'add index to product_dim')
61 |
62 |
63 | if __name__ == '__main__':
64 | unittest.main()
65 |
--------------------------------------------------------------------------------
/pgtweaklib/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | PgTweak
4 | ========
5 |
6 | A tool to run queries against different postgres settings.
7 | Normally one would scour the postgres logs for slow queries.
8 |
9 | Scouring logs for queries
10 | --------------------------
11 |
12 | Run the following to pull queries from log files::
13 |
14 | pgtweak --extract-queries --log-file some/pg/log/file > extracted.sql
15 |
16 | Getting a query file
17 | ---------------------
18 |
19 | One can go through extracted.sql to find a query to example, or hand
20 | write them in a text file.
21 |
22 | Generating a sample config file
23 | --------------------------------
24 |
25 | A config file is used to indicate what settings to tweak. It is a
26 | json file that looks like this::
27 |
28 | [
29 | [
30 | "work_mem",
31 | [
32 | "1MB",
33 | "512MB"
34 | ]
35 | ],
36 | [
37 | "maintenance_work_mem",
38 | [
39 | "1MB",
40 | "512MB"
41 | ]
42 | ]
43 | ]
44 |
45 | That is the one generated by ::
46 |
47 | pgtweak --gen-config
48 |
49 | You can add more settings and possible values for settings. Remember
50 | though, that it will test the combination of each setting. For every
51 | new value option you add to a setting, you double the runs. (Note
52 | that just adding an setting with a single value just includes that in
53 | every run.)
54 |
55 |
56 | testing settings
57 | -----------------
58 |
59 | Run::
60 |
61 | pgtweak --query-file simple.sql --analyze-queries --dburl postgres://postgres@localhost/testetl --config-file conf.json > results.json
62 |
63 | Note that ``dburl`` is a SqlAlchemy style connection string (ie
64 | postgres://user:password@host:post/database).
65 |
66 | The results.json file contains json list containing the query, time to
67 | run, results, and settings for that query.
68 |
69 | Settings and explanations
70 | ==========================
71 |
72 | TODO
73 |
74 | Enabling Logging
75 | =================
76 |
77 | need to turn on Postgres logging in postgresql.conf
78 |
79 | log_destination = stderr
80 | logging_collector = on
81 | log_filename = 'pgsql-%Y-%m-%d.log'
82 | log_min_error_statement = 'error'
83 | log_connections = false
84 | log_line_prefix = '<%t>'
85 | log_statement = all
86 | log_directory = '/var/log/pglog'
87 |
88 | * Make sure log_directory exists and has correct permissions
89 | * Restart postgres
90 |
91 | """
92 |
93 | # Copyright (c) 2010 Matt Harrison
94 | from itertools import islice, repeat, izip, cycle
95 | import json
96 | import optparse
97 | import re
98 | import sys
99 | import time
100 |
101 | import psycopg2
102 |
103 | import meta
104 |
105 | STATEMENT = 'statement:'
106 | SEP = '-----\n'
107 |
108 | def process_dburl(dburl):
109 | """
110 | Convert sqlalchemy style urls to psycopg2 friendly params
111 | >>> process_dburl('postgres://matt:password@localhost:9999/testdb')
112 | ('localhost', 'testdb', 'matt', 'password', '9999')
113 | >>> process_dburl('postgres://matt/testdb')
114 | (None, 'testdb', 'matt', None, None)
115 | >>> process_dburl('postgres://matt')
116 | (None, '', 'matt', None, None)
117 | >>> process_dburl('postgres://matt:password')
118 | (None, '', 'matt', 'password', None)
119 | >>> process_dburl('postgres://matt@localhost')
120 | ('localhost', '', 'matt', None, None)
121 | >>> process_dburl('postgres://matt@localhost:9999')
122 | ('localhost', '', 'matt', None, '9999')
123 | """
124 | regex = re.compile(r'^postgres://(?P\w+):?(?P\w+)?@?(?P\w+)?:?(?P\d\d\d\d)?/?(?P\w*)')
125 |
126 | host = regex.search(dburl).group('host')
127 | dbname = regex.search(dburl).group('db')
128 | user = regex.search(dburl).group('name')
129 | password = regex.search(dburl).group('pass')
130 | port = regex.search(dburl).group('port')
131 |
132 | return host, dbname, user, password, port
133 |
134 | def get_conn_string(dburl):
135 | """
136 | >>> get_conn_string('postgres://matt:password@localhost:9999/testdb')
137 | 'dbname=testdb user=matt password=password host=localhost port=9999'
138 | """
139 | host, dbname, user, password, port = process_dburl(dburl)
140 | conn = []
141 | for param in ['dbname', 'user', 'password',
142 | 'host', 'port', 'sslmode']:
143 | value = locals().get(param, None)
144 | if value:
145 | conn.append('%s=%s' %(param, value))
146 | result = ' '.join(conn)
147 | return result
148 |
149 | class Analyzer(object):
150 | def __init__(self, query, dburl=None):
151 | self.dburl = dburl
152 | self.query = query
153 | self.host = None
154 | self.dbname = None
155 | self.user = None
156 | self.password = None
157 |
158 | def _process_dburl(self):
159 | self.host, self.dbname, self.user, self.password, self.port = process_dburl(self.dburl)
160 | return
161 | regex = re.compile(r'^postgres://(?P\w+):?(?P\w+)?@?(?P\w+)?:?(?P\d\d\d\d)?/(?P\w*)')
162 |
163 | self.host = regex.search(self.dburl).group('host')
164 | self.dbname = regex.search(self.dburl).group('db')
165 | self.user = regex.search(self.dburl).group('name')
166 | self.password = regex.search(self.dburl).group('pass')
167 | self.port = regex.search(self.dburl).group('port')
168 |
169 | def run_config(self, config, fout):
170 | results = []
171 | for settings in config.gen_settings():
172 | setting_dict = dict(settings)
173 | with TweakContext(self.dburl, setting_dict) as tweaker:
174 | lines = list(x[0] for x in tweaker.run(add_explain_analyze(self.query)))
175 | results.append({'query':self.query,
176 | 'settings':setting_dict,
177 | 'time':tweaker.elapsed,
178 | 'results':lines})
179 | fout.write(json.dumps(results, indent=2))
180 |
181 |
182 | def analyze(self):
183 | conn = psycopg2.connect(get_conn_string(self.dburl))
184 | cur = conn.cursor()
185 | query = self.query
186 | for mod in ['ANALYZE', 'EXPLAIN']:
187 | if mod not in query:
188 | query = '%s %s' %(mod, query)
189 |
190 | cur.execute(query)
191 | results = [x[0] for x in cur.fetchall()]
192 | cur.close()
193 | conn.close()
194 |
195 | def add_explain_analyze(query):
196 | for mod in ['ANALYZE', 'EXPLAIN']:
197 | if mod not in query:
198 | query = '%s %s' %(mod, query)
199 | return query
200 |
201 |
202 | class Config(object):
203 | """
204 | A config is a bunch a setting names with possible values.
205 | When you call gen_settings it iterates over the combinations of those settings
206 |
207 | The config file is persisted as a json file
208 |
209 | >>> c = Config()
210 | >>> c.add_setting('foo', ['1', '100'])
211 | >>> c.add_setting('bar', ['2', '200'])
212 | >>> list(c.gen_settings())
213 | [[('foo', '1'), ('bar', '2')], [('foo', '1'), ('bar', '200')], [('foo', '100'), ('bar', '2')], [('foo', '100'), ('bar', '200')]]
214 | """
215 | def __init__(self):
216 | self.settings = []
217 |
218 | def add_setting(self, name, param_values):
219 | self.settings.append((name, param_values))
220 |
221 | def gen_settings(self):
222 | setting_groups = []
223 | for key, values in self.settings:
224 | setting_groups.append([(key,y) for y in values])
225 | for combo in iterCombinations(*setting_groups):
226 | yield combo
227 |
228 | def write(self, fout):
229 | fout.write(json.dumps(self.settings, indent=2))
230 |
231 | def load_config(fin):
232 | c = Config()
233 | c.settings = json.loads(fin.read())
234 | return c
235 |
236 | def iterCombinations(*iterables, **kwds):
237 | '''
238 | >>> #list(iterCombinations(*[[('a',1), ('a',2)],[('b', 1), ('b', 2)]]))
239 | >>> #list(iterCombinations([('a',1), ('a',2)],[('b', 1), ('b', 2)]))
240 | >>> list(iterCombinations(range(5), 'abc'))
241 | [[0, 'a'], [0, 'b'], [0, 'c'], [1, 'a'], [1, 'b'], [1, 'c'], [2, 'a'], [2, 'b'], [2, 'c'], [3, 'a'], [3, 'b'], [3, 'c'], [4, 'a'], [4, 'b'], [4, 'c']]
242 | http://code.activestate.com/recipes/501151-generating-combinations-in-blocks/
243 | Generates the combinations of the given iterables.
244 |
245 | @returns: An iterator over all combinations of the C{iterables}. Each yielded
246 | combination is a list of size C{len(iterables)} where the i-th element
247 | is drawn from the i-th iterable. B{Important note:} This list should not
248 | be modified between iterations; if you need to modify it, copy it to a
249 | new container.
250 |
251 | @param iterables: One or more arbitrary iterables.
252 | @param kwds: Currently only 'blocksize' allowed.
253 | @keyword blocksize: Determines the order of the yielded combinations. By
254 | default (C{blocksize=1}), the first iterable corresponds to the most outer
255 | loop (slowest change) and the last iterable corresponds to the most inner
256 | loop (fastest change).
257 |
258 | For larger blocksize, each iterable is first partitioned into consecutive
259 | blocks of size C{blocksize} (except perhaps for the last block which may
260 | be shorter). Then each combination is yielded by first iterating over
261 | each block combination C{B := (B1,B2,..Bn)} and then yielding each
262 | combination from B.
263 |
264 | More generally, C{blocksize} can be an iterable so that different
265 | C{iterables} can be partitioned by different block size. In this case,
266 | C{blocksize} is repeated as many time as necessary to match C{len(iterables)}.
267 | For instance::
268 | iterCombinations(range(4),range(6),range(8),'xyz', blocksize=(2,3))
269 | partitions C{range(4)} and C{range(8)} with blocksize=2, while C{range(6)}
270 | and 'xyz' are partitioned with blocksize=3.
271 | '''
272 | combo = [None] * len(iterables)
273 | blocksize = kwds.get('blocksize', 1)
274 | if isinstance(blocksize, int):
275 | sizes = repeat(blocksize)
276 | else:
277 | sizes = cycle(iter(blocksize))
278 | block_lists = [list(_iterblocks(it,sz)) for it,sz in izip(iterables,sizes)]
279 | for block_combo in _iterCombinations(block_lists, [None] * len(iterables)):
280 | #for block_combo in _iterCombinations(block_lists, len(iterables)):
281 | combo = [None] * len(iterables)
282 | for _ in _iterCombinations(block_combo, combo):
283 | #for _ in _iterCombinations(block_combo, len(iterables)):
284 | yield combo
285 |
286 |
287 | def _iterCombinations(groups, combo_list, index=0):
288 | """
289 | >>> list(_iterCombinations('abc', [None, None, None]))
290 | [['a', 'b', 'c']]
291 | >>> #list(_iterCombinations('abc', 1))
292 |
293 | # note that combo_list persists over time... hence this problem
294 | >>> list(_iterCombinations(['abc', 'xyz'], [None]* 2))
295 | [['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z'], ['c', 'z']]
296 |
297 | """
298 | # generate recursively all combinations of groups, updating combo_list
299 | # *in-place* for each combination.
300 | if index < len(groups)-1:
301 | for x in groups[index]:
302 | combo_list[index] = x
303 | for foo in _iterCombinations(groups,combo_list,index+1):
304 | yield combo_list
305 | else: # optimization to avoid the last level of recursion
306 | assert index == len(groups)-1
307 | for x in groups[index]:
308 | combo_list[index] = x
309 | yield combo_list
310 |
311 |
312 | def _iterblocks(iterable, blocksize, factory=tuple):
313 | """
314 | >>> list(_iterblocks('abc', 1))
315 | [('a',), ('b',), ('c',)]
316 | >>> list(_iterblocks('abc', 2))
317 | [('a', 'b'), ('c',)]
318 | >>> list(_iterblocks('abc', 4)) == list(_iterblocks('abc', 3))
319 | True
320 | """
321 | # split the iterable into blocks of blocksize
322 | iterable = iter(iterable)
323 | while True:
324 | block = factory(islice(iterable,blocksize))
325 | if not block: break
326 | yield block
327 | if len(block) < blocksize: break
328 |
329 | def analyze_queries(query_lines, dburl, config_file, fout):
330 | config = load_config(config_file)
331 | for query in query_lines:
332 | a = Analyzer(query, dburl)
333 | a.run_config(config, fout)
334 |
335 | def _get_qf_queries(fin, query_separator=SEP):
336 | """
337 | Yield lines of a single query
338 | """
339 | lines = []
340 | for line in fin:
341 | if line == query_separator:
342 | if lines:
343 | yield ''.join(lines)
344 | lines = []
345 | else:
346 | lines.append(line)
347 | if lines:
348 | yield ''.join(lines)
349 |
350 |
351 | def get_log_queries(fin, fout, query_separator=SEP):
352 | for q_lines in _query_lines(fin):
353 | for line in q_lines:
354 | fout.write(line)
355 | fout.write(SEP)
356 |
357 | def _query_lines(fin):
358 | """
359 | Given a query log yield lines of a single query
360 | """
361 | in_q = False
362 | lines = []
363 | for line in fin:
364 | if in_q and not line.startswith('\t'):
365 | in_q = False
366 | if lines:
367 | yield lines
368 | lines = []
369 | if not in_q and STATEMENT in line:
370 | in_q = True
371 | start_idx = line.index(STATEMENT) + len(STATEMENT)
372 | line = line[start_idx:]
373 | if in_q:
374 | lines.append(line)
375 | if lines:
376 | yield lines
377 |
378 | def get_expected_real_row(lines):
379 | """
380 | >>> get_expected_real_row(['Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=11) (actual time=0.004..0.052 rows=54 loops=1)', 'Total runtime: 0.150 ms'])
381 | (51, 54)
382 | """
383 | ACT = 'actual time='
384 | regex = re.compile(r'.*\(cost=.* rows=(?P\d*) .*\) \(actual.* rows=(?P\d*) .*\).*')
385 | for line in lines:
386 | if ACT in line:
387 | est = regex.search(line).group('rows_est')
388 | real = regex.search(line).group('rows_real')
389 | return int(est), int(real)
390 | return None
391 |
392 |
393 | def get_runtime(lines):
394 | """
395 | >>> get_runtime(['Seq Scan on promo_dim (cost=0.00..1.51 rows=51 width=11) (actual time=0.004..0.052 rows=51 loops=1)', 'Total runtime: 0.150 ms']) <= 0.15
396 | True
397 | """
398 | TOT = 'Total runtime: '
399 | for line in lines:
400 | if line.startswith(TOT):
401 | assert line.endswith(' ms')
402 | idx = line.index(TOT) + len(TOT)
403 | return float(line[idx:-3])
404 | return None
405 |
406 | def run_sql(cur, sql):
407 | cur.execute(sql)
408 | try:
409 | for row in cur.fetchall():
410 | yield row
411 | except psycopg2.ProgrammingError, e:
412 | if 'no results to fetch in e':
413 | pass
414 | else:
415 | raise
416 |
417 | class TweakContext(object):
418 | def __init__(self, dburl, settings_dict):
419 | self.dburl = dburl
420 | self.settings_dict = settings_dict
421 | self.conn = None
422 | self.cur = None
423 | self.old_settings = {}
424 | self.elapsed = None
425 |
426 | def __enter__(self):
427 | self.conn = psycopg2.connect(get_conn_string(self.dburl))
428 | self.cur = self.conn.cursor()
429 | # get old db settings
430 | for key in self.settings_dict:
431 | sql = 'SHOW %s' % key
432 | results = list(run_sql(self.cur, sql))
433 | self.old_settings[key] = results[0][0]
434 |
435 | # set database settings
436 | for key, value in self.settings_dict.items():
437 | sql = "SET %s = '%s'" %(key, value)
438 | results = list(run_sql(self.cur, sql))
439 |
440 | # need to return the as part
441 | return self
442 |
443 | def run(self, query):
444 | start = time.time()
445 | for row in run_sql(self.cur, query):
446 | yield row
447 | self.elapsed = time.time() - start
448 |
449 | def __exit__(self, *args):
450 | # set old db settings
451 | for key, value in self.old_settings.items():
452 | sql = "SET %s = '%s'" %(key, value)
453 | results = list(run_sql(self.cur, sql))
454 |
455 | self.cur.close()
456 | self.conn.close()
457 |
458 |
459 |
460 | def main(prog_args):
461 | parser = optparse.OptionParser(version=meta.__version__)
462 | extract_group = optparse.OptionGroup(parser, 'Extract Queries from psql log')
463 | extract_group.add_option('--extract-queries', action='store_true', help='extract queries from a log file')
464 | extract_group.add_option('--log-file', help='specify log file')
465 | parser.add_option_group(extract_group)
466 |
467 | analyze_group = optparse.OptionGroup(parser, 'Analyze extracted queries')
468 | analyze_group.add_option('--analyze-queries', action='store_true', help='analyze query file (or stdin)')
469 | analyze_group.add_option('--query-file', help='specify query file (file with queries delimited by %s or stdin)' % SEP)
470 | analyze_group.add_option('--dburl', help='sqlalchemy connection string (we do not use sa, just the style of it ie postgres://postgres:password@localhost:port/dbname)')
471 | analyze_group.add_option('--config-file', help='specify config file containing parameters for analyzing')
472 | parser.add_option_group(analyze_group)
473 |
474 | parser.add_option('--gen-config', action='store_true', help='generate a config file writeing work_mem and maintenance_work_mem')
475 |
476 | parser.add_option('--test', action='store_true', help='run doctests')
477 |
478 |
479 | opt, args = parser.parse_args(prog_args)
480 |
481 | if opt.extract_queries:
482 | if not opt.log_file:
483 | sys.error('Specify --log-file')
484 | return 1
485 | fin = open(opt.log_file)
486 | fout = sys.stdout
487 | get_log_queries(fin, fout)
488 | fout.close()
489 |
490 | if opt.analyze_queries:
491 | if not opt.query_file:
492 | fin = sys.stdin
493 | else:
494 | fin = open(opt.query_file)
495 | if not opt.config_file:
496 | sys.error('Specify --config-file (--gen-config to generate sample)')
497 |
498 | analyze_queries(_get_qf_queries(fin), opt.dburl, open(opt.config_file), sys.stdout)
499 |
500 | if opt.gen_config:
501 | c = Config()
502 | c.add_setting('work_mem', ['1MB', '512MB'])
503 | c.add_setting('maintenance_work_mem', ['1MB', '512MB'])
504 | c.write(sys.stdout)
505 |
506 | if opt.test:
507 | import doctest
508 | doctest.testmod()
509 |
510 | if __name__ == '__main__':
511 | sys.exit(main(sys.argv))
512 |
513 |
--------------------------------------------------------------------------------