├── indexdigest ├── test │ ├── cli │ │ ├── __init__.py │ │ └── test_script.py │ ├── core │ │ ├── __init__.py │ │ ├── test_utils.py │ │ ├── test_columns.py │ │ ├── test_indices.py │ │ ├── test_query.py │ │ └── test_database.py │ ├── linters │ │ ├── __init__.py │ │ ├── test_0164_empty_database.py │ │ ├── test_0074_single_column.py │ │ ├── test_0036_use_innodb.py │ │ ├── test_0094_generic_primary_key.py │ │ ├── test_0089_empty_tables.py │ │ ├── test_0002_not_used_indices.py │ │ ├── test_0026_full_table_scan.py │ │ ├── test_0034_missing_primary_index.py │ │ ├── test_0118_high_offset_selects.py │ │ ├── test_0075_test_tables.py │ │ ├── test_0031_low_cardinality_index.py │ │ ├── test_0028_data_too_old.py │ │ ├── test_0092_select_star.py │ │ ├── test_0019_queries_not_using_indices.py │ │ ├── test_0028_data_not_updated_recently.py │ │ ├── test_0027_selects_with_like.py │ │ ├── test_0093_having_clause.py │ │ ├── test_0004_redundant_indices.py │ │ ├── test_0070_insert_ignore.py │ │ ├── test_0020_big_table.py │ │ ├── test_0032_utf_latin_columns.py │ │ └── test_0006_not_used_columns_and_tables.py │ ├── test_schema.py │ ├── test_0107_schema_partitions.py │ ├── test_0089_handle_sql_errors.py │ ├── formatters │ │ ├── test_yaml.py │ │ ├── __init__.py │ │ ├── test_plain.py │ │ └── test_syslog.py │ ├── test_0004_redundant_indices_core.py │ └── __init__.py ├── __init__.py ├── formatters │ ├── __init__.py │ ├── yaml.py │ ├── syslog.py │ └── plain.py ├── cli │ ├── __init__.py │ ├── add_linter.py │ └── script.py ├── linters │ ├── linter_0074_single_column.py │ ├── linter_0089_empty_tables.py │ ├── linter_0075_test_tables.py │ ├── linter_0036_use_innodb.py │ ├── linter_0164_empty_database.py │ ├── linter_0034_missing_primary_index.py │ ├── linter_0094_generic_primary_key.py │ ├── linter_0118_high_offset_selects.py │ ├── linter_0093_having_clause.py │ ├── linter_0002_not_used_indices.py │ ├── linter_0027_selects_with_like.py │ ├── linter_0092_select_star.py │ ├── __init__.py │ ├── linter_0026_full_table_scan.py │ ├── linter_0032_utf_latin_columns.py │ ├── linter_0019_queries_not_using_indices.py │ ├── linter_0070_insert_ignore.py │ ├── linter_0004_redundant_indices.py │ ├── linter_0028_data_not_updated_recently.py │ ├── linter_0020_filesort_temporary_table.py │ ├── linter_0031_low_cardinality_index.py │ ├── linter_0028_data_too_old.py │ └── linter_0006_not_used_columns_and_tables.py ├── utils.py ├── schema.py └── database.py ├── MANIFEST.in ├── pylintrc ├── .coveragerc ├── .pylintrc ├── sql ├── 0026-full-table-scan-log ├── 0092-select-star-log ├── 0002-not-used-indices-log ├── 0027-selects-with-like-log ├── 0074-single-column.sql ├── 0006-not-used-columns-and-tables-log ├── 0075-test-tables.sql ├── 0093-having-clause-log ├── populate.sh ├── 0020-big-table-log ├── 0070-insert-ignore-log.sql ├── 0020-big-table.sql ├── 0036-use-innodb.sql ├── 0070-insert-ignore-log ├── 0089-empty-tables.sql ├── 0098-handle-sql-errors-log ├── 0002-not-used-indices.sql ├── 0107-schema-partitions.sql ├── 0028-data-not-updated-recently.sql ├── 0094-generic-primary-key.sql ├── 0118-high-offset-selects-log ├── README.md ├── 0019-queries-not-using-indices.sql ├── 0006-not-used-columns-and-tables.sql ├── 0000-core.sql ├── 0034-missing-primary-index.sql ├── 0019-queries-not-using-indices-log ├── 0032-utf-latin-columns.sql ├── 0004-redundant-indices.sql └── 0028-data-too-old.sql ├── .whitesource ├── setup.sql ├── .dockerignore ├── CONTRIBUTING.md ├── .github ├── dependabot.yml └── workflows │ ├── push-to-ghcr.yml │ ├── python-publish.yml │ ├── dependabot-automerge.yml │ ├── dockerimage.yml │ ├── tests.yml │ └── python.yml ├── Dockerfile ├── LICENSE ├── Makefile ├── .gitignore └── setup.py /indexdigest/test/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /indexdigest/test/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /indexdigest/test/linters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | prune indexdigest/test 3 | -------------------------------------------------------------------------------- /indexdigest/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | index_digest Python module 3 | """ 4 | VERSION = '1.7.0' 5 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | # Messages to disable 3 | disable = consider-using-f-string,fixme,use-dict-literal,R0801 4 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | # https://coverage.readthedocs.io/en/latest/source.html#source 3 | omit = 4 | indexdigest/cli/add_linter.py 5 | indexdigest/test/* 6 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable=too-few-public-methods,fixme,no-member,duplicate-code,useless-object-inheritance,use-dict-literal 3 | [DESIGN] 4 | max-args=7 5 | -------------------------------------------------------------------------------- /sql/0026-full-table-scan-log: -------------------------------------------------------------------------------- 1 | -- full table scan 2 | SELECT * FROM 0020_big_table 3 | SELECT * FROM 0020_big_table LIMIT 5 4 | -- using index 5 | SELECT * FROM 0020_big_table WHERE item_id = 1 -------------------------------------------------------------------------------- /indexdigest/formatters/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains formatters used by CLI script 3 | """ 4 | # expose formatters 5 | from .plain import format_plain 6 | from .syslog import format_syslog 7 | from .yaml import format_yaml 8 | -------------------------------------------------------------------------------- /sql/0092-select-star-log: -------------------------------------------------------------------------------- 1 | -- report these as select queries with * 2 | SELECT * FROM foo; 3 | SELECT t.* FROM bar AS t; 4 | 5 | -- false positives 6 | SELECT 3 * 3; 7 | SELECT count(*) FROM foo; 8 | SELECT /* foo */ test FROM foo; 9 | -------------------------------------------------------------------------------- /.whitesource: -------------------------------------------------------------------------------- 1 | { 2 | "scanSettings": { 3 | "baseBranches": [] 4 | }, 5 | "checkRunSettings": { 6 | "vulnerableCheckRunConclusionLevel": "failure", 7 | "displayMode": "diff" 8 | }, 9 | "issueSettings": { 10 | "minSeverityLevel": "LOW" 11 | } 12 | } -------------------------------------------------------------------------------- /sql/0002-not-used-indices-log: -------------------------------------------------------------------------------- 1 | select * from `0002_not_used_indices` order by item_id 2 | select * from `0002_not_used_indices` where foo = 'foo' and item_id = 2 3 | select count(*) from `0002_not_used_indices` where foo = 'foo' 4 | select * from `0002_not_used_indices` where bar = 'foo' 5 | -------------------------------------------------------------------------------- /sql/0027-selects-with-like-log: -------------------------------------------------------------------------------- 1 | -- uses an index 2 | SELECT * FROM 0020_big_table WHERE text LIKE '00%' 3 | -- does not use an index 4 | SELECT * FROM 0020_big_table WHERE text LIKE '%00' 5 | -- does not use an index, but is not a LIKE query 6 | SELECT * FROM 0020_big_table WHERE val > 50 7 | -------------------------------------------------------------------------------- /sql/0074-single-column.sql: -------------------------------------------------------------------------------- 1 | -- Report tables with just a single column 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/74 4 | DROP TABLE IF EXISTS `0074_bag_of_ints`; 5 | CREATE TABLE `0074_bag_of_ints` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | PRIMARY KEY (`item_id`) 8 | ) CHARSET=utf8; 9 | -------------------------------------------------------------------------------- /sql/0006-not-used-columns-and-tables-log: -------------------------------------------------------------------------------- 1 | INSERT INTO bar VALUES(1, 'foo', 'bar', 'test'); 2 | SELECT /* a comment */ foo FROM `0006_not_used_columns` WHERE item_id = 1; 3 | SELECT test, item_id FROM `0006_not_used_columns` WHERE foo = 'a'; 4 | -- query with an error: #1146 - "Table 'index_digest.t' doesn't exist" 5 | SELECT test FROM t; -------------------------------------------------------------------------------- /indexdigest/test/test_schema.py: -------------------------------------------------------------------------------- 1 | from indexdigest.schema import Column 2 | 3 | 4 | def test_column_int_column_normalization(): 5 | col = Column(name='foo', column_type='int') 6 | assert col.type == 'int' 7 | 8 | # normalize int(N) from MySQL 8.0.16 and older to int 9 | col = Column(name='foo', column_type='int(11)') 10 | assert col.type == 'int' 11 | -------------------------------------------------------------------------------- /sql/0075-test-tables.sql: -------------------------------------------------------------------------------- 1 | -- Report tables with "test" word in their name 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/75 4 | DROP TABLE IF EXISTS `0075_some_guy_test_table`; 5 | CREATE TABLE `0075_some_guy_test_table` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `name` varchar(255) NOT NULL, 8 | PRIMARY KEY (`item_id`) 9 | ) CHARSET=utf8; 10 | -------------------------------------------------------------------------------- /indexdigest/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A module containing command-line tool 3 | """ 4 | import logging 5 | 6 | from os import getenv 7 | 8 | if getenv('DEBUG') == '1': # pragma: no cover 9 | logging.basicConfig( 10 | level=logging.DEBUG, 11 | format='%(asctime)s %(name)-35s %(levelname)-8s %(message)s', 12 | datefmt="%Y-%m-%d %H:%M:%S" 13 | ) 14 | -------------------------------------------------------------------------------- /setup.sql: -------------------------------------------------------------------------------- 1 | -- create databases 2 | CREATE DATABASE IF NOT EXISTS index_digest; 3 | CREATE DATABASE IF NOT EXISTS index_digest_empty; -- #146 4 | 5 | -- create a user and grant access to our databases 6 | CREATE USER 'index_digest'@'%' IDENTIFIED BY 'qwerty'; 7 | 8 | GRANT ALL ON index_digest.* TO 'index_digest'@'%'; 9 | GRANT ALL ON index_digest_empty.* TO 'index_digest'@'%'; 10 | -------------------------------------------------------------------------------- /sql/0093-having-clause-log: -------------------------------------------------------------------------------- 1 | -- Rewriting the query's HAVING clause into a predicate will enable the use of indexes during query processing. 2 | SELECT * FROM foo HAVING bar = 2; 3 | SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2' 4 | SELECT * FROM `0019_queries_not_using_indices` WHERE foo = 'foo' HAVING bar = 'test'; 5 | -------------------------------------------------------------------------------- /sql/populate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is used to populate the MySQL instance with the content of all SQL files in this directory 3 | FILES=`ls sql/*.sql` 4 | 5 | for FILE in $FILES 6 | do 7 | echo -n "* Importing ${FILE} ... " 8 | cat $FILE | mysql --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest 2>&1 | grep -v "Using a password" 9 | echo "done" 10 | done 11 | -------------------------------------------------------------------------------- /sql/0020-big-table-log: -------------------------------------------------------------------------------- 1 | -- Using where; Using index -- and that's good :) 2 | SELECT count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 3 | -- Using where; Using filesort 4 | SELECT * FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 ORDER BY val 5 | -- Using where; Using temporary; Using filesort 6 | SELECT val, count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 GROUP BY val ORDER BY val 7 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # hidden files and directories 2 | .* 3 | 4 | # shell scripts and Makefile 5 | *.sh 6 | Makefile 7 | 8 | # virtual environments 9 | env* 10 | 11 | # Python-specific stuff 12 | *.egg-info/ 13 | dist/ 14 | htmlcov/ 15 | coverage.xml 16 | 17 | # SQL fixtures and tests 18 | setup.sql 19 | sql/ 20 | 21 | # other stuff 22 | *.log 23 | CONTRIBUTING.md 24 | Dockerfile 25 | MANIFEST* 26 | 27 | **/__pycache__ 28 | **/test 29 | hooks/ 30 | -------------------------------------------------------------------------------- /sql/0070-insert-ignore-log.sql: -------------------------------------------------------------------------------- 1 | -- Report queries using INSERT IGNORE 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/70 4 | -- https://medium.com/legacy-systems-diary/things-to-avoid-episode-1-insert-ignore-535b4c24406b 5 | DROP TABLE IF EXISTS `0070_insert_ignore`; 6 | CREATE TABLE `0070_insert_ignore` ( 7 | `item_id` int(9) NOT NULL, 8 | `text` char(5) NOT NULL, 9 | `time` DATETIME, 10 | UNIQUE KEY (`item_id`) 11 | ) CHARSET=utf8; 12 | -------------------------------------------------------------------------------- /indexdigest/test/test_0107_schema_partitions.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.test import DatabaseTestMixin 6 | 7 | 8 | class TestSchemaWithPartition(TestCase, DatabaseTestMixin): 9 | 10 | def test_schema_partitions(self): 11 | schema = self.connection.get_table_schema('0107_schema_partitions') 12 | print(schema) 13 | 14 | assert '/*!50100' not in schema 15 | -------------------------------------------------------------------------------- /sql/0020-big-table.sql: -------------------------------------------------------------------------------- 1 | -- Report queries that use filesort or temporary file 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/20 4 | DROP TABLE IF EXISTS `0020_big_table`; 5 | CREATE TABLE `0020_big_table` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `val` int(9) NOT NULL, 8 | `text` char(5) NOT NULL, 9 | `num` int(3) NOT NULL, 10 | PRIMARY KEY (`item_id`), 11 | KEY text_idx (`text`), 12 | KEY num_idx (`num`) -- low cardinality (#31) 13 | ) CHARSET=utf8; 14 | -------------------------------------------------------------------------------- /sql/0036-use-innodb.sql: -------------------------------------------------------------------------------- 1 | -- Report MyISAM tables and suggest to use InndDB 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/36 4 | DROP TABLE IF EXISTS `0036_use_innodb_myisam`; 5 | CREATE TABLE `0036_use_innodb_myisam` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` int(8), 8 | PRIMARY KEY (`item_id`) 9 | ) ENGINE=MyISAM; 10 | 11 | DROP TABLE IF EXISTS `0036_use_innodb`; 12 | CREATE TABLE `0036_use_innodb` ( 13 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 14 | `foo` int(8), 15 | PRIMARY KEY (`item_id`) 16 | ); 17 | -------------------------------------------------------------------------------- /sql/0070-insert-ignore-log: -------------------------------------------------------------------------------- 1 | -- inserts with IGNORE flag 2 | INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01'); 3 | /* foo */ INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01'); 4 | INSERT IGNORE INTO `0070_insert_ignore` VALUES ('123', 9, '2017-01-01'); 5 | INSERT /* foo */ IGNORE INTO `0070_insert_ignore` VALUES ('2017-01-01', 9, 123); 6 | -- no IGNORE flag 7 | /* INSERT IGNORE */ INSERT INTO `0070_insert_ignore` VALUES ('2017-01-01', 9, 123); 8 | INSERT INTO `0070_insert_ignore` VALUES ('INSERT IGNORE', 9, 123); 9 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Developer notes 2 | =============== 3 | 4 | ## Testing locally with various version of MySQL 5 | 6 | Assume that you want to test `index-digest` locally against MySQL v5.5: 7 | 8 | ``` 9 | docker pull mysql:5.5 10 | sudo service mysql stop 11 | docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=yes -d -p 3306:3306 mysql:5.5 12 | ``` 13 | 14 | Wait for mysql instance to start up. Then from the repository's main directory run: 15 | 16 | ```` 17 | mysql --protocol=tcp -u root -v < setup.sql 18 | ./sql/populate.sh 19 | make sql-console 20 | ``` 21 | -------------------------------------------------------------------------------- /sql/0089-empty-tables.sql: -------------------------------------------------------------------------------- 1 | -- Report empty tables 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/89 4 | DROP TABLE IF EXISTS `0089_empty_table`; 5 | CREATE TABLE `0089_empty_table` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` int(9), 8 | PRIMARY KEY (`item_id`) 9 | ); 10 | 11 | DROP TABLE IF EXISTS `0089_not_empty_table`; 12 | CREATE TABLE `0089_not_empty_table` ( 13 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 14 | `foo` int(9) DEFAULT 0, 15 | PRIMARY KEY (`item_id`) 16 | ); 17 | 18 | INSERT INTO 0089_not_empty_table VALUES (1, NULL), (2, 5), (42, 56); 19 | -------------------------------------------------------------------------------- /sql/0098-handle-sql-errors-log: -------------------------------------------------------------------------------- 1 | -- ERROR 1140 (42000): In aggregated query without GROUP BY, expression #1 of SELECT list contains nonaggregated column 'index_digest.0020_big_table.val'; this is incompatible with sql_mode=only_full_group_by 2 | SELECT val, count(*) FROM `0020_big_table` WHERE item_id BETWEEN 10 AND 20; 3 | 4 | -- query with aliases 5 | SELECT t.val as value, count(*) FROM `0020_big_table` as t WHERE item_id BETWEEN 10 AND 20 GROUP BY val; 6 | SELECT val as value, count(*) FROM `0020_big_table` WHERE item_id BETWEEN 10 AND 20 GROUP BY val; 7 | 8 | -- invalid syntax 9 | SELEKT foo FROM bar; 10 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Basic set up 2 | # https://help.github.com/en/github/administering-a-repository/configuration-options-for-dependency-updates#package-ecosystem 3 | 4 | version: 2 5 | updates: 6 | 7 | # Maintain PyPI dependencies 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | 13 | # GitHub Actions 14 | - package-ecosystem: "github-actions" 15 | directory: "/" 16 | schedule: 17 | interval: daily 18 | 19 | # Dockerfile 20 | - package-ecosystem: "docker" 21 | directory: "/" 22 | schedule: 23 | interval: daily 24 | -------------------------------------------------------------------------------- /sql/0002-not-used-indices.sql: -------------------------------------------------------------------------------- 1 | -- Report not used indices 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/2 4 | DROP TABLE IF EXISTS `0002_not_used_indices`; 5 | CREATE TABLE `0002_not_used_indices` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` varchar(16) NOT NULL DEFAULT '', 8 | `test` varchar(16) NOT NULL DEFAULT '', 9 | `bar` varchar(16), 10 | PRIMARY KEY (`item_id`), 11 | KEY `test_id_idx` (`test`, `item_id`), 12 | KEY `foo_id_idx` (`foo`, `item_id`) 13 | ); 14 | 15 | INSERT INTO 0002_not_used_indices VALUES 16 | (NULL, 'test', '', NULL), 17 | (NULL, 'foo', 'test', NULL), 18 | (NULL, 'foo', '', NULL); 19 | -------------------------------------------------------------------------------- /sql/0107-schema-partitions.sql: -------------------------------------------------------------------------------- 1 | -- Handle tables with partitions 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/107 4 | DROP TABLE IF EXISTS `0107_schema_partitions`; 5 | CREATE TABLE `0107_schema_partitions` ( 6 | firstname VARCHAR(25) NOT NULL, 7 | lastname VARCHAR(25) NOT NULL, 8 | username VARCHAR(16) NOT NULL, 9 | email VARCHAR(35), 10 | joined DATE NOT NULL 11 | ) CHARSET=utf8 12 | PARTITION BY RANGE( YEAR(joined) ) ( 13 | PARTITION p0 VALUES LESS THAN (1960), 14 | PARTITION p1 VALUES LESS THAN (1970), 15 | PARTITION p2 VALUES LESS THAN (1980), 16 | PARTITION p3 VALUES LESS THAN (1990), 17 | PARTITION p4 VALUES LESS THAN MAXVALUE 18 | ); 19 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0164_empty_database.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0164_empty_database import check_empty_database 6 | from indexdigest.test import DatabaseTestMixin 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_empty_database(self): 12 | reports = list(check_empty_database(self.connection)) 13 | 14 | print(reports, reports[0].context) 15 | 16 | assert len(reports) == 1 17 | 18 | assert str(reports[0]) == 'index_digest_empty: "index_digest_empty" database has no tables' 19 | assert reports[0].table_name == 'index_digest_empty' 20 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0074_single_column.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter reports tables with just a single column 3 | """ 4 | from indexdigest.utils import LinterEntry 5 | 6 | 7 | def check_single_column(database): 8 | """ 9 | :type database indexdigest.database.Database 10 | :rtype: list[LinterEntry] 11 | """ 12 | tables = [ 13 | table 14 | for table in database.get_tables() 15 | if len(database.get_table_columns(table)) == 1 16 | ] 17 | 18 | for table in tables: 19 | yield LinterEntry(linter_type='single_column', table_name=table, 20 | message='"{}" has just a single column'. 21 | format(table), 22 | context={'schema': database.get_table_schema(table)}) 23 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0074_single_column.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters import check_single_column 6 | from indexdigest.test import DatabaseTestMixin 7 | 8 | 9 | class TestSingleColumn(TestCase, DatabaseTestMixin): 10 | 11 | def test_check_single_column(self): 12 | reports = list(check_single_column(self.connection)) 13 | 14 | print(list(map(str, reports))) 15 | 16 | self.assertEqual(len(reports), 1) 17 | 18 | self.assertEqual(str(reports[0]), 19 | '0074_bag_of_ints: "0074_bag_of_ints" has just a single column') 20 | self.assertTrue('CREATE TABLE `0074_bag_of_ints` (' in reports[0].context['schema']) 21 | 22 | # assert False 23 | -------------------------------------------------------------------------------- /sql/0028-data-not-updated-recently.sql: -------------------------------------------------------------------------------- 1 | -- Report tables that were not updated recently 2 | -- They may contain archive data or the script that updates it broke. 3 | -- 4 | -- https://github.com/macbre/index-digest/issues/28 5 | DROP TABLE IF EXISTS `0028_data_not_updated_recently`; 6 | CREATE TABLE `0028_data_not_updated_recently` ( 7 | `item_id` int(8) unsigned NOT NULL AUTO_INCREMENT, 8 | `cnt` int(8) unsigned NOT NULL, 9 | `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 10 | PRIMARY KEY (`item_id`) 11 | ) ENGINE=InnoDB; 12 | 13 | -- table with old data (6 months old) 14 | INSERT INTO 0028_data_not_updated_recently(cnt, `timestamp`) VALUES 15 | (20, NOW() - INTERVAL 50 DAY), 16 | (20, NOW() - INTERVAL 45 DAY), 17 | (20, NOW() - INTERVAL 40 DAY); 18 | -------------------------------------------------------------------------------- /.github/workflows/push-to-ghcr.yml: -------------------------------------------------------------------------------- 1 | name: Build and publish a Docker image to ghcr.io and Docker Hub 2 | on: 3 | # publish on releases (tagged as "x.y.z" - "v" prefix is removed) 4 | release: 5 | types: [ published ] 6 | 7 | # publish on pushes to the main branch (tagged as "master") 8 | push: 9 | branches: 10 | - master 11 | 12 | # pull_request: # debug 13 | 14 | jobs: 15 | docker_publish: 16 | runs-on: "ubuntu-latest" 17 | 18 | steps: 19 | - uses: actions/checkout@v5.0.1 20 | 21 | - name: Build and publish a Docker image for ${{ github.repository }} 22 | uses: macbre/push-to-ghcr@master 23 | with: 24 | image_name: ${{ github.repository }} 25 | github_token: ${{ secrets.GITHUB_TOKEN }} 26 | docker_io_token: ${{ secrets.DOCKER_IO_ACCESS_TOKEN }} 27 | -------------------------------------------------------------------------------- /sql/0094-generic-primary-key.sql: -------------------------------------------------------------------------------- 1 | -- Report tables with a generic primary key (id) 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/94 4 | DROP TABLE IF EXISTS `0094_generic_primary_key`; 5 | CREATE TABLE `0094_generic_primary_key` ( 6 | `id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` varchar(16) NOT NULL DEFAULT '', 8 | PRIMARY KEY (`id`) 9 | ); 10 | 11 | DROP TABLE IF EXISTS `0094_generic_primary_key_id_as_column`; 12 | CREATE TABLE `0094_generic_primary_key_id_as_column` ( 13 | `foo` int(9) NOT NULL AUTO_INCREMENT, 14 | `id` varchar(16) NOT NULL DEFAULT '', 15 | PRIMARY KEY (`foo`) 16 | ); 17 | 18 | DROP TABLE IF EXISTS `0094_non_generic_primary_key`; 19 | CREATE TABLE `0094_non_generic_primary_key` ( 20 | `row_id` int(9) NOT NULL AUTO_INCREMENT, 21 | `foo` varchar(16) NOT NULL DEFAULT '', 22 | PRIMARY KEY (`row_id`) 23 | ); 24 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0036_use_innodb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0036_use_innodb import check_use_innodb 6 | from indexdigest.test import DatabaseTestMixin 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_use_innodb(self): 12 | reports = list(check_use_innodb(self.connection)) 13 | 14 | print(reports, reports[0].context) 15 | 16 | self.assertEqual(len(reports), 1) 17 | 18 | self.assertEqual(str(reports[0]), 19 | '0036_use_innodb_myisam: "0036_use_innodb_myisam" uses MyISAM storage engine') 20 | self.assertEqual(reports[0].table_name, '0036_use_innodb_myisam') 21 | self.assertEqual(str(reports[0].context['engine']), "MyISAM") 22 | 23 | # assert False 24 | -------------------------------------------------------------------------------- /sql/0118-high-offset-selects-log: -------------------------------------------------------------------------------- 1 | -- no offset queries 2 | SELECT foo_limit FROM bar_offset 3 | -- not high enough query 4 | SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 100 5 | select * from 0020_big_table order by id limit 50, 5; 6 | -- offset queries 7 | SELECT /* CategoryPaginationViewer::processSection */ page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' ORDER BY cl_sortkey LIMIT 927600,200 8 | -- insert queries should be ignored (#140) 9 | /* 7388e26b */ insert into global_discussion_log.logs ( user_id, ip, site_id, location, action, user_agent, time, app_id ) values ( 33017624, X'', 2233, '', 0, 'content-changed-consumer', {ts '2018-03-15 23:20:18.316'}, null ) 10 | -------------------------------------------------------------------------------- /sql/README.md: -------------------------------------------------------------------------------- 1 | sql 2 | === 3 | 4 | This directory contains `*.sql` files with test schemas. Each reported task / bug should have a separate SQL file with a name `NNNN-short-description.sql` (e.g. `0004-redundant-indices.sql` where 4 is the GitHub's issue number). 5 | 6 | Each test schema should be self-contained (e.g. there are no dependencies on other files) and it should be possible to re-apply them, i.e. `DROP TABLE IF EXISTS table_name` statements are there: 7 | 8 | ### An example 9 | 10 | ```sql 11 | -- Detect redundant indices 12 | -- 13 | -- https://github.com/macbre/index-digest/issues/4 14 | DROP TABLE IF EXISTS `0004_id_foo`; 15 | CREATE TABLE `0004_id_foo` ( 16 | `id` int(9) NOT NULL AUTO_INCREMENT, 17 | `foo` varbinary(16) NOT NULL DEFAULT '', 18 | PRIMARY KEY (`id`,`foo`), 19 | UNIQUE KEY `idx` (`id`,`foo`) 20 | ); 21 | -- ... 22 | ``` 23 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0094_generic_primary_key.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0094_generic_primary_key import check_generic_primary_key 6 | from indexdigest.test import DatabaseTestMixin 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_generic_primary_key(self): 12 | reports = list(check_generic_primary_key(self.connection)) 13 | 14 | print(list(map(str, reports))) 15 | 16 | assert len(reports) == 1 17 | 18 | assert str(reports[0]) == '0094_generic_primary_key: ' \ 19 | '"0094_generic_primary_key" has a primary key called id, use a more meaningful name' 20 | assert reports[0].table_name == '0094_generic_primary_key' 21 | assert 'CREATE TABLE `0094_generic_primary_key`' in reports[0].context['schema'] 22 | -------------------------------------------------------------------------------- /sql/0019-queries-not-using-indices.sql: -------------------------------------------------------------------------------- 1 | -- Report queries that do not use indices 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/19 4 | DROP TABLE IF EXISTS `0019_queries_not_using_indices`; 5 | CREATE TABLE `0019_queries_not_using_indices` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` varchar(16) NOT NULL DEFAULT '', 8 | `bar` varchar(16) NOT NULL DEFAULT '', 9 | PRIMARY KEY (`item_id`), 10 | KEY `bar_idx` (`bar`) 11 | ); 12 | 13 | -- https://github.com/macbre/index-digest/issues/210 14 | DROP TABLE IF EXISTS `0019_queries_not_using_indices_empty_table`; 15 | CREATE TABLE `0019_queries_not_using_indices_empty_table` ( 16 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 17 | `foo` varchar(16) NOT NULL DEFAULT '', 18 | PRIMARY KEY (`item_id`) 19 | ); 20 | 21 | INSERT INTO 0019_queries_not_using_indices VALUES 22 | (1, 'test', ''), 23 | (2, 'foo', 'test'), 24 | (3, 'foo', 'check'); 25 | -------------------------------------------------------------------------------- /sql/0006-not-used-columns-and-tables.sql: -------------------------------------------------------------------------------- 1 | -- Report not used columns and tables 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/6 4 | DROP TABLE IF EXISTS `0006_not_used_columns`; 5 | CREATE TABLE `0006_not_used_columns` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` varchar(16) NOT NULL DEFAULT '', 8 | `bar` varchar(16) NOT NULL DEFAULT '', 9 | `test` varchar(16) NOT NULL DEFAULT '', 10 | PRIMARY KEY (`item_id`) 11 | ); 12 | 13 | INSERT INTO 0006_not_used_columns VALUES 14 | (1, 'test', '', ''), 15 | (42, 'foo', 'test', ''), 16 | (3, 'foo', '', 'check'); 17 | 18 | DROP TABLE IF EXISTS `0006_not_used_tables`; 19 | CREATE TABLE `0006_not_used_tables` ( 20 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 21 | `foo` varchar(16) NOT NULL DEFAULT '', 22 | PRIMARY KEY (`item_id`) 23 | ); 24 | 25 | INSERT INTO 0006_not_used_tables VALUES 26 | (1, 'foo'), 27 | (2, 'foo'), 28 | (3, 'foo'); 29 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0089_empty_tables.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for empty tables 3 | """ 4 | from indexdigest.utils import LinterEntry 5 | 6 | 7 | def check_empty_tables(database): 8 | """ 9 | :type database indexdigest.database.Database 10 | :rtype: list[LinterEntry] 11 | """ 12 | empty_tables = [ 13 | table for table in database.get_tables() 14 | # use both "information_schema" and "explain select count(*)" based methods 15 | # to get the rows count estimate 16 | if database.get_table_metadata(table).get('rows') == 0 17 | or database.get_table_rows_estimate(table) == 0 18 | ] 19 | 20 | for table in empty_tables: 21 | yield LinterEntry(linter_type='empty_tables', table_name=table, 22 | message='"{}" table has no rows, is it really needed?'.format(table), 23 | context={'schema': database.get_table_schema(table)}) 24 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0089_empty_tables.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0089_empty_tables import check_empty_tables 6 | from indexdigest.test import DatabaseTestMixin 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_empty_tables(self): 12 | reports = check_empty_tables(self.connection) 13 | 14 | # only include tables from our test case 15 | reports = [ 16 | report for report in reports 17 | if report.table_name.startswith('0089_') 18 | ] 19 | 20 | print(list(map(str, reports))) 21 | 22 | self.assertEqual(len(reports), 1) 23 | 24 | self.assertEqual(str(reports[0]), 25 | '0089_empty_table: "0089_empty_table" table has no rows, is it really needed?') 26 | self.assertTrue('CREATE TABLE `0089_empty_table` (' in reports[0].context['schema']) 27 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0002_not_used_indices.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0002_not_used_indices import check_not_used_indices 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 7 | 8 | 9 | class TestNotUsedIndices(TestCase, DatabaseTestMixin): 10 | 11 | def test_not_used_indices(self): 12 | reports = list(check_not_used_indices( 13 | database=self.connection, queries=read_queries_from_log('0002-not-used-indices-log'))) 14 | 15 | print(reports) 16 | 17 | self.assertEqual(len(reports), 1) 18 | self.assertEqual(str(reports[0]), '0002_not_used_indices: "test_id_idx" index was not used by provided queries') 19 | self.assertEqual(reports[0].table_name, '0002_not_used_indices') 20 | self.assertEqual(str(reports[0].context['not_used_index']), 'KEY test_id_idx (test, item_id)') 21 | 22 | # assert False 23 | -------------------------------------------------------------------------------- /sql/0000-core.sql: -------------------------------------------------------------------------------- 1 | -- Tables for core tests of Database class 2 | DROP TABLE IF EXISTS `0000_the_table`; 3 | CREATE TABLE `0000_the_table` ( 4 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 5 | `foo` varchar(16) NOT NULL DEFAULT '', 6 | PRIMARY KEY (`item_id`,`foo`), 7 | KEY `idx_foo` (`foo`) 8 | ) CHARACTER SET utf8; 9 | 10 | INSERT INTO 0000_the_table VALUES(1, 'test'), (2, 'foo'), (3, 'foo ąęź'); 11 | 12 | -- handle dashes in table names 13 | DROP TABLE IF EXISTS `0000_the_table-metadata`; 14 | CREATE TABLE `0000_the_table-metadata` ( 15 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 16 | `foo` varchar(16) NOT NULL DEFAULT '', 17 | PRIMARY KEY (`item_id`,`foo`), 18 | KEY `idx_foo` (`foo`) 19 | ) CHARACTER SET utf8; 20 | 21 | INSERT INTO `0000_the_table-metadata` VALUES(1, 'test'), (2, 'foo'), (3, 'foo ąęź'), (4, 'foo'); 22 | 23 | -- handle views, actually ignore them :) 24 | DROP VIEW IF EXISTS `0000_the_view`; 25 | CREATE VIEW 0000_the_view AS SELECT foo, COUNT(*) AS cnt FROM `0000_the_table-metadata` GROUP BY foo; 26 | -------------------------------------------------------------------------------- /sql/0034-missing-primary-index.sql: -------------------------------------------------------------------------------- 1 | -- Report missing primary or unique keys 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/34 4 | DROP TABLE IF EXISTS `0034_with_primary_key`; 5 | CREATE TABLE `0034_with_primary_key` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `name` varchar(255) NOT NULL, 8 | PRIMARY KEY (`item_id`) 9 | ) CHARSET=utf8; 10 | 11 | DROP TABLE IF EXISTS `0034_with_unique_key`; 12 | CREATE TABLE `0034_with_unique_key` ( 13 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 14 | `name` varchar(255) NOT NULL, 15 | UNIQUE KEY idx (`item_id`) 16 | ) CHARSET=utf8; 17 | 18 | -- https://github.com/Wikia/app/pull/9863 19 | DROP TABLE IF EXISTS `0034_querycache`; 20 | CREATE TABLE `0034_querycache` ( 21 | `qc_type` varbinary(32) NOT NULL, 22 | `qc_value` int(10) unsigned NOT NULL DEFAULT '0', 23 | `qc_namespace` int(11) NOT NULL DEFAULT '0', 24 | `qc_title` varchar(255) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL DEFAULT '', 25 | KEY `qc_type` (`qc_type`,`qc_value`) 26 | ) CHARSET=utf8; 27 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # https://hub.docker.com/_/python/ 2 | FROM python:3.15.0a3-alpine 3 | 4 | WORKDIR /opt/index-digest 5 | 6 | # copy files required to run "pip install" 7 | COPY setup.py README.md ./ 8 | COPY ./indexdigest/__init__.py ./indexdigest/__init__.py 9 | 10 | # installs mysql_config and pip dependencies 11 | # https://github.com/gliderlabs/docker-alpine/issues/181 12 | RUN apk upgrade \ 13 | && apk add --virtual build-deps gcc musl-dev \ 14 | && apk add mariadb-dev \ 15 | && pip install . \ 16 | && rm -rf /root/.cache \ 17 | && apk del build-deps 18 | 19 | ARG GITHUB_SHA="dev" 20 | ENV COMMIT_SHA=${GITHUB_SHA} 21 | 22 | # run as nobody 23 | ENV HOME=/opt/index-digest 24 | RUN chown -R nobody . 25 | USER nobody 26 | 27 | # install the remaining files 28 | COPY --chown=nobody . . 29 | 30 | # install the entire package 31 | RUN pip install --no-warn-script-location --user . \ 32 | && rm -rf ~./cache 33 | 34 | RUN index_digest --version 35 | 36 | # docker run -t macbre/index-digest 37 | ENTRYPOINT ["index_digest"] 38 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0075_test_tables.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter reports tables with "test" word in their name 3 | """ 4 | import re 5 | 6 | from indexdigest.utils import LinterEntry 7 | 8 | TEST_TABLES = ( 9 | 'test', 10 | 'temp', 11 | ) 12 | 13 | 14 | def is_test_table(table_name): 15 | """ 16 | :type table_name str 17 | :rtype: bool 18 | """ 19 | return re.search(r'(^|_)({})(_|$)'.format('|'.join(TEST_TABLES)), table_name) is not None 20 | 21 | 22 | def check_test_tables(database): 23 | """ 24 | :type database indexdigest.database.Database 25 | :rtype: list[LinterEntry] 26 | """ 27 | test_tables = [ 28 | table for table in database.get_tables() 29 | if is_test_table(table) 30 | ] 31 | 32 | for table in test_tables: 33 | yield LinterEntry(linter_type='test_tables', table_name=table, 34 | message='"{}" seems to be a test table'. 35 | format(table), 36 | context={'schema': database.get_table_schema(table)}) 37 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0036_use_innodb.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for ... 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import LinterEntry 7 | 8 | 9 | def check_use_innodb(database): 10 | """ 11 | :type database indexdigest.database.Database 12 | :rtype: list[LinterEntry] 13 | """ 14 | # in MySQL 8.0 information_schema tables columns are uppercase 15 | res = database.query_dict_rows("SELECT TABLE_NAME, ENGINE FROM information_schema.tables " 16 | "WHERE ENGINE <> 'InnoDB' and TABLE_SCHEMA = '{}'". 17 | format(database.db_name)) 18 | 19 | for row in res: 20 | context = OrderedDict() 21 | context['schema'] = database.get_table_schema(row['TABLE_NAME']) 22 | context['engine'] = row['ENGINE'] 23 | 24 | yield LinterEntry(linter_type='use_innodb', table_name=row['TABLE_NAME'], 25 | message='"{TABLE_NAME}" uses {ENGINE} storage engine'. 26 | format(**row), 27 | context=context) 28 | -------------------------------------------------------------------------------- /sql/0019-queries-not-using-indices-log: -------------------------------------------------------------------------------- 1 | -- these use index 2 | SELECT item_id FROM 0019_queries_not_using_indices WHERE item_id = 2; 3 | SELECT item_id FROM 0019_queries_not_using_indices WHERE item_id BETWEEN 1 AND 3; 4 | SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" AND item_id = 1; 5 | -- these do not use index 6 | SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" OR item_id > 1; 7 | SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" 8 | -- no matching row in const table (#44) 9 | SELECT foo FROM 0019_queries_not_using_indices WHERE item_id = 5; 10 | -- #148: EXPLAINS' Extra says "No tables used" 11 | SELECT 1*1; 12 | SELECT 1 AS one FROM dual WHERE exists ( SELECT 1 FROM 0000_the_table WHERE item_id = 2 ); 13 | SELECT 1 AS one FROM dual WHERE exists ( SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" ); 14 | -- #210: EXPLAINS' Extra says "Select tables optimized away" 15 | SELECT max(item_id) FROM 0019_queries_not_using_indices; 16 | -- #210: EXPLAINS' Extra says "No matching min/max row" 17 | SELECT max(item_id) FROM 0019_queries_not_using_indices_empty_table; 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Maciej Brencz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | pip install -U -e .[dev] 3 | 4 | test: 5 | pytest -vv -o log_cli=true -o log_cli_level=warning 6 | 7 | coverage: 8 | pytest -vv --cov=indexdigest --cov-report=term-missing --cov-report=html --cov-fail-under=96 9 | 10 | lint: 11 | pylint indexdigest/ --ignore=test 12 | 13 | demo: 14 | docker run --network=host -t macbre/index-digest:latest mysql://index_digest:qwerty@127.0.0.1/index_digest --analyze-data --skip-checks=non_utf_columns --skip-tables=0028_no_time 15 | 16 | sql-console: 17 | mysql --prompt='mysql@\h[\d]>' --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest 18 | 19 | publish: 20 | # run git tag -a v0.0.0 before running make publish 21 | python setup.py sdist 22 | twine upload --skip-existing dist/* 23 | 24 | # docker (tag with commit ID) 25 | VERSION = "1.2.1-"$(shell git rev-parse --short HEAD) 26 | 27 | build: 28 | @docker build -t macbre/index-digest:$(VERSION) . \ 29 | && docker tag macbre/index-digest:$(VERSION) macbre/index-digest:latest 30 | 31 | push: build 32 | @docker push macbre/index-digest:$(VERSION) \ 33 | && docker push macbre/index-digest:latest 34 | 35 | .PHONY: build 36 | -------------------------------------------------------------------------------- /sql/0032-utf-latin-columns.sql: -------------------------------------------------------------------------------- 1 | -- Report text columns that use non-utf collation 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/32 4 | DROP TABLE IF EXISTS `0032_utf8_table`; 5 | CREATE TABLE `0032_utf8_table` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `name` varchar(255) NOT NULL, 8 | `latin_column` varchar(255) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL, 9 | `big5_column` varchar(255) CHARACTER SET big5, 10 | `utf_blob` blob, 11 | PRIMARY KEY (`item_id`) 12 | ) CHARSET=utf8 COLLATE=utf8_polish_ci; 13 | 14 | DROP TABLE IF EXISTS `0032_latin1_table`; 15 | CREATE TABLE `0032_latin1_table` ( 16 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 17 | `name` varchar(255), 18 | `utf8_column` varchar(255) CHARACTER SET utf8 COLLATE utf8_polish_ci NOT NULL, 19 | `ucs2_column` varchar(255) CHARACTER SET ucs2, 20 | `utf8mb4_column` varchar(255) CHARACTER SET utf8mb4, 21 | `utf16_column` varchar(255) CHARACTER SET utf16, 22 | -- `utf16le_column` varchar(255) CHARACTER SET utf16le, -- not supported by MySQL 5.5 23 | `utf32_column` varchar(255) CHARACTER SET utf32, 24 | `binary_column` varchar(255) CHARACTER SET binary, 25 | `latin_blob` blob, 26 | PRIMARY KEY (`item_id`) 27 | ) CHARSET=latin1; 28 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0026_full_table_scan.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from indexdigest.linters import check_full_table_scan 4 | from indexdigest.test import BigTableTest, read_queries_from_log 5 | 6 | 7 | class TestFullTableScan(BigTableTest): 8 | 9 | def test_full_table_scan(self): 10 | reports = list(check_full_table_scan(self.connection, read_queries_from_log('0026-full-table-scan-log'))) 11 | 12 | self.assertEqual(len(reports), 2) 13 | 14 | self.assertEqual(str(reports[0]), 15 | '0020_big_table: "SELECT * FROM 0020_big_table" query triggered full table scan') 16 | self.assertEqual(reports[0].context['query'], 17 | 'SELECT * FROM 0020_big_table') 18 | self.assertTrue(reports[0].context['explain_rows'] > 8000) 19 | 20 | self.assertEqual(str(reports[1]), 21 | '0020_big_table: "SELECT * FROM 0020_big_table LIMIT 5" query triggered full table scan') 22 | self.assertEqual(reports[1].context['query'], 23 | 'SELECT * FROM 0020_big_table LIMIT 5') 24 | self.assertTrue(reports[1].context['explain_rows'] > 8000) 25 | 26 | # assert False 27 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0034_missing_primary_index.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters import check_missing_primary_index 6 | from indexdigest.test import Database, DatabaseTestMixin 7 | 8 | 9 | class LimitedViewDatabase(Database, DatabaseTestMixin): 10 | """ 11 | Limit test to tables from sql/0034-missing-primary-index 12 | """ 13 | def get_tables(self): 14 | return ['0034_with_primary_key', '0034_with_unique_key', '0034_querycache'] 15 | 16 | 17 | class TestMissingPrimaryIndex(TestCase): 18 | @property 19 | def connection(self): 20 | return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN) 21 | 22 | def test_missing_primary_index(self): 23 | reports = list(check_missing_primary_index(self.connection)) 24 | 25 | print(list(map(str, reports))) 26 | 27 | self.assertEqual(len(reports), 1) 28 | 29 | self.assertEqual(str(reports[0]), 30 | '0034_querycache: "0034_querycache" table does not have any primary or unique index') 31 | self.assertTrue('CREATE TABLE `0034_querycache` (' in reports[0].context['schema']) 32 | 33 | # assert False 34 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0164_empty_database.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for databases with no tables 3 | """ 4 | from indexdigest.utils import LinterEntry 5 | 6 | 7 | def get_empty_databases(database): 8 | """ 9 | :type database indexdigest.database.Database 10 | :rtype: list[str] 11 | """ 12 | for db_name in database.query_list('SHOW DATABASES'): 13 | # skip "core" MySQL databases 14 | if db_name in ['information_schema']: 15 | continue 16 | 17 | tables_count = database.query_field('SELECT COUNT(*) FROM information_schema.TABLES ' 18 | 'WHERE TABLE_SCHEMA = "{}" AND ' 19 | 'TABLE_TYPE = "BASE TABLE"'.format(db_name)) 20 | # print(db_name, tables_count) 21 | if tables_count == 0: 22 | yield db_name 23 | 24 | 25 | def check_empty_database(database): 26 | """ 27 | :type database indexdigest.database.Database 28 | :rtype: list[LinterEntry] 29 | """ 30 | for db_name in get_empty_databases(database): 31 | yield LinterEntry(linter_type='empty_database', table_name=db_name, 32 | message='"{}" database has no tables'.format(db_name)) 33 | -------------------------------------------------------------------------------- /indexdigest/test/test_0089_handle_sql_errors.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0006_not_used_columns_and_tables import get_used_tables_from_queries 6 | from indexdigest.utils import explain_queries 7 | 8 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 9 | 10 | 11 | class ErrorsHandlingTest(TestCase, DatabaseTestMixin): 12 | 13 | @property 14 | def queries(self): 15 | return read_queries_from_log('0098-handle-sql-errors-log') 16 | 17 | def test_get_used_tables_from_queries(self): 18 | tables = get_used_tables_from_queries(self.queries) 19 | 20 | print(tables) 21 | 22 | assert '0020_big_table' in tables 23 | # assert False 24 | 25 | def test_explain_queries(self): 26 | res = list(explain_queries(self.connection, self.queries)) 27 | tables_used = [item[1] for item in res] 28 | 29 | print(res, tables_used) 30 | 31 | assert '0020_big_table' in tables_used 32 | # assert False 33 | 34 | def test_get_table_columns(self): 35 | res = self.connection.get_table_columns('t') 36 | self.assertIsNone(res) 37 | 38 | # assert False 39 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0034_missing_primary_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter reports missing primary / unique index 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import LinterEntry 7 | 8 | 9 | def check_missing_primary_index(database): 10 | """ 11 | :type database indexdigest.database.Database 12 | :rtype: list[LinterEntry] 13 | """ 14 | for table in database.get_tables(): 15 | # list non-primary (and non-unique) indices only 16 | # @see https://bugs.mysql.com/bug.php?id=76252 17 | # @see https://github.com/Wikia/app/pull/9863 18 | indices = [ 19 | index for index in database.get_table_indices(table) 20 | if index.is_primary or index.is_unique 21 | ] 22 | 23 | if indices: 24 | # so we have at least one primary or unique index defined 25 | continue 26 | 27 | context = OrderedDict() 28 | context['schema'] = database.get_table_schema(table) 29 | 30 | yield LinterEntry(linter_type='missing_primary_index', table_name=table, 31 | message='"{}" table does not have any primary or unique index'. 32 | format(table), 33 | context=context) 34 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0094_generic_primary_key.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for ... 3 | """ 4 | from indexdigest.utils import LinterEntry 5 | 6 | GENERIC_PRIMARY_KEY = 'id' 7 | 8 | 9 | def check_generic_primary_key(database): 10 | """ 11 | :type database indexdigest.database.Database 12 | :rtype: list[LinterEntry] 13 | """ 14 | for table_name in database.get_tables(): 15 | indices = [ 16 | index for index in database.get_table_indices(table_name) 17 | if index.is_primary 18 | ] 19 | 20 | # no primary index, a different check will take care of it 21 | if not indices: 22 | continue 23 | 24 | # there can be only one primary key, take the first one from the list 25 | primary_key = indices[0] 26 | # print(table_name, primary_key, primary_key.columns[0]) 27 | 28 | if primary_key.columns[0] == GENERIC_PRIMARY_KEY: 29 | yield LinterEntry(linter_type='generic_primary_key', table_name=table_name, 30 | message='"{}" has a primary key called id, ' 31 | 'use a more meaningful name'.format(table_name), 32 | context={"schema": database.get_table_schema(table_name)}) 33 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | name: Publish 4 | 5 | on: 6 | release: 7 | types: [created] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | jobs: 13 | deploy: 14 | name: Upload to PyPI 15 | 16 | runs-on: ubuntu-latest 17 | 18 | permissions: 19 | # IMPORTANT: this permission is mandatory for Trusted Publishing 20 | id-token: write 21 | 22 | steps: 23 | - uses: actions/checkout@v5.0.1 24 | - name: Set up Python 25 | uses: actions/setup-python@v6 26 | with: 27 | python-version: '3.x' 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install setuptools wheel 33 | 34 | - name: Build 35 | run: | 36 | python setup.py sdist bdist_wheel 37 | ls -lh dist/ 38 | 39 | # https://github.com/pypa/gh-action-pypi-publish?tab=readme-ov-file#trusted-publishing 40 | - name: Publish package distributions to PyPI 41 | uses: pypa/gh-action-pypi-publish@release/v1 42 | -------------------------------------------------------------------------------- /indexdigest/test/formatters/test_yaml.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest import VERSION 6 | from indexdigest.formatters import format_yaml as formatter 7 | from . import FormatterTestMixin 8 | 9 | 10 | class TestFormatter(TestCase, FormatterTestMixin): 11 | 12 | def test_formatter(self): 13 | out = formatter(self.get_database_mock(), self.get_reports_mock()) 14 | print(out) 15 | 16 | # first check that it's a valid YAML 17 | res = yaml.safe_load(out) 18 | assert 'meta' in res 19 | assert 'reports' in res 20 | 21 | assert 'version: index-digest v' + VERSION + '\n database_name: test_database\n' \ 22 | ' database_host: test.local\n database_version: MySQL v1.2.3-test' in out 23 | 24 | assert 'message: Something is fishy here' in out 25 | 26 | # context fields order is maintained 27 | assert ' context:\n foo: 42\n test: bar\n' in out 28 | 29 | # properly marked YAML file 30 | assert out.startswith('---') 31 | assert out.endswith('...\n') 32 | # assert False 33 | 34 | def test_formatter_no_results(self): 35 | out = formatter(self.get_database_mock(), []) 36 | print(out) 37 | 38 | assert out.endswith('reports: []\n...\n') 39 | -------------------------------------------------------------------------------- /indexdigest/test/test_0004_redundant_indices_core.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.test import DatabaseTestMixin 6 | 7 | 8 | class RedundantIndicesTest(TestCase, DatabaseTestMixin): 9 | 10 | def test_redundant_index_with_primary(self): 11 | indices = self.connection.get_table_indices('0004_id_foo') 12 | print(indices) 13 | 14 | (idx, primary) = indices 15 | 16 | self.assertEqual(primary.name, 'PRIMARY') 17 | self.assertEqual(idx.name, 'idx') 18 | 19 | self.assertTrue(idx.is_covered_by(primary)) 20 | self.assertFalse(primary.is_covered_by(idx)) 21 | 22 | def test_redundant_indexes(self): 23 | indices = self.connection.get_table_indices('0004_id_foo_bar') 24 | print(indices) 25 | 26 | (idx_foo, idx_foo_bar, idx_id_foo, primary) = indices 27 | 28 | self.assertEqual(primary.name, 'PRIMARY') 29 | self.assertEqual(idx_foo.name, 'idx_foo') 30 | self.assertEqual(idx_foo_bar.name, 'idx_foo_bar') 31 | self.assertEqual(idx_id_foo.name, 'idx_id_foo') 32 | 33 | self.assertTrue(idx_foo.is_covered_by(idx_foo_bar)) 34 | 35 | self.assertFalse(idx_foo.is_covered_by(idx_id_foo)) 36 | self.assertFalse(idx_foo.is_covered_by(primary)) 37 | self.assertFalse(primary.is_covered_by(idx_foo)) 38 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-automerge.yml: -------------------------------------------------------------------------------- 1 | # Based on https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/automating-dependabot-with-github-actions#enable-auto-merge-on-a-pull-request 2 | name: Dependabot auto-merge 3 | on: pull_request_target 4 | 5 | permissions: 6 | pull-requests: write 7 | contents: write 8 | 9 | jobs: 10 | dependabot: 11 | runs-on: ubuntu-latest 12 | if: ${{ github.actor == 'dependabot[bot]' }} 13 | steps: 14 | - name: Dependabot metadata 15 | id: metadata 16 | uses: dependabot/fetch-metadata@v2.4.0 17 | with: 18 | github-token: "${{ secrets.GITHUB_TOKEN }}" 19 | 20 | - name: Enable auto-merge for Dependabot PRs 21 | # Automatically merge semver-patch and semver-minor PRs 22 | if: "${{ steps.metadata.outputs.update-type == 23 | 'version-update:semver-minor' || 24 | steps.metadata.outputs.update-type == 25 | 'version-update:semver-patch' }}" 26 | 27 | # https://cli.github.com/manual/gh_pr_merge 28 | run: gh pr merge --auto --squash "$PR_URL" 29 | env: 30 | PR_URL: ${{github.event.pull_request.html_url}} 31 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 32 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0118_high_offset_selects.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0118_high_offset_selects import check_high_offset_selects 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_high_offset_selects(self): 12 | reports = list(check_high_offset_selects( 13 | self.connection, queries=read_queries_from_log('0118-high-offset-selects-log'))) 14 | 15 | print(reports, reports[0].context) 16 | 17 | self.assertEqual(len(reports), 1) 18 | 19 | self.assertEqual(str(reports[0]), 'page: "SELECT /* CategoryPaginationViewer::processSection..." query uses too high offset impacting the performance') 20 | self.assertEqual(reports[0].table_name, 'page') 21 | self.assertEqual(str(reports[0].context['query']), "SELECT /* CategoryPaginationViewer::processSection */ page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' ORDER BY cl_sortkey LIMIT 927600,200") 22 | self.assertEqual(reports[0].context['limit'], 200) 23 | self.assertEqual(reports[0].context['offset'], 927600) 24 | 25 | # assert False 26 | -------------------------------------------------------------------------------- /indexdigest/test/formatters/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from indexdigest.database import Database 4 | from indexdigest.utils import LinterEntry 5 | 6 | from .. import DatabaseTestMixin 7 | 8 | 9 | class DatabaseMock: 10 | VERSION = '1.2.3-test' 11 | HOST = 'test.local' 12 | 13 | @property 14 | def db_name(self): 15 | return 'test_database' 16 | 17 | def get_server_version(self): 18 | return self.VERSION 19 | 20 | def get_server_hostname(self): 21 | return self.HOST 22 | 23 | @staticmethod 24 | def get_queries(): 25 | return [] 26 | 27 | 28 | class FormatterTestMixin: 29 | @staticmethod 30 | def get_database_mock(): 31 | return DatabaseMock() 32 | 33 | @staticmethod 34 | def get_reports_mock(): 35 | context = OrderedDict() 36 | context['foo'] = 42 37 | context['test'] = 'bar' 38 | 39 | yield LinterEntry( 40 | linter_type='foo_linter', 41 | table_name='table_001', 42 | message='Something is fishy here', 43 | context=context 44 | ) 45 | 46 | yield LinterEntry( 47 | linter_type='bar_linter', 48 | table_name='table_042', 49 | message='An index is missing' 50 | ) 51 | 52 | @staticmethod 53 | def get_database(): 54 | return Database.connect_dsn(DatabaseTestMixin.DSN) 55 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0118_high_offset_selects.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for too high offset SELECT queries 3 | """ 4 | from collections import OrderedDict 5 | 6 | from sql_metadata.compat import get_query_limit_and_offset, get_query_tables 7 | 8 | from indexdigest.utils import LinterEntry, shorten_query, is_select_query 9 | 10 | 11 | OFFSET_THRESHOLD = 1000 12 | 13 | 14 | def check_high_offset_selects(_, queries): 15 | """ 16 | :type _ indexdigest.database.Database 17 | :type queries list[str] 18 | :rtype: list[LinterEntry] 19 | """ 20 | for query in queries: 21 | # ignore insert queries (#140) 22 | if not is_select_query(query): 23 | continue 24 | 25 | res = get_query_limit_and_offset(query) 26 | 27 | if res is None: 28 | continue 29 | 30 | (limit, offset) = res 31 | 32 | if offset < OFFSET_THRESHOLD: 33 | continue 34 | 35 | table_name = get_query_tables(query)[0] 36 | 37 | context = OrderedDict() 38 | context['query'] = query 39 | context['limit'] = limit 40 | context['offset'] = offset 41 | 42 | yield LinterEntry(linter_type='high_offset_selects', table_name=table_name, 43 | message='"{}" query uses too high offset impacting the performance'. 44 | format(shorten_query(query)), 45 | context=context) 46 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0093_having_clause.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for select queries with HAVING clause 3 | """ 4 | from sqlparse.tokens import Keyword 5 | from sql_metadata.compat import preprocess_query, get_query_tables, get_query_tokens 6 | 7 | from indexdigest.utils import LinterEntry, shorten_query, is_select_query 8 | 9 | 10 | def query_has_having_clause(query): 11 | """ 12 | Checks if provided query uses HAVING clause 13 | :type query str 14 | :rtype bool 15 | """ 16 | if not is_select_query(query): 17 | return False 18 | 19 | query = preprocess_query(query) 20 | tokens = get_query_tokens(query) 21 | 22 | for token in tokens: 23 | if token.ttype is Keyword and str(token).upper() == 'HAVING': 24 | return True 25 | 26 | return False 27 | 28 | 29 | def check_having_clause(_, queries): 30 | """ 31 | :type queries list[str] 32 | :rtype: list[LinterEntry] 33 | """ 34 | queries_with_having_clause = [ 35 | query for query in queries 36 | if query_has_having_clause(query) 37 | ] 38 | 39 | for query in queries_with_having_clause: 40 | table_name = get_query_tables(query)[0] 41 | 42 | yield LinterEntry(linter_type='having_clause', table_name=table_name, 43 | message='"{}" query uses HAVING clause'. 44 | format(shorten_query(query)), 45 | context={"query": query}) 46 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0002_not_used_indices.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for not used indices by going through SELECT queries 3 | """ 4 | import logging 5 | 6 | from collections import defaultdict 7 | 8 | from indexdigest.utils import LinterEntry, explain_queries 9 | 10 | 11 | def check_not_used_indices(database, queries): 12 | """ 13 | :type database indexdigest.database.Database 14 | :type queries list[str] 15 | :rtype: list[LinterEntry] 16 | """ 17 | logger = logging.getLogger(__name__) 18 | 19 | used_indices = defaultdict(list) 20 | 21 | # EXPLAIN each query 22 | for (query, table_used, index_used, _) in explain_queries(database, queries): 23 | if index_used is not None: 24 | logger.info("Query <%s> uses %s index on `%s` table", query, index_used, table_used) 25 | used_indices[table_used].append(index_used) 26 | 27 | # analyze all tables used by the above queries 28 | # print(used_indices) 29 | for table_name, table_indices in used_indices.items(): 30 | for index in database.get_table_indices(table_name): 31 | 32 | if index.name not in table_indices: 33 | yield LinterEntry(linter_type='not_used_indices', table_name=table_name, 34 | message='"{}" index was not used by provided queries'. 35 | format(index.name), 36 | context={"not_used_index": str(index)}) 37 | -------------------------------------------------------------------------------- /indexdigest/test/formatters/test_plain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | from unittest import TestCase 5 | 6 | from indexdigest import VERSION 7 | from indexdigest.formatters import format_plain as formatter 8 | from . import FormatterTestMixin 9 | 10 | 11 | class TestFormatter(TestCase, FormatterTestMixin): 12 | 13 | @staticmethod 14 | def _remove_ansi_styles(text): 15 | """ 16 | :type text str 17 | :rtype: str 18 | """ 19 | # '\033[0m' 20 | return re.sub(r'\033\[\d+m', '', text) 21 | 22 | def test_format_plain(self): 23 | out = formatter(self.get_database_mock(), self.get_reports_mock()) 24 | out = self._remove_ansi_styles(out) 25 | print(out) 26 | 27 | assert 'Found 2 issue(s) to report for "test_database" database' in out 28 | assert 'MySQL v1.2.3-test at test.local' in out 29 | assert 'index-digest v' + VERSION in out 30 | 31 | assert 'foo_linter → table affected: table_001' in out 32 | assert '✗ Something is fishy here' in out 33 | assert ' - foo: 42\n - test: bar' in out 34 | 35 | assert 'bar_linter → table affected: table_042' in out 36 | assert '✗ An index is missing' in out 37 | 38 | assert out.endswith('Queries performed: 0') 39 | # assert False 40 | 41 | def test_format_plain_no_results(self): 42 | out = formatter(self.get_database_mock(), []) 43 | assert out.endswith('Jolly, good! No issues to report') 44 | -------------------------------------------------------------------------------- /indexdigest/formatters/yaml.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides --format=yaml results formatter 3 | """ 4 | from __future__ import absolute_import 5 | 6 | from collections import OrderedDict 7 | 8 | import yaml 9 | import yamlordereddictloader 10 | 11 | import indexdigest 12 | 13 | 14 | def format_report(report): 15 | """ 16 | :type report indexdigest.utils.LinterEntry 17 | :rtype: OrderedDict 18 | """ 19 | res = OrderedDict() 20 | 21 | res['type'] = report.linter_type 22 | res['table'] = report.table_name 23 | res['message'] = report.message 24 | 25 | if report.context: 26 | res['context'] = report.context 27 | 28 | return res 29 | 30 | 31 | def format_yaml(database, reports): 32 | """ 33 | :type database indexdigest.database.Database 34 | :type reports list 35 | :rtype: str 36 | """ 37 | report = OrderedDict() 38 | 39 | report['meta'] = OrderedDict() 40 | report['meta']['version'] = 'index-digest v{}'.format(indexdigest.VERSION) 41 | report['meta']['database_name'] = database.db_name 42 | report['meta']['database_host'] = database.get_server_hostname() 43 | report['meta']['database_version'] = 'MySQL v{}'.format(database.get_server_version()) 44 | 45 | report['reports'] = [format_report(item) for item in reports] 46 | 47 | return yaml.dump(report, 48 | Dumper=yamlordereddictloader.Dumper, 49 | default_flow_style=False, 50 | explicit_start=True, 51 | explicit_end=True) 52 | -------------------------------------------------------------------------------- /sql/0004-redundant-indices.sql: -------------------------------------------------------------------------------- 1 | -- Detect redundant indices 2 | -- 3 | -- https://github.com/macbre/index-digest/issues/4 4 | DROP TABLE IF EXISTS `0004_id_foo`; 5 | CREATE TABLE `0004_id_foo` ( 6 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 7 | `foo` varbinary(16) NOT NULL DEFAULT '', 8 | PRIMARY KEY (`item_id`,`foo`), 9 | UNIQUE KEY `idx` (`item_id`,`foo`) 10 | ); 11 | 12 | DROP TABLE IF EXISTS `0004_id_foo_bar`; 13 | CREATE TABLE `0004_id_foo_bar` ( 14 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 15 | `foo` varbinary(16) NOT NULL DEFAULT '', 16 | `bar` varbinary(16) NOT NULL DEFAULT '', 17 | PRIMARY KEY (`item_id`), 18 | KEY `idx_foo` (`foo`), 19 | KEY `idx_foo_bar` (`foo`, `bar`), 20 | KEY `idx_id_foo` (`item_id`, `foo`) 21 | ); 22 | 23 | -- https://github.com/macbre/index-digest/issues/48 24 | DROP TABLE IF EXISTS `0004_indices_duplicating_each_other`; 25 | CREATE TABLE `0004_indices_duplicating_each_other` ( 26 | `item_id` int(9) NOT NULL AUTO_INCREMENT, 27 | `foo` varbinary(16) NOT NULL DEFAULT '', 28 | PRIMARY KEY (`item_id`), 29 | UNIQUE KEY `idx_foo` (`foo`), 30 | UNIQUE KEY `idx_foo_2` (`foo`) 31 | ); 32 | 33 | -- https://github.com/macbre/index-digest/issues/49 34 | DROP TABLE IF EXISTS `0004_image_comment_temp`; 35 | CREATE TABLE /*_*/0004_image_comment_temp ( 36 | -- Key to img_name (ugh) 37 | imgcomment_name varchar(255) binary NOT NULL, 38 | -- Key to comment_id 39 | imgcomment_description_id bigint unsigned NOT NULL, 40 | PRIMARY KEY (imgcomment_name, imgcomment_description_id) 41 | ) /*$wgDBTableOptions*/; 42 | -- Ensure uniqueness 43 | CREATE UNIQUE INDEX /*i*/imgcomment_name ON /*_*/0004_image_comment_temp (imgcomment_name); 44 | -------------------------------------------------------------------------------- /indexdigest/formatters/syslog.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides --format=syslog results formatter - pushes JSON messages via syslog 3 | """ 4 | from __future__ import absolute_import 5 | 6 | import json 7 | import syslog 8 | 9 | from collections import OrderedDict 10 | 11 | import indexdigest 12 | 13 | 14 | def _format_report(database, report): 15 | """ 16 | :type database indexdigest.database.Database 17 | :type report indexdigest.utils.LinterEntry 18 | :rtype: str 19 | """ 20 | res = OrderedDict() 21 | 22 | res['appname'] = 'index-digest' 23 | 24 | res['meta'] = OrderedDict() 25 | res['meta']['version'] = 'index-digest v{}'.format(indexdigest.VERSION) 26 | res['meta']['database_name'] = database.db_name 27 | res['meta']['database_host'] = database.get_server_hostname() 28 | res['meta']['database_version'] = 'MySQL v{}'.format(database.get_server_version()) 29 | 30 | res['report'] = OrderedDict() 31 | res['report']['type'] = report.linter_type 32 | res['report']['table'] = report.table_name 33 | res['report']['message'] = report.message 34 | 35 | if report.context: 36 | res['report']['context'] = report.context 37 | 38 | return json.dumps(res) 39 | 40 | 41 | def format_syslog(database, reports, ident='index-digest'): 42 | """ 43 | :type database indexdigest.database.Database 44 | :type reports list 45 | :type ident str 46 | :rtype: str 47 | """ 48 | syslog.openlog(ident=ident, logoption=syslog.LOG_PID, facility=syslog.LOG_USER) 49 | 50 | for report in reports: 51 | syslog.syslog(_format_report(database, report)) 52 | 53 | syslog.closelog() 54 | return '' 55 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0027_selects_with_like.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks SELECT queries that use LIKE '%foo' conditions 3 | """ 4 | import re 5 | 6 | from collections import OrderedDict 7 | 8 | from indexdigest.utils import LinterEntry, explain_queries, shorten_query 9 | 10 | 11 | def query_uses_leftmost_like(query): 12 | """ 13 | Returns True for queries with LIKE '%foo' conditions 14 | 15 | :type query str 16 | :rtype: bool 17 | """ 18 | # quit fast 19 | if 'like' not in query.lower(): 20 | return False 21 | 22 | matches = re.search(r'LIKE\s\s?[\'"]%\w', query, flags=re.IGNORECASE) 23 | return matches is not None 24 | 25 | 26 | def check_selects_with_like(database, queries): 27 | """ 28 | :type database indexdigest.database.Database 29 | :type queries list[str] 30 | :rtype: list[LinterEntry] 31 | """ 32 | for (query, table_used, index_used, explain_row) in explain_queries(database, queries): 33 | if index_used is None and query_uses_leftmost_like(query): 34 | context = OrderedDict() 35 | context['query'] = query 36 | 37 | # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information 38 | context['explain_extra'] = explain_row['Extra'] 39 | context['explain_rows'] = int(explain_row['rows']) # string when using MariaDB 10.5 40 | 41 | yield LinterEntry(linter_type='selects_with_like', table_name=table_used, 42 | message='"{}" query uses LIKE with left-most wildcard'. 43 | format(shorten_query(query)), 44 | context=context) 45 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0092_select_star.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for select queries with * wildcard 3 | """ 4 | from sqlparse.tokens import Wildcard 5 | from sql_metadata.compat import preprocess_query, get_query_tables, get_query_tokens 6 | 7 | from indexdigest.utils import LinterEntry, shorten_query, is_select_query 8 | 9 | 10 | def is_wildcard_query(query): 11 | """ 12 | Checks if provided query selects using a * wildcard 13 | :type query str 14 | :rtype bool 15 | """ 16 | if not is_select_query(query): 17 | return False 18 | 19 | query = preprocess_query(query) 20 | tokens = get_query_tokens(query) 21 | 22 | last_token = None 23 | 24 | for token in tokens: 25 | if token.ttype is Wildcard: 26 | # print([query, token, 'last token', last_token]) 27 | 28 | # check what was before the wildcard 29 | # count(*) ? 30 | if last_token and str(last_token) not in ['(']: 31 | return True 32 | else: 33 | last_token = token 34 | 35 | return False 36 | 37 | 38 | def check_select_star(_, queries): 39 | """ 40 | :type queries list[str] 41 | :rtype: list[LinterEntry] 42 | """ 43 | queries_with_wildcard = [ 44 | query for query in queries 45 | if is_wildcard_query(query) 46 | ] 47 | 48 | for query in queries_with_wildcard: 49 | table_name = get_query_tables(query)[0] 50 | 51 | yield LinterEntry(linter_type='select_star', table_name=table_name, 52 | message='"{}" query uses SELECT *'. 53 | format(shorten_query(query)), 54 | context={"query": query}) 55 | -------------------------------------------------------------------------------- /indexdigest/linters/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains linters used to check the database for improvements. 3 | """ 4 | # expose linters 5 | from .linter_0002_not_used_indices import check_not_used_indices 6 | from .linter_0004_redundant_indices import check_redundant_indices 7 | from .linter_0006_not_used_columns_and_tables import check_not_used_tables, check_not_used_columns 8 | from .linter_0019_queries_not_using_indices import check_queries_not_using_indices 9 | from .linter_0020_filesort_temporary_table import \ 10 | check_queries_using_filesort, check_queries_using_temporary 11 | from .linter_0026_full_table_scan import check_full_table_scan 12 | from .linter_0027_selects_with_like import check_selects_with_like 13 | from .linter_0028_data_too_old import check_data_too_old 14 | from .linter_0028_data_not_updated_recently import check_data_not_updated_recently 15 | from .linter_0032_utf_latin_columns import check_latin_columns 16 | from .linter_0034_missing_primary_index import check_missing_primary_index 17 | from .linter_0036_use_innodb import check_use_innodb 18 | from .linter_0070_insert_ignore import check_insert_ignore_queries 19 | from .linter_0074_single_column import check_single_column 20 | from .linter_0075_test_tables import check_test_tables 21 | from .linter_0089_empty_tables import check_empty_tables 22 | from .linter_0092_select_star import check_select_star 23 | from .linter_0093_having_clause import check_having_clause 24 | from .linter_0094_generic_primary_key import check_generic_primary_key 25 | from .linter_0118_high_offset_selects import check_high_offset_selects 26 | from .linter_0164_empty_database import check_empty_database 27 | from .linter_0031_low_cardinality_index import check_low_cardinality_index 28 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0026_full_table_scan.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for SELECT queries that use full table scan 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import explain_queries, LinterEntry, shorten_query 7 | 8 | 9 | def check_full_table_scan(database, queries): 10 | """ 11 | Full table scan 12 | 13 | An operation that requires reading the entire contents of a table, rather than just selected 14 | portions using an index. Typically performed either with small lookup tables, or in data 15 | warehousing situations with large tables where all available data is aggregated and analyzed. 16 | How frequently these operations occur, and the sizes of the tables relative to available memory, 17 | have implications for the algorithms used in query optimization and managing the buffer pool. 18 | 19 | :type database indexdigest.database.Database 20 | :type queries list[str] 21 | :rtype: list[LinterEntry] 22 | """ 23 | for (query, table_used, _, row) in explain_queries(database, queries): 24 | # The output from EXPLAIN shows ALL in the type column when 25 | # MySQL uses a full table scan to resolve a query. 26 | if row['type'] != 'ALL': 27 | continue 28 | 29 | context = OrderedDict() 30 | context['query'] = query 31 | context['explain_rows'] = int(row['rows']) # we get string here when using MariaDB 10.5 32 | 33 | yield LinterEntry(linter_type='queries_using_full_table_scan', table_name=table_used, 34 | message='"{}" query triggered full table scan'. 35 | format(shorten_query(query)), 36 | context=context) 37 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0075_test_tables.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters import check_test_tables 6 | from indexdigest.linters.linter_0075_test_tables import is_test_table 7 | from indexdigest.test import DatabaseTestMixin 8 | 9 | 10 | class TestTables(TestCase, DatabaseTestMixin): 11 | 12 | def test_is_test_table(self): 13 | assert is_test_table('test') is True 14 | assert is_test_table('some_guy_test_table') is True 15 | assert is_test_table('0075_some_guy_test_table') is True 16 | assert is_test_table('foo_test_bar') is True 17 | assert is_test_table('test_bar') is True 18 | assert is_test_table('foo_test') is True 19 | assert is_test_table('forum_creation_temp') is True 20 | 21 | assert is_test_table('foo_testing') is False 22 | assert is_test_table('test123') is False 23 | assert is_test_table('travis_tests') is False 24 | 25 | def test_check_test_table(self): 26 | reports = list(check_test_tables(self.connection)) 27 | 28 | print(list(map(str, reports))) 29 | 30 | self.assertEqual(len(reports), 2) 31 | 32 | self.assertEqual(str(reports[0]), 33 | '0004_image_comment_temp: "0004_image_comment_temp" seems to be a test table') 34 | self.assertTrue('CREATE TABLE `0004_image_comment_temp` (' in reports[0].context['schema']) 35 | 36 | self.assertEqual(str(reports[1]), 37 | '0075_some_guy_test_table: "0075_some_guy_test_table" seems to be a test table') 38 | self.assertTrue('CREATE TABLE `0075_some_guy_test_table` (' in reports[1].context['schema']) 39 | 40 | # assert False 41 | -------------------------------------------------------------------------------- /indexdigest/test/formatters/test_syslog.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, mock 2 | 3 | from indexdigest import VERSION 4 | from indexdigest.formatters.syslog import _format_report, format_syslog 5 | from . import FormatterTestMixin 6 | 7 | from indexdigest.cli.script import get_reports 8 | 9 | 10 | class TestFormatter(TestCase, FormatterTestMixin): 11 | 12 | def test_format_report_helper(self): 13 | report = next(self.get_reports_mock()) 14 | out = _format_report(self.get_database_mock(), report) 15 | print(out, report) 16 | 17 | self.assertEqual( 18 | '{"appname": "index-digest", "meta": {"version": "index-digest v' + VERSION + '", "database_name": "test_database", ' 19 | '"database_host": "test.local", "database_version": "MySQL v1.2.3-test"}, ' 20 | '"report": {"type": "foo_linter", "table": "table_001", "message": "Something is fishy here", ' 21 | '"context": {"foo": 42, "test": "bar"}}}', 22 | out 23 | ) 24 | 25 | # assert False 26 | 27 | 28 | class TestFormatterIntegrationTest(TestCase, FormatterTestMixin): 29 | 30 | def test_format_for_real_reports(self): 31 | database = self.get_database() 32 | 33 | # pass all reports via syslog formatter 34 | for report in get_reports(database, analyze_data=True): 35 | _format_report(database, report) 36 | 37 | @mock.patch('syslog.syslog') 38 | def test_format_syslog(self, mocked_syslog: mock.MagicMock): 39 | reports = list(self.get_reports_mock()) 40 | format_syslog(database=self.get_database_mock(), reports=reports) 41 | 42 | assert mocked_syslog.called, 'syslog.syslog has been called' 43 | assert mocked_syslog.call_count == len(reports) 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .pytest_cache/ 4 | *.py[cod] 5 | *$py.class 6 | *.swp 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env*/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | .idea/ 106 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0031_low_cardinality_index.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0031_low_cardinality_index import \ 6 | check_low_cardinality_index, get_low_cardinality_indices, INDEX_CARDINALITY_THRESHOLD 7 | from indexdigest.test import DatabaseTestMixin 8 | 9 | 10 | class TestLinter(TestCase, DatabaseTestMixin): 11 | 12 | def setUp(self) -> None: 13 | self.skipTest(reason="test_0031_low_cardinality_index is not stable") 14 | 15 | def test_get_low_cardinality_indices(self): 16 | indices = list(get_low_cardinality_indices(self.connection)) 17 | 18 | print(indices) 19 | 20 | assert len(indices) == 1 21 | 22 | index = indices[0] 23 | assert index[0] == '0020_big_table' 24 | assert index[2]['INDEX_NAME'] == 'num_idx' 25 | assert index[2]['COLUMN_NAME'] == 'num' 26 | assert index[2]['CARDINALITY'] >= 1 27 | assert index[2]['CARDINALITY'] <= INDEX_CARDINALITY_THRESHOLD 28 | 29 | def test_low_cardinality_index(self): 30 | reports = list(check_low_cardinality_index(self.connection)) 31 | 32 | print(reports, reports[0].context) 33 | 34 | assert len(reports) == 1 35 | 36 | assert str(reports[0]) == '0020_big_table: "num_idx" index on "num" column ' \ 37 | 'has low cardinality, check if it is needed' 38 | assert reports[0].table_name == '0020_big_table' 39 | 40 | assert reports[0].context['column_name'] == 'num' 41 | assert reports[0].context['index_name'] == 'num_idx' 42 | assert isinstance(reports[0].context['index_cardinality'], int) 43 | 44 | self.assertAlmostEqual(int(reports[0].context['value_usage']), 50, delta=5) 45 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0032_utf_latin_columns.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter reports text columns that have characters encoding set to latin1 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import LinterEntry 7 | 8 | 9 | def is_text_column_latin(column): 10 | """ 11 | :type column indexdigest.schema.Column 12 | :rtype: bool 13 | """ 14 | if not column.is_text_type(): 15 | return False 16 | 17 | # ignore blob columns without specified character set 18 | if column.character_set is None: 19 | return False 20 | 21 | # ignore utf8 columns 22 | # utf8, ucs2, utf8mb4, utf16, utf16le, utf32 23 | # @see https://dev.mysql.com/doc/refman/5.7/en/charset-unicode.html 24 | if column.character_set.startswith('utf') or column.character_set in ['ucs2', 'binary']: 25 | return False 26 | 27 | return True 28 | 29 | 30 | def check_latin_columns(database): 31 | """ 32 | :type database indexdigest.database.Database 33 | :rtype: list[LinterEntry] 34 | """ 35 | for table in database.get_tables(): 36 | for column in database.get_table_columns(table): 37 | if not is_text_column_latin(column): 38 | continue 39 | 40 | # print([table, column, column.character_set, column.collation]) 41 | 42 | context = OrderedDict() 43 | context['column'] = column.name 44 | context['column_character_set'] = column.character_set 45 | context['column_collation'] = column.collation 46 | context['schema'] = database.get_table_schema(table) 47 | 48 | yield LinterEntry(linter_type='non_utf_columns', table_name=table, 49 | message='"{}" text column has "{}" character set defined'. 50 | format(column.name, column.character_set), 51 | context=context) 52 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0019_queries_not_using_indices.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks SELECT queries that do not use indices 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import LinterEntry, explain_queries, shorten_query 7 | 8 | 9 | def check_queries_not_using_indices(database, queries): 10 | """ 11 | :type database indexdigest.database.Database 12 | :type queries list[str] 13 | :rtype: list[LinterEntry] 14 | """ 15 | for (query, table_used, index_used, explain_row) in explain_queries(database, queries): 16 | # print(query, explain_row) 17 | 18 | # EXPLAIN can return no matching row in const table in Extra column. 19 | # Do not consider this query as not using an index. -- see #44 and #210 20 | if explain_row['Extra'] in [ 21 | 'Impossible WHERE noticed after reading const tables', 22 | 'no matching row in const table', 23 | 'No tables used', 24 | 'Select tables optimized away', 25 | 'No matching min/max row', 26 | ]: 27 | continue 28 | 29 | if index_used is None: 30 | context = OrderedDict() 31 | context['query'] = query 32 | 33 | # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information 34 | context['explain_extra'] = explain_row['Extra'] 35 | context['explain_rows'] = explain_row['rows'] 36 | context['explain_filtered'] = explain_row.get('filtered') # can be not set 37 | context['explain_possible_keys'] = explain_row['possible_keys'] 38 | 39 | yield LinterEntry(linter_type='queries_not_using_index', table_name=table_used, 40 | message='"{}" query did not make use of any index'. 41 | format(shorten_query(query)), 42 | context=context) 43 | -------------------------------------------------------------------------------- /.github/workflows/dockerimage.yml: -------------------------------------------------------------------------------- 1 | name: Build and test a Docker image 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | 8 | jobs: 9 | 10 | docker_build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v5.0.1 15 | 16 | - name: Build the Docker image 17 | env: 18 | # @see https://docs.docker.com/develop/develop-images/build_enhancements/ 19 | DOCKER_BUILDKIT: "1" 20 | run: | 21 | docker build . \ 22 | --cache-from ghcr.io/macbre/index-digest:latest \ 23 | --build-arg BUILDKIT_INLINE_CACHE=1 \ 24 | --build-arg GITHUB_SHA=$(git rev-parse --short HEAD) \ 25 | --tag ${{ github.repository }} 26 | 27 | docker images 28 | 29 | echo "## Image labels:" 30 | docker inspect --format='{{json .Config.Labels}}' ${{ github.repository }} | jq 31 | 32 | echo "## Image env vars:" 33 | docker inspect --format='{{json .Config.Env}}' ${{ github.repository }} | jq 34 | 35 | - name: Check the version 36 | run: | 37 | docker run ${{ github.repository }} --version 38 | 39 | docker_test: 40 | runs-on: ubuntu-latest 41 | needs: docker_build 42 | 43 | services: 44 | mysql: 45 | image: mysql:8.0.22 46 | env: 47 | MYSQL_ALLOW_EMPTY_PASSWORD: yes 48 | MYSQL_DATABASE: index_digest 49 | MYSQL_USER: test 50 | MYSQL_PASSWORD: p4ss 51 | ports: 52 | - "53306:3306" 53 | options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 54 | 55 | steps: 56 | - name: Run the container and connect to the test database 57 | run: | 58 | docker ps 59 | docker run --network=host ${{ github.repository }} mysql://test:p4ss@0.0.0.0:53306/index_digest | tee /tmp/results 60 | grep "Jolly, good! No issues to report" /tmp/results 61 | -------------------------------------------------------------------------------- /sql/0028-data-too-old.sql: -------------------------------------------------------------------------------- 1 | -- Report tables that have really old data 2 | -- Worth checking if such long data retention is actually needed 3 | -- 4 | -- https://github.com/macbre/index-digest/issues/28 5 | DROP TABLE IF EXISTS `0028_data_too_old`; 6 | CREATE TABLE `0028_data_too_old` ( 7 | `item_id` int(8) unsigned NOT NULL AUTO_INCREMENT, 8 | `cnt` int(8) unsigned NOT NULL, 9 | `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 10 | PRIMARY KEY (`item_id`) 11 | ) ENGINE=InnoDB; 12 | 13 | 14 | -- table with old data (6 months old) 15 | INSERT INTO 0028_data_too_old VALUES 16 | (1, 12, NOW() - INTERVAL 6 MONTH), 17 | (2, 20, NOW() - INTERVAL 3 MONTH), 18 | (3, 42, NOW()); 19 | 20 | INSERT INTO 0028_data_too_old(cnt) VALUES 21 | (52); 22 | 23 | 24 | -- table with no old data 25 | DROP TABLE IF EXISTS `0028_data_ok`; 26 | CREATE TABLE `0028_data_ok` LIKE `0028_data_too_old`; 27 | 28 | INSERT INTO 0028_data_ok(cnt, `timestamp`) VALUES 29 | (1, NOW() - INTERVAL 7 DAY); 30 | 31 | 32 | -- empty tables should be simply ignored 33 | DROP TABLE IF EXISTS `0028_data_empty`; 34 | CREATE TABLE `0028_data_empty` LIKE `0028_data_too_old`; 35 | 36 | -- table with no time columns 37 | DROP TABLE IF EXISTS `0028_no_time`; 38 | CREATE TABLE `0028_no_time` ( 39 | `item_id` int(8) unsigned NOT NULL AUTO_INCREMENT, 40 | `cnt` int(8) unsigned NOT NULL, 41 | PRIMARY KEY (`item_id`) 42 | ) ENGINE=InnoDB; 43 | 44 | -- MediaWiki timestamp columns 45 | -- @see https://www.mediawiki.org/wiki/Manual:Revision_table 46 | DROP TABLE IF EXISTS `0028_revision`; 47 | CREATE TABLE `0028_revision` ( 48 | `rev_id` int(10) unsigned NOT NULL AUTO_INCREMENT, 49 | `rev_timestamp` binary(14) NOT NULL, 50 | PRIMARY KEY (`rev_id`) 51 | ) ENGINE=InnoDB; 52 | 53 | INSERT INTO 0028_revision(rev_id, `rev_timestamp`) VALUES 54 | (1, '20180101000000'), 55 | (2, '2018010'); -- #129: Incorrect datetime value 56 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0028_data_too_old.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.database import Database 6 | from indexdigest.linters.linter_0028_data_too_old import check_data_too_old 7 | from indexdigest.test import DatabaseTestMixin 8 | 9 | 10 | class LimitedViewDatabase(Database, DatabaseTestMixin): 11 | """ 12 | Limit test to tables 13 | """ 14 | def get_tables(self): 15 | return [ 16 | '0028_data_too_old', 17 | '0028_data_ok', 18 | '0028_data_empty', 19 | '0028_no_time', 20 | '0028_data_not_updated_recently', 21 | '0028_revision', 22 | ] 23 | 24 | 25 | class TestLinter(TestCase, DatabaseTestMixin): 26 | 27 | @property 28 | def connection(self): 29 | return LimitedViewDatabase.connect_dsn(self.DSN) 30 | 31 | def test_data_too_old(self): 32 | reports = list(check_data_too_old(self.connection)) 33 | 34 | print(list(map(str, reports))) 35 | 36 | assert len(reports) == 1 37 | 38 | assert str(reports[0]).startswith('0028_data_too_old: "0028_data_too_old" has rows added 18') # .. 184 days ago 39 | assert str(reports[0]).endswith('consider changing retention policy') 40 | # self.assertAlmostEquals(reports[0].context['diff_days'], 184) 41 | assert reports[0].table_name == '0028_data_too_old' 42 | 43 | assert 'data_since' in reports[0].context 44 | assert 'data_until' in reports[0].context 45 | assert 'table_size_mb' in reports[0].context 46 | 47 | assert reports[0].context['date_column_name'] == 'timestamp' 48 | 49 | def test_data_too_old_with_custom_threshold(self): 50 | env = { 51 | 'INDEX_DIGEST_DATA_TOO_OLD_THRESHOLD_DAYS': str(365 * 86400) 52 | } 53 | 54 | reports = list(check_data_too_old(self.connection, env)) 55 | 56 | print(list(map(str, reports))) 57 | assert len(reports) == 0 58 | -------------------------------------------------------------------------------- /indexdigest/test/core/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from indexdigest.utils import is_select_query, parse_dsn, shorten_query 4 | 5 | 6 | class TestUtils(TestCase): 7 | 8 | def test_parse_dsn(self): 9 | parsed = parse_dsn('mysql://alex:pwd@localhost/test') 10 | 11 | self.assertEqual('localhost', parsed['host']) 12 | self.assertEqual(3306, parsed['port']) 13 | self.assertEqual('alex', parsed['user']) 14 | self.assertEqual('pwd', parsed['passwd']) 15 | self.assertEqual('test', parsed['db']) 16 | 17 | def test_parse_dsn_with_port(self): 18 | parsed = parse_dsn('mysql://alex:pwd@localhost:5000/test') 19 | 20 | self.assertEqual('localhost', parsed['host']) 21 | self.assertEqual(5000, parsed['port']) 22 | self.assertEqual('alex', parsed['user']) 23 | self.assertEqual('pwd', parsed['passwd']) 24 | self.assertEqual('test', parsed['db']) 25 | 26 | def test_is_select_query(self): 27 | assert is_select_query('SELECT * FROM foo') 28 | assert is_select_query('select * from foo') 29 | assert is_select_query('SELECT * FROM foo;') 30 | assert is_select_query(' SELECT * FROM foo;') 31 | assert is_select_query('/* foo */ SELECT * FROM foo;') 32 | 33 | assert is_select_query('BEGIN') is False 34 | assert is_select_query('COMMIT') is False 35 | assert is_select_query('/* SELECT */ COMMIT') is False 36 | assert is_select_query('TRUNCATE foo;') is False 37 | assert is_select_query('UPDATE foo SET bar=42 WHERE id=1') is False 38 | 39 | def test_shorten_query(self): 40 | self.assertEqual('SELECT * FROM foo', shorten_query('SELECT * FROM foo')) 41 | self.assertEqual('SELECT * FROM foo', shorten_query('SELECT * FROM foo', max_len=18)) 42 | self.assertEqual('SELECT * FROM foo', shorten_query('SELECT * FROM foo', max_len=17)) 43 | self.assertEqual('SELECT * FROM fo...', shorten_query('SELECT * FROM foo', max_len=16)) 44 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0092_select_star.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0092_select_star import check_select_star, is_wildcard_query 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_is_wildcard_query(self): 12 | assert is_wildcard_query('SELECT * FROM foo;') 13 | assert is_wildcard_query('SELECT t.* FROM foo AS t;') 14 | assert is_wildcard_query('SELECT * FROM `user` WHERE user_id = 34994913 LIMIT 1') 15 | assert is_wildcard_query('/* User::loadFromDatabase */ SELECT * FROM `user` WHERE user_id = 34994913 LIMIT 1') 16 | assert is_wildcard_query('SELECT /* User::loadFromDatabase */ * FROM `user` WHERE user_id = 34994913 LIMIT 1') 17 | 18 | assert is_wildcard_query('SELECT id FROM foo') is False 19 | assert is_wildcard_query('SELECT (id+2) * 2 FROM foo') is False 20 | assert is_wildcard_query('SELECT 3 * 3') is False 21 | assert is_wildcard_query('SELECT count(*) FROM foo') is False 22 | assert is_wildcard_query('SELECT /* foo */ test FROM foo') is False 23 | 24 | assert is_wildcard_query('INSERT * INTO foo') is False 25 | 26 | # assert False 27 | 28 | def test_check_select_star(self): 29 | reports = list(check_select_star(self.connection, read_queries_from_log('0092-select-star-log'))) 30 | 31 | print(list(map(str, reports))) 32 | 33 | assert len(reports) == 2 34 | 35 | assert str(reports[0]) == 'foo: "SELECT * FROM foo" query uses SELECT *' 36 | assert reports[0].table_name == 'foo' 37 | assert reports[0].context['query'] == 'SELECT * FROM foo;' 38 | 39 | assert str(reports[1]) == 'bar: "SELECT t.* FROM bar AS t" query uses SELECT *' 40 | assert reports[1].table_name == 'bar' 41 | assert reports[1].context['query'] == 'SELECT t.* FROM bar AS t;' 42 | 43 | # assert False 44 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0019_queries_not_using_indices.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0019_queries_not_using_indices import check_queries_not_using_indices 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 7 | 8 | 9 | class TestQueriesNotUsingIndices(TestCase, DatabaseTestMixin): 10 | 11 | def test_queries(self): 12 | reports = list(check_queries_not_using_indices( 13 | database=self.connection, queries=read_queries_from_log('0019-queries-not-using-indices-log'))) 14 | 15 | print(*[f"{report.message} ({report.context['explain_extra']})" for report in reports], sep="\n") 16 | assert len(reports) == 3 17 | 18 | self.assertEqual(str(reports[0]), '0019_queries_not_using_indices: "SELECT item_id FROM 0019_queries_not_using_indices..." query did not make use of any index') 19 | self.assertEqual(reports[0].table_name, '0019_queries_not_using_indices') 20 | self.assertEqual(str(reports[0].context['query']), 'SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" OR item_id > 1;') 21 | self.assertEqual(str(reports[0].context['explain_extra']), 'Using where') 22 | self.assertEqual(str(reports[0].context['explain_rows']), '3') 23 | 24 | self.assertEqual(reports[1].table_name, '0019_queries_not_using_indices') 25 | self.assertEqual(str(reports[1].context['query']), 'SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test"') 26 | self.assertEqual(str(reports[1].context['explain_extra']), 'Using where') 27 | self.assertEqual(str(reports[1].context['explain_rows']), '3') 28 | 29 | self.assertEqual(reports[2].table_name, '0019_queries_not_using_indices') 30 | self.assertEqual(str(reports[2].context['query']), 'SELECT 1 AS one FROM dual WHERE exists ( SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" );') 31 | self.assertEqual(str(reports[2].context['explain_extra']), 'Using where') 32 | self.assertEqual(str(reports[2].context['explain_rows']), '3') 33 | 34 | # assert False 35 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0070_insert_ignore.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks INSERT IGNORE queries 3 | 4 | If you use the IGNORE modifier, errors that occur while executing the INSERT statement are ignored. 5 | For example, without IGNORE, a row that duplicates an existing UNIQUE index or PRIMARY KEY value 6 | in the table causes a duplicate-key error and the statement is aborted. With IGNORE, the row is 7 | discarded and no error occurs. Ignored errors generate warnings instead. 8 | 9 | Data conversions that would trigger errors abort the statement if IGNORE is not specified. 10 | With IGNORE, invalid values are adjusted to the closest values and inserted; warnings 11 | are produced but the statement does not abort. 12 | 13 | @see https://medium.com/legacy-systems-diary/things-to-avoid-episode-1-insert-ignore-535b4c24406b 14 | """ 15 | import re 16 | 17 | from collections import OrderedDict 18 | from sql_metadata.compat import get_query_tables 19 | 20 | from indexdigest.utils import LinterEntry, shorten_query 21 | 22 | 23 | def remove_comments(sql): 24 | """ 25 | :type sql str 26 | :rtype: str 27 | """ 28 | return re.sub(r'/\*[^*]+\*/', '', sql) 29 | 30 | 31 | def is_insert_ignore_query(sql): 32 | """ 33 | :type sql str 34 | :rtype: bool 35 | """ 36 | sql = remove_comments(sql).lstrip() 37 | return re.match(r'^INSERT\s+IGNORE\s', sql, flags=re.IGNORECASE) is not None 38 | 39 | 40 | def check_insert_ignore_queries(database, queries): 41 | """ 42 | :type database indexdigest.database.Database 43 | :type queries list[str] 44 | :rtype: list[LinterEntry] 45 | """ 46 | queries = [query for query in queries if is_insert_ignore_query(query)] 47 | 48 | for query in queries: 49 | table_used = get_query_tables(query)[0] 50 | 51 | context = OrderedDict() 52 | context['query'] = query 53 | context['schema'] = database.get_table_schema(table_used) 54 | 55 | yield LinterEntry(linter_type='insert_ignore', table_name=table_used, 56 | message='"{}" query uses a risky INSERT IGNORE'. 57 | format(shorten_query(query)), 58 | context=context) 59 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0028_data_not_updated_recently.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0028_data_not_updated_recently \ 6 | import check_data_not_updated_recently, get_time_columns 7 | from indexdigest.test import DatabaseTestMixin 8 | from .test_0028_data_too_old import LimitedViewDatabase 9 | 10 | 11 | class TestLinter(TestCase, DatabaseTestMixin): 12 | 13 | @property 14 | def connection(self): 15 | return LimitedViewDatabase.connect_dsn(self.DSN) 16 | 17 | def test_get_time_columns(self): 18 | columns = list(get_time_columns(self.connection)) 19 | 20 | assert len(columns) == 5 21 | 22 | assert columns[0][0] == '0028_data_too_old' 23 | assert columns[0][1].name == 'timestamp' 24 | 25 | assert columns[4][0] == '0028_revision' 26 | assert columns[4][1].name == 'rev_timestamp' 27 | 28 | print(list(columns)) 29 | # assert False 30 | 31 | def test_data_not_updated_recently(self): 32 | reports = list(check_data_not_updated_recently(self.connection)) 33 | 34 | print(list(map(str, reports))) 35 | 36 | assert len(reports) == 1 37 | 38 | assert str(reports[0]).startswith('0028_data_not_updated_recently: "0028_data_not_updated_recently" ' 39 | 'has the latest row added ') 40 | assert str(reports[0]).endswith('consider checking if it should be up-to-date') 41 | assert abs(reports[0].context['diff_days'] - 40) < 2, 'diff_days is around 40 days' 42 | assert reports[0].table_name == '0028_data_not_updated_recently' 43 | 44 | assert 'data_since' in reports[0].context 45 | assert 'data_until' in reports[0].context 46 | assert 'table_size_mb' in reports[0].context 47 | 48 | def test_data_not_updated_recently_with_custom_threshold(self): 49 | env = { 50 | 'INDEX_DIGEST_DATA_NOT_UPDATED_RECENTLY_THRESHOLD_DAYS': str(60 * 86400) 51 | } 52 | 53 | reports = list(check_data_not_updated_recently(self.connection, env)) 54 | 55 | print(list(map(str, reports))) 56 | assert len(reports) == 0 57 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0027_selects_with_like.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0027_selects_with_like import check_selects_with_like, query_uses_leftmost_like 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 7 | 8 | 9 | class TestSelectsWithLike(TestCase, DatabaseTestMixin): 10 | 11 | def test_query_uses_leftmost_like(self): 12 | self.assertTrue(query_uses_leftmost_like("SELECT * FROM foo WHERE bar LIKE '%baz';")) 13 | self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar LIKE "%baz";')) 14 | self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like "%baz";')) 15 | self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like "%123";')) 16 | self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like\n"%123";')) 17 | self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like "%123";')) 18 | 19 | self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE bar = 'baz'")) 20 | self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE like = 'foo'")) 21 | self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE bar LIKE 'b%z';")) 22 | self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE bar LIKE 'ba%';")) 23 | 24 | def test_queries(self): 25 | reports = list(check_selects_with_like( 26 | database=self.connection, queries=read_queries_from_log('0027-selects-with-like-log'))) 27 | 28 | print(reports, reports[0].context) 29 | 30 | self.assertEqual(len(reports), 1) 31 | 32 | self.assertEqual(str(reports[0]), '0020_big_table: "SELECT * FROM 0020_big_table WHERE text LIKE \'%00\'" query uses LIKE with left-most wildcard') 33 | self.assertEqual(reports[0].table_name, '0020_big_table') 34 | self.assertEqual(str(reports[0].context['query']), 'SELECT * FROM 0020_big_table WHERE text LIKE \'%00\'') 35 | self.assertEqual(str(reports[0].context['explain_extra']), 'Using where') 36 | self.assertTrue(reports[0].context['explain_rows'] > 10000) 37 | 38 | # assert False 39 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Integration tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | 8 | jobs: 9 | integrations_tests: 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | # Do not fail if one the tests did not pass 14 | fail-fast: false 15 | 16 | matrix: 17 | # Docker images of MySQL-compliant databases to run the tests suite on 18 | database: 19 | # https://hub.docker.com/_/mysql?tab=tags 20 | - "mysql:5.7.32" 21 | - "mysql:8.0.22" 22 | - "mysql:8.1.0" 23 | - "mysql:9.4.0" 24 | # https://hub.docker.com/_/mariadb?tab=tags 25 | - "mariadb:10.1" 26 | - "mariadb:10.2" 27 | - "mariadb:10.5" 28 | - "mariadb:10.6" 29 | - "mariadb:11.8" 30 | - "mariadb:12.0" 31 | # https://hub.docker.com/_/percona?tab=tags 32 | - "percona:8.0.22-13" 33 | 34 | services: 35 | mysql: 36 | image: ${{ matrix.database }} 37 | env: 38 | MYSQL_ALLOW_EMPTY_PASSWORD: yes 39 | MYSQL_DATABASE: index_digest 40 | ports: 41 | - "53306:3306" 42 | options: --health-cmd="mysqladmin ping || mariadb-admin ping" --health-interval=10s --health-timeout=5s --health-retries=6 43 | 44 | steps: 45 | - uses: actions/checkout@v5.0.1 46 | 47 | # https://github.com/actions/setup-python?tab=readme-ov-file#caching-packages-dependencies 48 | - name: Set up Python 49 | uses: actions/setup-python@v6 50 | with: 51 | python-version: "3.14" 52 | cache: 'pip' # dependencies caching 53 | cache-dependency-path: 'setup.py' 54 | 55 | - name: Install dependencies 56 | run: | 57 | pip install wheel 58 | make install 59 | 60 | - name: Set up the database 61 | run: | 62 | docker ps 63 | mysql --protocol=tcp --port=53306 -u root --password="" -v < setup.sql 64 | # import the test schema files 65 | "./sql/populate.sh" 66 | mysql --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest -v -e '\s; SHOW TABLES; SHOW DATABASES;' 67 | 68 | - name: Tests 69 | run: make test 70 | -------------------------------------------------------------------------------- /indexdigest/formatters/plain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Provides --format=plain results formatter 4 | """ 5 | from termcolor import colored 6 | 7 | import indexdigest 8 | from indexdigest.utils import LinterEntry 9 | 10 | 11 | def format_context(context): 12 | """ 13 | :type context dict 14 | :rtype: str 15 | """ 16 | return '\n '.join([ 17 | "- {key}: {value}".format( 18 | key=colored(key, color='green', attrs=['bold']), 19 | value=str(value).replace("\n", "\n ") 20 | ) 21 | for (key, value) in context.items() 22 | ]) 23 | 24 | 25 | def format_plain(database, reports): 26 | """ 27 | :type database indexdigest.database.Database 28 | :type reports list 29 | :rtype: str 30 | """ 31 | out = '' 32 | 33 | # cast to a list (to be able to count reports) 34 | reports = list(reports) 35 | 36 | # emit results 37 | line = '-' * 60 + "\n" 38 | 39 | out += line 40 | out += 'Found {} issue(s) to report for "{}" database\n'.format( 41 | len(reports), database.db_name) 42 | out += line 43 | out += 'MySQL v{} at {}\n'.format( 44 | database.get_server_version(), database.get_server_hostname()) 45 | out += 'index-digest v{}\n'.format(indexdigest.VERSION) 46 | out += line 47 | 48 | if reports: 49 | for report in reports: 50 | assert isinstance(report, LinterEntry) 51 | 52 | out += colored(report.linter_type, color='blue', attrs=['bold']) + \ 53 | ' → table affected: ' + \ 54 | colored(report.table_name, attrs=['bold']) + \ 55 | '\n' 56 | 57 | out += colored( 58 | '\n{} {}\n'.format(colored('✗', color='red', attrs=['bold']), report.message), 59 | color='white') 60 | 61 | if report.context is not None: 62 | out += '\n {}\n'.format(format_context(report.context)) 63 | 64 | out += '\n' 65 | out += line 66 | 67 | out += 'Queries performed: {}'.format(len(database.get_queries())) 68 | # out += '\n'.join(map(str, database.get_queries()))) 69 | else: 70 | out += 'Jolly, good! No issues to report' 71 | 72 | return out 73 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0004_redundant_indices.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for redundant indices from a given set of them 3 | """ 4 | import logging 5 | 6 | from collections import OrderedDict 7 | 8 | from indexdigest.utils import LinterEntry 9 | 10 | 11 | def get_redundant_indices(indices): 12 | """ 13 | :type indices list[indexdigest.schema.Index] 14 | :rtype: list[tuple] 15 | """ 16 | redundant_indices = [] 17 | 18 | for index in indices: 19 | for compare in indices: 20 | if index.is_covered_by(compare): 21 | redundant_indices.append((index, compare, )) 22 | 23 | return redundant_indices 24 | 25 | 26 | def check_redundant_indices(database): 27 | """ 28 | :type database indexdigest.database.Database 29 | :rtype: list[LinterEntry] 30 | """ 31 | logger = logging.getLogger(__name__) 32 | 33 | for table in database.get_tables(): 34 | logger.info("Checking %s table", table) 35 | 36 | indices = database.get_table_indices(table) 37 | meta = database.get_table_metadata(table) 38 | schema = database.get_table_schema(table) 39 | 40 | redundant_indices = set() 41 | 42 | for (redundant_index, suggested_index) in get_redundant_indices(indices): 43 | # the index we're about to suggest was reported as redundant - #48 44 | if suggested_index in redundant_indices: 45 | continue 46 | 47 | context = OrderedDict() 48 | context['redundant'] = str(redundant_index) 49 | context['covered_by'] = str(suggested_index) 50 | context['schema'] = schema 51 | context['table_data_size_mb'] = 1. * meta['data_size'] / 1024 / 1024 52 | context['table_index_size_mb'] = 1. * meta['index_size'] / 1024 / 1024 53 | 54 | # add to the list to avoid redundant indices being reported in a loop - #48 55 | redundant_indices.add(redundant_index) 56 | 57 | yield LinterEntry(linter_type='redundant_indices', table_name=table, 58 | message='"{}" index can be removed as redundant (covered by "{}")'. 59 | format(redundant_index.name, suggested_index.name), 60 | context=context) 61 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0093_having_clause.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0093_having_clause import query_has_having_clause, check_having_clause 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 7 | 8 | 9 | class TestLinter(TestCase, DatabaseTestMixin): 10 | 11 | def test_query_has_having_clause(self): 12 | assert query_has_having_clause('SELECT * FROM foo having bar = 2') 13 | assert query_has_having_clause('SELECT * FROM foo HAVING bar = 2') 14 | 15 | assert query_has_having_clause("SELECT * FROM 0019_queries_not_using_indices " 16 | "WHERE foo = 'foo' HAVING bar = 'test'") 17 | assert query_has_having_clause("SELECT s.cust_id,count(s.cust_id) FROM SH.sales s " 18 | "GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2'") 19 | 20 | assert query_has_having_clause('SELECT * FROM foo') is False 21 | assert query_has_having_clause('SELECT * FROM foo_having LIMIT 10') is False 22 | assert query_has_having_clause('SELECT /* having */ id FROM foo') is False 23 | 24 | assert query_has_having_clause('INSERT 42 INTO having') is False 25 | 26 | def test_having_clause(self): 27 | reports = list(check_having_clause(self.connection, read_queries_from_log('0093-having-clause-log'))) 28 | 29 | print(list(map(str, reports))) 30 | 31 | assert len(reports) == 3 32 | 33 | assert str(reports[0]) == 'foo: "SELECT * FROM foo HAVING bar = 2" query uses HAVING clause' 34 | assert reports[0].table_name == 'foo' 35 | assert reports[0].context['query'] == 'SELECT * FROM foo HAVING bar = 2;' 36 | 37 | assert str(reports[1]) == 'SH.sales: "SELECT s.cust_id,count(s.cust_id) ' \ 38 | 'FROM SH.sales s ..." query uses HAVING clause' 39 | assert reports[1].table_name == 'SH.sales' 40 | 41 | assert str(reports[2]) == '0019_queries_not_using_indices: "SELECT * FROM ' \ 42 | '`0019_queries_not_using_indices` WHE..." query uses HAVING clause' 43 | assert reports[2].table_name == '0019_queries_not_using_indices' 44 | 45 | # assert False 46 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0028_data_not_updated_recently.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter looks for table that were not updated recently 3 | """ 4 | from collections import OrderedDict 5 | from datetime import datetime 6 | from time import time 7 | 8 | from indexdigest.utils import LinterEntry 9 | 10 | from .linter_0028_data_too_old import get_time_columns, get_boundary_times 11 | 12 | 13 | def check_data_not_updated_recently(database, env=None): 14 | """ 15 | :type database indexdigest.database.Database 16 | :type env dict 17 | :rtype: list[LinterEntry] 18 | """ 19 | now = int(time()) # I will probably never understand dates handling in Python 20 | 21 | # set up a diff threshold (in days) 22 | env = env if env else dict() 23 | diff_threshold = int(env.get('INDEX_DIGEST_DATA_NOT_UPDATED_RECENTLY_THRESHOLD_DAYS', 30)) 24 | 25 | for (table_name, column) in get_time_columns(database): 26 | timestamps = get_boundary_times(database, table_name, column) 27 | if timestamps is None: 28 | continue 29 | 30 | diff = now - timestamps.get('max') 31 | # print(table_name, column, timestamps, now, diff) 32 | 33 | if diff > diff_threshold * 86400: 34 | diff_days = int(diff / 86400) 35 | 36 | metadata = database.get_table_metadata(table_name) 37 | 38 | context = OrderedDict() 39 | context['diff_days'] = diff_days 40 | context['data_since'] = str(datetime.fromtimestamp(timestamps.get('min'))) 41 | context['data_until'] = str(datetime.fromtimestamp(timestamps.get('max'))) 42 | context['date_column_name'] = str(column) 43 | context['schema'] = database.get_table_schema(table_name) 44 | context['rows'] = database.get_table_rows_estimate(table_name) 45 | context['table_size_mb'] = \ 46 | 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024 47 | 48 | yield LinterEntry(linter_type='data_not_updated_recently', table_name=table_name, 49 | message='"{}" has the latest row added {} days ago, ' 50 | 'consider checking if it should be up-to-date'. 51 | format(table_name, diff_days), 52 | context=context) 53 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0004_redundant_indices.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0004_redundant_indices import get_redundant_indices, check_redundant_indices 6 | from indexdigest.test import DatabaseTestMixin 7 | 8 | 9 | class TestRedundantIndices(TestCase, DatabaseTestMixin): 10 | 11 | def test_get_redundant_indices_from_database(self): 12 | indices = self.connection.get_table_indices('0004_id_foo_bar') 13 | redundant_indices = get_redundant_indices(indices) 14 | (entry,) = redundant_indices 15 | 16 | print(entry) 17 | 18 | self.assertEqual(len(redundant_indices), 1) 19 | 20 | # idx_foo is covered by idx_foo_bar 21 | self.assertEqual(entry[0].name, 'idx_foo') 22 | self.assertEqual(entry[1].name, 'idx_foo_bar') 23 | 24 | # assert False 25 | 26 | def test_check_redundant_indices(self): 27 | reports = check_redundant_indices(self.connection) 28 | reports = list(filter( 29 | lambda i: i.table_name.startswith('0004_'), 30 | reports 31 | )) 32 | 33 | print(list(map(str,reports))) 34 | 35 | self.assertEqual(len(reports), 3) 36 | self.assertEqual(str(reports[0]), '0004_id_foo: "idx" index can be removed as redundant (covered by "PRIMARY")') 37 | self.assertEqual(str(reports[1]), '0004_id_foo_bar: "idx_foo" index can be removed as redundant (covered by "idx_foo_bar")') 38 | self.assertEqual(str(reports[2]), '0004_indices_duplicating_each_other: "idx_foo" index can be removed as redundant (covered by "idx_foo_2")') 39 | 40 | report = reports[0] 41 | 42 | print(report, report.context) 43 | 44 | self.assertEqual(str(report.context['redundant']), 'UNIQUE KEY idx (item_id, foo)') 45 | self.assertEqual(str(report.context['covered_by']), 'PRIMARY KEY (item_id, foo)') 46 | 47 | # and we have size reported as well (see #16) 48 | self.assertTrue(report.context['table_data_size_mb'] > 0) 49 | self.assertTrue(report.context['table_index_size_mb'] > 0) 50 | 51 | # and we a schema reported in the context (see #16) 52 | self.assertTrue('CREATE TABLE' in report.context['schema']) 53 | self.assertTrue('AUTO_INCREMENT' in report.context['schema']) 54 | self.assertTrue('ENGINE=' in report.context['schema']) 55 | 56 | # assert False 57 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from setuptools import setup, find_packages 3 | 4 | # take the version 5 | with open("indexdigest/__init__.py", "r") as fh: 6 | # e.g. VERSION = '1.5.0' 7 | last_line = fh.readlines()[-1] 8 | VERSION = re.search(r'[\d.]+', last_line).group(0) 9 | 10 | # @see https://packaging.python.org/tutorials/packaging-projects/#creating-setup-py 11 | with open("README.md", "r") as fh: 12 | long_description = fh.read() 13 | 14 | # @see https://github.com/pypa/sampleproject/blob/master/setup.py 15 | setup( 16 | name='indexdigest', 17 | version=VERSION, 18 | author='Maciej Brencz', 19 | author_email='maciej.brencz@gmail.com', 20 | license='MIT', 21 | description='Analyses your database queries and schema and suggests indices and schema improvements', 22 | long_description=long_description, 23 | long_description_content_type="text/markdown", 24 | url='https://github.com/macbre/index-digest', 25 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 26 | classifiers=[ 27 | # How mature is this project? Common values are 28 | # 3 - Alpha 29 | # 4 - Beta 30 | # 5 - Production/Stable 31 | 'Development Status :: 5 - Production/Stable', 32 | 33 | # Indicate who your project is intended for 34 | 'Intended Audience :: Developers', 35 | 'Intended Audience :: System Administrators', 36 | 'Topic :: Database', 37 | 38 | # Specify the Python versions you support here. In particular, ensure 39 | # that you indicate whether you support Python 2, Python 3 or both. 40 | 'Programming Language :: Python :: 3', 41 | ], 42 | packages=find_packages(), 43 | python_requires='>=3.9', 44 | extras_require={ 45 | 'dev': [ 46 | 'coverage==7.10.7', 47 | 'pylint==3.3.9', 48 | 'pytest==8.4.2', 49 | 'pytest-cov==7.0.0', 50 | 'twine==6.2.0', 51 | ] 52 | }, 53 | install_requires=[ 54 | 'docopt==0.6.2', 55 | 'PyYAML==6.0.3', 56 | 'mysqlclient==2.2.7', 57 | 'sql_metadata==2.19.0', 58 | 'termcolor==3.1.0', 59 | 'yamlordereddictloader==0.4.2' 60 | ], 61 | entry_points={ 62 | 'console_scripts': [ 63 | 'add_linter=indexdigest.cli.add_linter:main', # creates a new linter from a template 64 | 'index_digest=indexdigest.cli.script:main', 65 | ], 66 | } 67 | ) 68 | -------------------------------------------------------------------------------- /indexdigest/test/core/test_columns.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from indexdigest.schema import Column 4 | 5 | 6 | class TestColumn(TestCase): 7 | 8 | def test_is_text_type(self): 9 | text_types = [ 10 | 'CHAR(16)', 11 | 'VARCHAR(16)', 12 | 'CHAR(32)', 13 | 'VARCHAR(32)', 14 | 'BINARY(16)', 15 | 'VARBINARY(16)', 16 | 'BINARY(32)', 17 | 'VARBINARY(32)', 18 | 'TEXT', 19 | 'BLOB', 20 | "SET('a', 'b', 'c', 'd')", 21 | "ENUM('x-small', 'small', 'medium', 'large', 'x-large')", 22 | ] 23 | 24 | not_text_types = [ 25 | 'INT', 26 | 'BIGINT', 27 | 'INT(9)', 28 | 'TIMESTAMP', 29 | 'DATETIME', 30 | ] 31 | 32 | for text_type in text_types: 33 | self.assertTrue( 34 | expr=Column('foo', column_type=text_type, character_set='utf8').is_text_type(), 35 | msg=text_type) 36 | 37 | for not_text_type in not_text_types: 38 | self.assertFalse( 39 | expr=Column('foo', column_type=not_text_type, character_set='utf8').is_text_type(), 40 | msg=not_text_type) 41 | 42 | def test_is_timestamp_type(self): 43 | timestamp_types = [ 44 | 'TIMESTAMP', 45 | 'DATETIME', 46 | 'DATE', 47 | 'TIME', 48 | 'YEAR', 49 | ] 50 | 51 | not_timestamp_types = [ 52 | 'INT', 53 | 'BIGINT', 54 | 'INT(9)', 55 | 'CHAR(16)', 56 | 'VARCHAR(16)', 57 | 'CHAR(32)', 58 | 'VARCHAR(32)', 59 | 'BINARY(16)', 60 | 'VARBINARY(16)', 61 | 'BINARY(32)', 62 | 'VARBINARY(32)', 63 | 'TEXT', 64 | 'BLOB', 65 | "SET('a', 'b', 'c', 'd')", 66 | "ENUM('x-small', 'small', 'medium', 'large', 'x-large')", 67 | ] 68 | 69 | for timestamp_type in timestamp_types: 70 | self.assertTrue( 71 | expr=Column('foo', column_type=timestamp_type, character_set='utf8').is_timestamp_type(), 72 | msg=timestamp_type) 73 | 74 | for not_timestamp_type in not_timestamp_types: 75 | self.assertFalse( 76 | expr=Column('foo', column_type=not_timestamp_type, character_set='utf8').is_timestamp_type(), 77 | msg=not_timestamp_type) 78 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Test against different Python version 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | 8 | jobs: 9 | python_tests: 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | # Do not fail if one the tests did not pass 14 | fail-fast: false 15 | 16 | matrix: 17 | # Python version(s) to use when running the tests 18 | # https://github.com/actions/python-versions/blob/main/versions-manifest.json 19 | python-version: 20 | - "3.9" 21 | - "3.10" 22 | - "3.11" 23 | - "3.12" 24 | - "3.13" 25 | - "3.14" 26 | 27 | # Docker images of MySQL-compliant databases to run the tests suite on 28 | database: 29 | - "mysql:8.0.20" 30 | 31 | services: 32 | mysql: 33 | image: ${{ matrix.database }} 34 | env: 35 | MYSQL_ALLOW_EMPTY_PASSWORD: yes 36 | MYSQL_DATABASE: index_digest 37 | ports: 38 | - "53306:3306" 39 | options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 40 | 41 | steps: 42 | - uses: actions/checkout@v5.0.1 43 | - name: Set up Python 44 | uses: actions/setup-python@v6 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | cache: 'pip' # dependencies caching 48 | cache-dependency-path: 'setup.py' 49 | 50 | - name: Install dependencies 51 | run: | 52 | python -m pip install --upgrade pip 53 | pip install wheel 54 | make install 55 | 56 | - name: Linter 57 | run: make lint 58 | 59 | - name: Set up the database 60 | run: | 61 | docker ps 62 | mysql --protocol=tcp --port=53306 -u root --password="" -v < setup.sql 63 | # import the test schema files 64 | "./sql/populate.sh" 65 | mysql --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest -v -e '\s; SHOW TABLES; SHOW DATABASES;' 66 | 67 | - name: Tests with code coverage 68 | run: make coverage 69 | 70 | # https://coveralls-python.readthedocs.io/en/latest/usage/index.html 71 | # upload coverage report for just one of Python version matrix runs 72 | - name: Upload coverage report to Coveralls 73 | if: matrix.python-version == '3.12' 74 | env: 75 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 76 | run: | 77 | pip install coveralls 78 | coveralls --service=github 79 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0070_insert_ignore.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0070_insert_ignore import \ 6 | remove_comments, is_insert_ignore_query, check_insert_ignore_queries 7 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 8 | 9 | 10 | class TestInsertIgnore(TestCase, DatabaseTestMixin): 11 | 12 | def test_remove_comments(self): 13 | self.assertEqual( 14 | 'INSERT IGNORE', 15 | remove_comments('INSERT /* foo */ IGNORE') 16 | ) 17 | 18 | self.assertEqual( 19 | 'SELECT foo', 20 | remove_comments('/* foo */SELECT/* test*/ foo') 21 | ) 22 | 23 | def test_is_insert_ignore_query(self): 24 | assert is_insert_ignore_query("INSERT IGNORE INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is True 25 | assert is_insert_ignore_query("INSERT IGN/*bar*/ORE INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is True 26 | assert is_insert_ignore_query("Insert /* foo */ Ignore INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is True 27 | assert is_insert_ignore_query("/* foo */ INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');") is True 28 | 29 | assert is_insert_ignore_query("/* INSERT IGNORE */ INSERT INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is False 30 | assert is_insert_ignore_query("INSERT INTO 0070_insert_ignore VALUES ('INSERT IGNORE', 9, 123);") is False 31 | 32 | def test_queries(self): 33 | reports = list(check_insert_ignore_queries( 34 | database=self.connection, queries=read_queries_from_log('0070-insert-ignore-log'))) 35 | 36 | print(reports) 37 | 38 | self.assertEqual(len(reports), 4) 39 | 40 | self.assertEqual(str(reports[0]), '0070_insert_ignore: "INSERT IGNORE INTO `0070_insert_ignore` VALUES (9,..." query uses a risky INSERT IGNORE') 41 | self.assertEqual(reports[0].table_name, '0070_insert_ignore') 42 | self.assertEqual(str(reports[0].context['query']), "INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');") 43 | assert 'CREATE TABLE `0070_insert_ignore` (' in str(reports[0].context['schema']) 44 | 45 | self.assertEqual(str(reports[1].context['query']), "/* foo */ INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');") 46 | self.assertEqual(str(reports[2].context['query']), "INSERT IGNORE INTO `0070_insert_ignore` VALUES ('123', 9, '2017-01-01');") 47 | self.assertEqual(str(reports[3].context['query']), "INSERT /* foo */ IGNORE INTO `0070_insert_ignore` VALUES ('2017-01-01', 9, 123);") 48 | # assert False 49 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0020_big_table.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from indexdigest.linters import check_queries_using_filesort, check_queries_using_temporary 4 | from indexdigest.test import BigTableTest, read_queries_from_log 5 | 6 | 7 | class TestBigTableLinters(BigTableTest): 8 | 9 | def test_get_table_rows_estimate(self): 10 | # this table has 100000, but assume the returned estimate is above 75k 11 | self.assertGreater(self.connection.get_table_rows_estimate('0020_big_table'), 75000) 12 | 13 | def test_filesort(self): 14 | reports = list(check_queries_using_filesort(self.connection, read_queries_from_log('0020-big-table-log'))) 15 | 16 | self.assertEqual(len(reports), 2) 17 | 18 | self.assertEqual(str(reports[0]), 19 | '0020_big_table: "SELECT * FROM 0020_big_table WHERE item_id BETWEEN..." query used filesort') 20 | self.assertEqual(reports[0].context['query'], 21 | 'SELECT * FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 ORDER BY val') 22 | self.assertEqual(reports[0].context['explain_extra'], 'Using where; Using filesort') 23 | self.assertEqual(reports[0].context['explain_rows'], 11) 24 | self.assertEqual(reports[0].context['explain_key'], 'PRIMARY') 25 | 26 | self.assertEqual(str(reports[1]), 27 | '0020_big_table: "SELECT val, count(*) FROM 0020_big_table WHERE ite..." query used filesort') 28 | self.assertEqual(reports[1].context['query'], 29 | 'SELECT val, count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 GROUP BY val ORDER BY val') 30 | self.assertEqual(reports[1].context['explain_extra'], 'Using where; Using temporary; Using filesort') 31 | self.assertEqual(reports[1].context['explain_rows'], 11) 32 | self.assertEqual(reports[1].context['explain_key'], 'PRIMARY') 33 | 34 | # assert False 35 | 36 | def test_temporary(self): 37 | reports = list(check_queries_using_temporary(self.connection, read_queries_from_log('0020-big-table-log'))) 38 | 39 | self.assertEqual(len(reports), 1) 40 | 41 | self.assertEqual(str(reports[0]), 42 | '0020_big_table: "SELECT val, count(*) FROM 0020_big_table WHERE ite..." query used temporary') 43 | self.assertEqual(reports[0].context['query'], 44 | 'SELECT val, count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 GROUP BY val ORDER BY val') 45 | self.assertEqual(reports[0].context['explain_extra'], 'Using where; Using temporary; Using filesort') 46 | self.assertEqual(reports[0].context['explain_rows'], 11) 47 | self.assertEqual(reports[0].context['explain_key'], 'PRIMARY') 48 | 49 | # assert False 50 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0032_utf_latin_columns.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0032_utf_latin_columns import \ 6 | check_latin_columns, is_text_column_latin 7 | from indexdigest.schema import Column 8 | from indexdigest.test import Database, DatabaseTestMixin 9 | 10 | 11 | class TestIsTextColumnLatin(TestCase): 12 | 13 | def test_is_text_column_non_latin(self): 14 | for character_set in ['utf8', 'ucs2', 'utf8mb4', 'utf16', 'utf16le', 'utf32', 'binary']: 15 | column = Column(name='foo', column_type='varchar', character_set=character_set) 16 | 17 | assert is_text_column_latin(column) is False, character_set 18 | 19 | def test_is_text_column_latin(self): 20 | # @see https://dev.mysql.com/doc/refman/5.7/en/charset-mysql.html 21 | for character_set in ['big5', 'latin1', 'latin2']: 22 | column = Column(name='foo', column_type='varchar', character_set=character_set) 23 | 24 | assert is_text_column_latin(column) is True, character_set 25 | 26 | def test_blob_column(self): 27 | assert is_text_column_latin(Column(name='foo', column_type='blob')) is False 28 | 29 | 30 | class LimitedViewDatabase(Database, DatabaseTestMixin): 31 | """ 32 | Limit test to tables from sql/0032-utf-latin-columns.sql 33 | """ 34 | def get_tables(self): 35 | return ['0032_utf8_table', '0032_latin1_table'] 36 | 37 | 38 | class TestFullTableScan(TestCase): 39 | @property 40 | def connection(self): 41 | return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN) 42 | 43 | def test_latin1_columns(self): 44 | reports = list(check_latin_columns(self.connection)) 45 | 46 | print(list(map(str, reports))) 47 | 48 | self.assertEqual(len(reports), 3) 49 | 50 | self.assertEqual(str(reports[0]), 51 | '0032_utf8_table: "latin_column" text column has "latin1" character set defined') 52 | self.assertEqual(reports[0].context['column'], 'latin_column') 53 | self.assertEqual(reports[0].context['column_character_set'], 'latin1') 54 | self.assertEqual(reports[0].context['column_collation'], 'latin1_bin') 55 | assert 'CREATE TABLE `0032_utf8_table` (' in reports[0].context['schema'] 56 | 57 | self.assertEqual(str(reports[1]), 58 | '0032_utf8_table: "big5_column" text column has "big5" character set defined') 59 | self.assertEqual(reports[1].context['column'], 'big5_column') 60 | self.assertEqual(reports[1].context['column_character_set'], 'big5') 61 | self.assertEqual(reports[1].context['column_collation'], 'big5_chinese_ci') 62 | 63 | self.assertEqual(str(reports[2]), 64 | '0032_latin1_table: "name" text column has "latin1" character set defined') 65 | self.assertEqual(reports[2].context['column'], 'name') 66 | self.assertEqual(reports[2].context['column_character_set'], 'latin1') 67 | self.assertEqual(reports[2].context['column_collation'], 'latin1_swedish_ci') 68 | 69 | # assert False 70 | -------------------------------------------------------------------------------- /indexdigest/test/core/test_indices.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from indexdigest.schema import Index 4 | 5 | 6 | class TestIndex(TestCase): 7 | 8 | def test_repr(self): 9 | self.assertEqual(' KEY foo (id, bar)', repr(Index(name='foo', columns=['id', 'bar']))) 10 | self.assertEqual(' PRIMARY KEY (id)', repr(Index(name='key', columns=['id'], primary=True))) 11 | self.assertEqual(' UNIQUE KEY idx_bar (bar)', repr(Index(name='idx_bar', columns=['bar'], unique=True))) 12 | 13 | def test_is_covered_by(self): 14 | # #1 case 15 | primary = Index(name='base', columns=['id', 'bar'], primary=True) 16 | second = Index(name='base', columns=['id', 'bar']) 17 | 18 | self.assertFalse(primary.is_covered_by(second)) 19 | self.assertTrue(second.is_covered_by(primary)) 20 | 21 | # self-check 22 | self.assertFalse(second.is_covered_by(second)) 23 | 24 | # #2 case 25 | first = Index(name='base', columns=['id', 'bar', 'foo']) 26 | second = Index(name='base', columns=['id', 'bar']) 27 | 28 | self.assertFalse(first.is_covered_by(second)) 29 | self.assertTrue(second.is_covered_by(first)) 30 | 31 | # #3 case 32 | first = Index(name='base', columns=['id', 'bar', 'foo']) 33 | second = Index(name='base', columns=['id', 'bar', 'foo']) 34 | 35 | self.assertTrue(first.is_covered_by(second)) 36 | self.assertTrue(second.is_covered_by(first)) 37 | 38 | # #4 case 39 | first = Index(name='base', columns=['id', 'bar', 'foo']) 40 | second = Index(name='base', columns=['bar', 'foo']) 41 | 42 | self.assertFalse(first.is_covered_by(second)) 43 | self.assertFalse(second.is_covered_by(first)) 44 | 45 | def test_primary_and_unique_keys_coverage(self): 46 | # @see https://github.com/macbre/index-digest/issues/49 47 | 48 | # second key adds a uniqueness constraint, keep it 49 | first = Index(name='base', columns=['bar', 'foo'], primary=True) 50 | second = Index(name='base', columns=['bar'], unique=True) 51 | 52 | self.assertFalse(first.is_covered_by(second)) 53 | self.assertFalse(second.is_covered_by(first)) 54 | 55 | # these keys are the same (primary is unique) 56 | first = Index(name='base', columns=['bar', 'foo'], primary=True) 57 | second = Index(name='base', columns=['bar', 'foo'], unique=True) 58 | 59 | self.assertFalse(first.is_covered_by(second)) 60 | self.assertTrue(second.is_covered_by(first)) 61 | 62 | # prefer unique over non-unique 63 | first = Index(name='base', columns=['bar', 'foo'], unique=True) 64 | second = Index(name='base', columns=['bar', 'foo']) 65 | 66 | self.assertFalse(first.is_covered_by(second)) 67 | self.assertTrue(second.is_covered_by(first)) 68 | 69 | # identical unique indices 70 | first = Index(name='base', columns=['bar', 'foo'], unique=True) 71 | second = Index(name='base', columns=['bar', 'foo'], unique=True) 72 | 73 | self.assertTrue(first.is_covered_by(second)) 74 | self.assertTrue(second.is_covered_by(first)) 75 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0020_filesort_temporary_table.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for SELECT queries whether they trigger filesort or temporary file 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import explain_queries, LinterEntry, shorten_query 7 | 8 | 9 | def filter_explain_extra(database, queries, check): 10 | """ 11 | Parse "Extra" column from EXPLAIN query results, e.g. 12 | 13 | "Using where; Using temporary; Using filesort" 14 | 15 | :type database indexdigest.database.Database 16 | :type queries list[str] 17 | :type check str 18 | :rtype: list 19 | """ 20 | for (query, table_used, _, explain_row) in explain_queries(database, queries): 21 | extra_parsed = str(explain_row['Extra']).split('; ') 22 | 23 | if check in extra_parsed: 24 | context = OrderedDict() 25 | context['query'] = query 26 | 27 | context['explain_extra'] = explain_row['Extra'] 28 | context['explain_rows'] = int(explain_row['rows']) # string when using MariaDB 10.5 29 | context['explain_filtered'] = explain_row.get('filtered') # can be not set 30 | context['explain_key'] = explain_row['key'] 31 | 32 | yield (query, table_used, context) 33 | 34 | 35 | def check_queries_using_filesort(database, queries): 36 | """ 37 | Using filesort 38 | 39 | MySQL must do an extra pass to find out how to retrieve the rows in sorted order. The sort is 40 | done by going through all rows according to the join type and storing the sort key and pointer 41 | to the row for all rows that match the WHERE clause. The keys then are sorted and the rows are 42 | retrieved in sorted order. 43 | 44 | Percona says: The truth is, filesort is badly named. Anytime a sort can't be performed from an 45 | index, it's a filesort. It has nothing to do with files. Filesort should be called "sort." 46 | It is quicksort at heart. 47 | 48 | :type database indexdigest.database.Database 49 | :type queries list[str] 50 | :rtype: list[LinterEntry] 51 | """ 52 | filtered = filter_explain_extra(database, queries, check='Using filesort') 53 | 54 | for (query, table_used, context) in filtered: 55 | yield LinterEntry(linter_type='queries_using_filesort', table_name=table_used, 56 | message='"{}" query used filesort'.format(shorten_query(query)), 57 | context=context) 58 | 59 | 60 | def check_queries_using_temporary(database, queries): 61 | """ 62 | Using temporary 63 | 64 | To resolve the query, MySQL needs to create a temporary table to hold the result. This 65 | typically happens if the query contains GROUP BY and ORDER BY clauses that list columns 66 | differently. 67 | 68 | :type database indexdigest.database.Database 69 | :type queries list[str] 70 | :rtype: list[LinterEntry] 71 | """ 72 | filtered = filter_explain_extra(database, queries, check='Using temporary') 73 | 74 | for (query, table_used, context) in filtered: 75 | yield LinterEntry(linter_type='queries_using_temporary', table_name=table_used, 76 | message='"{}" query used temporary'.format(shorten_query(query)), 77 | context=context) 78 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0031_low_cardinality_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for ... 3 | """ 4 | from collections import OrderedDict 5 | 6 | from indexdigest.utils import LinterEntry 7 | 8 | # skip small tables 9 | ROWS_COUNT_THRESHOLD = 100000 10 | 11 | # cardinality threshold 12 | INDEX_CARDINALITY_THRESHOLD = 6 13 | 14 | # the least frequent value should be used at most by x% rows 15 | INDEX_VALUE_PERCENTAGE_THRESHOLD = 20 16 | 17 | 18 | def get_low_cardinality_indices(database): 19 | """ 20 | :type database indexdigest.database.Database 21 | :rtype: list 22 | """ 23 | for table_name in database.get_tables(): 24 | rows_count = database.get_table_rows_estimate(table_name) 25 | if rows_count < ROWS_COUNT_THRESHOLD: 26 | continue 27 | 28 | # get table indices statistics 29 | # @see https://dev.mysql.com/doc/refman/5.7/en/show-index.html 30 | # @see https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/ 31 | indices = database.query_dict_rows( 32 | "select TABLE_NAME, INDEX_NAME, COLUMN_NAME, CARDINALITY from" 33 | " INFORMATION_SCHEMA.STATISTICS where" 34 | " TABLE_NAME = '{table_name}' AND TABLE_SCHEMA = '{database_name}'".format( 35 | table_name=table_name, database_name=database.db_name) 36 | ) 37 | 38 | for index in indices: 39 | # the cardinality is too high 40 | if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD: 41 | continue 42 | 43 | yield table_name, rows_count, index 44 | 45 | 46 | def check_low_cardinality_index(database): 47 | """ 48 | :type database indexdigest.database.Database 49 | :rtype: list[LinterEntry] 50 | """ 51 | for table_name, rows_count, index in get_low_cardinality_indices(database): 52 | # the least frequent value should be used in up to 20% of rows 53 | # https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/ 54 | row = database.query_dict_row( 55 | 'SELECT {column} AS value, COUNT(*) AS cnt FROM `{table}` ' 56 | 'GROUP BY 1 ORDER BY 2 ASC LIMIT 1'.format( 57 | column=index['COLUMN_NAME'], table=index['TABLE_NAME'] 58 | ) 59 | ) 60 | 61 | value_usage = 100. * row['cnt'] / rows_count 62 | # print(row, value_usage) 63 | 64 | # the least frequent value is quite rare - it makes sense to have an index here 65 | if value_usage < INDEX_VALUE_PERCENTAGE_THRESHOLD: 66 | continue 67 | 68 | # print(value_usage, index, table_name) 69 | 70 | context = OrderedDict() 71 | context['column_name'] = index['COLUMN_NAME'] 72 | context['index_name'] = index['INDEX_NAME'] 73 | context['index_cardinality'] = int(index['CARDINALITY']) 74 | context['schema'] = database.get_table_schema(table_name) 75 | context['value_usage'] = value_usage 76 | 77 | yield LinterEntry(linter_type='low_cardinality_index', table_name=table_name, 78 | message='"{}" index on "{}" column has low cardinality, ' 79 | 'check if it is needed'. 80 | format(index['INDEX_NAME'], index['COLUMN_NAME']), 81 | context=context) 82 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0028_data_too_old.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter looks for tables that have really old data 3 | """ 4 | from collections import OrderedDict 5 | from datetime import datetime 6 | from time import time 7 | 8 | from indexdigest.utils import LinterEntry, memoize 9 | 10 | 11 | def get_time_columns(database): 12 | """ 13 | :type database indexdigest.database.Database 14 | :rtype: list 15 | """ 16 | for table_name in database.get_tables(): 17 | time_columns = [ 18 | column 19 | for column in database.get_table_columns(table_name) 20 | if column.is_timestamp_type() or 'time' in column.name 21 | ] 22 | 23 | # there are no time type columns, skip 24 | if not time_columns: 25 | continue 26 | 27 | # for now just check the first time column 28 | yield (table_name, time_columns[0]) 29 | 30 | 31 | @memoize 32 | def get_boundary_times(database, table_name, column): 33 | """ 34 | :type database indexdigest.database.Database 35 | :type table_name str 36 | :type column indexdigest.Column.column 37 | :rtype: dict 38 | """ 39 | # this may take a while when {column} is not indexed! 40 | query = 'SELECT /* index-digest */ UNIX_TIMESTAMP(MIN(`{column}`)) as `min`, ' \ 41 | 'UNIX_TIMESTAMP(MAX(`{column}`)) as `max` FROM `{table}`'.\ 42 | format( 43 | column=column.name, 44 | table=table_name 45 | ) 46 | 47 | timestamps = database.query_dict_row(query) 48 | 49 | # if there's no data in the table, return None 50 | return timestamps if timestamps.get('min') and timestamps.get('max') else None 51 | 52 | 53 | def check_data_too_old(database, env=None): 54 | """ 55 | :type database indexdigest.database.Database 56 | :type env dict 57 | :rtype: list[LinterEntry] 58 | """ 59 | now = int(time()) # I will probably never understand dates handling in Python 60 | 61 | # set up a diff threshold (in days) 62 | env = env if env else dict() 63 | diff_threshold = int(env.get('INDEX_DIGEST_DATA_TOO_OLD_THRESHOLD_DAYS', 3 * 30)) 64 | 65 | for (table_name, column) in get_time_columns(database): 66 | timestamps = get_boundary_times(database, table_name, column) 67 | 68 | if timestamps is None: 69 | continue 70 | 71 | diff = now - timestamps.get('min') 72 | # print(table_name, column, timestamps, now, diff) 73 | 74 | if diff > diff_threshold * 86400: 75 | diff_days = int(diff / 86400) 76 | 77 | metadata = database.get_table_metadata(table_name) 78 | 79 | context = OrderedDict() 80 | context['diff_days'] = diff_days 81 | context['data_since'] = str(datetime.fromtimestamp(timestamps.get('min'))) 82 | context['data_until'] = str(datetime.fromtimestamp(timestamps.get('max'))) 83 | context['date_column_name'] = str(column) 84 | context['schema'] = database.get_table_schema(table_name) 85 | context['rows'] = database.get_table_rows_estimate(table_name) 86 | context['table_size_mb'] = \ 87 | 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024 88 | 89 | yield LinterEntry(linter_type='data_too_old', table_name=table_name, 90 | message='"{}" has rows added {} days ago, ' 91 | 'consider changing retention policy'. 92 | format(table_name, diff_days), 93 | context=context) 94 | -------------------------------------------------------------------------------- /indexdigest/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains utility functions and classes 3 | """ 4 | from urllib.parse import urlparse 5 | 6 | import functools 7 | import logging 8 | import re 9 | 10 | 11 | def parse_dsn(dsn): 12 | """ 13 | Parser given Data Source Name string into an object that can be passed to a database connector. 14 | 15 | Example: mysql://alex:pwd@localhost/test 16 | DSN: :///;?# 17 | 18 | @see https://mysqlclient.readthedocs.io/user_guide.html#mysqldb 19 | 20 | :type dsn str 21 | :rtype: dict 22 | """ 23 | parsed = urlparse(dsn) 24 | 25 | return { 26 | 'host': parsed.hostname, 27 | 'port': int(parsed.port) if parsed.port else 3306, 28 | 'user': parsed.username, 29 | 'passwd': parsed.password, 30 | 'db': str(parsed.path).lstrip('/') 31 | } 32 | 33 | 34 | def is_select_query(query): 35 | """ 36 | :type query str 37 | :rtype bool 38 | """ 39 | query = query.lstrip().lower() 40 | query = re.sub(r'^/\*[^*]+\*/', '', query) # remove SQL comments 41 | 42 | return query.lstrip().startswith('select') 43 | 44 | 45 | def explain_queries(database, queries): 46 | """ 47 | Yields EXPLAIN result rows for given queries 48 | 49 | :type database indexdigest.database.Database 50 | :type queries list[str] 51 | :rtype: tuple[str,str,str,str] 52 | """ 53 | # analyze only SELECT queries from the log 54 | for query in filter(is_select_query, queries): 55 | try: 56 | for row in database.explain_query(query): 57 | table_used = row['table'] 58 | index_used = row['key'] 59 | 60 | yield (query, table_used, index_used, row) 61 | except IndexDigestError: 62 | logger = logging.getLogger('explain_queries') 63 | logger.error('Cannot explain the query: %s', query) 64 | 65 | 66 | def shorten_query(query, max_len=50): 67 | """ 68 | :type query str 69 | :type max_len int 70 | :rtype: str 71 | """ 72 | query = query.rstrip('; ') 73 | return '{}...'.format(query[:max_len]) if len(query) > max_len else query 74 | 75 | 76 | def memoize(func): 77 | """ 78 | Memoization pattern implemented 79 | 80 | :type func 81 | :rtype func 82 | """ 83 | # @see https://medium.com/@nkhaja/memoization-and-decorators-with-python-32f607439f84 84 | cache = func.cache = {} 85 | 86 | @functools.wraps(func) 87 | def memoized_func(*args, **kwargs): 88 | """ 89 | :type args 90 | :type kwargs 91 | """ 92 | key = str(args) + str(kwargs) 93 | if key not in cache: 94 | cache[key] = func(*args, **kwargs) 95 | return cache[key] 96 | return memoized_func 97 | 98 | 99 | # pylint:disable=too-few-public-methods 100 | class LinterEntry: 101 | """ 102 | Wraps a single linter entry. Various formatters may display this data differently. 103 | """ 104 | def __init__(self, linter_type, table_name, message, context=None): 105 | """ 106 | :type linter_type str 107 | :type table_name str 108 | :type message str 109 | :type context dict 110 | """ 111 | self.linter_type = linter_type 112 | self.table_name = table_name 113 | self.message = message 114 | self.context = context 115 | 116 | def __str__(self): 117 | return '{table_name}: {message}'.format( 118 | table_name=self.table_name, message=self.message) 119 | 120 | 121 | class IndexDigestError(Exception): 122 | """ 123 | index-digest base exception class 124 | """ 125 | -------------------------------------------------------------------------------- /indexdigest/test/linters/test_0006_not_used_columns_and_tables.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | from indexdigest.linters.linter_0006_not_used_columns_and_tables import check_not_used_tables, check_not_used_columns, \ 6 | get_used_tables_from_queries 7 | from indexdigest.database import Database 8 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log 9 | 10 | 11 | class LimitedViewDatabase(Database, DatabaseTestMixin): 12 | """ 13 | Limit test to tables from sql/0006-not-used-columns-and-tables.sql 14 | """ 15 | def get_tables(self): 16 | return ['0006_not_used_columns', '0006_not_used_tables'] 17 | 18 | 19 | class TestNotUsedTables(TestCase): 20 | 21 | @property 22 | def connection(self): 23 | return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN) 24 | 25 | def test_not_used_tables(self): 26 | reports = list(check_not_used_tables( 27 | database=self.connection, queries=read_queries_from_log('0006-not-used-columns-and-tables-log'))) 28 | 29 | print(reports) 30 | 31 | self.assertEqual(len(reports), 1) 32 | self.assertEqual(str(reports[0]), '0006_not_used_tables: "0006_not_used_tables" table was not used by provided queries') 33 | self.assertEqual(reports[0].table_name, '0006_not_used_tables') 34 | 35 | assert str(reports[0].context['schema']).startswith('CREATE TABLE `0006_not_used_tables` (\n') 36 | 37 | # these are estimates, can't assert a certain value 38 | assert reports[0].context['table_size_mb'] > 0.0001 39 | assert reports[0].context['rows_estimated'] > 0 40 | 41 | def test_get_used_tables_from_queries(self): 42 | queries = [ 43 | 'SELECT /* a comment */ foo FROM `0006_not_used_columns` AS r WHERE item_id = 1;', # table alias 44 | 'SELECT 1 FROM `0006_not_used_tables` WHERE item_id = 3;', 45 | ] 46 | 47 | tables = get_used_tables_from_queries(queries) 48 | 49 | print(tables) 50 | 51 | self.assertListEqual(tables, ['0006_not_used_columns', '0006_not_used_tables']) 52 | 53 | # assert False 54 | 55 | 56 | class TestNotUsedColumns(TestCase): 57 | 58 | @property 59 | def connection(self): 60 | return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN) 61 | 62 | def test_not_used_columns(self): 63 | queries = [ 64 | 'SELECT test, item_id FROM `0006_not_used_columns` WHERE foo = "a"' 65 | ] 66 | 67 | reports = list(check_not_used_columns(database=self.connection, queries=queries)) 68 | 69 | self.assertEqual(len(reports), 1) 70 | self.assertEqual(str(reports[0]), '0006_not_used_columns: "bar" column was not used by provided queries') 71 | self.assertEqual(reports[0].table_name, '0006_not_used_columns') 72 | self.assertEqual(reports[0].context['column_name'], 'bar') 73 | self.assertEqual(reports[0].context['column_type'], 'varchar(16)') 74 | 75 | # assert False 76 | 77 | def test_not_used_columns_two(self): 78 | queries = [ 79 | 'SELECT test FROM `0006_not_used_columns` WHERE foo = "a"' 80 | ] 81 | 82 | reports = list(check_not_used_columns(database=self.connection, queries=queries)) 83 | 84 | # reports ordered is the same as schema columns order 85 | self.assertEqual(len(reports), 2) 86 | self.assertEqual(reports[0].context['column_name'], 'item_id') 87 | self.assertEqual(reports[0].context['column_type'], 'int') 88 | self.assertEqual(reports[1].context['column_name'], 'bar') 89 | self.assertEqual(reports[1].context['column_type'], 'varchar(16)') 90 | 91 | # assert False 92 | 93 | def test_parsing_errors_handling(self): 94 | queries = [ 95 | 'SELECT test', 96 | 'SELECT 0020_big_table t WHERE id BETWEEN 10 AND 20 GROUP BY val' 97 | ] 98 | 99 | reports = list(check_not_used_columns(database=self.connection, queries=queries)) 100 | self.assertEqual(len(reports), 0) 101 | 102 | # assert False 103 | -------------------------------------------------------------------------------- /indexdigest/linters/linter_0006_not_used_columns_and_tables.py: -------------------------------------------------------------------------------- 1 | """ 2 | This linter checks for not used columns and tables by going through SELECT queries 3 | """ 4 | import logging 5 | 6 | from collections import defaultdict, OrderedDict 7 | from sql_metadata.compat import get_query_columns, get_query_tables 8 | 9 | from indexdigest.utils import LinterEntry, is_select_query 10 | 11 | 12 | def get_used_tables_from_queries(queries): 13 | """ 14 | :type queries list[str] 15 | :rtype: list[str] 16 | """ 17 | used_tables = [] 18 | queries = filter(is_select_query, queries) 19 | 20 | for query in queries: 21 | # parse each query from the log 22 | tables = get_query_tables(query) 23 | if tables and tables[0] not in used_tables: 24 | used_tables.append(tables[0]) 25 | 26 | return used_tables 27 | 28 | 29 | def check_not_used_tables(database, queries): 30 | """ 31 | :type database indexdigest.database.Database 32 | :type queries list[str] 33 | :rtype: list[LinterEntry] 34 | """ 35 | logger = logging.getLogger(__name__) 36 | 37 | # get database meta-data 38 | tables = database.get_tables() 39 | 40 | # analyze only SELECT queries from the log 41 | used_tables = get_used_tables_from_queries(queries) 42 | logger.info("These tables were used by provided queries: %s", used_tables) 43 | 44 | # now check which tables were not used 45 | not_used_tables = [table for table in tables if table not in used_tables] 46 | 47 | # generate reports 48 | for table in not_used_tables: 49 | metadata = database.get_table_metadata(table) 50 | context = OrderedDict() 51 | 52 | context['schema'] = database.get_table_schema(table) 53 | context['table_size_mb'] = \ 54 | 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024 55 | context['rows_estimated'] = database.get_table_rows_estimate(table) 56 | 57 | yield LinterEntry(linter_type='not_used_tables', table_name=table, 58 | message='"{}" table was not used by provided queries'.format(table), 59 | context=context) 60 | 61 | 62 | def check_not_used_columns(database, queries): 63 | """ 64 | :type database indexdigest.database.Database 65 | :type queries list[str] 66 | :rtype: list[LinterEntry] 67 | :raises Exception 68 | """ 69 | logger = logging.getLogger(__name__) 70 | 71 | # analyze only SELECT queries from the log 72 | queries = list(filter(is_select_query, queries)) 73 | 74 | used_tables = get_used_tables_from_queries(queries) 75 | used_columns = defaultdict(list) 76 | 77 | logger.info("Will check these tables: %s", used_tables) 78 | 79 | # analyze given queries and collect used columns for each table 80 | for query in queries: 81 | tables = get_query_tables(query) 82 | if tables: 83 | columns = get_query_columns(query) 84 | 85 | # print(query, table, columns) 86 | 87 | # add used columns per table 88 | # FIXME: assume we're querying just a single table for now 89 | used_columns[tables[0]] += columns 90 | else: 91 | logger.error('Unable to extract tables and columns used from the query: %s', 92 | query) 93 | 94 | # analyze table schemas and report not used columns for each table 95 | for table in used_tables: 96 | logger.info("Checking %s table", table) 97 | table_columns = database.get_table_columns(table) 98 | 99 | # now get the difference and report them 100 | not_used_columns = [ 101 | column for column in table_columns 102 | if column.name not in set(used_columns[table]) 103 | ] if table_columns else [] 104 | 105 | for column in not_used_columns: 106 | yield LinterEntry(linter_type='not_used_columns', table_name=table, 107 | message='"{}" column was not used by provided queries'.format(column), 108 | context={'column_name': column.name, 'column_type': column.type}) 109 | -------------------------------------------------------------------------------- /indexdigest/test/core/test_query.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from sql_metadata.compat import get_query_columns, get_query_tables 4 | 5 | 6 | class TestUtils(TestCase): 7 | 8 | def test_get_query_columns(self): 9 | self.assertListEqual(['*'], 10 | get_query_columns('SELECT * FROM `test_table`')) 11 | 12 | self.assertListEqual(['foo'], 13 | get_query_columns('SELECT foo FROM `test_table`')) 14 | 15 | self.assertListEqual(['id', 'foo'], 16 | get_query_columns('SELECT id, foo FROM test_table WHERE id = 3')) 17 | 18 | self.assertListEqual(['foo', 'id'], 19 | get_query_columns('SELECT foo, count(*) as bar FROM `test_table` WHERE id = 3')) 20 | 21 | self.assertListEqual(['foo', 'test'], 22 | get_query_columns('SELECT foo, test as bar FROM `test_table`')) 23 | 24 | self.assertListEqual(['bar'], 25 | get_query_columns('SELECT /* a comment */ bar FROM test_table')) 26 | 27 | # assert False 28 | 29 | def test_get_query_tables(self): 30 | self.assertListEqual(['test_table'], 31 | get_query_tables('SELECT * FROM `test_table`')) 32 | 33 | self.assertListEqual(['0001_test_table'], 34 | get_query_tables('SELECT * FROM `0001_test_table`')) 35 | 36 | self.assertListEqual(['test_table'], 37 | get_query_tables('SELECT foo FROM `test_table`')) 38 | 39 | self.assertListEqual(['test_table'], 40 | get_query_tables('SELECT foo FROM test_table WHERE id = 1')) 41 | 42 | self.assertListEqual(['test_table', 'second_table'], 43 | get_query_tables('SELECT foo FROM test_table, second_table WHERE id = 1')) 44 | 45 | self.assertListEqual(['revision', 'page', 'wikicities_user'], 46 | get_query_tables('SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N')) 47 | 48 | self.assertListEqual(['events'], 49 | get_query_tables("SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP")) 50 | 51 | # complex queries 52 | # @see https://github.com/macbre/query-digest/issues/16 53 | self.assertListEqual(['report_wiki_recent_pageviews', 'dimension_wikis'], 54 | get_query_tables("SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N")) 55 | 56 | self.assertListEqual(['dimension_wikis', 'fact_wam_scores'], 57 | get_query_tables("SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC")) 58 | 59 | self.assertListEqual(['fact_wam_scores', 'dimension_wikis'], 60 | get_query_tables("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))")) 61 | 62 | # INSERT queries 63 | self.assertListEqual(['0070_insert_ignore_table'], 64 | get_query_tables("INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');")) 65 | 66 | self.assertListEqual(['0070_insert_ignore_table'], 67 | get_query_tables("INSERT into `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');")) 68 | 69 | # assert False 70 | -------------------------------------------------------------------------------- /indexdigest/test/cli/test_script.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from _pytest.monkeypatch import MonkeyPatch 4 | 5 | from indexdigest import VERSION 6 | from indexdigest.cli.script import filter_reports_by_type, filter_reports_by_table, get_version 7 | from indexdigest.utils import LinterEntry 8 | 9 | 10 | class FilterReportsByTypeTest(TestCase): 11 | 12 | REPORT_TYPES = [ 13 | 'foo', 14 | 'bar', 15 | 'test', 16 | 'test', 17 | 'foobar', 18 | ] 19 | 20 | @staticmethod 21 | def get_reports_mock(linter_types): 22 | """ 23 | :type linter_types list[str] 24 | :rtype: list[LinterEntry] 25 | """ 26 | return [ 27 | LinterEntry(linter_type=linter_type, table_name='foo', message='message') 28 | for linter_type in linter_types 29 | ] 30 | 31 | def test_noop(self): 32 | reports = self.get_reports_mock(self.REPORT_TYPES) 33 | 34 | filtered = filter_reports_by_type(reports) 35 | print(filtered) 36 | 37 | assert len(filtered) == len(self.REPORT_TYPES) 38 | 39 | def test_checks_switch(self): 40 | reports = self.get_reports_mock(self.REPORT_TYPES) 41 | 42 | filtered = filter_reports_by_type(reports, checks='foo,test') 43 | print(filtered) 44 | 45 | assert len(filtered) == 3 46 | assert filtered[0].linter_type == 'foo' 47 | assert filtered[1].linter_type == 'test' 48 | assert filtered[2].linter_type == 'test' 49 | 50 | def test_checks_switch_single(self): 51 | reports = self.get_reports_mock(self.REPORT_TYPES) 52 | 53 | filtered = filter_reports_by_type(reports, checks='test') 54 | print(filtered) 55 | 56 | assert len(filtered) == 2 57 | assert filtered[0].linter_type == 'test' 58 | assert filtered[1].linter_type == 'test' 59 | 60 | def test_skip_checks_switch(self): 61 | reports = self.get_reports_mock(self.REPORT_TYPES) 62 | 63 | filtered = filter_reports_by_type(reports, skip_checks='foo,test') 64 | print(filtered) 65 | 66 | assert len(filtered) == 2 67 | assert filtered[0].linter_type == 'bar' 68 | assert filtered[1].linter_type == 'foobar' 69 | 70 | 71 | class FilterReportsByTableTest(TestCase): 72 | 73 | REPORT_TABLES = [ 74 | 'foo', 75 | 'bar', 76 | 'test', 77 | 'test', 78 | 'foobar', 79 | ] 80 | 81 | @staticmethod 82 | def get_reports_mock(tables): 83 | """ 84 | :type tables list[str] 85 | :rtype: list[LinterEntry] 86 | """ 87 | return [ 88 | LinterEntry(linter_type='foo', table_name=table, message='message') 89 | for table in tables 90 | ] 91 | 92 | def test_noop(self): 93 | reports = self.get_reports_mock(self.REPORT_TABLES) 94 | 95 | filtered = filter_reports_by_table(reports) 96 | print(filtered) 97 | 98 | assert len(filtered) == len(self.REPORT_TABLES) 99 | 100 | def test_tables_switch(self): 101 | reports = self.get_reports_mock(self.REPORT_TABLES) 102 | 103 | filtered = filter_reports_by_table(reports, tables='foo,test') 104 | print(filtered) 105 | 106 | assert len(filtered) == 3 107 | assert filtered[0].table_name == 'foo' 108 | assert filtered[1].table_name == 'test' 109 | assert filtered[2].table_name == 'test' 110 | 111 | def test_tables_switch_single(self): 112 | reports = self.get_reports_mock(self.REPORT_TABLES) 113 | 114 | filtered = filter_reports_by_table(reports, tables='test') 115 | print(filtered) 116 | 117 | assert len(filtered) == 2 118 | assert filtered[0].table_name == 'test' 119 | assert filtered[1].table_name == 'test' 120 | 121 | def test_skip_tables_switch(self): 122 | reports = self.get_reports_mock(self.REPORT_TABLES) 123 | 124 | filtered = filter_reports_by_table(reports, skip_tables='foo,test') 125 | print(filtered) 126 | 127 | assert len(filtered) == 2 128 | assert filtered[0].table_name == 'bar' 129 | assert filtered[1].table_name == 'foobar' 130 | 131 | 132 | def test_get_version(monkeypatch: MonkeyPatch): 133 | monkeypatch.setenv('COMMIT_SHA', '1234567890abc') 134 | assert get_version() == f'{VERSION} (git 1234567)' 135 | -------------------------------------------------------------------------------- /indexdigest/cli/add_linter.py: -------------------------------------------------------------------------------- 1 | """ 2 | A helper script used to create files for new linter 3 | """ 4 | from __future__ import print_function 5 | 6 | import logging 7 | import re 8 | import sys 9 | 10 | logging.basicConfig( 11 | level=logging.DEBUG, 12 | format='%(levelname)-8s %(message)s', 13 | ) 14 | 15 | 16 | def add_linter(linter_id, linter_name): 17 | """ 18 | :type linter_id int 19 | :type linter_name str 20 | """ 21 | logger = logging.getLogger('add_linter') 22 | 23 | # normalize values 24 | linter_id_fmt = '{:04d}'.format(linter_id) 25 | linter_name = re.sub(r'[^a-z]+', '-', linter_name.strip().lower()) 26 | 27 | logger.info("Creating a new linter: %s - %s ...", linter_id_fmt, linter_name) 28 | 29 | # /sql directory 30 | sql_name = 'sql/{}-{}'.format(linter_id_fmt, linter_name.replace('_', '-')) 31 | logger.info("Add SQL schema and log files (%s) ...", sql_name) 32 | 33 | with open(sql_name + '.sql', mode='wt', encoding='utf-8') as file_name: 34 | # 0002_not_used_indices 35 | table_name = '{}_{}'.format(linter_id_fmt, linter_name.replace('-', '_')) 36 | 37 | file_name.writelines([ 38 | '-- Report ...\n', 39 | '--\n', 40 | '-- https://github.com/macbre/index-digest/issues/{}\n'.format(linter_id), 41 | 'DROP TABLE IF EXISTS `{}`;\n'.format(table_name), 42 | 'CREATE TABLE `{}` (\n'.format(table_name), 43 | '-- \n', 44 | ');\n', 45 | ]) 46 | 47 | logger.info('... %s created', file_name.name) 48 | 49 | with open(sql_name + '-log', mode='wt', encoding='utf-8') as file_name: 50 | file_name.writelines([ 51 | '-- \n', 52 | ]) 53 | 54 | logger.info('... %s created', file_name.name) 55 | 56 | # /indexdigest/linters directory 57 | linter_name = linter_name.replace('-', '_') 58 | logger.info("Add a Python code for %s linter ...", linter_name) 59 | 60 | with open('indexdigest/linters/linter_{}_{}.py'. 61 | format(linter_id_fmt, linter_name), mode='wt', encoding='utf-8') as file_name: 62 | file_name.writelines([ 63 | '"""\n', 64 | 'This linter checks for ...\n', 65 | '"""\n', 66 | 'from collections import defaultdict\n', 67 | '\n', 68 | 'from indexdigest.utils import LinterEntry, explain_queries\n', 69 | '\n', 70 | '\n', 71 | 'def check_{}(database, queries):\n'.format(linter_name), 72 | ' """\n', 73 | ' :type database indexdigest.database.Database\n', 74 | ' :type queries list[str]\n', 75 | ' :rtype: list[LinterEntry]\n', 76 | ' """\n', 77 | ' yield LinterEntry(linter_type=\'{}\', table_name=table_name,\n'. 78 | format(linter_name), 79 | ' message=\'"{}" ...\'.\n', 80 | ' format("foo"),\n', 81 | ' context={"foo": str("bar")})\n', 82 | ]) 83 | 84 | logger.info('... %s created', file_name.name) 85 | 86 | logger.info("Add a test ...") 87 | 88 | with open('indexdigest/test/linters/test_{}_{}.py'.format(linter_id_fmt, linter_name), 89 | mode='wt', encoding='utf-8') \ 90 | as file_name: 91 | file_name.writelines([ 92 | 'from __future__ import print_function\n', 93 | '\n', 94 | 'from unittest import TestCase\n', 95 | '\n', 96 | 'from indexdigest.linters.linter_{0}_{1} import check_{1}\n'. 97 | format(linter_id_fmt, linter_name), 98 | 'from indexdigest.test import DatabaseTestMixin, read_queries_from_log\n', 99 | '\n', 100 | '\n', 101 | 'class TestLinter(TestCase, DatabaseTestMixin):\n', 102 | '\n', 103 | ' def test_{}(self):\n'.format(linter_name), 104 | ' pass\n', 105 | ]) 106 | 107 | logger.info('... %s created', file_name.name) 108 | 109 | 110 | def main(): 111 | """ 112 | usage: add_linter 89 empty_tables 113 | """ 114 | try: 115 | linter_id = int(sys.argv[1]) 116 | linter_name = str(sys.argv[2]) 117 | 118 | add_linter(linter_id, linter_name) 119 | except IndexError: 120 | print('Usage: add_linter 89 empty_tables') 121 | sys.exit(1) 122 | -------------------------------------------------------------------------------- /indexdigest/test/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ..database import Database 4 | 5 | from unittest import TestCase 6 | 7 | 8 | def read_queries_from_log(log_file): 9 | """ 10 | :type log_file str 11 | :rtype: list[str] 12 | """ 13 | with open('sql/{}'.format(log_file)) as fp: 14 | queries = fp.readlines() 15 | queries = list(map(str.strip, queries)) # remove trailing spaces 16 | 17 | return queries 18 | 19 | 20 | class DatabaseTestMixin: 21 | DSN = 'mysql://index_digest:qwerty@127.0.0.1:53306/index_digest' 22 | DBNAME = 'index_digest' 23 | 24 | @property 25 | def connection(self): 26 | """ 27 | :rtype: Database 28 | """ 29 | return Database.connect_dsn(self.DSN) 30 | 31 | 32 | class BigTableTest(TestCase, DatabaseTestMixin): 33 | 34 | ROWS = 100000 # how many rows to generate 35 | BATCH = 5000 # perform INSERT in batches 36 | 37 | BIG_TABLE_NAME = '0020_big_table' 38 | PREPARED = False 39 | 40 | def setUp(self): 41 | super(BigTableTest, self).setUp() 42 | 43 | # prepare the big table only once 44 | if not BigTableTest.PREPARED: 45 | self._prepare_big_table() 46 | BigTableTest.PREPARED = True 47 | 48 | assert self.table_populated(), 'Table 0020_big_table should be populated with values' 49 | 50 | def _rows(self): 51 | """ 52 | Iterate from 1 to self.ROWS 53 | :rtype: list[int] 54 | """ 55 | r = 0 56 | while r < self.ROWS: 57 | r += 1 58 | yield r 59 | 60 | @staticmethod 61 | def _insert_values(cursor, values): 62 | """ 63 | :type cursor MySQLdb.cursors.BaseCursor 64 | :type values list[tuple] 65 | """ 66 | if len(values) == 0: 67 | return 68 | 69 | # @see https://dev.mysql.com/doc/refman/5.7/en/insert.html 70 | cursor.executemany('INSERT INTO 0020_big_table(item_id,val,text,num) VALUES(%s,%s,%s,%s)', values) 71 | # print(values[0], cursor.lastrowid) 72 | 73 | def _prepare_big_table(self): 74 | """ 75 | Fill the table with values 76 | """ 77 | # @see http://www.mysqltutorial.org/python-mysql-insert/ 78 | val = 1 79 | values = [] 80 | 81 | # use the same connection through out the function 82 | connection = self.connection 83 | cursor = connection.connection.cursor() 84 | 85 | # is table already populated? 86 | if self.table_populated(): 87 | return 88 | 89 | # no? populate it 90 | for row in self._rows(): 91 | # Report low cardinality indices, use only a few distinct values (#31) 92 | num = row % 2 93 | 94 | values.append((row, val, '{:05x}'.format(row)[:5], num)) 95 | 96 | if row % 5 == 0: 97 | val += 1 98 | 99 | if len(values) == self.BATCH: 100 | self._insert_values(cursor, values) 101 | values = [] 102 | 103 | # insert any remaining values 104 | self._insert_values(cursor, values) 105 | 106 | # save changes to the database 107 | connection.connection.commit() 108 | 109 | cursor.close() 110 | 111 | # update key distribution statistics (#31) 112 | self.connection.query('ANALYZE TABLE 0020_big_table') 113 | 114 | cardinality_stats = self.connection.query_dict_rows( 115 | "select TABLE_NAME, INDEX_NAME, COLUMN_NAME, CARDINALITY from" 116 | " INFORMATION_SCHEMA.STATISTICS where" 117 | " TABLE_NAME = '{table_name}' AND TABLE_SCHEMA = '{database_name}'".format( 118 | table_name=self.BIG_TABLE_NAME, database_name=self.DBNAME) 119 | ) 120 | logging.warning('Big table initialized, cardinality: %r', list(cardinality_stats)) 121 | 122 | def table_populated(self): 123 | """ 124 | :rtype: bool 125 | """ 126 | return self.connection.query_field('SELECT COUNT(*) FROM 0020_big_table') == self.ROWS 127 | 128 | 129 | class DatabaseWithMockedRow(Database): 130 | 131 | def __init__(self, mocked_row): 132 | super(DatabaseWithMockedRow, self).__init__(db='', host='', passwd='', user='') 133 | self.row = mocked_row 134 | 135 | @property 136 | def connection(self): 137 | raise Exception('Class {} needs to mock the query_* method'.format(self.__class__.__name__)) 138 | 139 | def query(self, sql, cursor=None): 140 | self._queries.append(sql) 141 | self.query_logger.info(sql) 142 | return [self.row] 143 | 144 | def query_row(self, sql): 145 | self._queries.append(sql) 146 | self.query_logger.info(sql) 147 | return self.row 148 | -------------------------------------------------------------------------------- /indexdigest/schema.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data structures for handling schema-related things like indices and columns 3 | """ 4 | 5 | 6 | class Index: 7 | """ 8 | Keeps a single index meta-data 9 | """ 10 | def __init__(self, name, columns, unique=False, primary=False): 11 | """ 12 | :type name str 13 | :type columns list[str] 14 | :type unique bool 15 | :type primary bool 16 | """ 17 | self._name = name 18 | self._columns = columns 19 | self._unique = unique 20 | self._primary = primary 21 | 22 | def is_covered_by(self, index): 23 | """ 24 | Checks if a current index is covered by a different one 25 | 26 | Examples: 27 | 28 | PRIMARY KEY (`id`,`foo`), 29 | UNIQUE KEY `idx` (`id`,`foo`) # redundant 30 | 31 | PRIMARY KEY (`id`), 32 | KEY `idx_foo` (`foo`), # redundant (covered by idx_foo_bar) 33 | KEY `idx_foo_bar` (`foo`, `bar`), 34 | KEY `idx_id_foo` (`id`, `foo`) 35 | 36 | :type index Index 37 | :rtype: bool 38 | """ 39 | # @see https://github.com/macbre/index-digest/issues/4 40 | 41 | # assume primary is never covered by other indices (plus self check) 42 | if self.is_primary or self == index: 43 | return False 44 | 45 | # equal indices - prefer unique over non unique indices 46 | # and primary keys over unique ones 47 | # @see https://github.com/macbre/index-digest/issues/49 48 | if self.columns == index.columns and self.is_unique: 49 | # we're covered by the same unique key or a primary key 50 | if index.is_unique or index.is_primary: 51 | return True 52 | 53 | return False 54 | 55 | # now take the subset of columns from the index we're comparing ourselves too 56 | columns_cnt = len(self.columns) 57 | 58 | if self.columns == index.columns[:columns_cnt]: 59 | if self.is_unique and index.is_primary: 60 | # the unique key adds a uniqueness bit to the primary key - #49 61 | return False 62 | 63 | return True 64 | 65 | return False 66 | 67 | @property 68 | def name(self): 69 | """ 70 | :rtype: str 71 | """ 72 | return self._name 73 | 74 | @property 75 | def columns(self): 76 | """ 77 | :rtype: list[str] 78 | """ 79 | return self._columns 80 | 81 | @property 82 | def is_unique(self): 83 | """ 84 | :rtype: bool 85 | """ 86 | return self._unique is True 87 | 88 | @property 89 | def is_primary(self): 90 | """ 91 | :rtype: bool 92 | """ 93 | return self._primary is True 94 | 95 | def __repr__(self): 96 | """ 97 | :rtype: str 98 | """ 99 | return '<{}> {}'.format(self.__class__.__name__, str(self)) 100 | 101 | def __str__(self): 102 | """ 103 | :rtype: str 104 | """ 105 | return '{type}{name} ({columns})'.format( 106 | type='PRIMARY KEY' if self.is_primary else 'UNIQUE KEY ' if self.is_unique else 'KEY ', 107 | name=self.name if not self.is_primary else '', 108 | columns=', '.join(self.columns) 109 | ) 110 | 111 | 112 | class Column: 113 | """ 114 | Keeps a single table column meta-data 115 | 116 | @see https://dev.mysql.com/doc/refman/5.7/en/columns-table.html 117 | """ 118 | def __init__(self, name, column_type, character_set=None, collation=None): 119 | """ 120 | :type name str 121 | :type column_type str 122 | :type character_set str 123 | :type collation str 124 | """ 125 | self._name = name 126 | self._type = column_type 127 | self._character_set = character_set 128 | self._collation = collation 129 | 130 | # As of MySQL 8.0.17, the ZEROFILL attribute is deprecated for numeric data types 131 | # and support for it will be removed in a future MySQL version. Consider using 132 | # an alternative means of producing the effect of this attribute. 133 | # 134 | # For example, applications could use the LPAD() function to zero-pad numbers up to 135 | # the desired width, or they could store the formatted numbers in CHAR columns. 136 | # 137 | # https://dev.mysql.com/doc/refman/8.0/en/numeric-type-syntax.html 138 | if 'int(' in self._type: 139 | # normalize int(N) from MySQL 8.0.16 and older to int 140 | self._type = self._type.split('(')[0] 141 | 142 | @property 143 | def name(self): 144 | """ 145 | :rtype: str 146 | """ 147 | return self._name 148 | 149 | @property 150 | def type(self): 151 | """ 152 | :rtype: str 153 | """ 154 | return self._type 155 | 156 | @property 157 | def character_set(self): 158 | """ 159 | :rtype: str 160 | """ 161 | return self._character_set 162 | 163 | @property 164 | def collation(self): 165 | """ 166 | :rtype: str 167 | """ 168 | return self._collation 169 | 170 | def is_text_type(self): 171 | """ 172 | :rtype: bool 173 | """ 174 | base_type = self.type.split('(')[0].upper() 175 | # @see https://dev.mysql.com/doc/refman/5.7/en/string-types.html 176 | return base_type in \ 177 | ['CHAR', 'VARCHAR', 'BINARY', 'VARBINARY', 'BLOB', 'TEXT', 'ENUM', 'SET'] 178 | 179 | def is_timestamp_type(self): 180 | """ 181 | :rtype: bool 182 | """ 183 | base_type = self.type.upper() 184 | # @see https://dev.mysql.com/doc/refman/5.7/en/date-and-time-types.html 185 | return base_type in \ 186 | ['DATE', 'TIME', 'DATETIME', 'TIMESTAMP', 'YEAR'] 187 | 188 | def __repr__(self): 189 | """ 190 | :rtype: str 191 | """ 192 | return '<{}> {}'.format(self.__class__.__name__, str(self)) 193 | 194 | def __str__(self): 195 | """ 196 | :rtype: str 197 | """ 198 | return self._name 199 | -------------------------------------------------------------------------------- /indexdigest/cli/script.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=line-too-long 2 | """index_digest 3 | 4 | Analyses your database queries and schema and suggests indices improvements. 5 | 6 | Usage: 7 | index_digest DSN [--sql-log=] [--format=] [--analyze-data] [--check-empty-databases] [--checks= | --skip-checks=] [--tables= | --skip-tables=] 8 | index_digest (-h | --help) 9 | index_digest --version 10 | 11 | Options: 12 | DSN Data Source Name of database to check 13 | --sql-log= Text file with SQL queries to check against the database 14 | --format= Use a given results formatter (plain, syslog, yaml) 15 | --analyze-data Run additional checks that will query table data (can be slow!) 16 | --check-empty-databases Detect empty databases on this MySQL server 17 | --checks= Comma-separated lists of checks to report 18 | --skip-checks= Comma-separated lists of checks to skip from report 19 | --tables= Comma-separated lists of tables to report 20 | --skip-tables= Comma-separated lists of tables to skip from report 21 | -h --help Show this screen. 22 | --version Show version. 23 | 24 | Examples: 25 | index_digest mysql://username:password@localhost/dbname 26 | index_digest mysql://index_digest:qwerty@localhost/index_digest --sql-log=sql.log 27 | index_digest mysql://index_digest:qwerty@localhost/index_digest --skip-checks=non_utf_columns 28 | index_digest mysql://index_digest:qwerty@localhost/index_digest --analyze-data --checks=data_too_old,data_not_updated_recently 29 | index_digest mysql://index_digest:qwerty@localhost/index_digest --analyze-data --skip-tables=DATABASECHANGELOG,DATABASECHANGELOGLOCK 30 | 31 | Visit 32 | """ 33 | from __future__ import print_function 34 | 35 | import logging 36 | from itertools import chain 37 | from os import getenv, environ 38 | 39 | from docopt import docopt 40 | 41 | import indexdigest 42 | from indexdigest.database import Database 43 | from indexdigest.utils import IndexDigestError 44 | from indexdigest.formatters import \ 45 | format_plain, \ 46 | format_syslog, \ 47 | format_yaml 48 | from indexdigest.linters import \ 49 | check_queries_using_filesort, check_queries_using_temporary, \ 50 | check_not_used_indices, check_queries_not_using_indices, \ 51 | check_not_used_tables, check_not_used_columns, \ 52 | check_redundant_indices, \ 53 | check_full_table_scan, \ 54 | check_latin_columns, \ 55 | check_selects_with_like, \ 56 | check_missing_primary_index, \ 57 | check_test_tables, \ 58 | check_insert_ignore_queries, \ 59 | check_single_column, \ 60 | check_empty_tables, \ 61 | check_select_star, \ 62 | check_having_clause, \ 63 | check_data_too_old, \ 64 | check_data_not_updated_recently, \ 65 | check_generic_primary_key, \ 66 | check_high_offset_selects, \ 67 | check_use_innodb, \ 68 | check_empty_database, \ 69 | check_low_cardinality_index 70 | 71 | 72 | def get_version() -> str: 73 | """ 74 | Return version string, e.g. 75 | 1.5.0 (git 1a258db) 76 | """ 77 | return '{version} (git {commit})'.format( 78 | version=indexdigest.VERSION, 79 | commit=getenv('COMMIT_SHA', 'dev')[:7] 80 | ) 81 | 82 | 83 | def get_reports(database, sql_log=None, analyze_data=False, check_empty_databases=False): 84 | """ 85 | :type database Database 86 | :type sql_log str 87 | :type analyze_data bool 88 | :type check_empty_databases bool 89 | :rtype: list[indexdigest.utils.LinterEntry] 90 | """ 91 | logger = logging.getLogger(__name__) 92 | 93 | # read SQL log file (if provided) 94 | if sql_log: 95 | logger.debug('Trying to open SQL log file: %s', sql_log) 96 | 97 | with open(sql_log, encoding='utf-8') as log_file: 98 | queries = log_file.readlines() 99 | queries = list(map(str.strip, queries)) # remove trailing spaces 100 | logger.debug('Got %d entries in SQL log file', len(queries)) 101 | else: 102 | queries = None 103 | 104 | # run all checks 105 | reports = chain( 106 | check_redundant_indices(database), 107 | check_latin_columns(database), 108 | check_missing_primary_index(database), 109 | check_test_tables(database), 110 | check_single_column(database), 111 | check_empty_tables(database), 112 | check_generic_primary_key(database), 113 | check_use_innodb(database), 114 | check_low_cardinality_index(database), 115 | ) 116 | 117 | # checks that use SQL log 118 | if queries: 119 | reports = chain( 120 | reports, 121 | check_not_used_indices(database, queries=queries), 122 | check_not_used_tables(database, queries=queries), 123 | check_not_used_columns(database, queries=queries), 124 | check_queries_not_using_indices(database, queries=queries), 125 | check_queries_using_filesort(database, queries=queries), 126 | check_queries_using_temporary(database, queries=queries), 127 | check_full_table_scan(database, queries=queries), 128 | check_selects_with_like(database, queries=queries), 129 | check_insert_ignore_queries(database, queries=queries), 130 | check_select_star(database, queries=queries), 131 | check_having_clause(database, queries=queries), 132 | check_high_offset_selects(database, queries=queries), 133 | ) 134 | 135 | # checks that require --analyze-data switch to be on (see #28) 136 | if analyze_data is True: 137 | logger.info("Will run data analyzing checks, can take a while...") 138 | 139 | reports = chain( 140 | reports, 141 | check_data_too_old(database, env=environ), 142 | check_data_not_updated_recently(database, env=environ), 143 | ) 144 | 145 | # --check-empty-databases switch to be on to run "empty_database" (see #146) 146 | if check_empty_databases is True: 147 | logger.info("Will analyze databases on this MySQL server, can take a while...") 148 | 149 | reports = chain( 150 | reports, 151 | check_empty_database(database), 152 | ) 153 | 154 | return reports 155 | 156 | 157 | def filter_reports_by_type(reports, checks=None, skip_checks=None): 158 | """ 159 | :type reports list[indexdigest.utils.LinterEntry] 160 | :type checks str 161 | :type skip_checks str 162 | :rtype: list[indexdigest.utils.LinterEntry] 163 | """ 164 | if checks: 165 | return [ 166 | report for report in reports 167 | if report.linter_type in checks.split(',') 168 | ] 169 | 170 | if skip_checks: 171 | return [ 172 | report for report in reports 173 | if report.linter_type not in skip_checks.split(',') 174 | ] 175 | 176 | return reports 177 | 178 | 179 | def filter_reports_by_table(reports, tables=None, skip_tables=None): 180 | """ 181 | :type reports list[indexdigest.utils.LinterEntry] 182 | :type tables str 183 | :type skip_tables str 184 | :rtype: list[indexdigest.utils.LinterEntry] 185 | """ 186 | if tables: 187 | return [ 188 | report for report in reports 189 | if report.table_name in tables.split(',') 190 | ] 191 | 192 | if skip_tables: 193 | return [ 194 | report for report in reports 195 | if report.table_name not in skip_tables.split(',') 196 | ] 197 | 198 | return reports 199 | 200 | 201 | def main(): # pragma: no cover 202 | """ Main entry point for CLI""" 203 | logger = logging.getLogger(__name__) 204 | 205 | arguments = docopt(__doc__, version=f'index_digest {get_version()}') 206 | logger.debug('Options: %s', arguments) 207 | 208 | if 'DSN' not in arguments: 209 | return 210 | 211 | # connect to the database 212 | database = Database.connect_dsn(arguments['DSN']) 213 | logger.debug('Connected to MySQL server v%s', database.get_server_version()) 214 | 215 | reports = get_reports( 216 | database, 217 | sql_log=arguments.get('--sql-log'), 218 | analyze_data=arguments.get('--analyze-data'), 219 | check_empty_databases=arguments.get('--check-empty-databases') 220 | ) 221 | 222 | # handle --checks / --skip-checks 223 | reports = filter_reports_by_type( 224 | reports, 225 | checks=arguments.get('--checks'), 226 | skip_checks=arguments.get('--skip-checks') 227 | ) 228 | 229 | # handle --tables / --skip-tables 230 | reports = filter_reports_by_table( 231 | reports, 232 | tables=arguments.get('--tables'), 233 | skip_tables=arguments.get('--skip-tables') 234 | ) 235 | 236 | # handle --format 237 | formatter = arguments.get('--format') or 'plain' 238 | logger.info("Using formatter: %s", formatter) 239 | 240 | if formatter == 'plain': 241 | print(format_plain(database, reports)) 242 | elif formatter == 'syslog': 243 | ident = getenv('SYSLOG_IDENT', 'index-digest') 244 | logger.info('Using syslog ident: %s', ident) 245 | print(format_syslog(database, reports, ident)) 246 | elif formatter == 'yaml': 247 | print(format_yaml(database, reports)) 248 | else: 249 | raise IndexDigestError('Unknown formatter provided: {}'.format(formatter)) 250 | -------------------------------------------------------------------------------- /indexdigest/database.py: -------------------------------------------------------------------------------- 1 | """ 2 | Database connector wrapper 3 | """ 4 | import logging 5 | import re 6 | from collections import OrderedDict, defaultdict 7 | from warnings import filterwarnings 8 | 9 | import MySQLdb 10 | from MySQLdb.cursors import DictCursor 11 | from MySQLdb._exceptions import OperationalError, ProgrammingError 12 | 13 | from indexdigest.schema import Column, Index 14 | from indexdigest.utils import parse_dsn, memoize, IndexDigestError 15 | 16 | 17 | class IndexDigestQueryError(IndexDigestError): 18 | """ 19 | A wrapper for _mysql_exceptions.OperationalError: 20 | """ 21 | 22 | 23 | class DatabaseBase: 24 | """ 25 | A generic wrapper of database connection with basic querying functionality. 26 | 27 | Sub-class this to mock database connection 28 | """ 29 | 30 | # pylint:disable=too-many-positional-arguments,too-many-arguments,invalid-name 31 | def __init__(self, host: str, user: str, passwd: str, db: str, port: int=3306): 32 | """ 33 | Connects to a given database 34 | """ 35 | self.logger = logging.getLogger(__name__) 36 | self.query_logger = logging.getLogger(__name__ + '.query') 37 | 38 | # lazy connect 39 | self._connection_params = dict(host=host, port=port, user=user, passwd=passwd, db=db) 40 | self._connection = None 41 | self.db_name = db 42 | 43 | # Suppress MySQL warnings when EXPLAIN is run (#63) 44 | filterwarnings('ignore', category=MySQLdb.Warning) 45 | 46 | # register queries 47 | self._queries = [] 48 | 49 | @classmethod 50 | def connect_dsn(cls, dsn): 51 | """ 52 | :type dsn str 53 | :rtype DatabaseBase 54 | """ 55 | parsed = parse_dsn(dsn) 56 | return cls(**parsed) 57 | 58 | @property 59 | def connection(self): 60 | """ 61 | Lazy connection 62 | 63 | :rtype: Connection 64 | """ 65 | if self._connection is None: 66 | self.logger.info('Lazy connecting to %s:%i and using %s database', 67 | self._connection_params['host'], self._connection_params['port'], 68 | self._connection_params['db']) 69 | 70 | self._connection = MySQLdb.connect(**self._connection_params) 71 | 72 | return self._connection 73 | 74 | def get_queries(self): 75 | """ 76 | :rtype: list[str] 77 | """ 78 | return self._queries 79 | 80 | def query(self, sql, cursor_class=None): 81 | """ 82 | :type sql str 83 | :type cursor_class MySQLdb.cursors.BaseCursor 84 | :rtype: MySQLdb.cursors.Cursor 85 | :raises IndexDigestQueryError 86 | """ 87 | self.query_logger.info('%s', sql) 88 | 89 | cursor = self.connection.cursor(cursorclass=cursor_class) 90 | 91 | try: 92 | # Python 3: query should be str (unicode) when executing % 93 | try: 94 | sql = sql.encode('utf8') 95 | except UnicodeDecodeError: 96 | pass 97 | 98 | cursor.execute(sql) 99 | except (OperationalError, ProgrammingError) as ex: 100 | # e.g. (1054, "Unknown column 'test' in 'field list'") - OperationalError 101 | # e.g. (1146, "Table 'index_digest.t' doesn't exist") - ProgrammingError 102 | (code, message) = ex.args 103 | self.query_logger.error('Database error #%d: %s', code, message) 104 | raise IndexDigestQueryError(message) from ex 105 | 106 | # register the query 107 | self._queries.append(sql) 108 | 109 | return cursor 110 | 111 | def query_row(self, sql): 112 | """ 113 | :type sql str 114 | :rtype: list 115 | """ 116 | return self.query(sql).fetchone() 117 | 118 | def query_dict_row(self, sql): 119 | """ 120 | Return a single row as a dictionary 121 | 122 | :type sql str 123 | :rtype: dict 124 | """ 125 | # DictCursor is a Cursor class that returns rows as dictionaries 126 | return self.query(sql, cursor_class=DictCursor).fetchone() 127 | 128 | def query_dict_rows(self, sql): 129 | """ 130 | Return all rows as dictionaries 131 | 132 | :type sql str 133 | :rtype: dict[] 134 | """ 135 | # DictCursor is a Cursor class that returns rows as dictionaries 136 | yield from self.query(sql, cursor_class=DictCursor) 137 | 138 | def query_field(self, sql): 139 | """ 140 | :type sql str 141 | :rtype: str 142 | """ 143 | return self.query_row(sql)[0] 144 | 145 | def query_list(self, sql): 146 | """ 147 | Returns an iterator with the first field on each row. 148 | 149 | e.g. SHOW TABLES 150 | 151 | :type sql str 152 | :rtype: list[str] 153 | """ 154 | for row in self.query(sql): 155 | yield str(row[0]) 156 | 157 | def query_key_value(self, sql): 158 | """ 159 | Returns an ordered dictionary with key / value taken fro first two fields of each row. 160 | 161 | e.g. SHOW VARIABLES 162 | 163 | :type sql str 164 | :rtype: OrderedDict 165 | """ 166 | res = OrderedDict() 167 | 168 | for row in self.query(sql): 169 | res[row[0]] = row[1] 170 | 171 | return res 172 | 173 | 174 | class Database(DatabaseBase): 175 | """ 176 | Database wrapper extended with some stats-related queries 177 | """ 178 | 179 | @memoize 180 | def get_server_version(self): 181 | """ 182 | Returns server version (e.g. "5.5.57-0+deb8u1") 183 | 184 | :rtype: str 185 | """ 186 | return self.query_field('SELECT VERSION()') 187 | 188 | def get_server_hostname(self): 189 | """ 190 | Return hostname of the server 191 | :rtype: str 192 | """ 193 | return self.get_variables(like='hostname').get('hostname') 194 | 195 | @memoize 196 | def get_tables(self): 197 | """ 198 | Returns the alphabetically sorted list of tables (ignore views) 199 | 200 | :rtype: list[str] 201 | """ 202 | return sorted(self.query_list( 203 | 'SELECT TABLE_NAME FROM information_schema.tables ' 204 | 'WHERE table_schema = "{}" and TABLE_TYPE = "BASE TABLE"'. 205 | format(self.db_name) 206 | )) 207 | 208 | def get_variables(self, like=None): 209 | """ 210 | Returns the key / value dictionary with server variables 211 | 212 | :type like str 213 | :rtype: OrderedDict 214 | """ 215 | sql = 'SHOW VARIABLES' 216 | if like is not None: 217 | sql += ' LIKE "{}%"'.format(like) 218 | 219 | return self.query_key_value(sql) 220 | 221 | @memoize 222 | def explain_query(self, sql): 223 | """ 224 | Runs EXPLAIN query for a given SQL 225 | 226 | :type sql str 227 | :rtype: list 228 | """ 229 | # @see https://dev.mysql.com/doc/refman/5.7/en/explain-output.html 230 | return list(self.query_dict_rows('EXPLAIN {}'.format(sql))) 231 | 232 | @memoize 233 | def get_table_schema(self, table_name): 234 | """ 235 | Run SHOW CREATE TABLE query for a given table 236 | :type table_name str 237 | :rtype: str 238 | """ 239 | # @see https://dev.mysql.com/doc/refman/5.7/en/show-create-table.html 240 | schema = str(self.query_row('SHOW CREATE TABLE `{}`'.format(table_name))[1]) 241 | 242 | # remove partitions definition (#107) 243 | schema = re.sub(r'/\*!50100[^*]+\*/', '', schema) 244 | 245 | return schema.rstrip() 246 | 247 | def _get_information_schema_where(self, table_name): 248 | """ 249 | :type table_name str 250 | :rtype: str 251 | """ 252 | # @see https://dev.mysql.com/doc/refman/5.7/en/information-schema.html 253 | return "WHERE TABLE_SCHEMA='{db}' AND TABLE_NAME='{table_name}'".format( 254 | db=self._connection_params['db'], table_name=table_name) 255 | 256 | @memoize 257 | def get_table_metadata(self, table_name): 258 | """ 259 | Return table's metadata 260 | 261 | :type table_name str 262 | :rtype: dict 263 | """ 264 | # https://dev.mysql.com/doc/refman/5.7/en/tables-table.html 265 | # https://mariadb.com/kb/en/information-schema-tables-table/ 266 | stats = self.query_dict_row( 267 | "SELECT ENGINE, TABLE_ROWS, DATA_LENGTH, INDEX_LENGTH " 268 | "FROM information_schema.TABLES " + self._get_information_schema_where(table_name)) 269 | 270 | # TODO: introduce dataclass 271 | return { 272 | 'engine': stats['ENGINE'], 273 | 'rows': stats['TABLE_ROWS'], # For InnoDB the row count is only a rough estimate 274 | 'data_size': stats['DATA_LENGTH'], 275 | 'index_size': stats['INDEX_LENGTH'], 276 | } 277 | 278 | @memoize 279 | def get_table_columns(self, table_name): 280 | """ 281 | Return the list of indices for a given table 282 | 283 | :type table_name str 284 | :rtype: list[Column] 285 | """ 286 | # @see https://dev.mysql.com/doc/refman/8.0/en/show-columns.html 287 | try: 288 | columns = [ 289 | row['Field'] 290 | for row in self.query_dict_rows("SHOW COLUMNS FROM `{}`".format(table_name)) 291 | ] 292 | except IndexDigestQueryError: 293 | logger = logging.getLogger('get_table_columns') 294 | logger.error('Cannot get columns list for table: %s', table_name) 295 | return None 296 | 297 | # @see https://dev.mysql.com/doc/refman/8.0/en/information-schema-columns-table.html 298 | rows = self.query_dict_rows( 299 | "SELECT COLUMN_NAME as NAME, COLUMN_TYPE as TYPE, CHARACTER_SET_NAME, COLLATION_NAME " 300 | "FROM information_schema.COLUMNS " + self._get_information_schema_where(table_name)) 301 | 302 | meta = dict() 303 | 304 | for row in rows: 305 | # TYPE: e.g. int(9), varchar(32) 306 | meta[row['NAME']] = Column(name=row['NAME'], column_type=row['TYPE'], 307 | character_set=row['CHARACTER_SET_NAME'], 308 | collation=row['COLLATION_NAME']) 309 | 310 | # keep the order taken from SHOW COLUMNS 311 | return [ 312 | meta[column] 313 | for column in columns 314 | ] 315 | 316 | @memoize 317 | def get_table_indices(self, table_name): 318 | """ 319 | Return the list of indices for a given table 320 | 321 | :type table_name str 322 | :rtype: list[Index] 323 | """ 324 | # @see https://dev.mysql.com/doc/refman/5.7/en/statistics-table.html 325 | # @see https://dev.mysql.com/doc/refman/5.7/en/show-index.html 326 | res = self.query_dict_rows( 327 | "SELECT INDEX_NAME, NON_UNIQUE, SEQ_IN_INDEX, COLUMN_NAME, CARDINALITY " + 328 | "FROM information_schema.STATISTICS " + self._get_information_schema_where(table_name) + 329 | " ORDER BY INDEX_NAME, SEQ_IN_INDEX") 330 | 331 | index_columns = defaultdict(list) 332 | index_meta = OrderedDict() 333 | 334 | for row in res: 335 | index_name = row['INDEX_NAME'] 336 | index_columns[index_name].append(row['COLUMN_NAME']) 337 | 338 | if index_name not in index_meta: 339 | index_meta[index_name] = { 340 | 'unique': int(row['NON_UNIQUE']) == 0, 341 | 'primary': row['INDEX_NAME'] == 'PRIMARY', 342 | } 343 | 344 | ret = [] 345 | 346 | for index_name, meta in index_meta.items(): 347 | columns = index_columns[index_name] 348 | ret.append(Index( 349 | name=index_name, columns=columns, primary=meta['primary'], unique=meta['unique'])) 350 | 351 | return ret 352 | 353 | @memoize 354 | def get_table_rows_estimate(self, table_name): 355 | """ 356 | Estimate table's rows count by running EXPLAIN SELECT COUNT(*) FROM foo 357 | 358 | #96 - For MySQL 8.0 we fall back to a "raw" query: SELECT COUNT(*) FROM foo 359 | 360 | :type table_name str 361 | :rtype int 362 | """ 363 | sql = "SELECT COUNT(*) FROM `{}`".format(table_name) 364 | explain_row = self.explain_query(sql)[0] 365 | 366 | # EXPLAIN query returned rows count 367 | if explain_row['rows'] is not None: 368 | return int(explain_row['rows']) 369 | 370 | # "Select tables optimized away" was returned by the query (see #96) 371 | self.logger.info("'EXPLAIN %s' query returned '%s' in Extra field", 372 | sql, explain_row['Extra']) 373 | 374 | return self.query_field(sql) 375 | -------------------------------------------------------------------------------- /indexdigest/test/core/test_database.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | from __future__ import print_function 3 | 4 | from unittest import TestCase 5 | 6 | from indexdigest.test import DatabaseTestMixin, DatabaseWithMockedRow 7 | from indexdigest.database import DatabaseBase 8 | 9 | 10 | class TestDatabaseBase(TestCase, DatabaseTestMixin): 11 | 12 | def test_database_connect(self): 13 | conn = DatabaseBase(host='127.0.0.1', user='index_digest', passwd='qwerty', db='index_digest') 14 | self.assertIsInstance(conn, DatabaseBase) 15 | 16 | def test_database_connect_dsn(self): 17 | self.assertIsInstance(self.connection, DatabaseBase) 18 | 19 | def test_query_list(self): 20 | res = list(self.connection.query_list('SHOW DATABASES')) 21 | 22 | self.assertTrue('information_schema' in res, res) 23 | self.assertTrue('index_digest' in res, res) 24 | 25 | def test_query_field(self): 26 | cnt = self.connection.query_field('SELECT count(*) FROM 0000_the_table') 27 | 28 | self.assertEqual(cnt, 3) 29 | 30 | def test_query_row(self): 31 | row = self.connection.query_row('SELECT * FROM 0000_the_table WHERE item_id = 1') 32 | 33 | self.assertEqual(row[0], 1) 34 | self.assertEqual(row[1], 'test') 35 | 36 | def test_query_dict_row(self): 37 | row = self.connection.query_dict_row('SELECT * FROM 0000_the_table ORDER BY 1') 38 | print(row) 39 | 40 | self.assertEqual(row['item_id'], 1) 41 | self.assertEqual(row['foo'], 'test') 42 | 43 | def test_query_dict_rows(self): 44 | rows = list(self.connection.query_dict_rows('SELECT * FROM 0000_the_table ORDER BY 1')) 45 | row = rows[0] 46 | print(row) 47 | 48 | self.assertEqual(len(rows), 3) 49 | 50 | self.assertEqual(row['item_id'], 1) 51 | self.assertEqual(row['foo'], 'test') 52 | 53 | 54 | class TestDatabase(TestCase, DatabaseTestMixin): 55 | 56 | TABLE_NAME = '0000_the_table' 57 | 58 | def test_database_version(self): 59 | # 5.5.57-0+deb8u1 / 8.0.3-rc-log / 9.4.0 MySQL Community Server - GPL 60 | # 10.2.10-MariaDB-10.2.10+maria~jessie / 12.0.2-MariaDB-ubu2404 mariadb.org binary distribution 61 | version = self.connection.get_server_version() 62 | 63 | self.assertTrue( 64 | version.startswith('5.') or version.startswith('8.') or version.startswith('9.') or 'MariaDB' in version, 65 | 'MySQL server should be from 5.x/8.x/9.x line or have MariaDB part') 66 | 67 | def test_get_tables(self): 68 | tables = list(self.connection.get_tables()) 69 | print(tables) 70 | 71 | assert self.TABLE_NAME in tables 72 | assert '0000_the_table-metadata' in tables 73 | assert '0000_the_view' not in tables 74 | 75 | def test_get_variables(self): 76 | variables = self.connection.get_variables() 77 | print(variables) 78 | 79 | self.assertTrue('version_compile_os' in variables) 80 | self.assertTrue('innodb_data_home_dir' in variables) 81 | 82 | def test_get_variables_like(self): 83 | variables = self.connection.get_variables(like='innodb') 84 | print(variables) 85 | 86 | self.assertFalse('version_compile_os' in variables) # this variable does not match given like 87 | self.assertTrue('innodb_data_home_dir' in variables) 88 | 89 | def test_explain_and_utf_query(self): 90 | """ 91 | mysql> explain SELECT * FROM 0000_the_table WHERE foo = "foo ąęź"; 92 | +----+-------------+----------------+------+---------------+---------+---------+-------+------+--------------------------+ 93 | | id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra | 94 | +----+-------------+----------------+------+---------------+---------+---------+-------+------+--------------------------+ 95 | | 1 | SIMPLE | 0000_the_table | ref | idx_foo | idx_foo | 50 | const | 1 | Using where; Using index | 96 | +----+-------------+----------------+------+---------------+---------+---------+-------+------+--------------------------+ 97 | 1 row in set (0.00 sec) 98 | """ 99 | res = list(self.connection.explain_query('SELECT * FROM {} WHERE foo = "foo ąęź"'.format(self.TABLE_NAME))) 100 | row = res[0] 101 | print(row) 102 | 103 | self.assertEqual(len(res), 1) 104 | self.assertEqual(row['key'], 'idx_foo') 105 | self.assertEqual(row['table'], self.TABLE_NAME) 106 | self.assertTrue('Using index' in row['Extra']) 107 | 108 | def test_get_table_indices(self): 109 | """ 110 | mysql> SELECT INDEX_NAME, NON_UNIQUE, SEQ_IN_INDEX, COLUMN_NAME, CARDINALITY 111 | FROM INFORMATION_SCHEMA.STATISTICS WHERE table_name = '0000_the_table' 112 | ORDER BY INDEX_NAME, SEQ_IN_INDEX; 113 | +------------+------------+--------------+-------------+-------------+ 114 | | INDEX_NAME | NON_UNIQUE | SEQ_IN_INDEX | COLUMN_NAME | CARDINALITY | 115 | +------------+------------+--------------+-------------+-------------+ 116 | | idx_foo | 1 | 1 | foo | 3 | 117 | | PRIMARY | 0 | 1 | id | 3 | 118 | | PRIMARY | 0 | 2 | foo | 3 | 119 | +------------+------------+--------------+-------------+-------------+ 120 | 3 rows in set (0.00 sec) 121 | """ 122 | (idx, primary) = self.connection.get_table_indices(self.TABLE_NAME) 123 | print(idx, primary) 124 | 125 | self.assertEqual(idx.name, 'idx_foo') 126 | self.assertEqual(primary.name, 'PRIMARY') 127 | 128 | self.assertListEqual(idx.columns, ['foo']) 129 | self.assertListEqual(primary.columns, ['item_id', 'foo']) 130 | 131 | self.assertFalse(idx.is_primary) 132 | self.assertFalse(idx.is_unique) 133 | self.assertTrue(primary.is_primary, 'Primary key is correctly detected') 134 | self.assertTrue(primary.is_unique, 'Primary key should be treated as a unique one') 135 | 136 | # assert False 137 | 138 | def test_get_table_schema(self): 139 | schema = self.connection.get_table_schema(self.TABLE_NAME) 140 | print(schema) 141 | 142 | self.assertTrue('CREATE TABLE `0000_the_table` (' in schema) 143 | self.assertTrue('PRIMARY KEY (`item_id`,`foo`),' in schema) 144 | self.assertTrue('ENGINE=InnoDB' in schema) 145 | 146 | # assert False 147 | 148 | def test_get_table_metadata(self): 149 | meta = self.connection.get_table_metadata(self.TABLE_NAME) 150 | print(meta) 151 | 152 | # stats 153 | self.assertEqual(meta['engine'], 'InnoDB') 154 | self.assertAlmostEqual(meta['rows'], 3, delta=1) 155 | self.assertTrue(meta['index_size'] > 0) 156 | self.assertTrue(meta['data_size'] > 0) 157 | 158 | # assert False 159 | 160 | def test_get_table_columns(self): 161 | columns = self.connection.get_table_columns(self.TABLE_NAME) 162 | print(columns) 163 | 164 | # the columns order is maintained 165 | column_names = [column.name for column in columns] 166 | 167 | # columns 168 | self.assertTrue('item_id' in column_names) 169 | self.assertTrue('foo' in column_names) 170 | 171 | self.assertEqual(columns[0].name, 'item_id') 172 | self.assertEqual(columns[0].type, 'int') 173 | self.assertIsNone(columns[0].character_set) # numeric column 174 | 175 | self.assertEqual(columns[1].name, 'foo') 176 | self.assertEqual(columns[1].type, 'varchar(16)') 177 | self.assertIn(columns[1].character_set, ['utf8', 'utf8mb3']) 178 | 179 | self.assertEqual(len(columns), 2) 180 | 181 | # assert False 182 | 183 | def test_get_table_rows_estimate(self): 184 | self.assertAlmostEqual(self.connection.get_table_rows_estimate(self.TABLE_NAME), 3, delta=1) 185 | 186 | 187 | class TestsWithDatabaseMocked(TestCase): 188 | 189 | def test_database_hostname(self): 190 | db = DatabaseWithMockedRow(mocked_row=['hostname', 'kopytko.foo.net']) 191 | self.assertEqual(db.get_server_hostname(), 'kopytko.foo.net') 192 | 193 | def test_database_version(self): 194 | db = DatabaseWithMockedRow(mocked_row=['5.5.58-0+deb8u1']) 195 | self.assertEqual(db.get_server_version(), '5.5.58-0+deb8u1') 196 | 197 | 198 | class TestMemoization(TestCase, DatabaseTestMixin): 199 | 200 | def test_get_queries(self): 201 | db = DatabaseWithMockedRow(mocked_row=['foo']) 202 | 203 | # query method is not memoized, so let's count all queries (even the same ones) 204 | for _ in range(5): 205 | self.assertEqual(db.query_row('SELECT FOO'), ['foo']) 206 | 207 | self.assertEqual(len(db.get_queries()), 5) 208 | self.assertEqual(db.get_queries()[0], 'SELECT FOO') 209 | 210 | def test_cached_get_tables(self): 211 | tables_list = ['foo'] 212 | db = DatabaseWithMockedRow(mocked_row=tables_list) 213 | 214 | # this would made five queries to database if not memoization in get_tables 215 | for _ in range(5): 216 | self.assertEqual(db.get_tables(), tables_list) 217 | 218 | # however, only one is made :) 219 | self.assertEqual(len(db.get_queries()), 1) 220 | 221 | def test_cached_explain_query(self): 222 | db = self.connection 223 | 224 | # this would made ten queries to database if not memoization in explain_query 225 | # also test that @memoize decorator correctly handles different arguments 226 | for _ in range(5): 227 | (row,) = db.explain_query('SELECT * FROM 0000_the_table') 228 | self.assertEqual(row['table'], '0000_the_table') 229 | 230 | (row,) = db.explain_query('SELECT * FROM 0002_not_used_indices') 231 | self.assertEqual(row['table'], '0002_not_used_indices') 232 | 233 | queries = db.get_queries() 234 | print(queries) 235 | 236 | # however, only two are made :) 237 | self.assertEqual(len(queries), 2) 238 | 239 | self.assertTrue('EXPLAIN SELECT * FROM 0000_the_table' in str(queries[0])) 240 | self.assertTrue('EXPLAIN SELECT * FROM 0002_not_used_indices' in str(queries[1])) 241 | 242 | def test_cached_get_indices(self): 243 | db = self.connection 244 | 245 | # this would made ten queries to database if not memoization in get_tables 246 | # also test that @memoize decorator correctly handles different arguments 247 | for _ in range(5): 248 | (_, primary) = db.get_table_indices(table_name='0000_the_table') 249 | self.assertTrue(primary.is_primary) 250 | 251 | (idx, _, _) = db.get_table_indices(table_name='0002_not_used_indices') 252 | self.assertEqual(idx.name, 'foo_id_idx') 253 | 254 | queries = db.get_queries() 255 | print(queries) 256 | 257 | # however, only two are made :) 258 | self.assertEqual(len(queries), 2) 259 | 260 | self.assertTrue('0000_the_table' in str(queries[0])) 261 | self.assertTrue('0002_not_used_indices' in str(queries[1])) 262 | 263 | def test_cached_get_columns(self): 264 | db = self.connection 265 | 266 | # this would made ten queries to database if not memoization in get_table_columns 267 | # also test that @memoize decorator correctly handles different arguments 268 | for _ in range(5): 269 | (col, _) = db.get_table_columns(table_name='0000_the_table') 270 | self.assertEqual(col.name, 'item_id') 271 | 272 | (_, col, _, _) = db.get_table_columns(table_name='0002_not_used_indices') 273 | self.assertEqual(col.name, 'foo') 274 | 275 | queries = db.get_queries() 276 | print(queries) 277 | 278 | # however, only four are made :) 279 | self.assertEqual(len(queries), 4) 280 | 281 | self.assertTrue("SHOW COLUMNS FROM `0000_the_table`" in str(queries[0])) 282 | self.assertTrue("information_schema.COLUMNS WHERE TABLE_SCHEMA='index_digest' AND TABLE_NAME='0000_the_table'" in str(queries[1])) 283 | self.assertTrue("SHOW COLUMNS FROM `0002_not_used_indices`" in str(queries[2])) 284 | self.assertTrue("information_schema.COLUMNS WHERE TABLE_SCHEMA='index_digest' AND TABLE_NAME='0002_not_used_indices'" in str(queries[3])) 285 | 286 | def test_cached_get_table_schema(self): 287 | db = DatabaseWithMockedRow(mocked_row=[None, 'CREATE TABLE foo;']) 288 | 289 | # this would made ten queries to database if not memoization in get_table_schema 290 | # also test that @memoize decorator correctly handles different arguments 291 | for _ in range(5): 292 | schema = db.get_table_schema('0000_the_table') 293 | self.assertEqual(schema, 'CREATE TABLE foo;') 294 | 295 | schema = db.get_table_schema('0002_not_used_indices') 296 | self.assertEqual(schema, 'CREATE TABLE foo;') 297 | 298 | queries = db.get_queries() 299 | print(queries) 300 | 301 | # however, only two are made :) 302 | self.assertEqual(len(queries), 2) 303 | 304 | self.assertEqual('SHOW CREATE TABLE `0000_the_table`', str(queries[0])) 305 | self.assertEqual('SHOW CREATE TABLE `0002_not_used_indices`', str(queries[1])) 306 | --------------------------------------------------------------------------------