├── indexdigest
    ├── test
    │   ├── cli
    │   │   ├── __init__.py
    │   │   └── test_script.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── test_utils.py
    │   │   ├── test_columns.py
    │   │   ├── test_indices.py
    │   │   ├── test_query.py
    │   │   └── test_database.py
    │   ├── linters
    │   │   ├── __init__.py
    │   │   ├── test_0164_empty_database.py
    │   │   ├── test_0074_single_column.py
    │   │   ├── test_0036_use_innodb.py
    │   │   ├── test_0094_generic_primary_key.py
    │   │   ├── test_0089_empty_tables.py
    │   │   ├── test_0002_not_used_indices.py
    │   │   ├── test_0026_full_table_scan.py
    │   │   ├── test_0034_missing_primary_index.py
    │   │   ├── test_0118_high_offset_selects.py
    │   │   ├── test_0075_test_tables.py
    │   │   ├── test_0031_low_cardinality_index.py
    │   │   ├── test_0028_data_too_old.py
    │   │   ├── test_0092_select_star.py
    │   │   ├── test_0019_queries_not_using_indices.py
    │   │   ├── test_0028_data_not_updated_recently.py
    │   │   ├── test_0027_selects_with_like.py
    │   │   ├── test_0093_having_clause.py
    │   │   ├── test_0004_redundant_indices.py
    │   │   ├── test_0070_insert_ignore.py
    │   │   ├── test_0020_big_table.py
    │   │   ├── test_0032_utf_latin_columns.py
    │   │   └── test_0006_not_used_columns_and_tables.py
    │   ├── test_schema.py
    │   ├── test_0107_schema_partitions.py
    │   ├── test_0089_handle_sql_errors.py
    │   ├── formatters
    │   │   ├── test_yaml.py
    │   │   ├── __init__.py
    │   │   ├── test_plain.py
    │   │   └── test_syslog.py
    │   ├── test_0004_redundant_indices_core.py
    │   └── __init__.py
    ├── __init__.py
    ├── formatters
    │   ├── __init__.py
    │   ├── yaml.py
    │   ├── syslog.py
    │   └── plain.py
    ├── cli
    │   ├── __init__.py
    │   ├── add_linter.py
    │   └── script.py
    ├── linters
    │   ├── linter_0074_single_column.py
    │   ├── linter_0089_empty_tables.py
    │   ├── linter_0075_test_tables.py
    │   ├── linter_0036_use_innodb.py
    │   ├── linter_0164_empty_database.py
    │   ├── linter_0034_missing_primary_index.py
    │   ├── linter_0094_generic_primary_key.py
    │   ├── linter_0118_high_offset_selects.py
    │   ├── linter_0093_having_clause.py
    │   ├── linter_0002_not_used_indices.py
    │   ├── linter_0027_selects_with_like.py
    │   ├── linter_0092_select_star.py
    │   ├── __init__.py
    │   ├── linter_0026_full_table_scan.py
    │   ├── linter_0032_utf_latin_columns.py
    │   ├── linter_0019_queries_not_using_indices.py
    │   ├── linter_0070_insert_ignore.py
    │   ├── linter_0004_redundant_indices.py
    │   ├── linter_0028_data_not_updated_recently.py
    │   ├── linter_0020_filesort_temporary_table.py
    │   ├── linter_0031_low_cardinality_index.py
    │   ├── linter_0028_data_too_old.py
    │   └── linter_0006_not_used_columns_and_tables.py
    ├── utils.py
    ├── schema.py
    └── database.py
├── MANIFEST.in
├── pylintrc
├── .coveragerc
├── .pylintrc
├── sql
    ├── 0026-full-table-scan-log
    ├── 0092-select-star-log
    ├── 0002-not-used-indices-log
    ├── 0027-selects-with-like-log
    ├── 0074-single-column.sql
    ├── 0006-not-used-columns-and-tables-log
    ├── 0075-test-tables.sql
    ├── 0093-having-clause-log
    ├── populate.sh
    ├── 0020-big-table-log
    ├── 0070-insert-ignore-log.sql
    ├── 0020-big-table.sql
    ├── 0036-use-innodb.sql
    ├── 0070-insert-ignore-log
    ├── 0089-empty-tables.sql
    ├── 0098-handle-sql-errors-log
    ├── 0002-not-used-indices.sql
    ├── 0107-schema-partitions.sql
    ├── 0028-data-not-updated-recently.sql
    ├── 0094-generic-primary-key.sql
    ├── 0118-high-offset-selects-log
    ├── README.md
    ├── 0019-queries-not-using-indices.sql
    ├── 0006-not-used-columns-and-tables.sql
    ├── 0000-core.sql
    ├── 0034-missing-primary-index.sql
    ├── 0019-queries-not-using-indices-log
    ├── 0032-utf-latin-columns.sql
    ├── 0004-redundant-indices.sql
    └── 0028-data-too-old.sql
├── .whitesource
├── setup.sql
├── .dockerignore
├── CONTRIBUTING.md
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── push-to-ghcr.yml
    │   ├── python-publish.yml
    │   ├── dependabot-automerge.yml
    │   ├── dockerimage.yml
    │   ├── tests.yml
    │   └── python.yml
├── Dockerfile
├── LICENSE
├── Makefile
├── .gitignore
└── setup.py


/indexdigest/test/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/indexdigest/test/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | prune indexdigest/test
3 | 


--------------------------------------------------------------------------------
/indexdigest/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | index_digest Python module
3 | """
4 | VERSION = '1.7.0'
5 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | # Messages to disable
3 | disable = consider-using-f-string,fixme,use-dict-literal,R0801
4 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | # https://coverage.readthedocs.io/en/latest/source.html#source
3 | omit =
4 |     indexdigest/cli/add_linter.py
5 |     indexdigest/test/*
6 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=too-few-public-methods,fixme,no-member,duplicate-code,useless-object-inheritance,use-dict-literal
3 | [DESIGN]
4 | max-args=7
5 | 


--------------------------------------------------------------------------------
/sql/0026-full-table-scan-log:
--------------------------------------------------------------------------------
1 | -- full table scan
2 | SELECT * FROM 0020_big_table
3 | SELECT * FROM 0020_big_table LIMIT 5
4 | -- using index
5 | SELECT * FROM 0020_big_table WHERE item_id = 1


--------------------------------------------------------------------------------
/indexdigest/formatters/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains formatters used by CLI script
3 | """
4 | # expose formatters
5 | from .plain import format_plain
6 | from .syslog import format_syslog
7 | from .yaml import format_yaml
8 | 


--------------------------------------------------------------------------------
/sql/0092-select-star-log:
--------------------------------------------------------------------------------
1 | -- report these as select queries with *
2 | SELECT * FROM foo;
3 | SELECT t.* FROM bar AS t;
4 | 
5 | -- false positives
6 | SELECT 3 * 3;
7 | SELECT count(*) FROM foo;
8 | SELECT /* foo */ test FROM foo;
9 | 


--------------------------------------------------------------------------------
/.whitesource:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scanSettings": {
 3 |     "baseBranches": []
 4 |   },
 5 |   "checkRunSettings": {
 6 |     "vulnerableCheckRunConclusionLevel": "failure",
 7 |     "displayMode": "diff"
 8 |   },
 9 |   "issueSettings": {
10 |     "minSeverityLevel": "LOW"
11 |   }
12 | }


--------------------------------------------------------------------------------
/sql/0002-not-used-indices-log:
--------------------------------------------------------------------------------
1 | select * from `0002_not_used_indices` order by item_id
2 | select * from `0002_not_used_indices` where foo = 'foo' and item_id = 2
3 | select count(*) from `0002_not_used_indices` where foo = 'foo'
4 | select * from `0002_not_used_indices` where bar = 'foo'
5 | 


--------------------------------------------------------------------------------
/sql/0027-selects-with-like-log:
--------------------------------------------------------------------------------
1 | -- uses an index
2 | SELECT * FROM 0020_big_table WHERE text LIKE '00%'
3 | -- does not use an index
4 | SELECT * FROM 0020_big_table WHERE text LIKE '%00'
5 | -- does not use an index, but is not a LIKE query
6 | SELECT * FROM 0020_big_table WHERE val > 50
7 | 


--------------------------------------------------------------------------------
/sql/0074-single-column.sql:
--------------------------------------------------------------------------------
1 | -- Report tables with just a single column
2 | --
3 | -- https://github.com/macbre/index-digest/issues/74
4 | DROP TABLE IF EXISTS `0074_bag_of_ints`;
5 | CREATE TABLE `0074_bag_of_ints` (
6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
7 | 	PRIMARY KEY (`item_id`)
8 | ) CHARSET=utf8;
9 | 


--------------------------------------------------------------------------------
/sql/0006-not-used-columns-and-tables-log:
--------------------------------------------------------------------------------
1 | INSERT INTO bar VALUES(1, 'foo', 'bar', 'test');
2 | SELECT /* a comment */ foo FROM `0006_not_used_columns` WHERE item_id = 1;
3 | SELECT test, item_id FROM `0006_not_used_columns` WHERE foo = 'a';
4 | -- query with an error: #1146 - "Table 'index_digest.t' doesn't exist"
5 | SELECT test FROM t;


--------------------------------------------------------------------------------
/indexdigest/test/test_schema.py:
--------------------------------------------------------------------------------
 1 | from indexdigest.schema import Column
 2 | 
 3 | 
 4 | def test_column_int_column_normalization():
 5 |     col = Column(name='foo', column_type='int')
 6 |     assert col.type == 'int'
 7 | 
 8 |     # normalize int(N) from MySQL 8.0.16 and older to int
 9 |     col = Column(name='foo', column_type='int(11)')
10 |     assert col.type == 'int'
11 | 


--------------------------------------------------------------------------------
/sql/0075-test-tables.sql:
--------------------------------------------------------------------------------
 1 | -- Report tables with "test" word in their name
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/75
 4 | DROP TABLE IF EXISTS `0075_some_guy_test_table`;
 5 | CREATE TABLE `0075_some_guy_test_table` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`name` varchar(255) NOT NULL,
 8 | 	PRIMARY KEY (`item_id`)
 9 | ) CHARSET=utf8;
10 | 


--------------------------------------------------------------------------------
/indexdigest/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A module containing command-line tool
 3 | """
 4 | import logging
 5 | 
 6 | from os import getenv
 7 | 
 8 | if getenv('DEBUG') == '1':  # pragma: no cover
 9 |     logging.basicConfig(
10 |         level=logging.DEBUG,
11 |         format='%(asctime)s %(name)-35s %(levelname)-8s %(message)s',
12 |         datefmt="%Y-%m-%d %H:%M:%S"
13 |     )
14 | 


--------------------------------------------------------------------------------
/setup.sql:
--------------------------------------------------------------------------------
 1 | -- create databases
 2 | CREATE DATABASE IF NOT EXISTS index_digest;
 3 | CREATE DATABASE IF NOT EXISTS index_digest_empty; -- #146
 4 | 
 5 | -- create a user and grant access to our databases
 6 | CREATE USER 'index_digest'@'%' IDENTIFIED BY 'qwerty';
 7 | 
 8 | GRANT ALL ON index_digest.* TO 'index_digest'@'%';
 9 | GRANT ALL ON index_digest_empty.* TO 'index_digest'@'%';
10 | 


--------------------------------------------------------------------------------
/sql/0093-having-clause-log:
--------------------------------------------------------------------------------
1 | -- Rewriting the query's HAVING clause into a predicate will enable the use of indexes during query processing.
2 | SELECT * FROM foo HAVING bar = 2;
3 | SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2'
4 | SELECT * FROM `0019_queries_not_using_indices` WHERE foo = 'foo' HAVING bar = 'test';
5 | 


--------------------------------------------------------------------------------
/sql/populate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is used to populate the MySQL instance with the content of all SQL files in this directory
 3 | FILES=`ls sql/*.sql`
 4 | 
 5 | for FILE in $FILES
 6 | do
 7 | 	echo -n "* Importing ${FILE} ... "
 8 | 	cat $FILE | mysql --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest 2>&1 | grep -v "Using a password"
 9 | 	echo "done"
10 | done
11 | 


--------------------------------------------------------------------------------
/sql/0020-big-table-log:
--------------------------------------------------------------------------------
1 | -- Using where; Using index -- and that's good :)
2 | SELECT count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20
3 | -- Using where; Using filesort
4 | SELECT * FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 ORDER BY val
5 | -- Using where; Using temporary; Using filesort
6 | SELECT val, count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 GROUP BY val ORDER BY val
7 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # hidden files and directories
 2 | .*
 3 | 
 4 | # shell scripts and Makefile
 5 | *.sh
 6 | Makefile
 7 | 
 8 | # virtual environments
 9 | env*
10 | 
11 | # Python-specific stuff
12 | *.egg-info/
13 | dist/
14 | htmlcov/
15 | coverage.xml
16 | 
17 | # SQL fixtures and tests
18 | setup.sql
19 | sql/
20 | 
21 | # other stuff
22 | *.log
23 | CONTRIBUTING.md
24 | Dockerfile
25 | MANIFEST*
26 | 
27 | **/__pycache__
28 | **/test
29 | hooks/
30 | 


--------------------------------------------------------------------------------
/sql/0070-insert-ignore-log.sql:
--------------------------------------------------------------------------------
 1 | -- Report queries using INSERT IGNORE
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/70
 4 | -- https://medium.com/legacy-systems-diary/things-to-avoid-episode-1-insert-ignore-535b4c24406b
 5 | DROP TABLE IF EXISTS `0070_insert_ignore`;
 6 | CREATE TABLE `0070_insert_ignore` (
 7 | 	`item_id` int(9) NOT NULL,
 8 | 	`text` char(5) NOT NULL,
 9 | 	`time` DATETIME,
10 | 	UNIQUE KEY (`item_id`)
11 | ) CHARSET=utf8;
12 | 


--------------------------------------------------------------------------------
/indexdigest/test/test_0107_schema_partitions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.test import DatabaseTestMixin
 6 | 
 7 | 
 8 | class TestSchemaWithPartition(TestCase, DatabaseTestMixin):
 9 | 
10 |     def test_schema_partitions(self):
11 |         schema = self.connection.get_table_schema('0107_schema_partitions')
12 |         print(schema)
13 | 
14 |         assert '/*!50100' not in schema
15 | 


--------------------------------------------------------------------------------
/sql/0020-big-table.sql:
--------------------------------------------------------------------------------
 1 | -- Report queries that use filesort or temporary file
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/20
 4 | DROP TABLE IF EXISTS `0020_big_table`;
 5 | CREATE TABLE `0020_big_table` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`val` int(9) NOT NULL,
 8 | 	`text` char(5) NOT NULL,
 9 | 	`num` int(3) NOT NULL,
10 | 	PRIMARY KEY (`item_id`),
11 | 	KEY text_idx (`text`),
12 | 	KEY num_idx (`num`) -- low cardinality (#31)
13 | ) CHARSET=utf8;
14 | 


--------------------------------------------------------------------------------
/sql/0036-use-innodb.sql:
--------------------------------------------------------------------------------
 1 | -- Report MyISAM tables and suggest to use InndDB
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/36
 4 | DROP TABLE IF EXISTS `0036_use_innodb_myisam`;
 5 | CREATE TABLE `0036_use_innodb_myisam` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`foo` int(8),
 8 | 	PRIMARY KEY (`item_id`)
 9 | ) ENGINE=MyISAM;
10 | 
11 | DROP TABLE IF EXISTS `0036_use_innodb`;
12 | CREATE TABLE `0036_use_innodb` (
13 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
14 | 	`foo` int(8),
15 | 	PRIMARY KEY (`item_id`)
16 | );
17 | 


--------------------------------------------------------------------------------
/sql/0070-insert-ignore-log:
--------------------------------------------------------------------------------
1 | -- inserts with IGNORE flag
2 | INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');
3 | /* foo */ INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');
4 | INSERT  IGNORE INTO `0070_insert_ignore` VALUES ('123', 9, '2017-01-01');
5 | INSERT /* foo */ IGNORE INTO `0070_insert_ignore` VALUES ('2017-01-01', 9, 123);
6 | -- no IGNORE flag
7 | /* INSERT IGNORE */ INSERT INTO `0070_insert_ignore` VALUES ('2017-01-01', 9, 123);
8 | INSERT INTO `0070_insert_ignore` VALUES ('INSERT IGNORE', 9, 123);
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Developer notes
 2 | ===============
 3 | 
 4 | ## Testing locally with various version of MySQL
 5 | 
 6 | Assume that you want to test `index-digest` locally against MySQL v5.5:
 7 | 
 8 | ```
 9 | docker pull mysql:5.5
10 | sudo service mysql stop
11 | docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=yes -d -p 3306:3306 mysql:5.5
12 | ```
13 | 
14 | Wait for mysql instance to start up. Then from the repository's main directory run:
15 | 
16 | ````
17 | mysql --protocol=tcp -u root -v < setup.sql
18 | ./sql/populate.sh
19 | make sql-console
20 | ```
21 | 


--------------------------------------------------------------------------------
/sql/0089-empty-tables.sql:
--------------------------------------------------------------------------------
 1 | -- Report empty tables
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/89
 4 | DROP TABLE IF EXISTS `0089_empty_table`;
 5 | CREATE TABLE `0089_empty_table` (
 6 |     `item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 |     `foo` int(9),
 8 | 	PRIMARY KEY (`item_id`)
 9 | );
10 | 
11 | DROP TABLE IF EXISTS `0089_not_empty_table`;
12 | CREATE TABLE `0089_not_empty_table` (
13 |     `item_id` int(9) NOT NULL AUTO_INCREMENT,
14 |     `foo` int(9) DEFAULT 0,
15 | 	PRIMARY KEY (`item_id`)
16 | );
17 | 
18 | INSERT INTO 0089_not_empty_table VALUES (1, NULL), (2, 5), (42, 56);
19 | 


--------------------------------------------------------------------------------
/sql/0098-handle-sql-errors-log:
--------------------------------------------------------------------------------
 1 | -- ERROR 1140 (42000): In aggregated query without GROUP BY, expression #1 of SELECT list contains nonaggregated column 'index_digest.0020_big_table.val'; this is incompatible with sql_mode=only_full_group_by
 2 | SELECT val, count(*) FROM `0020_big_table` WHERE item_id BETWEEN 10 AND 20;
 3 | 
 4 | -- query with aliases
 5 | SELECT t.val as value, count(*) FROM `0020_big_table` as t WHERE item_id BETWEEN 10 AND 20 GROUP BY val;
 6 | SELECT val as value, count(*) FROM `0020_big_table` WHERE item_id BETWEEN 10 AND 20 GROUP BY val;
 7 | 
 8 | -- invalid syntax
 9 | SELEKT foo FROM bar;
10 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Basic set up
 2 | # https://help.github.com/en/github/administering-a-repository/configuration-options-for-dependency-updates#package-ecosystem
 3 | 
 4 | version: 2
 5 | updates:
 6 | 
 7 |   # Maintain PyPI dependencies
 8 |   - package-ecosystem: "pip"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 | 
13 |   # GitHub Actions
14 |   - package-ecosystem: "github-actions"
15 |     directory: "/"
16 |     schedule:
17 |         interval: daily
18 | 
19 |   # Dockerfile
20 |   - package-ecosystem: "docker"
21 |     directory: "/"
22 |     schedule:
23 |         interval: daily
24 | 


--------------------------------------------------------------------------------
/sql/0002-not-used-indices.sql:
--------------------------------------------------------------------------------
 1 | -- Report not used indices
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/2
 4 | DROP TABLE IF EXISTS `0002_not_used_indices`;
 5 | CREATE TABLE `0002_not_used_indices` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`foo` varchar(16) NOT NULL DEFAULT '',
 8 | 	`test` varchar(16) NOT NULL DEFAULT '',
 9 | 	`bar` varchar(16),
10 | 	PRIMARY KEY (`item_id`),
11 | 	KEY `test_id_idx` (`test`, `item_id`),
12 | 	KEY `foo_id_idx` (`foo`, `item_id`)
13 | );
14 | 
15 | INSERT INTO 0002_not_used_indices VALUES
16 |     (NULL, 'test', '', NULL),
17 |     (NULL, 'foo', 'test', NULL),
18 |     (NULL, 'foo', '', NULL);
19 | 


--------------------------------------------------------------------------------
/sql/0107-schema-partitions.sql:
--------------------------------------------------------------------------------
 1 | -- Handle tables with partitions
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/107
 4 | DROP TABLE IF EXISTS `0107_schema_partitions`;
 5 | CREATE TABLE `0107_schema_partitions` (
 6 |     firstname VARCHAR(25) NOT NULL,
 7 |     lastname VARCHAR(25) NOT NULL,
 8 |     username VARCHAR(16) NOT NULL,
 9 |     email VARCHAR(35),
10 |     joined DATE NOT NULL
11 | ) CHARSET=utf8
12 | PARTITION BY RANGE( YEAR(joined) ) (
13 |     PARTITION p0 VALUES LESS THAN (1960),
14 |     PARTITION p1 VALUES LESS THAN (1970),
15 |     PARTITION p2 VALUES LESS THAN (1980),
16 |     PARTITION p3 VALUES LESS THAN (1990),
17 |     PARTITION p4 VALUES LESS THAN MAXVALUE
18 | );
19 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0164_empty_database.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0164_empty_database import check_empty_database
 6 | from indexdigest.test import DatabaseTestMixin
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_empty_database(self):
12 |         reports = list(check_empty_database(self.connection))
13 | 
14 |         print(reports, reports[0].context)
15 | 
16 |         assert len(reports) == 1
17 | 
18 |         assert str(reports[0]) == 'index_digest_empty: "index_digest_empty" database has no tables'
19 |         assert reports[0].table_name == 'index_digest_empty'
20 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0074_single_column.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter reports tables with just a single column
 3 | """
 4 | from indexdigest.utils import LinterEntry
 5 | 
 6 | 
 7 | def check_single_column(database):
 8 |     """
 9 |     :type database  indexdigest.database.Database
10 |     :rtype: list[LinterEntry]
11 |     """
12 |     tables = [
13 |         table
14 |         for table in database.get_tables()
15 |         if len(database.get_table_columns(table)) == 1
16 |     ]
17 | 
18 |     for table in tables:
19 |         yield LinterEntry(linter_type='single_column', table_name=table,
20 |                           message='"{}" has just a single column'.
21 |                           format(table),
22 |                           context={'schema': database.get_table_schema(table)})
23 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0074_single_column.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters import check_single_column
 6 | from indexdigest.test import DatabaseTestMixin
 7 | 
 8 | 
 9 | class TestSingleColumn(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_check_single_column(self):
12 |         reports = list(check_single_column(self.connection))
13 | 
14 |         print(list(map(str, reports)))
15 | 
16 |         self.assertEqual(len(reports), 1)
17 | 
18 |         self.assertEqual(str(reports[0]),
19 |                          '0074_bag_of_ints: "0074_bag_of_ints" has just a single column')
20 |         self.assertTrue('CREATE TABLE `0074_bag_of_ints` (' in reports[0].context['schema'])
21 | 
22 |         # assert False
23 | 


--------------------------------------------------------------------------------
/sql/0028-data-not-updated-recently.sql:
--------------------------------------------------------------------------------
 1 | -- Report tables that were not updated recently
 2 | -- They may contain archive data or the script that updates it broke.
 3 | --
 4 | -- https://github.com/macbre/index-digest/issues/28
 5 | DROP TABLE IF EXISTS `0028_data_not_updated_recently`;
 6 | CREATE TABLE `0028_data_not_updated_recently` (
 7 |     `item_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
 8 |     `cnt` int(8) unsigned NOT NULL,
 9 |      `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
10 |      PRIMARY KEY (`item_id`)
11 | ) ENGINE=InnoDB;
12 | 
13 | -- table with old data (6 months old)
14 | INSERT INTO 0028_data_not_updated_recently(cnt, `timestamp`) VALUES
15 |     (20, NOW() - INTERVAL 50 DAY),
16 |     (20, NOW() - INTERVAL 45 DAY),
17 |     (20, NOW() - INTERVAL 40 DAY);
18 | 


--------------------------------------------------------------------------------
/.github/workflows/push-to-ghcr.yml:
--------------------------------------------------------------------------------
 1 | name: Build and publish a Docker image to ghcr.io and Docker Hub
 2 | on:
 3 |   # publish on releases (tagged as "x.y.z" - "v" prefix is removed)
 4 |   release:
 5 |     types: [ published ]
 6 | 
 7 |   # publish on pushes to the main branch (tagged as "master")
 8 |   push:
 9 |     branches:
10 |       - master
11 | 
12 | #   pull_request: # debug
13 | 
14 | jobs:
15 |   docker_publish:
16 |     runs-on: "ubuntu-latest"
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v5.0.1
20 | 
21 |       - name: Build and publish a Docker image for ${{ github.repository }}
22 |         uses: macbre/push-to-ghcr@master
23 |         with:
24 |           image_name: ${{ github.repository }}
25 |           github_token: ${{ secrets.GITHUB_TOKEN }}
26 |           docker_io_token: ${{ secrets.DOCKER_IO_ACCESS_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/sql/0094-generic-primary-key.sql:
--------------------------------------------------------------------------------
 1 | -- Report tables with a generic primary key (id)
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/94
 4 | DROP TABLE IF EXISTS `0094_generic_primary_key`;
 5 | CREATE TABLE `0094_generic_primary_key` (
 6 | 	`id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`foo` varchar(16) NOT NULL DEFAULT '',
 8 | 	PRIMARY KEY (`id`)
 9 | );
10 | 
11 | DROP TABLE IF EXISTS `0094_generic_primary_key_id_as_column`;
12 | CREATE TABLE `0094_generic_primary_key_id_as_column` (
13 | 	`foo` int(9) NOT NULL AUTO_INCREMENT,
14 | 	`id` varchar(16) NOT NULL DEFAULT '',
15 | 	PRIMARY KEY (`foo`)
16 | );
17 | 
18 | DROP TABLE IF EXISTS `0094_non_generic_primary_key`;
19 | CREATE TABLE `0094_non_generic_primary_key` (
20 | 	`row_id` int(9) NOT NULL AUTO_INCREMENT,
21 | 	`foo` varchar(16) NOT NULL DEFAULT '',
22 | 	PRIMARY KEY (`row_id`)
23 | );
24 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0036_use_innodb.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0036_use_innodb import check_use_innodb
 6 | from indexdigest.test import DatabaseTestMixin
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_use_innodb(self):
12 |         reports = list(check_use_innodb(self.connection))
13 | 
14 |         print(reports, reports[0].context)
15 | 
16 |         self.assertEqual(len(reports), 1)
17 | 
18 |         self.assertEqual(str(reports[0]),
19 |                          '0036_use_innodb_myisam: "0036_use_innodb_myisam" uses MyISAM storage engine')
20 |         self.assertEqual(reports[0].table_name, '0036_use_innodb_myisam')
21 |         self.assertEqual(str(reports[0].context['engine']), "MyISAM")
22 | 
23 |         # assert False
24 | 


--------------------------------------------------------------------------------
/sql/0118-high-offset-selects-log:
--------------------------------------------------------------------------------
 1 | -- no offset queries
 2 | SELECT foo_limit FROM bar_offset
 3 | -- not high enough query
 4 | SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 100
 5 | select * from 0020_big_table order by id limit 50, 5;
 6 | -- offset queries
 7 | SELECT /* CategoryPaginationViewer::processSection */  page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix  FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id))  WHERE cl_type = 'page' AND cl_to = 'Spotify/Song'  ORDER BY cl_sortkey LIMIT 927600,200
 8 | -- insert queries should be ignored (#140)
 9 | /* 7388e26b */ insert into global_discussion_log.logs (   user_id,    ip,    site_id,    location,    action,    user_agent,    time,    app_id ) values (   33017624,    X'',    2233,    '',    0,    'content-changed-consumer',    {ts '2018-03-15 23:20:18.316'},    null )
10 | 


--------------------------------------------------------------------------------
/sql/README.md:
--------------------------------------------------------------------------------
 1 | sql
 2 | ===
 3 | 
 4 | This directory contains `*.sql` files with test schemas. Each reported task / bug should have a separate SQL file with a name `NNNN-short-description.sql` (e.g. `0004-redundant-indices.sql` where 4 is the GitHub's issue number).
 5 | 
 6 | Each test schema should be self-contained (e.g. there are no dependencies on other files) and it should be possible to re-apply them, i.e. `DROP TABLE IF EXISTS table_name` statements are there:
 7 | 
 8 | ### An example
 9 | 
10 | ```sql
11 | -- Detect redundant indices
12 | --
13 | -- https://github.com/macbre/index-digest/issues/4
14 | DROP TABLE IF EXISTS `0004_id_foo`;
15 | CREATE TABLE `0004_id_foo` (
16 |         `id` int(9) NOT NULL AUTO_INCREMENT,
17 |         `foo` varbinary(16) NOT NULL DEFAULT '',
18 |         PRIMARY KEY (`id`,`foo`),
19 |         UNIQUE KEY `idx` (`id`,`foo`)
20 | );
21 | -- ...
22 | ```
23 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0094_generic_primary_key.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0094_generic_primary_key import check_generic_primary_key
 6 | from indexdigest.test import DatabaseTestMixin
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_generic_primary_key(self):
12 |         reports = list(check_generic_primary_key(self.connection))
13 | 
14 |         print(list(map(str, reports)))
15 | 
16 |         assert len(reports) == 1
17 | 
18 |         assert str(reports[0]) == '0094_generic_primary_key: ' \
19 |                                   '"0094_generic_primary_key" has a primary key called id, use a more meaningful name'
20 |         assert reports[0].table_name == '0094_generic_primary_key'
21 |         assert 'CREATE TABLE `0094_generic_primary_key`' in reports[0].context['schema']
22 | 


--------------------------------------------------------------------------------
/sql/0019-queries-not-using-indices.sql:
--------------------------------------------------------------------------------
 1 | -- Report queries that do not use indices
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/19
 4 | DROP TABLE IF EXISTS `0019_queries_not_using_indices`;
 5 | CREATE TABLE `0019_queries_not_using_indices` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`foo` varchar(16) NOT NULL DEFAULT '',
 8 | 	`bar` varchar(16) NOT NULL DEFAULT '',
 9 | 	PRIMARY KEY (`item_id`),
10 | 	KEY `bar_idx` (`bar`)
11 | );
12 | 
13 | -- https://github.com/macbre/index-digest/issues/210
14 | DROP TABLE IF EXISTS `0019_queries_not_using_indices_empty_table`;
15 | CREATE TABLE `0019_queries_not_using_indices_empty_table` (
16 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
17 | 	`foo` varchar(16) NOT NULL DEFAULT '',
18 | 	PRIMARY KEY (`item_id`)
19 | );
20 | 
21 | INSERT INTO 0019_queries_not_using_indices VALUES
22 |     (1, 'test', ''),
23 |     (2, 'foo', 'test'),
24 |     (3, 'foo', 'check');
25 | 


--------------------------------------------------------------------------------
/sql/0006-not-used-columns-and-tables.sql:
--------------------------------------------------------------------------------
 1 | -- Report not used columns and tables
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/6
 4 | DROP TABLE IF EXISTS `0006_not_used_columns`;
 5 | CREATE TABLE `0006_not_used_columns` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`foo` varchar(16) NOT NULL DEFAULT '',
 8 | 	`bar` varchar(16) NOT NULL DEFAULT '',
 9 | 	`test` varchar(16) NOT NULL DEFAULT '',
10 | 	PRIMARY KEY (`item_id`)
11 | );
12 | 
13 | INSERT INTO 0006_not_used_columns VALUES
14 |     (1, 'test', '', ''),
15 |     (42, 'foo', 'test', ''),
16 |     (3, 'foo', '', 'check');
17 | 
18 | DROP TABLE IF EXISTS `0006_not_used_tables`;
19 | CREATE TABLE `0006_not_used_tables` (
20 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
21 | 	`foo` varchar(16) NOT NULL DEFAULT '',
22 | 	PRIMARY KEY (`item_id`)
23 | );
24 | 
25 | INSERT INTO 0006_not_used_tables VALUES
26 |     (1, 'foo'),
27 |     (2, 'foo'),
28 |     (3, 'foo');
29 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0089_empty_tables.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for empty tables
 3 | """
 4 | from indexdigest.utils import LinterEntry
 5 | 
 6 | 
 7 | def check_empty_tables(database):
 8 |     """
 9 |     :type database  indexdigest.database.Database
10 |     :rtype: list[LinterEntry]
11 |     """
12 |     empty_tables = [
13 |         table for table in database.get_tables()
14 |         # use both "information_schema" and "explain select count(*)" based methods
15 |         # to get the rows count estimate
16 |         if database.get_table_metadata(table).get('rows') == 0
17 |         or database.get_table_rows_estimate(table) == 0
18 |     ]
19 | 
20 |     for table in empty_tables:
21 |         yield LinterEntry(linter_type='empty_tables', table_name=table,
22 |                           message='"{}" table has no rows, is it really needed?'.format(table),
23 |                           context={'schema': database.get_table_schema(table)})
24 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0089_empty_tables.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0089_empty_tables import check_empty_tables
 6 | from indexdigest.test import DatabaseTestMixin
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_empty_tables(self):
12 |         reports = check_empty_tables(self.connection)
13 | 
14 |         # only include tables from our test case
15 |         reports = [
16 |             report for report in reports
17 |             if report.table_name.startswith('0089_')
18 |         ]
19 | 
20 |         print(list(map(str, reports)))
21 | 
22 |         self.assertEqual(len(reports), 1)
23 | 
24 |         self.assertEqual(str(reports[0]),
25 |                          '0089_empty_table: "0089_empty_table" table has no rows, is it really needed?')
26 |         self.assertTrue('CREATE TABLE `0089_empty_table` (' in reports[0].context['schema'])
27 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0002_not_used_indices.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0002_not_used_indices import check_not_used_indices
 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 7 | 
 8 | 
 9 | class TestNotUsedIndices(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_not_used_indices(self):
12 |         reports = list(check_not_used_indices(
13 |             database=self.connection, queries=read_queries_from_log('0002-not-used-indices-log')))
14 | 
15 |         print(reports)
16 | 
17 |         self.assertEqual(len(reports), 1)
18 |         self.assertEqual(str(reports[0]), '0002_not_used_indices: "test_id_idx" index was not used by provided queries')
19 |         self.assertEqual(reports[0].table_name, '0002_not_used_indices')
20 |         self.assertEqual(str(reports[0].context['not_used_index']), 'KEY test_id_idx (test, item_id)')
21 | 
22 |         # assert False
23 | 


--------------------------------------------------------------------------------
/sql/0000-core.sql:
--------------------------------------------------------------------------------
 1 | -- Tables for core tests of Database class
 2 | DROP TABLE IF EXISTS `0000_the_table`;
 3 | CREATE TABLE `0000_the_table` (
 4 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 5 | 	`foo` varchar(16) NOT NULL DEFAULT '',
 6 | 	PRIMARY KEY (`item_id`,`foo`),
 7 | 	KEY `idx_foo` (`foo`)
 8 | ) CHARACTER SET utf8;
 9 | 
10 | INSERT INTO 0000_the_table VALUES(1, 'test'), (2, 'foo'), (3, 'foo ąęź');
11 | 
12 | -- handle dashes in table names
13 | DROP TABLE IF EXISTS `0000_the_table-metadata`;
14 | CREATE TABLE `0000_the_table-metadata` (
15 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
16 | 	`foo` varchar(16) NOT NULL DEFAULT '',
17 | 	PRIMARY KEY (`item_id`,`foo`),
18 | 	KEY `idx_foo` (`foo`)
19 | ) CHARACTER SET utf8;
20 | 
21 | INSERT INTO `0000_the_table-metadata` VALUES(1, 'test'), (2, 'foo'), (3, 'foo ąęź'), (4, 'foo');
22 | 
23 | -- handle views, actually ignore them :)
24 | DROP VIEW IF EXISTS `0000_the_view`;
25 | CREATE VIEW 0000_the_view AS SELECT foo, COUNT(*) AS cnt FROM `0000_the_table-metadata` GROUP BY foo;
26 | 


--------------------------------------------------------------------------------
/sql/0034-missing-primary-index.sql:
--------------------------------------------------------------------------------
 1 | -- Report missing primary or unique keys
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/34
 4 | DROP TABLE IF EXISTS `0034_with_primary_key`;
 5 | CREATE TABLE `0034_with_primary_key` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`name` varchar(255) NOT NULL,
 8 | 	PRIMARY KEY (`item_id`)
 9 | ) CHARSET=utf8;
10 | 
11 | DROP TABLE IF EXISTS `0034_with_unique_key`;
12 | CREATE TABLE `0034_with_unique_key` (
13 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
14 | 	`name` varchar(255) NOT NULL,
15 | 	UNIQUE KEY idx (`item_id`)
16 | ) CHARSET=utf8;
17 | 
18 | -- https://github.com/Wikia/app/pull/9863
19 | DROP TABLE IF EXISTS `0034_querycache`;
20 | CREATE TABLE `0034_querycache` (
21 |   `qc_type` varbinary(32) NOT NULL,
22 |   `qc_value` int(10) unsigned NOT NULL DEFAULT '0',
23 |   `qc_namespace` int(11) NOT NULL DEFAULT '0',
24 |   `qc_title` varchar(255) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL DEFAULT '',
25 |   KEY `qc_type` (`qc_type`,`qc_value`)
26 | ) CHARSET=utf8;
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # https://hub.docker.com/_/python/
 2 | FROM python:3.15.0a3-alpine
 3 | 
 4 | WORKDIR /opt/index-digest
 5 | 
 6 | # copy files required to run "pip install"
 7 | COPY setup.py README.md ./
 8 | COPY ./indexdigest/__init__.py ./indexdigest/__init__.py
 9 | 
10 | # installs mysql_config and pip dependencies
11 | # https://github.com/gliderlabs/docker-alpine/issues/181
12 | RUN apk upgrade \
13 |     && apk add --virtual build-deps gcc musl-dev \
14 |     && apk add mariadb-dev \
15 |     && pip install . \
16 |     && rm -rf /root/.cache \
17 |     && apk del build-deps
18 | 
19 | ARG GITHUB_SHA="dev"
20 | ENV COMMIT_SHA=${GITHUB_SHA}
21 | 
22 | # run as nobody
23 | ENV HOME=/opt/index-digest
24 | RUN chown -R nobody .
25 | USER nobody
26 | 
27 | # install the remaining files
28 | COPY --chown=nobody . .
29 | 
30 | # install the entire package
31 | RUN pip install --no-warn-script-location --user . \
32 |     && rm -rf ~./cache
33 | 
34 | RUN index_digest --version
35 | 
36 | # docker run -t macbre/index-digest
37 | ENTRYPOINT ["index_digest"]
38 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0075_test_tables.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter reports tables with "test" word in their name
 3 | """
 4 | import re
 5 | 
 6 | from indexdigest.utils import LinterEntry
 7 | 
 8 | TEST_TABLES = (
 9 |     'test',
10 |     'temp',
11 | )
12 | 
13 | 
14 | def is_test_table(table_name):
15 |     """
16 |     :type table_name str
17 |     :rtype: bool
18 |     """
19 |     return re.search(r'(^|_)({})(_|$)'.format('|'.join(TEST_TABLES)), table_name) is not None
20 | 
21 | 
22 | def check_test_tables(database):
23 |     """
24 |     :type database  indexdigest.database.Database
25 |     :rtype: list[LinterEntry]
26 |     """
27 |     test_tables = [
28 |         table for table in database.get_tables()
29 |         if is_test_table(table)
30 |     ]
31 | 
32 |     for table in test_tables:
33 |         yield LinterEntry(linter_type='test_tables', table_name=table,
34 |                           message='"{}" seems to be a test table'.
35 |                           format(table),
36 |                           context={'schema': database.get_table_schema(table)})
37 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0036_use_innodb.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for ...
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import LinterEntry
 7 | 
 8 | 
 9 | def check_use_innodb(database):
10 |     """
11 |     :type database  indexdigest.database.Database
12 |     :rtype: list[LinterEntry]
13 |     """
14 |     # in MySQL 8.0 information_schema tables columns are uppercase
15 |     res = database.query_dict_rows("SELECT TABLE_NAME, ENGINE FROM information_schema.tables "
16 |                                    "WHERE ENGINE <> 'InnoDB' and TABLE_SCHEMA = '{}'".
17 |                                    format(database.db_name))
18 | 
19 |     for row in res:
20 |         context = OrderedDict()
21 |         context['schema'] = database.get_table_schema(row['TABLE_NAME'])
22 |         context['engine'] = row['ENGINE']
23 | 
24 |         yield LinterEntry(linter_type='use_innodb', table_name=row['TABLE_NAME'],
25 |                           message='"{TABLE_NAME}" uses {ENGINE} storage engine'.
26 |                           format(**row),
27 |                           context=context)
28 | 


--------------------------------------------------------------------------------
/sql/0019-queries-not-using-indices-log:
--------------------------------------------------------------------------------
 1 | -- these use index
 2 | SELECT item_id FROM 0019_queries_not_using_indices WHERE item_id = 2;
 3 | SELECT item_id FROM 0019_queries_not_using_indices WHERE item_id BETWEEN 1 AND 3;
 4 | SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" AND item_id = 1;
 5 | -- these do not use index
 6 | SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" OR item_id > 1;
 7 | SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test"
 8 | -- no matching row in const table (#44)
 9 | SELECT foo FROM 0019_queries_not_using_indices WHERE item_id = 5;
10 | -- #148: EXPLAINS' Extra says "No tables used"
11 | SELECT 1*1;
12 | SELECT 1 AS one FROM dual WHERE exists ( SELECT 1 FROM 0000_the_table WHERE item_id = 2 );
13 | SELECT 1 AS one FROM dual WHERE exists ( SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" );
14 | -- #210: EXPLAINS' Extra says "Select tables optimized away"
15 | SELECT max(item_id) FROM 0019_queries_not_using_indices;
16 | -- #210: EXPLAINS' Extra says "No matching min/max row"
17 | SELECT max(item_id) FROM 0019_queries_not_using_indices_empty_table;
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Maciej Brencz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | install:
 2 | 	pip install -U -e .[dev]
 3 | 
 4 | test:
 5 | 	pytest -vv -o log_cli=true -o log_cli_level=warning
 6 | 
 7 | coverage:
 8 | 	pytest -vv --cov=indexdigest --cov-report=term-missing --cov-report=html --cov-fail-under=96
 9 | 
10 | lint:
11 | 	pylint indexdigest/ --ignore=test
12 | 
13 | demo:
14 | 	docker run --network=host -t macbre/index-digest:latest mysql://index_digest:qwerty@127.0.0.1/index_digest --analyze-data --skip-checks=non_utf_columns --skip-tables=0028_no_time
15 | 
16 | sql-console:
17 | 	mysql --prompt='mysql@\h[\d]>' --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest
18 | 
19 | publish:
20 | 	# run git tag -a v0.0.0 before running make publish
21 | 	python setup.py sdist
22 | 	twine upload --skip-existing dist/*
23 | 
24 | # docker (tag with commit ID)
25 | VERSION = "1.2.1-"$(shell git rev-parse --short HEAD)
26 | 
27 | build:
28 | 	@docker build -t macbre/index-digest:$(VERSION) . \
29 | 	&& docker tag macbre/index-digest:$(VERSION) macbre/index-digest:latest
30 | 
31 | push: build
32 | 	@docker push macbre/index-digest:$(VERSION) \
33 | 	&& docker push macbre/index-digest:latest
34 | 
35 | .PHONY: build
36 | 


--------------------------------------------------------------------------------
/sql/0032-utf-latin-columns.sql:
--------------------------------------------------------------------------------
 1 | -- Report text columns that use non-utf collation
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/32
 4 | DROP TABLE IF EXISTS `0032_utf8_table`;
 5 | CREATE TABLE `0032_utf8_table` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`name` varchar(255) NOT NULL,
 8 | 	`latin_column` varchar(255) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
 9 | 	`big5_column` varchar(255) CHARACTER SET big5,
10 | 	`utf_blob` blob,
11 | 	PRIMARY KEY (`item_id`)
12 | ) CHARSET=utf8 COLLATE=utf8_polish_ci;
13 | 
14 | DROP TABLE IF EXISTS `0032_latin1_table`;
15 | CREATE TABLE `0032_latin1_table` (
16 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
17 | 	`name` varchar(255),
18 | 	`utf8_column` varchar(255) CHARACTER SET utf8 COLLATE utf8_polish_ci NOT NULL,
19 | 	`ucs2_column` varchar(255) CHARACTER SET ucs2,
20 | 	`utf8mb4_column` varchar(255) CHARACTER SET utf8mb4,
21 | 	`utf16_column` varchar(255) CHARACTER SET utf16,
22 | 	-- `utf16le_column` varchar(255) CHARACTER SET utf16le, -- not supported by MySQL 5.5
23 | 	`utf32_column` varchar(255) CHARACTER SET utf32,
24 | 	`binary_column` varchar(255) CHARACTER SET binary,
25 | 	`latin_blob` blob,
26 | 	PRIMARY KEY (`item_id`)
27 | ) CHARSET=latin1;
28 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0026_full_table_scan.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from indexdigest.linters import check_full_table_scan
 4 | from indexdigest.test import BigTableTest, read_queries_from_log
 5 | 
 6 | 
 7 | class TestFullTableScan(BigTableTest):
 8 | 
 9 |     def test_full_table_scan(self):
10 |         reports = list(check_full_table_scan(self.connection, read_queries_from_log('0026-full-table-scan-log')))
11 | 
12 |         self.assertEqual(len(reports), 2)
13 | 
14 |         self.assertEqual(str(reports[0]),
15 |                          '0020_big_table: "SELECT * FROM 0020_big_table" query triggered full table scan')
16 |         self.assertEqual(reports[0].context['query'],
17 |                          'SELECT * FROM 0020_big_table')
18 |         self.assertTrue(reports[0].context['explain_rows'] > 8000)
19 | 
20 |         self.assertEqual(str(reports[1]),
21 |                          '0020_big_table: "SELECT * FROM 0020_big_table LIMIT 5" query triggered full table scan')
22 |         self.assertEqual(reports[1].context['query'],
23 |                          'SELECT * FROM 0020_big_table LIMIT 5')
24 |         self.assertTrue(reports[1].context['explain_rows'] > 8000)
25 | 
26 |         # assert False
27 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0034_missing_primary_index.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters import check_missing_primary_index
 6 | from indexdigest.test import Database, DatabaseTestMixin
 7 | 
 8 | 
 9 | class LimitedViewDatabase(Database, DatabaseTestMixin):
10 |     """
11 |     Limit test to tables from sql/0034-missing-primary-index
12 |     """
13 |     def get_tables(self):
14 |         return ['0034_with_primary_key', '0034_with_unique_key', '0034_querycache']
15 | 
16 | 
17 | class TestMissingPrimaryIndex(TestCase):
18 |     @property
19 |     def connection(self):
20 |         return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN)
21 | 
22 |     def test_missing_primary_index(self):
23 |         reports = list(check_missing_primary_index(self.connection))
24 | 
25 |         print(list(map(str, reports)))
26 | 
27 |         self.assertEqual(len(reports), 1)
28 | 
29 |         self.assertEqual(str(reports[0]),
30 |                          '0034_querycache: "0034_querycache" table does not have any primary or unique index')
31 |         self.assertTrue('CREATE TABLE `0034_querycache` (' in reports[0].context['schema'])
32 | 
33 |         # assert False
34 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0164_empty_database.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for databases with no tables
 3 | """
 4 | from indexdigest.utils import LinterEntry
 5 | 
 6 | 
 7 | def get_empty_databases(database):
 8 |     """
 9 |     :type database  indexdigest.database.Database
10 |     :rtype: list[str]
11 |     """
12 |     for db_name in database.query_list('SHOW DATABASES'):
13 |         # skip "core" MySQL databases
14 |         if db_name in ['information_schema']:
15 |             continue
16 | 
17 |         tables_count = database.query_field('SELECT COUNT(*) FROM information_schema.TABLES '
18 |                                             'WHERE TABLE_SCHEMA = "{}" AND '
19 |                                             'TABLE_TYPE = "BASE TABLE"'.format(db_name))
20 |         # print(db_name, tables_count)
21 |         if tables_count == 0:
22 |             yield db_name
23 | 
24 | 
25 | def check_empty_database(database):
26 |     """
27 |     :type database  indexdigest.database.Database
28 |     :rtype: list[LinterEntry]
29 |     """
30 |     for db_name in get_empty_databases(database):
31 |         yield LinterEntry(linter_type='empty_database', table_name=db_name,
32 |                           message='"{}" database has no tables'.format(db_name))
33 | 


--------------------------------------------------------------------------------
/indexdigest/test/test_0089_handle_sql_errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0006_not_used_columns_and_tables import get_used_tables_from_queries
 6 | from indexdigest.utils import explain_queries
 7 | 
 8 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 9 | 
10 | 
11 | class ErrorsHandlingTest(TestCase, DatabaseTestMixin):
12 | 
13 |     @property
14 |     def queries(self):
15 |         return read_queries_from_log('0098-handle-sql-errors-log')
16 | 
17 |     def test_get_used_tables_from_queries(self):
18 |         tables = get_used_tables_from_queries(self.queries)
19 | 
20 |         print(tables)
21 | 
22 |         assert '0020_big_table' in tables
23 |         # assert False
24 | 
25 |     def test_explain_queries(self):
26 |         res = list(explain_queries(self.connection, self.queries))
27 |         tables_used = [item[1] for item in res]
28 | 
29 |         print(res, tables_used)
30 | 
31 |         assert '0020_big_table' in tables_used
32 |         # assert False
33 | 
34 |     def test_get_table_columns(self):
35 |         res = self.connection.get_table_columns('t')
36 |         self.assertIsNone(res)
37 | 
38 |         # assert False
39 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0034_missing_primary_index.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter reports missing primary / unique index
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import LinterEntry
 7 | 
 8 | 
 9 | def check_missing_primary_index(database):
10 |     """
11 |     :type database  indexdigest.database.Database
12 |     :rtype: list[LinterEntry]
13 |     """
14 |     for table in database.get_tables():
15 |         # list non-primary (and non-unique) indices only
16 |         # @see https://bugs.mysql.com/bug.php?id=76252
17 |         # @see https://github.com/Wikia/app/pull/9863
18 |         indices = [
19 |             index for index in database.get_table_indices(table)
20 |             if index.is_primary or index.is_unique
21 |         ]
22 | 
23 |         if indices:
24 |             # so we have at least one primary or unique index defined
25 |             continue
26 | 
27 |         context = OrderedDict()
28 |         context['schema'] = database.get_table_schema(table)
29 | 
30 |         yield LinterEntry(linter_type='missing_primary_index', table_name=table,
31 |                           message='"{}" table does not have any primary or unique index'.
32 |                           format(table),
33 |                           context=context)
34 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0094_generic_primary_key.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for ...
 3 | """
 4 | from indexdigest.utils import LinterEntry
 5 | 
 6 | GENERIC_PRIMARY_KEY = 'id'
 7 | 
 8 | 
 9 | def check_generic_primary_key(database):
10 |     """
11 |     :type database  indexdigest.database.Database
12 |     :rtype: list[LinterEntry]
13 |     """
14 |     for table_name in database.get_tables():
15 |         indices = [
16 |             index for index in database.get_table_indices(table_name)
17 |             if index.is_primary
18 |         ]
19 | 
20 |         # no primary index, a different check will take care of it
21 |         if not indices:
22 |             continue
23 | 
24 |         # there can be only one primary key, take the first one from the list
25 |         primary_key = indices[0]
26 |         # print(table_name, primary_key, primary_key.columns[0])
27 | 
28 |         if primary_key.columns[0] == GENERIC_PRIMARY_KEY:
29 |             yield LinterEntry(linter_type='generic_primary_key', table_name=table_name,
30 |                               message='"{}" has a primary key called id, '
31 |                                       'use a more meaningful name'.format(table_name),
32 |                               context={"schema": database.get_table_schema(table_name)})
33 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | name: Publish
 4 | 
 5 | on:
 6 |   release:
 7 |     types: [created]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   deploy:
14 |     name: Upload to PyPI
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     permissions:
19 |       # IMPORTANT: this permission is mandatory for Trusted Publishing
20 |       id-token: write
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v5.0.1
24 |     - name: Set up Python
25 |       uses: actions/setup-python@v6
26 |       with:
27 |         python-version: '3.x'
28 | 
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install setuptools wheel
33 | 
34 |     - name: Build
35 |       run: |
36 |         python setup.py sdist bdist_wheel
37 |         ls -lh dist/
38 | 
39 |     # https://github.com/pypa/gh-action-pypi-publish?tab=readme-ov-file#trusted-publishing
40 |     - name: Publish package distributions to PyPI
41 |       uses: pypa/gh-action-pypi-publish@release/v1
42 | 


--------------------------------------------------------------------------------
/indexdigest/test/formatters/test_yaml.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest import VERSION
 6 | from indexdigest.formatters import format_yaml as formatter
 7 | from . import FormatterTestMixin
 8 | 
 9 | 
10 | class TestFormatter(TestCase, FormatterTestMixin):
11 | 
12 |     def test_formatter(self):
13 |         out = formatter(self.get_database_mock(), self.get_reports_mock())
14 |         print(out)
15 | 
16 |         # first check that it's a valid YAML
17 |         res = yaml.safe_load(out)
18 |         assert 'meta' in res
19 |         assert 'reports' in res
20 | 
21 |         assert 'version: index-digest v' + VERSION + '\n  database_name: test_database\n' \
22 |             '  database_host: test.local\n  database_version: MySQL v1.2.3-test' in out
23 | 
24 |         assert 'message: Something is fishy here' in out
25 | 
26 |         # context fields order is maintained
27 |         assert '  context:\n    foo: 42\n    test: bar\n' in out
28 | 
29 |         # properly marked YAML file
30 |         assert out.startswith('---')
31 |         assert out.endswith('...\n')
32 |         # assert False
33 | 
34 |     def test_formatter_no_results(self):
35 |         out = formatter(self.get_database_mock(), [])
36 |         print(out)
37 | 
38 |         assert out.endswith('reports: []\n...\n')
39 | 


--------------------------------------------------------------------------------
/indexdigest/test/test_0004_redundant_indices_core.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.test import DatabaseTestMixin
 6 | 
 7 | 
 8 | class RedundantIndicesTest(TestCase, DatabaseTestMixin):
 9 | 
10 |     def test_redundant_index_with_primary(self):
11 |         indices = self.connection.get_table_indices('0004_id_foo')
12 |         print(indices)
13 | 
14 |         (idx, primary) = indices
15 | 
16 |         self.assertEqual(primary.name, 'PRIMARY')
17 |         self.assertEqual(idx.name, 'idx')
18 | 
19 |         self.assertTrue(idx.is_covered_by(primary))
20 |         self.assertFalse(primary.is_covered_by(idx))
21 | 
22 |     def test_redundant_indexes(self):
23 |         indices = self.connection.get_table_indices('0004_id_foo_bar')
24 |         print(indices)
25 | 
26 |         (idx_foo, idx_foo_bar, idx_id_foo, primary) = indices
27 | 
28 |         self.assertEqual(primary.name, 'PRIMARY')
29 |         self.assertEqual(idx_foo.name, 'idx_foo')
30 |         self.assertEqual(idx_foo_bar.name, 'idx_foo_bar')
31 |         self.assertEqual(idx_id_foo.name, 'idx_id_foo')
32 | 
33 |         self.assertTrue(idx_foo.is_covered_by(idx_foo_bar))
34 | 
35 |         self.assertFalse(idx_foo.is_covered_by(idx_id_foo))
36 |         self.assertFalse(idx_foo.is_covered_by(primary))
37 |         self.assertFalse(primary.is_covered_by(idx_foo))
38 | 


--------------------------------------------------------------------------------
/.github/workflows/dependabot-automerge.yml:
--------------------------------------------------------------------------------
 1 | # Based on https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/automating-dependabot-with-github-actions#enable-auto-merge-on-a-pull-request
 2 | name: Dependabot auto-merge
 3 | on: pull_request_target
 4 | 
 5 | permissions:
 6 |     pull-requests: write
 7 |     contents: write
 8 | 
 9 | jobs:
10 |     dependabot:
11 |         runs-on: ubuntu-latest
12 |         if: ${{ github.actor == 'dependabot[bot]' }}
13 |         steps:
14 |             - name: Dependabot metadata
15 |               id: metadata
16 |               uses: dependabot/fetch-metadata@v2.4.0
17 |               with:
18 |                   github-token: "${{ secrets.GITHUB_TOKEN }}"
19 | 
20 |             - name: Enable auto-merge for Dependabot PRs
21 |               # Automatically merge semver-patch and semver-minor PRs
22 |               if: "${{ steps.metadata.outputs.update-type ==
23 |                   'version-update:semver-minor' ||
24 |                   steps.metadata.outputs.update-type ==
25 |                   'version-update:semver-patch' }}"
26 | 
27 |               # https://cli.github.com/manual/gh_pr_merge
28 |               run: gh pr merge --auto --squash "$PR_URL"
29 |               env:
30 |                   PR_URL: ${{github.event.pull_request.html_url}}
31 |                   GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
32 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0118_high_offset_selects.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0118_high_offset_selects import check_high_offset_selects
 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_high_offset_selects(self):
12 |         reports = list(check_high_offset_selects(
13 |             self.connection, queries=read_queries_from_log('0118-high-offset-selects-log')))
14 | 
15 |         print(reports, reports[0].context)
16 | 
17 |         self.assertEqual(len(reports), 1)
18 | 
19 |         self.assertEqual(str(reports[0]), 'page: "SELECT /* CategoryPaginationViewer::processSection..." query uses too high offset impacting the performance')
20 |         self.assertEqual(reports[0].table_name, 'page')
21 |         self.assertEqual(str(reports[0].context['query']), "SELECT /* CategoryPaginationViewer::processSection */  page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix  FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id))  WHERE cl_type = 'page' AND cl_to = 'Spotify/Song'  ORDER BY cl_sortkey LIMIT 927600,200")
22 |         self.assertEqual(reports[0].context['limit'], 200)
23 |         self.assertEqual(reports[0].context['offset'], 927600)
24 | 
25 |         # assert False
26 | 


--------------------------------------------------------------------------------
/indexdigest/test/formatters/__init__.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from indexdigest.database import Database
 4 | from indexdigest.utils import LinterEntry
 5 | 
 6 | from .. import DatabaseTestMixin
 7 | 
 8 | 
 9 | class DatabaseMock:
10 |     VERSION = '1.2.3-test'
11 |     HOST = 'test.local'
12 | 
13 |     @property
14 |     def db_name(self):
15 |         return 'test_database'
16 | 
17 |     def get_server_version(self):
18 |         return self.VERSION
19 | 
20 |     def get_server_hostname(self):
21 |         return self.HOST
22 | 
23 |     @staticmethod
24 |     def get_queries():
25 |         return []
26 | 
27 | 
28 | class FormatterTestMixin:
29 |     @staticmethod
30 |     def get_database_mock():
31 |         return DatabaseMock()
32 | 
33 |     @staticmethod
34 |     def get_reports_mock():
35 |         context = OrderedDict()
36 |         context['foo'] = 42
37 |         context['test'] = 'bar'
38 | 
39 |         yield LinterEntry(
40 |             linter_type='foo_linter',
41 |             table_name='table_001',
42 |             message='Something is fishy here',
43 |             context=context
44 |         )
45 | 
46 |         yield LinterEntry(
47 |             linter_type='bar_linter',
48 |             table_name='table_042',
49 |             message='An index is missing'
50 |         )
51 | 
52 |     @staticmethod
53 |     def get_database():
54 |         return Database.connect_dsn(DatabaseTestMixin.DSN)
55 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0118_high_offset_selects.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for too high offset SELECT queries
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from sql_metadata.compat import get_query_limit_and_offset, get_query_tables
 7 | 
 8 | from indexdigest.utils import LinterEntry, shorten_query, is_select_query
 9 | 
10 | 
11 | OFFSET_THRESHOLD = 1000
12 | 
13 | 
14 | def check_high_offset_selects(_, queries):
15 |     """
16 |     :type _ indexdigest.database.Database
17 |     :type queries list[str]
18 |     :rtype: list[LinterEntry]
19 |     """
20 |     for query in queries:
21 |         # ignore insert queries (#140)
22 |         if not is_select_query(query):
23 |             continue
24 | 
25 |         res = get_query_limit_and_offset(query)
26 | 
27 |         if res is None:
28 |             continue
29 | 
30 |         (limit, offset) = res
31 | 
32 |         if offset < OFFSET_THRESHOLD:
33 |             continue
34 | 
35 |         table_name = get_query_tables(query)[0]
36 | 
37 |         context = OrderedDict()
38 |         context['query'] = query
39 |         context['limit'] = limit
40 |         context['offset'] = offset
41 | 
42 |         yield LinterEntry(linter_type='high_offset_selects', table_name=table_name,
43 |                           message='"{}" query uses too high offset impacting the performance'.
44 |                           format(shorten_query(query)),
45 |                           context=context)
46 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0093_having_clause.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for select queries with HAVING clause
 3 | """
 4 | from sqlparse.tokens import Keyword
 5 | from sql_metadata.compat import preprocess_query, get_query_tables, get_query_tokens
 6 | 
 7 | from indexdigest.utils import LinterEntry, shorten_query, is_select_query
 8 | 
 9 | 
10 | def query_has_having_clause(query):
11 |     """
12 |     Checks if provided query uses HAVING clause
13 |     :type query str
14 |     :rtype bool
15 |     """
16 |     if not is_select_query(query):
17 |         return False
18 | 
19 |     query = preprocess_query(query)
20 |     tokens = get_query_tokens(query)
21 | 
22 |     for token in tokens:
23 |         if token.ttype is Keyword and str(token).upper() == 'HAVING':
24 |             return True
25 | 
26 |     return False
27 | 
28 | 
29 | def check_having_clause(_, queries):
30 |     """
31 |     :type queries list[str]
32 |     :rtype: list[LinterEntry]
33 |     """
34 |     queries_with_having_clause = [
35 |         query for query in queries
36 |         if query_has_having_clause(query)
37 |     ]
38 | 
39 |     for query in queries_with_having_clause:
40 |         table_name = get_query_tables(query)[0]
41 | 
42 |         yield LinterEntry(linter_type='having_clause', table_name=table_name,
43 |                           message='"{}" query uses HAVING clause'.
44 |                           format(shorten_query(query)),
45 |                           context={"query": query})
46 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0002_not_used_indices.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for not used indices by going through SELECT queries
 3 | """
 4 | import logging
 5 | 
 6 | from collections import defaultdict
 7 | 
 8 | from indexdigest.utils import LinterEntry, explain_queries
 9 | 
10 | 
11 | def check_not_used_indices(database, queries):
12 |     """
13 |     :type database  indexdigest.database.Database
14 |     :type queries list[str]
15 |     :rtype: list[LinterEntry]
16 |     """
17 |     logger = logging.getLogger(__name__)
18 | 
19 |     used_indices = defaultdict(list)
20 | 
21 |     # EXPLAIN each query
22 |     for (query, table_used, index_used, _) in explain_queries(database, queries):
23 |         if index_used is not None:
24 |             logger.info("Query <%s> uses %s index on `%s` table", query, index_used, table_used)
25 |             used_indices[table_used].append(index_used)
26 | 
27 |     # analyze all tables used by the above queries
28 |     # print(used_indices)
29 |     for table_name, table_indices in used_indices.items():
30 |         for index in database.get_table_indices(table_name):
31 | 
32 |             if index.name not in table_indices:
33 |                 yield LinterEntry(linter_type='not_used_indices', table_name=table_name,
34 |                                   message='"{}" index was not used by provided queries'.
35 |                                   format(index.name),
36 |                                   context={"not_used_index": str(index)})
37 | 


--------------------------------------------------------------------------------
/indexdigest/test/formatters/test_plain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from unittest import TestCase
 5 | 
 6 | from indexdigest import VERSION
 7 | from indexdigest.formatters import format_plain as formatter
 8 | from . import FormatterTestMixin
 9 | 
10 | 
11 | class TestFormatter(TestCase, FormatterTestMixin):
12 | 
13 |     @staticmethod
14 |     def _remove_ansi_styles(text):
15 |         """
16 |         :type text str
17 |         :rtype: str
18 |         """
19 |         # '\033[0m'
20 |         return re.sub(r'\033\[\d+m', '', text)
21 | 
22 |     def test_format_plain(self):
23 |         out = formatter(self.get_database_mock(), self.get_reports_mock())
24 |         out = self._remove_ansi_styles(out)
25 |         print(out)
26 | 
27 |         assert 'Found 2 issue(s) to report for "test_database" database' in out
28 |         assert 'MySQL v1.2.3-test at test.local' in out
29 |         assert 'index-digest v' + VERSION in out
30 | 
31 |         assert 'foo_linter → table affected: table_001' in out
32 |         assert '✗ Something is fishy here' in out
33 |         assert '  - foo: 42\n  - test: bar' in out
34 | 
35 |         assert 'bar_linter → table affected: table_042' in out
36 |         assert '✗ An index is missing' in out
37 | 
38 |         assert out.endswith('Queries performed: 0')
39 |         # assert False
40 | 
41 |     def test_format_plain_no_results(self):
42 |         out = formatter(self.get_database_mock(), [])
43 |         assert out.endswith('Jolly, good! No issues to report')
44 | 


--------------------------------------------------------------------------------
/indexdigest/formatters/yaml.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provides --format=yaml results formatter
 3 | """
 4 | from __future__ import absolute_import
 5 | 
 6 | from collections import OrderedDict
 7 | 
 8 | import yaml
 9 | import yamlordereddictloader
10 | 
11 | import indexdigest
12 | 
13 | 
14 | def format_report(report):
15 |     """
16 |     :type report indexdigest.utils.LinterEntry
17 |     :rtype: OrderedDict
18 |     """
19 |     res = OrderedDict()
20 | 
21 |     res['type'] = report.linter_type
22 |     res['table'] = report.table_name
23 |     res['message'] = report.message
24 | 
25 |     if report.context:
26 |         res['context'] = report.context
27 | 
28 |     return res
29 | 
30 | 
31 | def format_yaml(database, reports):
32 |     """
33 |     :type database indexdigest.database.Database
34 |     :type reports list
35 |     :rtype: str
36 |     """
37 |     report = OrderedDict()
38 | 
39 |     report['meta'] = OrderedDict()
40 |     report['meta']['version'] = 'index-digest v{}'.format(indexdigest.VERSION)
41 |     report['meta']['database_name'] = database.db_name
42 |     report['meta']['database_host'] = database.get_server_hostname()
43 |     report['meta']['database_version'] = 'MySQL v{}'.format(database.get_server_version())
44 | 
45 |     report['reports'] = [format_report(item) for item in reports]
46 | 
47 |     return yaml.dump(report,
48 |                      Dumper=yamlordereddictloader.Dumper,
49 |                      default_flow_style=False,
50 |                      explicit_start=True,
51 |                      explicit_end=True)
52 | 


--------------------------------------------------------------------------------
/sql/0004-redundant-indices.sql:
--------------------------------------------------------------------------------
 1 | -- Detect redundant indices
 2 | --
 3 | -- https://github.com/macbre/index-digest/issues/4
 4 | DROP TABLE IF EXISTS `0004_id_foo`;
 5 | CREATE TABLE `0004_id_foo` (
 6 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
 7 | 	`foo` varbinary(16) NOT NULL DEFAULT '',
 8 | 	PRIMARY KEY (`item_id`,`foo`),
 9 | 	UNIQUE KEY `idx` (`item_id`,`foo`)
10 | );
11 | 
12 | DROP TABLE IF EXISTS `0004_id_foo_bar`;
13 | CREATE TABLE `0004_id_foo_bar` (
14 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
15 | 	`foo` varbinary(16) NOT NULL DEFAULT '',
16 | 	`bar` varbinary(16) NOT NULL DEFAULT '',
17 | 	PRIMARY KEY (`item_id`),
18 | 	KEY `idx_foo` (`foo`),
19 | 	KEY `idx_foo_bar` (`foo`, `bar`),
20 | 	KEY `idx_id_foo` (`item_id`, `foo`)
21 | );
22 | 
23 | -- https://github.com/macbre/index-digest/issues/48
24 | DROP TABLE IF EXISTS `0004_indices_duplicating_each_other`;
25 | CREATE TABLE `0004_indices_duplicating_each_other` (
26 | 	`item_id` int(9) NOT NULL AUTO_INCREMENT,
27 | 	`foo` varbinary(16) NOT NULL DEFAULT '',
28 | 	PRIMARY KEY (`item_id`),
29 | 	UNIQUE KEY `idx_foo` (`foo`),
30 | 	UNIQUE KEY `idx_foo_2` (`foo`)
31 | );
32 | 
33 | -- https://github.com/macbre/index-digest/issues/49
34 | DROP TABLE IF EXISTS `0004_image_comment_temp`;
35 | CREATE TABLE /*_*/0004_image_comment_temp (
36 |   -- Key to img_name (ugh)
37 |   imgcomment_name varchar(255) binary NOT NULL,
38 |   -- Key to comment_id
39 |   imgcomment_description_id bigint unsigned NOT NULL,
40 |   PRIMARY KEY (imgcomment_name, imgcomment_description_id)
41 | ) /*$wgDBTableOptions*/;
42 | -- Ensure uniqueness
43 | CREATE UNIQUE INDEX /*i*/imgcomment_name ON /*_*/0004_image_comment_temp (imgcomment_name);
44 | 


--------------------------------------------------------------------------------
/indexdigest/formatters/syslog.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provides --format=syslog results formatter - pushes JSON messages via syslog
 3 | """
 4 | from __future__ import absolute_import
 5 | 
 6 | import json
 7 | import syslog
 8 | 
 9 | from collections import OrderedDict
10 | 
11 | import indexdigest
12 | 
13 | 
14 | def _format_report(database, report):
15 |     """
16 |     :type database indexdigest.database.Database
17 |     :type report indexdigest.utils.LinterEntry
18 |     :rtype: str
19 |     """
20 |     res = OrderedDict()
21 | 
22 |     res['appname'] = 'index-digest'
23 | 
24 |     res['meta'] = OrderedDict()
25 |     res['meta']['version'] = 'index-digest v{}'.format(indexdigest.VERSION)
26 |     res['meta']['database_name'] = database.db_name
27 |     res['meta']['database_host'] = database.get_server_hostname()
28 |     res['meta']['database_version'] = 'MySQL v{}'.format(database.get_server_version())
29 | 
30 |     res['report'] = OrderedDict()
31 |     res['report']['type'] = report.linter_type
32 |     res['report']['table'] = report.table_name
33 |     res['report']['message'] = report.message
34 | 
35 |     if report.context:
36 |         res['report']['context'] = report.context
37 | 
38 |     return json.dumps(res)
39 | 
40 | 
41 | def format_syslog(database, reports, ident='index-digest'):
42 |     """
43 |     :type database indexdigest.database.Database
44 |     :type reports list
45 |     :type ident str
46 |     :rtype: str
47 |     """
48 |     syslog.openlog(ident=ident, logoption=syslog.LOG_PID, facility=syslog.LOG_USER)
49 | 
50 |     for report in reports:
51 |         syslog.syslog(_format_report(database, report))
52 | 
53 |     syslog.closelog()
54 |     return ''
55 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0027_selects_with_like.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks SELECT queries that use LIKE '%foo' conditions
 3 | """
 4 | import re
 5 | 
 6 | from collections import OrderedDict
 7 | 
 8 | from indexdigest.utils import LinterEntry, explain_queries, shorten_query
 9 | 
10 | 
11 | def query_uses_leftmost_like(query):
12 |     """
13 |     Returns True for queries with LIKE '%foo' conditions
14 | 
15 |     :type query str
16 |     :rtype: bool
17 |     """
18 |     # quit fast
19 |     if 'like' not in query.lower():
20 |         return False
21 | 
22 |     matches = re.search(r'LIKE\s\s?[\'"]%\w', query, flags=re.IGNORECASE)
23 |     return matches is not None
24 | 
25 | 
26 | def check_selects_with_like(database, queries):
27 |     """
28 |     :type database  indexdigest.database.Database
29 |     :type queries list[str]
30 |     :rtype: list[LinterEntry]
31 |     """
32 |     for (query, table_used, index_used, explain_row) in explain_queries(database, queries):
33 |         if index_used is None and query_uses_leftmost_like(query):
34 |             context = OrderedDict()
35 |             context['query'] = query
36 | 
37 |             # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information
38 |             context['explain_extra'] = explain_row['Extra']
39 |             context['explain_rows'] = int(explain_row['rows'])  # string when using MariaDB 10.5
40 | 
41 |             yield LinterEntry(linter_type='selects_with_like', table_name=table_used,
42 |                               message='"{}" query uses LIKE with left-most wildcard'.
43 |                               format(shorten_query(query)),
44 |                               context=context)
45 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0092_select_star.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for select queries with * wildcard
 3 | """
 4 | from sqlparse.tokens import Wildcard
 5 | from sql_metadata.compat import preprocess_query, get_query_tables, get_query_tokens
 6 | 
 7 | from indexdigest.utils import LinterEntry, shorten_query, is_select_query
 8 | 
 9 | 
10 | def is_wildcard_query(query):
11 |     """
12 |     Checks if provided query selects using a * wildcard
13 |     :type query str
14 |     :rtype bool
15 |     """
16 |     if not is_select_query(query):
17 |         return False
18 | 
19 |     query = preprocess_query(query)
20 |     tokens = get_query_tokens(query)
21 | 
22 |     last_token = None
23 | 
24 |     for token in tokens:
25 |         if token.ttype is Wildcard:
26 |             # print([query, token, 'last token', last_token])
27 | 
28 |             # check what was before the wildcard
29 |             # count(*) ?
30 |             if last_token and str(last_token) not in ['(']:
31 |                 return True
32 |         else:
33 |             last_token = token
34 | 
35 |     return False
36 | 
37 | 
38 | def check_select_star(_, queries):
39 |     """
40 |     :type queries list[str]
41 |     :rtype: list[LinterEntry]
42 |     """
43 |     queries_with_wildcard = [
44 |         query for query in queries
45 |         if is_wildcard_query(query)
46 |     ]
47 | 
48 |     for query in queries_with_wildcard:
49 |         table_name = get_query_tables(query)[0]
50 | 
51 |         yield LinterEntry(linter_type='select_star', table_name=table_name,
52 |                           message='"{}" query uses SELECT *'.
53 |                           format(shorten_query(query)),
54 |                           context={"query": query})
55 | 


--------------------------------------------------------------------------------
/indexdigest/linters/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains linters used to check the database for improvements.
 3 | """
 4 | # expose linters
 5 | from .linter_0002_not_used_indices import check_not_used_indices
 6 | from .linter_0004_redundant_indices import check_redundant_indices
 7 | from .linter_0006_not_used_columns_and_tables import check_not_used_tables, check_not_used_columns
 8 | from .linter_0019_queries_not_using_indices import check_queries_not_using_indices
 9 | from .linter_0020_filesort_temporary_table import \
10 |     check_queries_using_filesort, check_queries_using_temporary
11 | from .linter_0026_full_table_scan import check_full_table_scan
12 | from .linter_0027_selects_with_like import check_selects_with_like
13 | from .linter_0028_data_too_old import check_data_too_old
14 | from .linter_0028_data_not_updated_recently import check_data_not_updated_recently
15 | from .linter_0032_utf_latin_columns import check_latin_columns
16 | from .linter_0034_missing_primary_index import check_missing_primary_index
17 | from .linter_0036_use_innodb import check_use_innodb
18 | from .linter_0070_insert_ignore import check_insert_ignore_queries
19 | from .linter_0074_single_column import check_single_column
20 | from .linter_0075_test_tables import check_test_tables
21 | from .linter_0089_empty_tables import check_empty_tables
22 | from .linter_0092_select_star import check_select_star
23 | from .linter_0093_having_clause import check_having_clause
24 | from .linter_0094_generic_primary_key import check_generic_primary_key
25 | from .linter_0118_high_offset_selects import check_high_offset_selects
26 | from .linter_0164_empty_database import check_empty_database
27 | from .linter_0031_low_cardinality_index import check_low_cardinality_index
28 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0026_full_table_scan.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for SELECT queries that use full table scan
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import explain_queries, LinterEntry, shorten_query
 7 | 
 8 | 
 9 | def check_full_table_scan(database, queries):
10 |     """
11 |     Full table scan
12 | 
13 |     An operation that requires reading the entire contents of a table, rather than just selected
14 |     portions using an index. Typically performed either with small lookup tables, or in data
15 |     warehousing situations with large tables where all available data is aggregated and analyzed.
16 |     How frequently these operations occur, and the sizes of the tables relative to available memory,
17 |     have implications for the algorithms used in query optimization and managing the buffer pool.
18 | 
19 |     :type database  indexdigest.database.Database
20 |     :type queries list[str]
21 |     :rtype: list[LinterEntry]
22 |     """
23 |     for (query, table_used, _, row) in explain_queries(database, queries):
24 |         # The output from EXPLAIN shows ALL in the type column when
25 |         # MySQL uses a full table scan to resolve a query.
26 |         if row['type'] != 'ALL':
27 |             continue
28 | 
29 |         context = OrderedDict()
30 |         context['query'] = query
31 |         context['explain_rows'] = int(row['rows'])  # we get string here when using MariaDB 10.5
32 | 
33 |         yield LinterEntry(linter_type='queries_using_full_table_scan', table_name=table_used,
34 |                           message='"{}" query triggered full table scan'.
35 |                           format(shorten_query(query)),
36 |                           context=context)
37 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0075_test_tables.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters import check_test_tables
 6 | from indexdigest.linters.linter_0075_test_tables import is_test_table
 7 | from indexdigest.test import DatabaseTestMixin
 8 | 
 9 | 
10 | class TestTables(TestCase, DatabaseTestMixin):
11 | 
12 |     def test_is_test_table(self):
13 |         assert is_test_table('test') is True
14 |         assert is_test_table('some_guy_test_table') is True
15 |         assert is_test_table('0075_some_guy_test_table') is True
16 |         assert is_test_table('foo_test_bar') is True
17 |         assert is_test_table('test_bar') is True
18 |         assert is_test_table('foo_test') is True
19 |         assert is_test_table('forum_creation_temp') is True
20 | 
21 |         assert is_test_table('foo_testing') is False
22 |         assert is_test_table('test123') is False
23 |         assert is_test_table('travis_tests') is False
24 | 
25 |     def test_check_test_table(self):
26 |         reports = list(check_test_tables(self.connection))
27 | 
28 |         print(list(map(str, reports)))
29 | 
30 |         self.assertEqual(len(reports), 2)
31 | 
32 |         self.assertEqual(str(reports[0]),
33 |                          '0004_image_comment_temp: "0004_image_comment_temp" seems to be a test table')
34 |         self.assertTrue('CREATE TABLE `0004_image_comment_temp` (' in reports[0].context['schema'])
35 | 
36 |         self.assertEqual(str(reports[1]),
37 |                          '0075_some_guy_test_table: "0075_some_guy_test_table" seems to be a test table')
38 |         self.assertTrue('CREATE TABLE `0075_some_guy_test_table` (' in reports[1].context['schema'])
39 | 
40 |         # assert False
41 | 


--------------------------------------------------------------------------------
/indexdigest/test/formatters/test_syslog.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase, mock
 2 | 
 3 | from indexdigest import VERSION
 4 | from indexdigest.formatters.syslog import _format_report, format_syslog
 5 | from . import FormatterTestMixin
 6 | 
 7 | from indexdigest.cli.script import get_reports
 8 | 
 9 | 
10 | class TestFormatter(TestCase, FormatterTestMixin):
11 | 
12 |     def test_format_report_helper(self):
13 |         report = next(self.get_reports_mock())
14 |         out = _format_report(self.get_database_mock(), report)
15 |         print(out, report)
16 | 
17 |         self.assertEqual(
18 |             '{"appname": "index-digest", "meta": {"version": "index-digest v' + VERSION + '", "database_name": "test_database", '
19 |             '"database_host": "test.local", "database_version": "MySQL v1.2.3-test"}, '
20 |             '"report": {"type": "foo_linter", "table": "table_001", "message": "Something is fishy here", '
21 |             '"context": {"foo": 42, "test": "bar"}}}',
22 |             out
23 |         )
24 | 
25 |         # assert False
26 | 
27 | 
28 | class TestFormatterIntegrationTest(TestCase, FormatterTestMixin):
29 | 
30 |     def test_format_for_real_reports(self):
31 |         database = self.get_database()
32 | 
33 |         # pass all reports via syslog formatter
34 |         for report in get_reports(database, analyze_data=True):
35 |             _format_report(database, report)
36 | 
37 |     @mock.patch('syslog.syslog')
38 |     def test_format_syslog(self, mocked_syslog: mock.MagicMock):
39 |         reports = list(self.get_reports_mock())
40 |         format_syslog(database=self.get_database_mock(), reports=reports)
41 | 
42 |         assert mocked_syslog.called, 'syslog.syslog has been called'
43 |         assert mocked_syslog.call_count == len(reports)
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | .pytest_cache/
  4 | *.py[cod]
  5 | *$py.class
  6 | *.swp
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env*/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 
105 | .idea/
106 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0031_low_cardinality_index.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0031_low_cardinality_index import \
 6 |     check_low_cardinality_index, get_low_cardinality_indices, INDEX_CARDINALITY_THRESHOLD
 7 | from indexdigest.test import DatabaseTestMixin
 8 | 
 9 | 
10 | class TestLinter(TestCase, DatabaseTestMixin):
11 | 
12 |     def setUp(self) -> None:
13 |         self.skipTest(reason="test_0031_low_cardinality_index is not stable")
14 | 
15 |     def test_get_low_cardinality_indices(self):
16 |         indices = list(get_low_cardinality_indices(self.connection))
17 | 
18 |         print(indices)
19 | 
20 |         assert len(indices) == 1
21 | 
22 |         index = indices[0]
23 |         assert index[0] == '0020_big_table'
24 |         assert index[2]['INDEX_NAME'] == 'num_idx'
25 |         assert index[2]['COLUMN_NAME'] == 'num'
26 |         assert index[2]['CARDINALITY'] >= 1
27 |         assert index[2]['CARDINALITY'] <= INDEX_CARDINALITY_THRESHOLD
28 | 
29 |     def test_low_cardinality_index(self):
30 |         reports = list(check_low_cardinality_index(self.connection))
31 | 
32 |         print(reports, reports[0].context)
33 | 
34 |         assert len(reports) == 1
35 | 
36 |         assert str(reports[0]) == '0020_big_table: "num_idx" index on "num" column ' \
37 |                                   'has low cardinality, check if it is needed'
38 |         assert reports[0].table_name == '0020_big_table'
39 | 
40 |         assert reports[0].context['column_name'] == 'num'
41 |         assert reports[0].context['index_name'] == 'num_idx'
42 |         assert isinstance(reports[0].context['index_cardinality'], int)
43 | 
44 |         self.assertAlmostEqual(int(reports[0].context['value_usage']), 50, delta=5)
45 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0032_utf_latin_columns.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter reports text columns that have characters encoding set to latin1
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import LinterEntry
 7 | 
 8 | 
 9 | def is_text_column_latin(column):
10 |     """
11 |     :type column indexdigest.schema.Column
12 |     :rtype: bool
13 |     """
14 |     if not column.is_text_type():
15 |         return False
16 | 
17 |     # ignore blob columns without specified character set
18 |     if column.character_set is None:
19 |         return False
20 | 
21 |     # ignore utf8 columns
22 |     # utf8, ucs2, utf8mb4, utf16, utf16le, utf32
23 |     # @see https://dev.mysql.com/doc/refman/5.7/en/charset-unicode.html
24 |     if column.character_set.startswith('utf') or column.character_set in ['ucs2', 'binary']:
25 |         return False
26 | 
27 |     return True
28 | 
29 | 
30 | def check_latin_columns(database):
31 |     """
32 |     :type database  indexdigest.database.Database
33 |     :rtype: list[LinterEntry]
34 |     """
35 |     for table in database.get_tables():
36 |         for column in database.get_table_columns(table):
37 |             if not is_text_column_latin(column):
38 |                 continue
39 | 
40 |             # print([table, column, column.character_set, column.collation])
41 | 
42 |             context = OrderedDict()
43 |             context['column'] = column.name
44 |             context['column_character_set'] = column.character_set
45 |             context['column_collation'] = column.collation
46 |             context['schema'] = database.get_table_schema(table)
47 | 
48 |             yield LinterEntry(linter_type='non_utf_columns', table_name=table,
49 |                               message='"{}" text column has "{}" character set defined'.
50 |                               format(column.name, column.character_set),
51 |                               context=context)
52 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0019_queries_not_using_indices.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks SELECT queries that do not use indices
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import LinterEntry, explain_queries, shorten_query
 7 | 
 8 | 
 9 | def check_queries_not_using_indices(database, queries):
10 |     """
11 |     :type database  indexdigest.database.Database
12 |     :type queries list[str]
13 |     :rtype: list[LinterEntry]
14 |     """
15 |     for (query, table_used, index_used, explain_row) in explain_queries(database, queries):
16 |         # print(query, explain_row)
17 | 
18 |         # EXPLAIN can return no matching row in const table in Extra column.
19 |         # Do not consider this query as not using an index. -- see #44 and #210
20 |         if explain_row['Extra'] in [
21 |                 'Impossible WHERE noticed after reading const tables',
22 |                 'no matching row in const table',
23 |                 'No tables used',
24 |                 'Select tables optimized away',
25 |                 'No matching min/max row',
26 |         ]:
27 |             continue
28 | 
29 |         if index_used is None:
30 |             context = OrderedDict()
31 |             context['query'] = query
32 | 
33 |             # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information
34 |             context['explain_extra'] = explain_row['Extra']
35 |             context['explain_rows'] = explain_row['rows']
36 |             context['explain_filtered'] = explain_row.get('filtered')  # can be not set
37 |             context['explain_possible_keys'] = explain_row['possible_keys']
38 | 
39 |             yield LinterEntry(linter_type='queries_not_using_index', table_name=table_used,
40 |                               message='"{}" query did not make use of any index'.
41 |                               format(shorten_query(query)),
42 |                               context=context)
43 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerimage.yml:
--------------------------------------------------------------------------------
 1 | name: Build and test a Docker image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 | 
10 |   docker_build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v5.0.1
15 | 
16 |     - name: Build the Docker image
17 |       env:
18 |         # @see https://docs.docker.com/develop/develop-images/build_enhancements/
19 |         DOCKER_BUILDKIT: "1"
20 |       run: |
21 |         docker build . \
22 |           --cache-from ghcr.io/macbre/index-digest:latest \
23 |           --build-arg BUILDKIT_INLINE_CACHE=1 \
24 |           --build-arg GITHUB_SHA=$(git rev-parse --short HEAD) \
25 |           --tag ${{ github.repository }}
26 | 
27 |         docker images
28 | 
29 |         echo "## Image labels:"
30 |         docker inspect --format='{{json .Config.Labels}}' ${{ github.repository }} | jq
31 | 
32 |         echo "## Image env vars:"
33 |         docker inspect --format='{{json .Config.Env}}' ${{ github.repository }} | jq
34 | 
35 |     - name: Check the version
36 |       run: |
37 |         docker run ${{ github.repository }} --version
38 | 
39 |   docker_test:
40 |     runs-on: ubuntu-latest
41 |     needs: docker_build
42 | 
43 |     services:
44 |       mysql:
45 |         image: mysql:8.0.22
46 |         env:
47 |           MYSQL_ALLOW_EMPTY_PASSWORD: yes
48 |           MYSQL_DATABASE: index_digest
49 |           MYSQL_USER: test
50 |           MYSQL_PASSWORD: p4ss
51 |         ports:
52 |         - "53306:3306"
53 |         options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3
54 | 
55 |     steps:
56 |     - name: Run the container and connect to the test database
57 |       run: |
58 |         docker ps
59 |         docker run --network=host ${{ github.repository }} mysql://test:p4ss@0.0.0.0:53306/index_digest | tee /tmp/results
60 |         grep "Jolly, good! No issues to report" /tmp/results
61 | 


--------------------------------------------------------------------------------
/sql/0028-data-too-old.sql:
--------------------------------------------------------------------------------
 1 | -- Report tables that have really old data
 2 | -- Worth checking if such long data retention is actually needed
 3 | --
 4 | -- https://github.com/macbre/index-digest/issues/28
 5 | DROP TABLE IF EXISTS `0028_data_too_old`;
 6 | CREATE TABLE `0028_data_too_old` (
 7 |     `item_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
 8 |     `cnt` int(8) unsigned NOT NULL,
 9 |      `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
10 |      PRIMARY KEY (`item_id`)
11 | ) ENGINE=InnoDB;
12 | 
13 | 
14 | -- table with old data (6 months old)
15 | INSERT INTO 0028_data_too_old VALUES
16 |     (1, 12, NOW() - INTERVAL 6 MONTH),
17 |     (2, 20, NOW() - INTERVAL 3 MONTH),
18 |     (3, 42, NOW());
19 | 
20 | INSERT INTO 0028_data_too_old(cnt) VALUES
21 |     (52);
22 | 
23 | 
24 | -- table with no old data
25 | DROP TABLE IF EXISTS `0028_data_ok`;
26 | CREATE TABLE `0028_data_ok` LIKE `0028_data_too_old`;
27 | 
28 | INSERT INTO 0028_data_ok(cnt, `timestamp`) VALUES
29 |     (1, NOW() - INTERVAL 7 DAY);
30 | 
31 | 
32 | -- empty tables should be simply ignored
33 | DROP TABLE IF EXISTS `0028_data_empty`;
34 | CREATE TABLE `0028_data_empty` LIKE `0028_data_too_old`;
35 | 
36 | -- table with no time columns
37 | DROP TABLE IF EXISTS `0028_no_time`;
38 | CREATE TABLE `0028_no_time` (
39 |     `item_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
40 |     `cnt` int(8) unsigned NOT NULL,
41 |      PRIMARY KEY (`item_id`)
42 | ) ENGINE=InnoDB;
43 | 
44 | -- MediaWiki timestamp columns
45 | -- @see https://www.mediawiki.org/wiki/Manual:Revision_table
46 | DROP TABLE IF EXISTS `0028_revision`;
47 | CREATE TABLE `0028_revision` (
48 |     `rev_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
49 |     `rev_timestamp` binary(14) NOT NULL,
50 |     PRIMARY KEY (`rev_id`)
51 | ) ENGINE=InnoDB;
52 | 
53 | INSERT INTO 0028_revision(rev_id, `rev_timestamp`) VALUES
54 |     (1, '20180101000000'),
55 |     (2, '2018010'); -- #129: Incorrect datetime value
56 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0028_data_too_old.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.database import Database
 6 | from indexdigest.linters.linter_0028_data_too_old import check_data_too_old
 7 | from indexdigest.test import DatabaseTestMixin
 8 | 
 9 | 
10 | class LimitedViewDatabase(Database, DatabaseTestMixin):
11 |     """
12 |     Limit test to tables
13 |     """
14 |     def get_tables(self):
15 |         return [
16 |             '0028_data_too_old',
17 |             '0028_data_ok',
18 |             '0028_data_empty',
19 |             '0028_no_time',
20 |             '0028_data_not_updated_recently',
21 |             '0028_revision',
22 |         ]
23 | 
24 | 
25 | class TestLinter(TestCase, DatabaseTestMixin):
26 | 
27 |     @property
28 |     def connection(self):
29 |         return LimitedViewDatabase.connect_dsn(self.DSN)
30 | 
31 |     def test_data_too_old(self):
32 |         reports = list(check_data_too_old(self.connection))
33 | 
34 |         print(list(map(str, reports)))
35 | 
36 |         assert len(reports) == 1
37 | 
38 |         assert str(reports[0]).startswith('0028_data_too_old: "0028_data_too_old" has rows added 18')  # .. 184 days ago
39 |         assert str(reports[0]).endswith('consider changing retention policy')
40 |         # self.assertAlmostEquals(reports[0].context['diff_days'], 184)
41 |         assert reports[0].table_name == '0028_data_too_old'
42 | 
43 |         assert 'data_since' in reports[0].context
44 |         assert 'data_until' in reports[0].context
45 |         assert 'table_size_mb' in reports[0].context
46 | 
47 |         assert reports[0].context['date_column_name'] == 'timestamp'
48 | 
49 |     def test_data_too_old_with_custom_threshold(self):
50 |         env = {
51 |             'INDEX_DIGEST_DATA_TOO_OLD_THRESHOLD_DAYS': str(365 * 86400)
52 |         }
53 | 
54 |         reports = list(check_data_too_old(self.connection, env))
55 | 
56 |         print(list(map(str, reports)))
57 |         assert len(reports) == 0
58 | 


--------------------------------------------------------------------------------
/indexdigest/test/core/test_utils.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from indexdigest.utils import is_select_query, parse_dsn, shorten_query
 4 | 
 5 | 
 6 | class TestUtils(TestCase):
 7 | 
 8 |     def test_parse_dsn(self):
 9 |         parsed = parse_dsn('mysql://alex:pwd@localhost/test')
10 | 
11 |         self.assertEqual('localhost', parsed['host'])
12 |         self.assertEqual(3306, parsed['port'])
13 |         self.assertEqual('alex', parsed['user'])
14 |         self.assertEqual('pwd', parsed['passwd'])
15 |         self.assertEqual('test', parsed['db'])
16 | 
17 |     def test_parse_dsn_with_port(self):
18 |         parsed = parse_dsn('mysql://alex:pwd@localhost:5000/test')
19 | 
20 |         self.assertEqual('localhost', parsed['host'])
21 |         self.assertEqual(5000, parsed['port'])
22 |         self.assertEqual('alex', parsed['user'])
23 |         self.assertEqual('pwd', parsed['passwd'])
24 |         self.assertEqual('test', parsed['db'])
25 | 
26 |     def test_is_select_query(self):
27 |         assert is_select_query('SELECT * FROM foo')
28 |         assert is_select_query('select * from foo')
29 |         assert is_select_query('SELECT * FROM foo;')
30 |         assert is_select_query('  SELECT * FROM foo;')
31 |         assert is_select_query('/* foo */ SELECT * FROM foo;')
32 | 
33 |         assert is_select_query('BEGIN') is False
34 |         assert is_select_query('COMMIT') is False
35 |         assert is_select_query('/* SELECT */ COMMIT') is False
36 |         assert is_select_query('TRUNCATE foo;') is False
37 |         assert is_select_query('UPDATE foo SET bar=42 WHERE id=1') is False
38 | 
39 |     def test_shorten_query(self):
40 |         self.assertEqual('SELECT * FROM foo', shorten_query('SELECT * FROM foo'))
41 |         self.assertEqual('SELECT * FROM foo', shorten_query('SELECT * FROM foo', max_len=18))
42 |         self.assertEqual('SELECT * FROM foo', shorten_query('SELECT * FROM foo', max_len=17))
43 |         self.assertEqual('SELECT * FROM fo...', shorten_query('SELECT * FROM foo', max_len=16))
44 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0092_select_star.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0092_select_star import check_select_star, is_wildcard_query
 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_is_wildcard_query(self):
12 |         assert is_wildcard_query('SELECT * FROM foo;')
13 |         assert is_wildcard_query('SELECT t.* FROM foo AS t;')
14 |         assert is_wildcard_query('SELECT  *  FROM `user`  WHERE user_id = 34994913  LIMIT 1')
15 |         assert is_wildcard_query('/* User::loadFromDatabase */ SELECT  *  FROM `user`  WHERE user_id = 34994913  LIMIT 1')
16 |         assert is_wildcard_query('SELECT /* User::loadFromDatabase */ *  FROM `user`  WHERE user_id = 34994913  LIMIT 1')
17 | 
18 |         assert is_wildcard_query('SELECT id FROM foo') is False
19 |         assert is_wildcard_query('SELECT (id+2) * 2 FROM foo') is False
20 |         assert is_wildcard_query('SELECT 3 * 3') is False
21 |         assert is_wildcard_query('SELECT count(*) FROM foo') is False
22 |         assert is_wildcard_query('SELECT /* foo */ test FROM foo') is False
23 | 
24 |         assert is_wildcard_query('INSERT * INTO foo') is False
25 | 
26 |         # assert False
27 | 
28 |     def test_check_select_star(self):
29 |         reports = list(check_select_star(self.connection, read_queries_from_log('0092-select-star-log')))
30 | 
31 |         print(list(map(str, reports)))
32 | 
33 |         assert len(reports) == 2
34 | 
35 |         assert str(reports[0]) == 'foo: "SELECT * FROM foo" query uses SELECT *'
36 |         assert reports[0].table_name == 'foo'
37 |         assert reports[0].context['query'] == 'SELECT * FROM foo;'
38 | 
39 |         assert str(reports[1]) == 'bar: "SELECT t.* FROM bar AS t" query uses SELECT *'
40 |         assert reports[1].table_name == 'bar'
41 |         assert reports[1].context['query'] == 'SELECT t.* FROM bar AS t;'
42 | 
43 |         # assert False
44 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0019_queries_not_using_indices.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0019_queries_not_using_indices import check_queries_not_using_indices
 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 7 | 
 8 | 
 9 | class TestQueriesNotUsingIndices(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_queries(self):
12 |         reports = list(check_queries_not_using_indices(
13 |             database=self.connection, queries=read_queries_from_log('0019-queries-not-using-indices-log')))
14 | 
15 |         print(*[f"{report.message} ({report.context['explain_extra']})" for report in reports], sep="\n")
16 |         assert len(reports) == 3
17 | 
18 |         self.assertEqual(str(reports[0]), '0019_queries_not_using_indices: "SELECT item_id FROM 0019_queries_not_using_indices..." query did not make use of any index')
19 |         self.assertEqual(reports[0].table_name, '0019_queries_not_using_indices')
20 |         self.assertEqual(str(reports[0].context['query']), 'SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" OR item_id > 1;')
21 |         self.assertEqual(str(reports[0].context['explain_extra']), 'Using where')
22 |         self.assertEqual(str(reports[0].context['explain_rows']), '3')
23 | 
24 |         self.assertEqual(reports[1].table_name, '0019_queries_not_using_indices')
25 |         self.assertEqual(str(reports[1].context['query']), 'SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test"')
26 |         self.assertEqual(str(reports[1].context['explain_extra']), 'Using where')
27 |         self.assertEqual(str(reports[1].context['explain_rows']), '3')
28 | 
29 |         self.assertEqual(reports[2].table_name, '0019_queries_not_using_indices')
30 |         self.assertEqual(str(reports[2].context['query']), 'SELECT 1 AS one FROM dual WHERE exists ( SELECT item_id FROM 0019_queries_not_using_indices WHERE foo = "test" );')
31 |         self.assertEqual(str(reports[2].context['explain_extra']), 'Using where')
32 |         self.assertEqual(str(reports[2].context['explain_rows']), '3')
33 | 
34 |         # assert False
35 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0070_insert_ignore.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks INSERT IGNORE queries
 3 | 
 4 | If you use the IGNORE modifier, errors that occur while executing the INSERT statement are ignored.
 5 | For example, without IGNORE, a row that duplicates an existing UNIQUE index or PRIMARY KEY value
 6 | in the table causes a duplicate-key error and the statement is aborted. With IGNORE, the row is
 7 | discarded and no error occurs. Ignored errors generate warnings instead.
 8 | 
 9 | Data conversions that would trigger errors abort the statement if IGNORE is not specified.
10 | With IGNORE, invalid values are adjusted to the closest values and inserted; warnings
11 | are produced but the statement does not abort.
12 | 
13 | @see https://medium.com/legacy-systems-diary/things-to-avoid-episode-1-insert-ignore-535b4c24406b
14 | """
15 | import re
16 | 
17 | from collections import OrderedDict
18 | from sql_metadata.compat import get_query_tables
19 | 
20 | from indexdigest.utils import LinterEntry, shorten_query
21 | 
22 | 
23 | def remove_comments(sql):
24 |     """
25 |     :type sql str
26 |     :rtype: str
27 |     """
28 |     return re.sub(r'/\*[^*]+\*/', '', sql)
29 | 
30 | 
31 | def is_insert_ignore_query(sql):
32 |     """
33 |     :type sql str
34 |     :rtype: bool
35 |     """
36 |     sql = remove_comments(sql).lstrip()
37 |     return re.match(r'^INSERT\s+IGNORE\s', sql, flags=re.IGNORECASE) is not None
38 | 
39 | 
40 | def check_insert_ignore_queries(database, queries):
41 |     """
42 |     :type database  indexdigest.database.Database
43 |     :type queries list[str]
44 |     :rtype: list[LinterEntry]
45 |     """
46 |     queries = [query for query in queries if is_insert_ignore_query(query)]
47 | 
48 |     for query in queries:
49 |         table_used = get_query_tables(query)[0]
50 | 
51 |         context = OrderedDict()
52 |         context['query'] = query
53 |         context['schema'] = database.get_table_schema(table_used)
54 | 
55 |         yield LinterEntry(linter_type='insert_ignore', table_name=table_used,
56 |                           message='"{}" query uses a risky INSERT IGNORE'.
57 |                           format(shorten_query(query)),
58 |                           context=context)
59 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0028_data_not_updated_recently.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0028_data_not_updated_recently \
 6 |     import check_data_not_updated_recently, get_time_columns
 7 | from indexdigest.test import DatabaseTestMixin
 8 | from .test_0028_data_too_old import LimitedViewDatabase
 9 | 
10 | 
11 | class TestLinter(TestCase, DatabaseTestMixin):
12 | 
13 |     @property
14 |     def connection(self):
15 |         return LimitedViewDatabase.connect_dsn(self.DSN)
16 | 
17 |     def test_get_time_columns(self):
18 |         columns = list(get_time_columns(self.connection))
19 | 
20 |         assert len(columns) == 5
21 | 
22 |         assert columns[0][0] == '0028_data_too_old'
23 |         assert columns[0][1].name == 'timestamp'
24 | 
25 |         assert columns[4][0] == '0028_revision'
26 |         assert columns[4][1].name == 'rev_timestamp'
27 | 
28 |         print(list(columns))
29 |         # assert False
30 | 
31 |     def test_data_not_updated_recently(self):
32 |         reports = list(check_data_not_updated_recently(self.connection))
33 | 
34 |         print(list(map(str, reports)))
35 | 
36 |         assert len(reports) == 1
37 | 
38 |         assert str(reports[0]).startswith('0028_data_not_updated_recently: "0028_data_not_updated_recently" '
39 |                                           'has the latest row added ')
40 |         assert str(reports[0]).endswith('consider checking if it should be up-to-date')
41 |         assert abs(reports[0].context['diff_days'] - 40) < 2, 'diff_days is around 40 days'
42 |         assert reports[0].table_name == '0028_data_not_updated_recently'
43 | 
44 |         assert 'data_since' in reports[0].context
45 |         assert 'data_until' in reports[0].context
46 |         assert 'table_size_mb' in reports[0].context
47 | 
48 |     def test_data_not_updated_recently_with_custom_threshold(self):
49 |         env = {
50 |             'INDEX_DIGEST_DATA_NOT_UPDATED_RECENTLY_THRESHOLD_DAYS': str(60 * 86400)
51 |         }
52 | 
53 |         reports = list(check_data_not_updated_recently(self.connection, env))
54 | 
55 |         print(list(map(str, reports)))
56 |         assert len(reports) == 0
57 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0027_selects_with_like.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0027_selects_with_like import check_selects_with_like, query_uses_leftmost_like
 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 7 | 
 8 | 
 9 | class TestSelectsWithLike(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_query_uses_leftmost_like(self):
12 |         self.assertTrue(query_uses_leftmost_like("SELECT * FROM foo WHERE bar LIKE '%baz';"))
13 |         self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar LIKE "%baz";'))
14 |         self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like "%baz";'))
15 |         self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like "%123";'))
16 |         self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like\n"%123";'))
17 |         self.assertTrue(query_uses_leftmost_like('SELECT * FROM foo WHERE bar like  "%123";'))
18 | 
19 |         self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE bar = 'baz'"))
20 |         self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE like = 'foo'"))
21 |         self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE bar LIKE 'b%z';"))
22 |         self.assertFalse(query_uses_leftmost_like("SELECT * FROM foo WHERE bar LIKE 'ba%';"))
23 | 
24 |     def test_queries(self):
25 |         reports = list(check_selects_with_like(
26 |             database=self.connection, queries=read_queries_from_log('0027-selects-with-like-log')))
27 | 
28 |         print(reports, reports[0].context)
29 | 
30 |         self.assertEqual(len(reports), 1)
31 | 
32 |         self.assertEqual(str(reports[0]), '0020_big_table: "SELECT * FROM 0020_big_table WHERE text LIKE \'%00\'" query uses LIKE with left-most wildcard')
33 |         self.assertEqual(reports[0].table_name, '0020_big_table')
34 |         self.assertEqual(str(reports[0].context['query']), 'SELECT * FROM 0020_big_table WHERE text LIKE \'%00\'')
35 |         self.assertEqual(str(reports[0].context['explain_extra']), 'Using where')
36 |         self.assertTrue(reports[0].context['explain_rows'] > 10000)
37 | 
38 |         # assert False
39 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Integration tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   integrations_tests:
10 |     runs-on: ubuntu-latest
11 | 
12 |     strategy:
13 |       # Do not fail if one the tests did not pass
14 |       fail-fast: false
15 | 
16 |       matrix:
17 |         # Docker images of MySQL-compliant databases to run the tests suite on
18 |         database:
19 |           # https://hub.docker.com/_/mysql?tab=tags
20 |           - "mysql:5.7.32"
21 |           - "mysql:8.0.22"
22 |           - "mysql:8.1.0"
23 |           - "mysql:9.4.0"
24 |           # https://hub.docker.com/_/mariadb?tab=tags
25 |           - "mariadb:10.1"
26 |           - "mariadb:10.2"
27 |           - "mariadb:10.5"
28 |           - "mariadb:10.6"
29 |           - "mariadb:11.8"
30 |           - "mariadb:12.0"
31 |           # https://hub.docker.com/_/percona?tab=tags
32 |           - "percona:8.0.22-13"
33 | 
34 |     services:
35 |         mysql:
36 |             image: ${{ matrix.database }}
37 |             env:
38 |                 MYSQL_ALLOW_EMPTY_PASSWORD: yes
39 |                 MYSQL_DATABASE: index_digest
40 |             ports:
41 |                 - "53306:3306"
42 |             options: --health-cmd="mysqladmin ping || mariadb-admin ping" --health-interval=10s --health-timeout=5s --health-retries=6
43 | 
44 |     steps:
45 |     - uses: actions/checkout@v5.0.1
46 | 
47 |     # https://github.com/actions/setup-python?tab=readme-ov-file#caching-packages-dependencies
48 |     - name: Set up Python
49 |       uses: actions/setup-python@v6
50 |       with:
51 |         python-version: "3.14"
52 |         cache: 'pip' # dependencies caching
53 |         cache-dependency-path: 'setup.py'
54 | 
55 |     - name: Install dependencies
56 |       run: |
57 |         pip install wheel
58 |         make install
59 | 
60 |     - name: Set up the database
61 |       run: |
62 |         docker ps
63 |         mysql --protocol=tcp --port=53306 -u root --password="" -v < setup.sql
64 |         # import the test schema files
65 |         "./sql/populate.sh"
66 |         mysql --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest -v -e '\s; SHOW TABLES; SHOW DATABASES;'
67 | 
68 |     - name: Tests
69 |       run: make test
70 | 


--------------------------------------------------------------------------------
/indexdigest/formatters/plain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Provides --format=plain results formatter
 4 | """
 5 | from termcolor import colored
 6 | 
 7 | import indexdigest
 8 | from indexdigest.utils import LinterEntry
 9 | 
10 | 
11 | def format_context(context):
12 |     """
13 |     :type context dict
14 |     :rtype: str
15 |     """
16 |     return '\n  '.join([
17 |         "- {key}: {value}".format(
18 |             key=colored(key, color='green', attrs=['bold']),
19 |             value=str(value).replace("\n", "\n    ")
20 |         )
21 |         for (key, value) in context.items()
22 |     ])
23 | 
24 | 
25 | def format_plain(database, reports):
26 |     """
27 |     :type database indexdigest.database.Database
28 |     :type reports list
29 |     :rtype: str
30 |     """
31 |     out = ''
32 | 
33 |     # cast to a list (to be able to count reports)
34 |     reports = list(reports)
35 | 
36 |     # emit results
37 |     line = '-' * 60 + "\n"
38 | 
39 |     out += line
40 |     out += 'Found {} issue(s) to report for "{}" database\n'.format(
41 |         len(reports), database.db_name)
42 |     out += line
43 |     out += 'MySQL v{} at {}\n'.format(
44 |         database.get_server_version(), database.get_server_hostname())
45 |     out += 'index-digest v{}\n'.format(indexdigest.VERSION)
46 |     out += line
47 | 
48 |     if reports:
49 |         for report in reports:
50 |             assert isinstance(report, LinterEntry)
51 | 
52 |             out += colored(report.linter_type, color='blue', attrs=['bold']) + \
53 |                 ' → table affected: ' + \
54 |                 colored(report.table_name, attrs=['bold']) + \
55 |                 '\n'
56 | 
57 |             out += colored(
58 |                 '\n{} {}\n'.format(colored('✗', color='red', attrs=['bold']), report.message),
59 |                 color='white')
60 | 
61 |             if report.context is not None:
62 |                 out += '\n  {}\n'.format(format_context(report.context))
63 | 
64 |             out += '\n'
65 |             out += line
66 | 
67 |         out += 'Queries performed: {}'.format(len(database.get_queries()))
68 |         # out += '\n'.join(map(str, database.get_queries())))
69 |     else:
70 |         out += 'Jolly, good! No issues to report'
71 | 
72 |     return out
73 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0004_redundant_indices.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for redundant indices from a given set of them
 3 | """
 4 | import logging
 5 | 
 6 | from collections import OrderedDict
 7 | 
 8 | from indexdigest.utils import LinterEntry
 9 | 
10 | 
11 | def get_redundant_indices(indices):
12 |     """
13 |     :type indices list[indexdigest.schema.Index]
14 |     :rtype: list[tuple]
15 |     """
16 |     redundant_indices = []
17 | 
18 |     for index in indices:
19 |         for compare in indices:
20 |             if index.is_covered_by(compare):
21 |                 redundant_indices.append((index, compare, ))
22 | 
23 |     return redundant_indices
24 | 
25 | 
26 | def check_redundant_indices(database):
27 |     """
28 |     :type database  indexdigest.database.Database
29 |     :rtype: list[LinterEntry]
30 |     """
31 |     logger = logging.getLogger(__name__)
32 | 
33 |     for table in database.get_tables():
34 |         logger.info("Checking %s table", table)
35 | 
36 |         indices = database.get_table_indices(table)
37 |         meta = database.get_table_metadata(table)
38 |         schema = database.get_table_schema(table)
39 | 
40 |         redundant_indices = set()
41 | 
42 |         for (redundant_index, suggested_index) in get_redundant_indices(indices):
43 |             # the index we're about to suggest was reported as redundant - #48
44 |             if suggested_index in redundant_indices:
45 |                 continue
46 | 
47 |             context = OrderedDict()
48 |             context['redundant'] = str(redundant_index)
49 |             context['covered_by'] = str(suggested_index)
50 |             context['schema'] = schema
51 |             context['table_data_size_mb'] = 1. * meta['data_size'] / 1024 / 1024
52 |             context['table_index_size_mb'] = 1. * meta['index_size'] / 1024 / 1024
53 | 
54 |             # add to the list to avoid redundant indices being reported in a loop - #48
55 |             redundant_indices.add(redundant_index)
56 | 
57 |             yield LinterEntry(linter_type='redundant_indices', table_name=table,
58 |                               message='"{}" index can be removed as redundant (covered by "{}")'.
59 |                               format(redundant_index.name, suggested_index.name),
60 |                               context=context)
61 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0093_having_clause.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0093_having_clause import query_has_having_clause, check_having_clause
 6 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 7 | 
 8 | 
 9 | class TestLinter(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_query_has_having_clause(self):
12 |         assert query_has_having_clause('SELECT * FROM foo having bar = 2')
13 |         assert query_has_having_clause('SELECT * FROM foo HAVING bar = 2')
14 | 
15 |         assert query_has_having_clause("SELECT * FROM 0019_queries_not_using_indices "
16 |                                        "WHERE foo = 'foo' HAVING bar = 'test'")
17 |         assert query_has_having_clause("SELECT s.cust_id,count(s.cust_id) FROM SH.sales s "
18 |                                        "GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2'")
19 | 
20 |         assert query_has_having_clause('SELECT * FROM foo') is False
21 |         assert query_has_having_clause('SELECT * FROM foo_having LIMIT 10') is False
22 |         assert query_has_having_clause('SELECT /* having */ id FROM foo') is False
23 | 
24 |         assert query_has_having_clause('INSERT 42 INTO having') is False
25 | 
26 |     def test_having_clause(self):
27 |         reports = list(check_having_clause(self.connection, read_queries_from_log('0093-having-clause-log')))
28 | 
29 |         print(list(map(str, reports)))
30 | 
31 |         assert len(reports) == 3
32 | 
33 |         assert str(reports[0]) == 'foo: "SELECT * FROM foo HAVING bar = 2" query uses HAVING clause'
34 |         assert reports[0].table_name == 'foo'
35 |         assert reports[0].context['query'] == 'SELECT * FROM foo HAVING bar = 2;'
36 | 
37 |         assert str(reports[1]) == 'SH.sales: "SELECT s.cust_id,count(s.cust_id) ' \
38 |                                   'FROM SH.sales s ..." query uses HAVING clause'
39 |         assert reports[1].table_name == 'SH.sales'
40 | 
41 |         assert str(reports[2]) == '0019_queries_not_using_indices: "SELECT * FROM ' \
42 |                                   '`0019_queries_not_using_indices` WHE..." query uses HAVING clause'
43 |         assert reports[2].table_name == '0019_queries_not_using_indices'
44 | 
45 |         # assert False
46 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0028_data_not_updated_recently.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter looks for table that were not updated recently
 3 | """
 4 | from collections import OrderedDict
 5 | from datetime import datetime
 6 | from time import time
 7 | 
 8 | from indexdigest.utils import LinterEntry
 9 | 
10 | from .linter_0028_data_too_old import get_time_columns, get_boundary_times
11 | 
12 | 
13 | def check_data_not_updated_recently(database, env=None):
14 |     """
15 |     :type database  indexdigest.database.Database
16 |     :type env dict
17 |     :rtype: list[LinterEntry]
18 |     """
19 |     now = int(time())  # I will probably never understand dates handling in Python
20 | 
21 |     # set up a diff threshold (in days)
22 |     env = env if env else dict()
23 |     diff_threshold = int(env.get('INDEX_DIGEST_DATA_NOT_UPDATED_RECENTLY_THRESHOLD_DAYS', 30))
24 | 
25 |     for (table_name, column) in get_time_columns(database):
26 |         timestamps = get_boundary_times(database, table_name, column)
27 |         if timestamps is None:
28 |             continue
29 | 
30 |         diff = now - timestamps.get('max')
31 |         # print(table_name, column, timestamps, now, diff)
32 | 
33 |         if diff > diff_threshold * 86400:
34 |             diff_days = int(diff / 86400)
35 | 
36 |             metadata = database.get_table_metadata(table_name)
37 | 
38 |             context = OrderedDict()
39 |             context['diff_days'] = diff_days
40 |             context['data_since'] = str(datetime.fromtimestamp(timestamps.get('min')))
41 |             context['data_until'] = str(datetime.fromtimestamp(timestamps.get('max')))
42 |             context['date_column_name'] = str(column)
43 |             context['schema'] = database.get_table_schema(table_name)
44 |             context['rows'] = database.get_table_rows_estimate(table_name)
45 |             context['table_size_mb'] = \
46 |                 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024
47 | 
48 |             yield LinterEntry(linter_type='data_not_updated_recently', table_name=table_name,
49 |                               message='"{}" has the latest row added {} days ago, '
50 |                                       'consider checking if it should be up-to-date'.
51 |                               format(table_name, diff_days),
52 |                               context=context)
53 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0004_redundant_indices.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0004_redundant_indices import get_redundant_indices, check_redundant_indices
 6 | from indexdigest.test import DatabaseTestMixin
 7 | 
 8 | 
 9 | class TestRedundantIndices(TestCase, DatabaseTestMixin):
10 | 
11 |     def test_get_redundant_indices_from_database(self):
12 |         indices = self.connection.get_table_indices('0004_id_foo_bar')
13 |         redundant_indices = get_redundant_indices(indices)
14 |         (entry,) = redundant_indices
15 | 
16 |         print(entry)
17 | 
18 |         self.assertEqual(len(redundant_indices), 1)
19 | 
20 |         # idx_foo is covered by idx_foo_bar
21 |         self.assertEqual(entry[0].name, 'idx_foo')
22 |         self.assertEqual(entry[1].name, 'idx_foo_bar')
23 | 
24 |         # assert False
25 | 
26 |     def test_check_redundant_indices(self):
27 |         reports = check_redundant_indices(self.connection)
28 |         reports = list(filter(
29 |             lambda i: i.table_name.startswith('0004_'),
30 |             reports
31 |         ))
32 | 
33 |         print(list(map(str,reports)))
34 | 
35 |         self.assertEqual(len(reports), 3)
36 |         self.assertEqual(str(reports[0]), '0004_id_foo: "idx" index can be removed as redundant (covered by "PRIMARY")')
37 |         self.assertEqual(str(reports[1]), '0004_id_foo_bar: "idx_foo" index can be removed as redundant (covered by "idx_foo_bar")')
38 |         self.assertEqual(str(reports[2]), '0004_indices_duplicating_each_other: "idx_foo" index can be removed as redundant (covered by "idx_foo_2")')
39 | 
40 |         report = reports[0]
41 | 
42 |         print(report, report.context)
43 | 
44 |         self.assertEqual(str(report.context['redundant']), 'UNIQUE KEY idx (item_id, foo)')
45 |         self.assertEqual(str(report.context['covered_by']), 'PRIMARY KEY (item_id, foo)')
46 | 
47 |         # and we have size reported as well (see #16)
48 |         self.assertTrue(report.context['table_data_size_mb'] > 0)
49 |         self.assertTrue(report.context['table_index_size_mb'] > 0)
50 | 
51 |         # and we a schema reported in the context (see #16)
52 |         self.assertTrue('CREATE TABLE' in report.context['schema'])
53 |         self.assertTrue('AUTO_INCREMENT' in report.context['schema'])
54 |         self.assertTrue('ENGINE=' in report.context['schema'])
55 | 
56 |         # assert False
57 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from setuptools import setup, find_packages
 3 | 
 4 | # take the version
 5 | with open("indexdigest/__init__.py", "r") as fh:
 6 |     # e.g. VERSION = '1.5.0'
 7 |     last_line = fh.readlines()[-1]
 8 |     VERSION = re.search(r'[\d.]+', last_line).group(0)
 9 | 
10 | # @see https://packaging.python.org/tutorials/packaging-projects/#creating-setup-py
11 | with open("README.md", "r") as fh:
12 |     long_description = fh.read()
13 | 
14 | # @see https://github.com/pypa/sampleproject/blob/master/setup.py
15 | setup(
16 |     name='indexdigest',
17 |     version=VERSION,
18 |     author='Maciej Brencz',
19 |     author_email='maciej.brencz@gmail.com',
20 |     license='MIT',
21 |     description='Analyses your database queries and schema and suggests indices and schema improvements',
22 |     long_description=long_description,
23 |     long_description_content_type="text/markdown",
24 |     url='https://github.com/macbre/index-digest',
25 |     # https://pypi.python.org/pypi?%3Aaction=list_classifiers
26 |     classifiers=[
27 |         # How mature is this project? Common values are
28 |         #   3 - Alpha
29 |         #   4 - Beta
30 |         #   5 - Production/Stable
31 |         'Development Status :: 5 - Production/Stable',
32 | 
33 |         # Indicate who your project is intended for
34 |         'Intended Audience :: Developers',
35 |         'Intended Audience :: System Administrators',
36 |         'Topic :: Database',
37 | 
38 |         # Specify the Python versions you support here. In particular, ensure
39 |         # that you indicate whether you support Python 2, Python 3 or both.
40 |         'Programming Language :: Python :: 3',
41 |     ],
42 |     packages=find_packages(),
43 |     python_requires='>=3.9',
44 |     extras_require={
45 |         'dev': [
46 |             'coverage==7.10.7',
47 |             'pylint==3.3.9',
48 |             'pytest==8.4.2',
49 |             'pytest-cov==7.0.0',
50 |             'twine==6.2.0',
51 |         ]
52 |     },
53 |     install_requires=[
54 |         'docopt==0.6.2',
55 |         'PyYAML==6.0.3',
56 |         'mysqlclient==2.2.7',
57 |         'sql_metadata==2.19.0',
58 |         'termcolor==3.1.0',
59 |         'yamlordereddictloader==0.4.2'
60 |     ],
61 |     entry_points={
62 |         'console_scripts': [
63 |             'add_linter=indexdigest.cli.add_linter:main',  # creates a new linter from a template
64 |             'index_digest=indexdigest.cli.script:main',
65 |         ],
66 |     }
67 | )
68 | 


--------------------------------------------------------------------------------
/indexdigest/test/core/test_columns.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from indexdigest.schema import Column
 4 | 
 5 | 
 6 | class TestColumn(TestCase):
 7 | 
 8 |     def test_is_text_type(self):
 9 |         text_types = [
10 |             'CHAR(16)',
11 |             'VARCHAR(16)',
12 |             'CHAR(32)',
13 |             'VARCHAR(32)',
14 |             'BINARY(16)',
15 |             'VARBINARY(16)',
16 |             'BINARY(32)',
17 |             'VARBINARY(32)',
18 |             'TEXT',
19 |             'BLOB',
20 |             "SET('a', 'b', 'c', 'd')",
21 |             "ENUM('x-small', 'small', 'medium', 'large', 'x-large')",
22 |         ]
23 | 
24 |         not_text_types = [
25 |             'INT',
26 |             'BIGINT',
27 |             'INT(9)',
28 |             'TIMESTAMP',
29 |             'DATETIME',
30 |         ]
31 | 
32 |         for text_type in text_types:
33 |             self.assertTrue(
34 |                 expr=Column('foo', column_type=text_type, character_set='utf8').is_text_type(),
35 |                 msg=text_type)
36 | 
37 |         for not_text_type in not_text_types:
38 |             self.assertFalse(
39 |                 expr=Column('foo', column_type=not_text_type, character_set='utf8').is_text_type(),
40 |                 msg=not_text_type)
41 | 
42 |     def test_is_timestamp_type(self):
43 |         timestamp_types = [
44 |             'TIMESTAMP',
45 |             'DATETIME',
46 |             'DATE',
47 |             'TIME',
48 |             'YEAR',
49 |         ]
50 | 
51 |         not_timestamp_types = [
52 |             'INT',
53 |             'BIGINT',
54 |             'INT(9)',
55 |             'CHAR(16)',
56 |             'VARCHAR(16)',
57 |             'CHAR(32)',
58 |             'VARCHAR(32)',
59 |             'BINARY(16)',
60 |             'VARBINARY(16)',
61 |             'BINARY(32)',
62 |             'VARBINARY(32)',
63 |             'TEXT',
64 |             'BLOB',
65 |             "SET('a', 'b', 'c', 'd')",
66 |             "ENUM('x-small', 'small', 'medium', 'large', 'x-large')",
67 |         ]
68 | 
69 |         for timestamp_type in timestamp_types:
70 |             self.assertTrue(
71 |                 expr=Column('foo', column_type=timestamp_type, character_set='utf8').is_timestamp_type(),
72 |                 msg=timestamp_type)
73 | 
74 |         for not_timestamp_type in not_timestamp_types:
75 |             self.assertFalse(
76 |                 expr=Column('foo', column_type=not_timestamp_type, character_set='utf8').is_timestamp_type(),
77 |                 msg=not_timestamp_type)
78 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
 1 | name: Test against different Python version
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   python_tests:
10 |     runs-on: ubuntu-latest
11 | 
12 |     strategy:
13 |       # Do not fail if one the tests did not pass
14 |       fail-fast: false
15 | 
16 |       matrix:
17 |         # Python version(s) to use when running the tests
18 |         # https://github.com/actions/python-versions/blob/main/versions-manifest.json
19 |         python-version:
20 |           - "3.9"
21 |           - "3.10"
22 |           - "3.11"
23 |           - "3.12"
24 |           - "3.13"
25 |           - "3.14"
26 | 
27 |         # Docker images of MySQL-compliant databases to run the tests suite on
28 |         database:
29 |          - "mysql:8.0.20"
30 | 
31 |     services:
32 |         mysql:
33 |             image: ${{ matrix.database }}
34 |             env:
35 |                 MYSQL_ALLOW_EMPTY_PASSWORD: yes
36 |                 MYSQL_DATABASE: index_digest
37 |             ports:
38 |               - "53306:3306"
39 |             options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3
40 | 
41 |     steps:
42 |     - uses: actions/checkout@v5.0.1
43 |     - name: Set up Python
44 |       uses: actions/setup-python@v6
45 |       with:
46 |         python-version: ${{ matrix.python-version }}
47 |         cache: 'pip' # dependencies caching
48 |         cache-dependency-path: 'setup.py'
49 | 
50 |     - name: Install dependencies
51 |       run: |
52 |         python -m pip install --upgrade pip
53 |         pip install wheel
54 |         make install
55 | 
56 |     - name: Linter
57 |       run: make lint
58 | 
59 |     - name: Set up the database
60 |       run: |
61 |         docker ps
62 |         mysql --protocol=tcp --port=53306 -u root --password="" -v < setup.sql
63 |         # import the test schema files
64 |         "./sql/populate.sh"
65 |         mysql --protocol=tcp --port=53306 -uindex_digest -pqwerty index_digest -v -e '\s; SHOW TABLES; SHOW DATABASES;'
66 | 
67 |     - name: Tests with code coverage
68 |       run: make coverage
69 | 
70 |     # https://coveralls-python.readthedocs.io/en/latest/usage/index.html
71 |     # upload coverage report for just one of Python version matrix runs
72 |     - name: Upload coverage report to Coveralls
73 |       if: matrix.python-version == '3.12'
74 |       env:
75 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
76 |       run: |
77 |         pip install coveralls
78 |         coveralls --service=github
79 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0070_insert_ignore.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0070_insert_ignore import \
 6 |     remove_comments, is_insert_ignore_query, check_insert_ignore_queries
 7 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
 8 | 
 9 | 
10 | class TestInsertIgnore(TestCase, DatabaseTestMixin):
11 | 
12 |     def test_remove_comments(self):
13 |         self.assertEqual(
14 |             'INSERT  IGNORE',
15 |             remove_comments('INSERT /* foo */ IGNORE')
16 |         )
17 | 
18 |         self.assertEqual(
19 |             'SELECT foo',
20 |             remove_comments('/* foo */SELECT/* test*/ foo')
21 |         )
22 | 
23 |     def test_is_insert_ignore_query(self):
24 |         assert is_insert_ignore_query("INSERT IGNORE INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is True
25 |         assert is_insert_ignore_query("INSERT IGN/*bar*/ORE INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is True
26 |         assert is_insert_ignore_query("Insert /* foo */ Ignore INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is True
27 |         assert is_insert_ignore_query("/* foo */ INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');") is True
28 | 
29 |         assert is_insert_ignore_query("/* INSERT IGNORE */ INSERT INTO 0070_insert_ignore VALUES ('2017-01-01', 9, 123);") is False
30 |         assert is_insert_ignore_query("INSERT INTO 0070_insert_ignore VALUES ('INSERT IGNORE', 9, 123);") is False
31 | 
32 |     def test_queries(self):
33 |         reports = list(check_insert_ignore_queries(
34 |             database=self.connection, queries=read_queries_from_log('0070-insert-ignore-log')))
35 | 
36 |         print(reports)
37 | 
38 |         self.assertEqual(len(reports), 4)
39 | 
40 |         self.assertEqual(str(reports[0]), '0070_insert_ignore: "INSERT IGNORE INTO `0070_insert_ignore` VALUES (9,..." query uses a risky INSERT IGNORE')
41 |         self.assertEqual(reports[0].table_name, '0070_insert_ignore')
42 |         self.assertEqual(str(reports[0].context['query']), "INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');")
43 |         assert 'CREATE TABLE `0070_insert_ignore` (' in str(reports[0].context['schema'])
44 | 
45 |         self.assertEqual(str(reports[1].context['query']), "/* foo */ INSERT IGNORE INTO `0070_insert_ignore` VALUES (9, '123', '2017-01-01');")
46 |         self.assertEqual(str(reports[2].context['query']), "INSERT  IGNORE INTO `0070_insert_ignore` VALUES ('123', 9, '2017-01-01');")
47 |         self.assertEqual(str(reports[3].context['query']), "INSERT /* foo */ IGNORE INTO `0070_insert_ignore` VALUES ('2017-01-01', 9, 123);")
48 |         # assert False
49 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0020_big_table.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from indexdigest.linters import check_queries_using_filesort, check_queries_using_temporary
 4 | from indexdigest.test import BigTableTest, read_queries_from_log
 5 | 
 6 | 
 7 | class TestBigTableLinters(BigTableTest):
 8 | 
 9 |     def test_get_table_rows_estimate(self):
10 |         # this table has 100000, but assume the returned estimate is above 75k
11 |         self.assertGreater(self.connection.get_table_rows_estimate('0020_big_table'), 75000)
12 | 
13 |     def test_filesort(self):
14 |         reports = list(check_queries_using_filesort(self.connection, read_queries_from_log('0020-big-table-log')))
15 | 
16 |         self.assertEqual(len(reports), 2)
17 | 
18 |         self.assertEqual(str(reports[0]),
19 |                          '0020_big_table: "SELECT * FROM 0020_big_table WHERE item_id BETWEEN..." query used filesort')
20 |         self.assertEqual(reports[0].context['query'],
21 |                          'SELECT * FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 ORDER BY val')
22 |         self.assertEqual(reports[0].context['explain_extra'], 'Using where; Using filesort')
23 |         self.assertEqual(reports[0].context['explain_rows'], 11)
24 |         self.assertEqual(reports[0].context['explain_key'], 'PRIMARY')
25 | 
26 |         self.assertEqual(str(reports[1]),
27 |                          '0020_big_table: "SELECT val, count(*) FROM 0020_big_table WHERE ite..." query used filesort')
28 |         self.assertEqual(reports[1].context['query'],
29 |                          'SELECT val, count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 GROUP BY val ORDER BY val')
30 |         self.assertEqual(reports[1].context['explain_extra'], 'Using where; Using temporary; Using filesort')
31 |         self.assertEqual(reports[1].context['explain_rows'], 11)
32 |         self.assertEqual(reports[1].context['explain_key'], 'PRIMARY')
33 | 
34 |         # assert False
35 | 
36 |     def test_temporary(self):
37 |         reports = list(check_queries_using_temporary(self.connection, read_queries_from_log('0020-big-table-log')))
38 | 
39 |         self.assertEqual(len(reports), 1)
40 | 
41 |         self.assertEqual(str(reports[0]),
42 |                          '0020_big_table: "SELECT val, count(*) FROM 0020_big_table WHERE ite..." query used temporary')
43 |         self.assertEqual(reports[0].context['query'],
44 |                          'SELECT val, count(*) FROM 0020_big_table WHERE item_id BETWEEN 10 AND 20 GROUP BY val ORDER BY val')
45 |         self.assertEqual(reports[0].context['explain_extra'], 'Using where; Using temporary; Using filesort')
46 |         self.assertEqual(reports[0].context['explain_rows'], 11)
47 |         self.assertEqual(reports[0].context['explain_key'], 'PRIMARY')
48 | 
49 |         # assert False
50 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0032_utf_latin_columns.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from indexdigest.linters.linter_0032_utf_latin_columns import \
 6 |     check_latin_columns, is_text_column_latin
 7 | from indexdigest.schema import Column
 8 | from indexdigest.test import Database, DatabaseTestMixin
 9 | 
10 | 
11 | class TestIsTextColumnLatin(TestCase):
12 | 
13 |     def test_is_text_column_non_latin(self):
14 |         for character_set in ['utf8', 'ucs2', 'utf8mb4', 'utf16', 'utf16le', 'utf32', 'binary']:
15 |             column = Column(name='foo', column_type='varchar', character_set=character_set)
16 | 
17 |             assert is_text_column_latin(column) is False, character_set
18 | 
19 |     def test_is_text_column_latin(self):
20 |         # @see https://dev.mysql.com/doc/refman/5.7/en/charset-mysql.html
21 |         for character_set in ['big5', 'latin1', 'latin2']:
22 |             column = Column(name='foo', column_type='varchar', character_set=character_set)
23 | 
24 |             assert is_text_column_latin(column) is True, character_set
25 | 
26 |     def test_blob_column(self):
27 |         assert is_text_column_latin(Column(name='foo', column_type='blob')) is False
28 | 
29 | 
30 | class LimitedViewDatabase(Database, DatabaseTestMixin):
31 |     """
32 |     Limit test to tables from sql/0032-utf-latin-columns.sql
33 |     """
34 |     def get_tables(self):
35 |         return ['0032_utf8_table', '0032_latin1_table']
36 | 
37 | 
38 | class TestFullTableScan(TestCase):
39 |     @property
40 |     def connection(self):
41 |         return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN)
42 | 
43 |     def test_latin1_columns(self):
44 |         reports = list(check_latin_columns(self.connection))
45 | 
46 |         print(list(map(str, reports)))
47 | 
48 |         self.assertEqual(len(reports), 3)
49 | 
50 |         self.assertEqual(str(reports[0]),
51 |                          '0032_utf8_table: "latin_column" text column has "latin1" character set defined')
52 |         self.assertEqual(reports[0].context['column'], 'latin_column')
53 |         self.assertEqual(reports[0].context['column_character_set'], 'latin1')
54 |         self.assertEqual(reports[0].context['column_collation'], 'latin1_bin')
55 |         assert 'CREATE TABLE `0032_utf8_table` (' in reports[0].context['schema']
56 | 
57 |         self.assertEqual(str(reports[1]),
58 |                          '0032_utf8_table: "big5_column" text column has "big5" character set defined')
59 |         self.assertEqual(reports[1].context['column'], 'big5_column')
60 |         self.assertEqual(reports[1].context['column_character_set'], 'big5')
61 |         self.assertEqual(reports[1].context['column_collation'], 'big5_chinese_ci')
62 | 
63 |         self.assertEqual(str(reports[2]),
64 |                          '0032_latin1_table: "name" text column has "latin1" character set defined')
65 |         self.assertEqual(reports[2].context['column'], 'name')
66 |         self.assertEqual(reports[2].context['column_character_set'], 'latin1')
67 |         self.assertEqual(reports[2].context['column_collation'], 'latin1_swedish_ci')
68 | 
69 |         # assert False
70 | 


--------------------------------------------------------------------------------
/indexdigest/test/core/test_indices.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from indexdigest.schema import Index
 4 | 
 5 | 
 6 | class TestIndex(TestCase):
 7 | 
 8 |     def test_repr(self):
 9 |         self.assertEqual('<Index> KEY foo (id, bar)', repr(Index(name='foo', columns=['id', 'bar'])))
10 |         self.assertEqual('<Index> PRIMARY KEY (id)', repr(Index(name='key', columns=['id'], primary=True)))
11 |         self.assertEqual('<Index> UNIQUE KEY idx_bar (bar)', repr(Index(name='idx_bar', columns=['bar'], unique=True)))
12 | 
13 |     def test_is_covered_by(self):
14 |         # #1 case
15 |         primary = Index(name='base', columns=['id', 'bar'], primary=True)
16 |         second = Index(name='base', columns=['id', 'bar'])
17 | 
18 |         self.assertFalse(primary.is_covered_by(second))
19 |         self.assertTrue(second.is_covered_by(primary))
20 | 
21 |         # self-check
22 |         self.assertFalse(second.is_covered_by(second))
23 | 
24 |         # #2 case
25 |         first = Index(name='base', columns=['id', 'bar', 'foo'])
26 |         second = Index(name='base', columns=['id', 'bar'])
27 | 
28 |         self.assertFalse(first.is_covered_by(second))
29 |         self.assertTrue(second.is_covered_by(first))
30 | 
31 |         # #3 case
32 |         first = Index(name='base', columns=['id', 'bar', 'foo'])
33 |         second = Index(name='base', columns=['id', 'bar', 'foo'])
34 | 
35 |         self.assertTrue(first.is_covered_by(second))
36 |         self.assertTrue(second.is_covered_by(first))
37 | 
38 |         # #4 case
39 |         first = Index(name='base', columns=['id', 'bar', 'foo'])
40 |         second = Index(name='base', columns=['bar', 'foo'])
41 | 
42 |         self.assertFalse(first.is_covered_by(second))
43 |         self.assertFalse(second.is_covered_by(first))
44 | 
45 |     def test_primary_and_unique_keys_coverage(self):
46 |         # @see https://github.com/macbre/index-digest/issues/49
47 | 
48 |         # second key adds a uniqueness constraint, keep it
49 |         first = Index(name='base', columns=['bar', 'foo'], primary=True)
50 |         second = Index(name='base', columns=['bar'], unique=True)
51 | 
52 |         self.assertFalse(first.is_covered_by(second))
53 |         self.assertFalse(second.is_covered_by(first))
54 | 
55 |         # these keys are the same (primary is unique)
56 |         first = Index(name='base', columns=['bar', 'foo'], primary=True)
57 |         second = Index(name='base', columns=['bar', 'foo'], unique=True)
58 | 
59 |         self.assertFalse(first.is_covered_by(second))
60 |         self.assertTrue(second.is_covered_by(first))
61 | 
62 |         # prefer unique over non-unique
63 |         first = Index(name='base', columns=['bar', 'foo'], unique=True)
64 |         second = Index(name='base', columns=['bar', 'foo'])
65 | 
66 |         self.assertFalse(first.is_covered_by(second))
67 |         self.assertTrue(second.is_covered_by(first))
68 | 
69 |         # identical unique indices
70 |         first = Index(name='base', columns=['bar', 'foo'], unique=True)
71 |         second = Index(name='base', columns=['bar', 'foo'], unique=True)
72 | 
73 |         self.assertTrue(first.is_covered_by(second))
74 |         self.assertTrue(second.is_covered_by(first))
75 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0020_filesort_temporary_table.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for SELECT queries whether they trigger filesort or temporary file
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import explain_queries, LinterEntry, shorten_query
 7 | 
 8 | 
 9 | def filter_explain_extra(database, queries, check):
10 |     """
11 |     Parse "Extra" column from EXPLAIN query results, e.g.
12 | 
13 |     "Using where; Using temporary; Using filesort"
14 | 
15 |     :type database  indexdigest.database.Database
16 |     :type queries list[str]
17 |     :type check str
18 |     :rtype: list
19 |     """
20 |     for (query, table_used, _, explain_row) in explain_queries(database, queries):
21 |         extra_parsed = str(explain_row['Extra']).split('; ')
22 | 
23 |         if check in extra_parsed:
24 |             context = OrderedDict()
25 |             context['query'] = query
26 | 
27 |             context['explain_extra'] = explain_row['Extra']
28 |             context['explain_rows'] = int(explain_row['rows'])  # string when using MariaDB 10.5
29 |             context['explain_filtered'] = explain_row.get('filtered')  # can be not set
30 |             context['explain_key'] = explain_row['key']
31 | 
32 |             yield (query, table_used, context)
33 | 
34 | 
35 | def check_queries_using_filesort(database, queries):
36 |     """
37 |     Using filesort
38 | 
39 |     MySQL must do an extra pass to find out how to retrieve the rows in sorted order. The sort is
40 |     done by going through all rows according to the join type and storing the sort key and pointer
41 |     to the row for all rows that match the WHERE clause. The keys then are sorted and the rows are
42 |     retrieved in sorted order.
43 | 
44 |     Percona says: The truth is, filesort is badly named. Anytime a sort can't be performed from an
45 |     index, it's a filesort. It has nothing to do with files. Filesort should be called "sort."
46 |     It is quicksort at heart.
47 | 
48 |     :type database  indexdigest.database.Database
49 |     :type queries list[str]
50 |     :rtype: list[LinterEntry]
51 |     """
52 |     filtered = filter_explain_extra(database, queries, check='Using filesort')
53 | 
54 |     for (query, table_used, context) in filtered:
55 |         yield LinterEntry(linter_type='queries_using_filesort', table_name=table_used,
56 |                           message='"{}" query used filesort'.format(shorten_query(query)),
57 |                           context=context)
58 | 
59 | 
60 | def check_queries_using_temporary(database, queries):
61 |     """
62 |     Using temporary
63 | 
64 |     To resolve the query, MySQL needs to create a temporary table to hold the result. This
65 |     typically happens if the query contains GROUP BY and ORDER BY clauses that list columns
66 |     differently.
67 | 
68 |     :type database  indexdigest.database.Database
69 |     :type queries list[str]
70 |     :rtype: list[LinterEntry]
71 |     """
72 |     filtered = filter_explain_extra(database, queries, check='Using temporary')
73 | 
74 |     for (query, table_used, context) in filtered:
75 |         yield LinterEntry(linter_type='queries_using_temporary', table_name=table_used,
76 |                           message='"{}" query used temporary'.format(shorten_query(query)),
77 |                           context=context)
78 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0031_low_cardinality_index.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter checks for ...
 3 | """
 4 | from collections import OrderedDict
 5 | 
 6 | from indexdigest.utils import LinterEntry
 7 | 
 8 | # skip small tables
 9 | ROWS_COUNT_THRESHOLD = 100000
10 | 
11 | # cardinality threshold
12 | INDEX_CARDINALITY_THRESHOLD = 6
13 | 
14 | # the least frequent value should be used at most by x% rows
15 | INDEX_VALUE_PERCENTAGE_THRESHOLD = 20
16 | 
17 | 
18 | def get_low_cardinality_indices(database):
19 |     """
20 |     :type database  indexdigest.database.Database
21 |     :rtype: list
22 |     """
23 |     for table_name in database.get_tables():
24 |         rows_count = database.get_table_rows_estimate(table_name)
25 |         if rows_count < ROWS_COUNT_THRESHOLD:
26 |             continue
27 | 
28 |         # get table indices statistics
29 |         # @see https://dev.mysql.com/doc/refman/5.7/en/show-index.html
30 |         # @see https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
31 |         indices = database.query_dict_rows(
32 |             "select TABLE_NAME, INDEX_NAME, COLUMN_NAME, CARDINALITY from"
33 |             " INFORMATION_SCHEMA.STATISTICS where"
34 |             " TABLE_NAME = '{table_name}' AND TABLE_SCHEMA = '{database_name}'".format(
35 |                 table_name=table_name, database_name=database.db_name)
36 |         )
37 | 
38 |         for index in indices:
39 |             # the cardinality is too high
40 |             if index['CARDINALITY'] > INDEX_CARDINALITY_THRESHOLD:
41 |                 continue
42 | 
43 |             yield table_name, rows_count, index
44 | 
45 | 
46 | def check_low_cardinality_index(database):
47 |     """
48 |     :type database  indexdigest.database.Database
49 |     :rtype: list[LinterEntry]
50 |     """
51 |     for table_name, rows_count, index in get_low_cardinality_indices(database):
52 |         # the least frequent value should be used in up to 20% of rows
53 |         # https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
54 |         row = database.query_dict_row(
55 |             'SELECT {column} AS value, COUNT(*) AS cnt FROM `{table}` '
56 |             'GROUP BY 1 ORDER BY 2 ASC LIMIT 1'.format(
57 |                 column=index['COLUMN_NAME'], table=index['TABLE_NAME']
58 |             )
59 |         )
60 | 
61 |         value_usage = 100. * row['cnt'] / rows_count
62 |         # print(row, value_usage)
63 | 
64 |         # the least frequent value is quite rare - it makes sense to have an index here
65 |         if value_usage < INDEX_VALUE_PERCENTAGE_THRESHOLD:
66 |             continue
67 | 
68 |         # print(value_usage, index, table_name)
69 | 
70 |         context = OrderedDict()
71 |         context['column_name'] = index['COLUMN_NAME']
72 |         context['index_name'] = index['INDEX_NAME']
73 |         context['index_cardinality'] = int(index['CARDINALITY'])
74 |         context['schema'] = database.get_table_schema(table_name)
75 |         context['value_usage'] = value_usage
76 | 
77 |         yield LinterEntry(linter_type='low_cardinality_index', table_name=table_name,
78 |                           message='"{}" index on "{}" column has low cardinality, '
79 |                                   'check if it is needed'.
80 |                           format(index['INDEX_NAME'], index['COLUMN_NAME']),
81 |                           context=context)
82 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0028_data_too_old.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This linter looks for tables that have really old data
 3 | """
 4 | from collections import OrderedDict
 5 | from datetime import datetime
 6 | from time import time
 7 | 
 8 | from indexdigest.utils import LinterEntry, memoize
 9 | 
10 | 
11 | def get_time_columns(database):
12 |     """
13 |     :type database  indexdigest.database.Database
14 |     :rtype: list
15 |     """
16 |     for table_name in database.get_tables():
17 |         time_columns = [
18 |             column
19 |             for column in database.get_table_columns(table_name)
20 |             if column.is_timestamp_type() or 'time' in column.name
21 |         ]
22 | 
23 |         # there are no time type columns, skip
24 |         if not time_columns:
25 |             continue
26 | 
27 |         # for now just check the first time column
28 |         yield (table_name, time_columns[0])
29 | 
30 | 
31 | @memoize
32 | def get_boundary_times(database, table_name, column):
33 |     """
34 |     :type database  indexdigest.database.Database
35 |     :type table_name str
36 |     :type column indexdigest.Column.column
37 |     :rtype: dict
38 |     """
39 |     # this may take a while when {column} is not indexed!
40 |     query = 'SELECT /* index-digest */ UNIX_TIMESTAMP(MIN(`{column}`)) as `min`, ' \
41 |             'UNIX_TIMESTAMP(MAX(`{column}`)) as `max` FROM `{table}`'.\
42 |         format(
43 |             column=column.name,
44 |             table=table_name
45 |         )
46 | 
47 |     timestamps = database.query_dict_row(query)
48 | 
49 |     # if there's no data in the table, return None
50 |     return timestamps if timestamps.get('min') and timestamps.get('max') else None
51 | 
52 | 
53 | def check_data_too_old(database, env=None):
54 |     """
55 |     :type database  indexdigest.database.Database
56 |     :type env dict
57 |     :rtype: list[LinterEntry]
58 |     """
59 |     now = int(time())  # I will probably never understand dates handling in Python
60 | 
61 |     # set up a diff threshold (in days)
62 |     env = env if env else dict()
63 |     diff_threshold = int(env.get('INDEX_DIGEST_DATA_TOO_OLD_THRESHOLD_DAYS', 3 * 30))
64 | 
65 |     for (table_name, column) in get_time_columns(database):
66 |         timestamps = get_boundary_times(database, table_name, column)
67 | 
68 |         if timestamps is None:
69 |             continue
70 | 
71 |         diff = now - timestamps.get('min')
72 |         # print(table_name, column, timestamps, now, diff)
73 | 
74 |         if diff > diff_threshold * 86400:
75 |             diff_days = int(diff / 86400)
76 | 
77 |             metadata = database.get_table_metadata(table_name)
78 | 
79 |             context = OrderedDict()
80 |             context['diff_days'] = diff_days
81 |             context['data_since'] = str(datetime.fromtimestamp(timestamps.get('min')))
82 |             context['data_until'] = str(datetime.fromtimestamp(timestamps.get('max')))
83 |             context['date_column_name'] = str(column)
84 |             context['schema'] = database.get_table_schema(table_name)
85 |             context['rows'] = database.get_table_rows_estimate(table_name)
86 |             context['table_size_mb'] = \
87 |                 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024
88 | 
89 |             yield LinterEntry(linter_type='data_too_old', table_name=table_name,
90 |                               message='"{}" has rows added {} days ago, '
91 |                                       'consider changing retention policy'.
92 |                               format(table_name, diff_days),
93 |                               context=context)
94 | 


--------------------------------------------------------------------------------
/indexdigest/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains utility functions and classes
  3 | """
  4 | from urllib.parse import urlparse
  5 | 
  6 | import functools
  7 | import logging
  8 | import re
  9 | 
 10 | 
 11 | def parse_dsn(dsn):
 12 |     """
 13 |     Parser given Data Source Name string into an object that can be passed to a database connector.
 14 | 
 15 |     Example: mysql://alex:pwd@localhost/test
 16 |     DSN: <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 17 | 
 18 |     @see https://mysqlclient.readthedocs.io/user_guide.html#mysqldb
 19 | 
 20 |     :type dsn str
 21 |     :rtype: dict
 22 |     """
 23 |     parsed = urlparse(dsn)
 24 | 
 25 |     return {
 26 |         'host': parsed.hostname,
 27 |         'port': int(parsed.port) if parsed.port else 3306,
 28 |         'user': parsed.username,
 29 |         'passwd': parsed.password,
 30 |         'db': str(parsed.path).lstrip('/')
 31 |     }
 32 | 
 33 | 
 34 | def is_select_query(query):
 35 |     """
 36 |     :type query str
 37 |     :rtype bool
 38 |     """
 39 |     query = query.lstrip().lower()
 40 |     query = re.sub(r'^/\*[^*]+\*/', '', query)  # remove SQL comments
 41 | 
 42 |     return query.lstrip().startswith('select')
 43 | 
 44 | 
 45 | def explain_queries(database, queries):
 46 |     """
 47 |     Yields EXPLAIN result rows for given queries
 48 | 
 49 |     :type database  indexdigest.database.Database
 50 |     :type queries list[str]
 51 |     :rtype: tuple[str,str,str,str]
 52 |     """
 53 |     # analyze only SELECT queries from the log
 54 |     for query in filter(is_select_query, queries):
 55 |         try:
 56 |             for row in database.explain_query(query):
 57 |                 table_used = row['table']
 58 |                 index_used = row['key']
 59 | 
 60 |                 yield (query, table_used, index_used, row)
 61 |         except IndexDigestError:
 62 |             logger = logging.getLogger('explain_queries')
 63 |             logger.error('Cannot explain the query: %s', query)
 64 | 
 65 | 
 66 | def shorten_query(query, max_len=50):
 67 |     """
 68 |     :type query str
 69 |     :type max_len int
 70 |     :rtype: str
 71 |     """
 72 |     query = query.rstrip('; ')
 73 |     return '{}...'.format(query[:max_len]) if len(query) > max_len else query
 74 | 
 75 | 
 76 | def memoize(func):
 77 |     """
 78 |     Memoization pattern implemented
 79 | 
 80 |     :type func
 81 |     :rtype func
 82 |     """
 83 |     # @see https://medium.com/@nkhaja/memoization-and-decorators-with-python-32f607439f84
 84 |     cache = func.cache = {}
 85 | 
 86 |     @functools.wraps(func)
 87 |     def memoized_func(*args, **kwargs):
 88 |         """
 89 |         :type args
 90 |         :type kwargs
 91 |         """
 92 |         key = str(args) + str(kwargs)
 93 |         if key not in cache:
 94 |             cache[key] = func(*args, **kwargs)
 95 |         return cache[key]
 96 |     return memoized_func
 97 | 
 98 | 
 99 | # pylint:disable=too-few-public-methods
100 | class LinterEntry:
101 |     """
102 |     Wraps a single linter entry. Various formatters may display this data differently.
103 |     """
104 |     def __init__(self, linter_type, table_name, message, context=None):
105 |         """
106 |         :type linter_type str
107 |         :type table_name str
108 |         :type message str
109 |         :type context dict
110 |         """
111 |         self.linter_type = linter_type
112 |         self.table_name = table_name
113 |         self.message = message
114 |         self.context = context
115 | 
116 |     def __str__(self):
117 |         return '{table_name}: {message}'.format(
118 |             table_name=self.table_name, message=self.message)
119 | 
120 | 
121 | class IndexDigestError(Exception):
122 |     """
123 |     index-digest base exception class
124 |     """
125 | 


--------------------------------------------------------------------------------
/indexdigest/test/linters/test_0006_not_used_columns_and_tables.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from unittest import TestCase
  4 | 
  5 | from indexdigest.linters.linter_0006_not_used_columns_and_tables import check_not_used_tables, check_not_used_columns, \
  6 |     get_used_tables_from_queries
  7 | from indexdigest.database import Database
  8 | from indexdigest.test import DatabaseTestMixin, read_queries_from_log
  9 | 
 10 | 
 11 | class LimitedViewDatabase(Database, DatabaseTestMixin):
 12 |     """
 13 |     Limit test to tables from sql/0006-not-used-columns-and-tables.sql
 14 |     """
 15 |     def get_tables(self):
 16 |         return ['0006_not_used_columns', '0006_not_used_tables']
 17 | 
 18 | 
 19 | class TestNotUsedTables(TestCase):
 20 | 
 21 |     @property
 22 |     def connection(self):
 23 |         return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN)
 24 | 
 25 |     def test_not_used_tables(self):
 26 |         reports = list(check_not_used_tables(
 27 |             database=self.connection, queries=read_queries_from_log('0006-not-used-columns-and-tables-log')))
 28 | 
 29 |         print(reports)
 30 | 
 31 |         self.assertEqual(len(reports), 1)
 32 |         self.assertEqual(str(reports[0]), '0006_not_used_tables: "0006_not_used_tables" table was not used by provided queries')
 33 |         self.assertEqual(reports[0].table_name, '0006_not_used_tables')
 34 | 
 35 |         assert str(reports[0].context['schema']).startswith('CREATE TABLE `0006_not_used_tables` (\n')
 36 | 
 37 |         # these are estimates, can't assert a certain value
 38 |         assert reports[0].context['table_size_mb'] > 0.0001
 39 |         assert reports[0].context['rows_estimated'] > 0
 40 | 
 41 |     def test_get_used_tables_from_queries(self):
 42 |         queries = [
 43 |             'SELECT /* a comment */ foo FROM `0006_not_used_columns` AS r WHERE item_id = 1;',  # table alias
 44 |             'SELECT 1 FROM `0006_not_used_tables` WHERE item_id = 3;',
 45 |         ]
 46 | 
 47 |         tables = get_used_tables_from_queries(queries)
 48 | 
 49 |         print(tables)
 50 | 
 51 |         self.assertListEqual(tables, ['0006_not_used_columns', '0006_not_used_tables'])
 52 | 
 53 |         # assert False
 54 | 
 55 | 
 56 | class TestNotUsedColumns(TestCase):
 57 | 
 58 |     @property
 59 |     def connection(self):
 60 |         return LimitedViewDatabase.connect_dsn(DatabaseTestMixin.DSN)
 61 | 
 62 |     def test_not_used_columns(self):
 63 |         queries = [
 64 |             'SELECT test, item_id FROM `0006_not_used_columns` WHERE foo = "a"'
 65 |         ]
 66 | 
 67 |         reports = list(check_not_used_columns(database=self.connection, queries=queries))
 68 | 
 69 |         self.assertEqual(len(reports), 1)
 70 |         self.assertEqual(str(reports[0]), '0006_not_used_columns: "bar" column was not used by provided queries')
 71 |         self.assertEqual(reports[0].table_name, '0006_not_used_columns')
 72 |         self.assertEqual(reports[0].context['column_name'], 'bar')
 73 |         self.assertEqual(reports[0].context['column_type'], 'varchar(16)')
 74 | 
 75 |         # assert False
 76 | 
 77 |     def test_not_used_columns_two(self):
 78 |         queries = [
 79 |             'SELECT test FROM `0006_not_used_columns` WHERE foo = "a"'
 80 |         ]
 81 | 
 82 |         reports = list(check_not_used_columns(database=self.connection, queries=queries))
 83 | 
 84 |         # reports ordered is the same as schema columns order
 85 |         self.assertEqual(len(reports), 2)
 86 |         self.assertEqual(reports[0].context['column_name'], 'item_id')
 87 |         self.assertEqual(reports[0].context['column_type'], 'int')
 88 |         self.assertEqual(reports[1].context['column_name'], 'bar')
 89 |         self.assertEqual(reports[1].context['column_type'], 'varchar(16)')
 90 | 
 91 |         # assert False
 92 | 
 93 |     def test_parsing_errors_handling(self):
 94 |         queries = [
 95 |             'SELECT test',
 96 |             'SELECT 0020_big_table t WHERE id BETWEEN 10 AND 20 GROUP BY val'
 97 |         ]
 98 | 
 99 |         reports = list(check_not_used_columns(database=self.connection, queries=queries))
100 |         self.assertEqual(len(reports), 0)
101 | 
102 |         # assert False
103 | 


--------------------------------------------------------------------------------
/indexdigest/linters/linter_0006_not_used_columns_and_tables.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This linter checks for not used columns and tables by going through SELECT queries
  3 | """
  4 | import logging
  5 | 
  6 | from collections import defaultdict, OrderedDict
  7 | from sql_metadata.compat import get_query_columns, get_query_tables
  8 | 
  9 | from indexdigest.utils import LinterEntry, is_select_query
 10 | 
 11 | 
 12 | def get_used_tables_from_queries(queries):
 13 |     """
 14 |     :type queries list[str]
 15 |     :rtype: list[str]
 16 |     """
 17 |     used_tables = []
 18 |     queries = filter(is_select_query, queries)
 19 | 
 20 |     for query in queries:
 21 |         # parse each query from the log
 22 |         tables = get_query_tables(query)
 23 |         if tables and tables[0] not in used_tables:
 24 |             used_tables.append(tables[0])
 25 | 
 26 |     return used_tables
 27 | 
 28 | 
 29 | def check_not_used_tables(database, queries):
 30 |     """
 31 |     :type database  indexdigest.database.Database
 32 |     :type queries list[str]
 33 |     :rtype: list[LinterEntry]
 34 |     """
 35 |     logger = logging.getLogger(__name__)
 36 | 
 37 |     # get database meta-data
 38 |     tables = database.get_tables()
 39 | 
 40 |     # analyze only SELECT queries from the log
 41 |     used_tables = get_used_tables_from_queries(queries)
 42 |     logger.info("These tables were used by provided queries: %s", used_tables)
 43 | 
 44 |     # now check which tables were not used
 45 |     not_used_tables = [table for table in tables if table not in used_tables]
 46 | 
 47 |     # generate reports
 48 |     for table in not_used_tables:
 49 |         metadata = database.get_table_metadata(table)
 50 |         context = OrderedDict()
 51 | 
 52 |         context['schema'] = database.get_table_schema(table)
 53 |         context['table_size_mb'] = \
 54 |             1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024
 55 |         context['rows_estimated'] = database.get_table_rows_estimate(table)
 56 | 
 57 |         yield LinterEntry(linter_type='not_used_tables', table_name=table,
 58 |                           message='"{}" table was not used by provided queries'.format(table),
 59 |                           context=context)
 60 | 
 61 | 
 62 | def check_not_used_columns(database, queries):
 63 |     """
 64 |     :type database  indexdigest.database.Database
 65 |     :type queries list[str]
 66 |     :rtype: list[LinterEntry]
 67 |     :raises Exception
 68 |     """
 69 |     logger = logging.getLogger(__name__)
 70 | 
 71 |     # analyze only SELECT queries from the log
 72 |     queries = list(filter(is_select_query, queries))
 73 | 
 74 |     used_tables = get_used_tables_from_queries(queries)
 75 |     used_columns = defaultdict(list)
 76 | 
 77 |     logger.info("Will check these tables: %s", used_tables)
 78 | 
 79 |     # analyze given queries and collect used columns for each table
 80 |     for query in queries:
 81 |         tables = get_query_tables(query)
 82 |         if tables:
 83 |             columns = get_query_columns(query)
 84 | 
 85 |             # print(query, table, columns)
 86 | 
 87 |             # add used columns per table
 88 |             # FIXME: assume we're querying just a single table for now
 89 |             used_columns[tables[0]] += columns
 90 |         else:
 91 |             logger.error('Unable to extract tables and columns used from the query: %s',
 92 |                          query)
 93 | 
 94 |     # analyze table schemas and report not used columns for each table
 95 |     for table in used_tables:
 96 |         logger.info("Checking %s table", table)
 97 |         table_columns = database.get_table_columns(table)
 98 | 
 99 |         # now get the difference and report them
100 |         not_used_columns = [
101 |             column for column in table_columns
102 |             if column.name not in set(used_columns[table])
103 |         ] if table_columns else []
104 | 
105 |         for column in not_used_columns:
106 |             yield LinterEntry(linter_type='not_used_columns', table_name=table,
107 |                               message='"{}" column was not used by provided queries'.format(column),
108 |                               context={'column_name': column.name, 'column_type': column.type})
109 | 


--------------------------------------------------------------------------------
/indexdigest/test/core/test_query.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from sql_metadata.compat import get_query_columns, get_query_tables
 4 | 
 5 | 
 6 | class TestUtils(TestCase):
 7 | 
 8 |     def test_get_query_columns(self):
 9 |         self.assertListEqual(['*'],
10 |                              get_query_columns('SELECT * FROM `test_table`'))
11 | 
12 |         self.assertListEqual(['foo'],
13 |                              get_query_columns('SELECT foo FROM `test_table`'))
14 | 
15 |         self.assertListEqual(['id', 'foo'],
16 |                              get_query_columns('SELECT id, foo FROM test_table WHERE id = 3'))
17 | 
18 |         self.assertListEqual(['foo', 'id'],
19 |                              get_query_columns('SELECT foo, count(*) as bar FROM `test_table` WHERE id = 3'))
20 | 
21 |         self.assertListEqual(['foo', 'test'],
22 |                              get_query_columns('SELECT foo, test as bar FROM `test_table`'))
23 | 
24 |         self.assertListEqual(['bar'],
25 |                              get_query_columns('SELECT /* a comment */ bar FROM test_table'))
26 | 
27 |         # assert False
28 | 
29 |     def test_get_query_tables(self):
30 |         self.assertListEqual(['test_table'],
31 |                              get_query_tables('SELECT * FROM `test_table`'))
32 | 
33 |         self.assertListEqual(['0001_test_table'],
34 |                              get_query_tables('SELECT * FROM `0001_test_table`'))
35 | 
36 |         self.assertListEqual(['test_table'],
37 |                              get_query_tables('SELECT foo FROM `test_table`'))
38 | 
39 |         self.assertListEqual(['test_table'],
40 |                              get_query_tables('SELECT foo FROM test_table WHERE id = 1'))
41 | 
42 |         self.assertListEqual(['test_table', 'second_table'],
43 |                              get_query_tables('SELECT foo FROM test_table, second_table WHERE id = 1'))
44 | 
45 |         self.assertListEqual(['revision', 'page', 'wikicities_user'],
46 |                              get_query_tables('SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N'))
47 | 
48 |         self.assertListEqual(['events'],
49 |                              get_query_tables("SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date 	 FROM events 	 WHERE event_date BETWEEN '2017-10-18 00:00:00' 	 AND '2017-10-24 23:59:59'  	 AND wiki_id = '1289985' GROUP BY date WITH ROLLUP"))
50 | 
51 |         # complex queries
52 |         # @see https://github.com/macbre/query-digest/issues/16
53 |         self.assertListEqual(['report_wiki_recent_pageviews', 'dimension_wikis'],
54 |                              get_query_tables("SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N"))
55 | 
56 |         self.assertListEqual(['dimension_wikis', 'fact_wam_scores'],
57 |                              get_query_tables("SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC"))
58 | 
59 |         self.assertListEqual(['fact_wam_scores', 'dimension_wikis'],
60 |                              get_query_tables("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"))
61 | 
62 |         # INSERT queries
63 |         self.assertListEqual(['0070_insert_ignore_table'],
64 |                              get_query_tables("INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');"))
65 | 
66 |         self.assertListEqual(['0070_insert_ignore_table'],
67 |                              get_query_tables("INSERT into `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');"))
68 | 
69 |         # assert False
70 | 


--------------------------------------------------------------------------------
/indexdigest/test/cli/test_script.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | from _pytest.monkeypatch import MonkeyPatch
  4 | 
  5 | from indexdigest import VERSION
  6 | from indexdigest.cli.script import filter_reports_by_type, filter_reports_by_table, get_version
  7 | from indexdigest.utils import LinterEntry
  8 | 
  9 | 
 10 | class FilterReportsByTypeTest(TestCase):
 11 | 
 12 |     REPORT_TYPES = [
 13 |         'foo',
 14 |         'bar',
 15 |         'test',
 16 |         'test',
 17 |         'foobar',
 18 |     ]
 19 | 
 20 |     @staticmethod
 21 |     def get_reports_mock(linter_types):
 22 |         """
 23 |         :type linter_types list[str]
 24 |         :rtype: list[LinterEntry]
 25 |         """
 26 |         return [
 27 |             LinterEntry(linter_type=linter_type, table_name='foo', message='message')
 28 |             for linter_type in linter_types
 29 |         ]
 30 | 
 31 |     def test_noop(self):
 32 |         reports = self.get_reports_mock(self.REPORT_TYPES)
 33 | 
 34 |         filtered = filter_reports_by_type(reports)
 35 |         print(filtered)
 36 | 
 37 |         assert len(filtered) == len(self.REPORT_TYPES)
 38 | 
 39 |     def test_checks_switch(self):
 40 |         reports = self.get_reports_mock(self.REPORT_TYPES)
 41 | 
 42 |         filtered = filter_reports_by_type(reports, checks='foo,test')
 43 |         print(filtered)
 44 | 
 45 |         assert len(filtered) == 3
 46 |         assert filtered[0].linter_type == 'foo'
 47 |         assert filtered[1].linter_type == 'test'
 48 |         assert filtered[2].linter_type == 'test'
 49 | 
 50 |     def test_checks_switch_single(self):
 51 |         reports = self.get_reports_mock(self.REPORT_TYPES)
 52 | 
 53 |         filtered = filter_reports_by_type(reports, checks='test')
 54 |         print(filtered)
 55 | 
 56 |         assert len(filtered) == 2
 57 |         assert filtered[0].linter_type == 'test'
 58 |         assert filtered[1].linter_type == 'test'
 59 | 
 60 |     def test_skip_checks_switch(self):
 61 |         reports = self.get_reports_mock(self.REPORT_TYPES)
 62 | 
 63 |         filtered = filter_reports_by_type(reports, skip_checks='foo,test')
 64 |         print(filtered)
 65 | 
 66 |         assert len(filtered) == 2
 67 |         assert filtered[0].linter_type == 'bar'
 68 |         assert filtered[1].linter_type == 'foobar'
 69 | 
 70 | 
 71 | class FilterReportsByTableTest(TestCase):
 72 | 
 73 |     REPORT_TABLES = [
 74 |         'foo',
 75 |         'bar',
 76 |         'test',
 77 |         'test',
 78 |         'foobar',
 79 |     ]
 80 | 
 81 |     @staticmethod
 82 |     def get_reports_mock(tables):
 83 |         """
 84 |         :type tables list[str]
 85 |         :rtype: list[LinterEntry]
 86 |         """
 87 |         return [
 88 |             LinterEntry(linter_type='foo', table_name=table, message='message')
 89 |             for table in tables
 90 |         ]
 91 | 
 92 |     def test_noop(self):
 93 |         reports = self.get_reports_mock(self.REPORT_TABLES)
 94 | 
 95 |         filtered = filter_reports_by_table(reports)
 96 |         print(filtered)
 97 | 
 98 |         assert len(filtered) == len(self.REPORT_TABLES)
 99 | 
100 |     def test_tables_switch(self):
101 |         reports = self.get_reports_mock(self.REPORT_TABLES)
102 | 
103 |         filtered = filter_reports_by_table(reports, tables='foo,test')
104 |         print(filtered)
105 | 
106 |         assert len(filtered) == 3
107 |         assert filtered[0].table_name == 'foo'
108 |         assert filtered[1].table_name == 'test'
109 |         assert filtered[2].table_name == 'test'
110 | 
111 |     def test_tables_switch_single(self):
112 |         reports = self.get_reports_mock(self.REPORT_TABLES)
113 | 
114 |         filtered = filter_reports_by_table(reports, tables='test')
115 |         print(filtered)
116 | 
117 |         assert len(filtered) == 2
118 |         assert filtered[0].table_name == 'test'
119 |         assert filtered[1].table_name == 'test'
120 | 
121 |     def test_skip_tables_switch(self):
122 |         reports = self.get_reports_mock(self.REPORT_TABLES)
123 | 
124 |         filtered = filter_reports_by_table(reports, skip_tables='foo,test')
125 |         print(filtered)
126 | 
127 |         assert len(filtered) == 2
128 |         assert filtered[0].table_name == 'bar'
129 |         assert filtered[1].table_name == 'foobar'
130 | 
131 | 
132 | def test_get_version(monkeypatch: MonkeyPatch):
133 |     monkeypatch.setenv('COMMIT_SHA', '1234567890abc')
134 |     assert get_version() == f'{VERSION} (git 1234567)'
135 | 


--------------------------------------------------------------------------------
/indexdigest/cli/add_linter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A helper script used to create files for new linter
  3 | """
  4 | from __future__ import print_function
  5 | 
  6 | import logging
  7 | import re
  8 | import sys
  9 | 
 10 | logging.basicConfig(
 11 |     level=logging.DEBUG,
 12 |     format='%(levelname)-8s %(message)s',
 13 | )
 14 | 
 15 | 
 16 | def add_linter(linter_id, linter_name):
 17 |     """
 18 |     :type linter_id int
 19 |     :type linter_name str
 20 |     """
 21 |     logger = logging.getLogger('add_linter')
 22 | 
 23 |     # normalize values
 24 |     linter_id_fmt = '{:04d}'.format(linter_id)
 25 |     linter_name = re.sub(r'[^a-z]+', '-', linter_name.strip().lower())
 26 | 
 27 |     logger.info("Creating a new linter: %s - %s ...", linter_id_fmt, linter_name)
 28 | 
 29 |     # /sql directory
 30 |     sql_name = 'sql/{}-{}'.format(linter_id_fmt, linter_name.replace('_', '-'))
 31 |     logger.info("Add SQL schema and log files (%s) ...", sql_name)
 32 | 
 33 |     with open(sql_name + '.sql', mode='wt', encoding='utf-8') as file_name:
 34 |         # 0002_not_used_indices
 35 |         table_name = '{}_{}'.format(linter_id_fmt, linter_name.replace('-', '_'))
 36 | 
 37 |         file_name.writelines([
 38 |             '-- Report ...\n',
 39 |             '--\n',
 40 |             '-- https://github.com/macbre/index-digest/issues/{}\n'.format(linter_id),
 41 |             'DROP TABLE IF EXISTS `{}`;\n'.format(table_name),
 42 |             'CREATE TABLE `{}` (\n'.format(table_name),
 43 |             '-- \n',
 44 |             ');\n',
 45 |         ])
 46 | 
 47 |         logger.info('... %s created', file_name.name)
 48 | 
 49 |     with open(sql_name + '-log', mode='wt', encoding='utf-8') as file_name:
 50 |         file_name.writelines([
 51 |             '-- \n',
 52 |         ])
 53 | 
 54 |         logger.info('... %s created', file_name.name)
 55 | 
 56 |     # /indexdigest/linters directory
 57 |     linter_name = linter_name.replace('-', '_')
 58 |     logger.info("Add a Python code for %s linter ...", linter_name)
 59 | 
 60 |     with open('indexdigest/linters/linter_{}_{}.py'.
 61 |               format(linter_id_fmt, linter_name), mode='wt', encoding='utf-8') as file_name:
 62 |         file_name.writelines([
 63 |             '"""\n',
 64 |             'This linter checks for ...\n',
 65 |             '"""\n',
 66 |             'from collections import defaultdict\n',
 67 |             '\n',
 68 |             'from indexdigest.utils import LinterEntry, explain_queries\n',
 69 |             '\n',
 70 |             '\n',
 71 |             'def check_{}(database, queries):\n'.format(linter_name),
 72 |             '    """\n',
 73 |             '    :type database  indexdigest.database.Database\n',
 74 |             '    :type queries list[str]\n',
 75 |             '    :rtype: list[LinterEntry]\n',
 76 |             '    """\n',
 77 |             '    yield LinterEntry(linter_type=\'{}\', table_name=table_name,\n'.
 78 |             format(linter_name),
 79 |             '                      message=\'"{}" ...\'.\n',
 80 |             '                      format("foo"),\n',
 81 |             '                      context={"foo": str("bar")})\n',
 82 |         ])
 83 | 
 84 |         logger.info('... %s created', file_name.name)
 85 | 
 86 |     logger.info("Add a test ...")
 87 | 
 88 |     with open('indexdigest/test/linters/test_{}_{}.py'.format(linter_id_fmt, linter_name),
 89 |             mode='wt', encoding='utf-8') \
 90 |             as file_name:
 91 |         file_name.writelines([
 92 |             'from __future__ import print_function\n',
 93 |             '\n',
 94 |             'from unittest import TestCase\n',
 95 |             '\n',
 96 |             'from indexdigest.linters.linter_{0}_{1} import check_{1}\n'.
 97 |             format(linter_id_fmt, linter_name),
 98 |             'from indexdigest.test import DatabaseTestMixin, read_queries_from_log\n',
 99 |             '\n',
100 |             '\n',
101 |             'class TestLinter(TestCase, DatabaseTestMixin):\n',
102 |             '\n',
103 |             '    def test_{}(self):\n'.format(linter_name),
104 |             '        pass\n',
105 |         ])
106 | 
107 |         logger.info('... %s created', file_name.name)
108 | 
109 | 
110 | def main():
111 |     """
112 |     usage: add_linter 89 empty_tables
113 |     """
114 |     try:
115 |         linter_id = int(sys.argv[1])
116 |         linter_name = str(sys.argv[2])
117 | 
118 |         add_linter(linter_id, linter_name)
119 |     except IndexError:
120 |         print('Usage: add_linter 89 empty_tables')
121 |         sys.exit(1)
122 | 


--------------------------------------------------------------------------------
/indexdigest/test/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from ..database import Database
  4 | 
  5 | from unittest import TestCase
  6 | 
  7 | 
  8 | def read_queries_from_log(log_file):
  9 |     """
 10 |     :type log_file str
 11 |     :rtype: list[str]
 12 |     """
 13 |     with open('sql/{}'.format(log_file)) as fp:
 14 |         queries = fp.readlines()
 15 |         queries = list(map(str.strip, queries))  # remove trailing spaces
 16 | 
 17 |     return queries
 18 | 
 19 | 
 20 | class DatabaseTestMixin:
 21 |     DSN = 'mysql://index_digest:qwerty@127.0.0.1:53306/index_digest'
 22 |     DBNAME = 'index_digest'
 23 | 
 24 |     @property
 25 |     def connection(self):
 26 |         """
 27 |         :rtype: Database
 28 |         """
 29 |         return Database.connect_dsn(self.DSN)
 30 | 
 31 | 
 32 | class BigTableTest(TestCase, DatabaseTestMixin):
 33 | 
 34 |     ROWS = 100000  # how many rows to generate
 35 |     BATCH = 5000  # perform INSERT in batches
 36 | 
 37 |     BIG_TABLE_NAME = '0020_big_table'
 38 |     PREPARED = False
 39 | 
 40 |     def setUp(self):
 41 |         super(BigTableTest, self).setUp()
 42 | 
 43 |         # prepare the big table only once
 44 |         if not BigTableTest.PREPARED:
 45 |             self._prepare_big_table()
 46 |             BigTableTest.PREPARED = True
 47 | 
 48 |         assert self.table_populated(), 'Table 0020_big_table should be populated with values'
 49 | 
 50 |     def _rows(self):
 51 |         """
 52 |         Iterate from 1 to self.ROWS
 53 |         :rtype: list[int]
 54 |         """
 55 |         r = 0
 56 |         while r < self.ROWS:
 57 |             r += 1
 58 |             yield r
 59 | 
 60 |     @staticmethod
 61 |     def _insert_values(cursor, values):
 62 |         """
 63 |         :type cursor MySQLdb.cursors.BaseCursor
 64 |         :type values list[tuple]
 65 |         """
 66 |         if len(values) == 0:
 67 |             return
 68 | 
 69 |         # @see https://dev.mysql.com/doc/refman/5.7/en/insert.html
 70 |         cursor.executemany('INSERT INTO 0020_big_table(item_id,val,text,num) VALUES(%s,%s,%s,%s)', values)
 71 |         # print(values[0], cursor.lastrowid)
 72 | 
 73 |     def _prepare_big_table(self):
 74 |         """
 75 |         Fill the table with values
 76 |         """
 77 |         # @see http://www.mysqltutorial.org/python-mysql-insert/
 78 |         val = 1
 79 |         values = []
 80 | 
 81 |         # use the same connection through out the function
 82 |         connection = self.connection
 83 |         cursor = connection.connection.cursor()
 84 | 
 85 |         # is table already populated?
 86 |         if self.table_populated():
 87 |             return
 88 | 
 89 |         # no? populate it
 90 |         for row in self._rows():
 91 |             # Report low cardinality indices, use only a few distinct values (#31)
 92 |             num = row % 2
 93 | 
 94 |             values.append((row, val, '{:05x}'.format(row)[:5], num))
 95 | 
 96 |             if row % 5 == 0:
 97 |                 val += 1
 98 | 
 99 |             if len(values) == self.BATCH:
100 |                 self._insert_values(cursor, values)
101 |                 values = []
102 | 
103 |         # insert any remaining values
104 |         self._insert_values(cursor, values)
105 | 
106 |         # save changes to the database
107 |         connection.connection.commit()
108 | 
109 |         cursor.close()
110 | 
111 |         # update key distribution statistics (#31)
112 |         self.connection.query('ANALYZE TABLE 0020_big_table')
113 | 
114 |         cardinality_stats = self.connection.query_dict_rows(
115 |             "select TABLE_NAME, INDEX_NAME, COLUMN_NAME, CARDINALITY from"
116 |             " INFORMATION_SCHEMA.STATISTICS where"
117 |             " TABLE_NAME = '{table_name}' AND TABLE_SCHEMA = '{database_name}'".format(
118 |                 table_name=self.BIG_TABLE_NAME, database_name=self.DBNAME)
119 |         )
120 |         logging.warning('Big table initialized, cardinality: %r', list(cardinality_stats))
121 | 
122 |     def table_populated(self):
123 |         """
124 |         :rtype: bool
125 |         """
126 |         return self.connection.query_field('SELECT COUNT(*) FROM 0020_big_table') == self.ROWS
127 | 
128 | 
129 | class DatabaseWithMockedRow(Database):
130 | 
131 |     def __init__(self, mocked_row):
132 |         super(DatabaseWithMockedRow, self).__init__(db='', host='', passwd='', user='')
133 |         self.row = mocked_row
134 | 
135 |     @property
136 |     def connection(self):
137 |         raise Exception('Class {} needs to mock the query_* method'.format(self.__class__.__name__))
138 | 
139 |     def query(self, sql, cursor=None):
140 |         self._queries.append(sql)
141 |         self.query_logger.info(sql)
142 |         return [self.row]
143 | 
144 |     def query_row(self, sql):
145 |         self._queries.append(sql)
146 |         self.query_logger.info(sql)
147 |         return self.row
148 | 


--------------------------------------------------------------------------------
/indexdigest/schema.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data structures for handling schema-related things like indices and columns
  3 | """
  4 | 
  5 | 
  6 | class Index:
  7 |     """
  8 |     Keeps a single index meta-data
  9 |     """
 10 |     def __init__(self, name, columns, unique=False, primary=False):
 11 |         """
 12 |         :type name str
 13 |         :type columns list[str]
 14 |         :type unique bool
 15 |         :type primary bool
 16 |         """
 17 |         self._name = name
 18 |         self._columns = columns
 19 |         self._unique = unique
 20 |         self._primary = primary
 21 | 
 22 |     def is_covered_by(self, index):
 23 |         """
 24 |         Checks if a current index is covered by a different one
 25 | 
 26 |         Examples:
 27 | 
 28 |         PRIMARY KEY (`id`,`foo`),
 29 |         UNIQUE KEY `idx` (`id`,`foo`)  # redundant
 30 | 
 31 |         PRIMARY KEY (`id`),
 32 |         KEY `idx_foo` (`foo`),  # redundant (covered by idx_foo_bar)
 33 |         KEY `idx_foo_bar` (`foo`, `bar`),
 34 |         KEY `idx_id_foo` (`id`, `foo`)
 35 | 
 36 |         :type index Index
 37 |         :rtype: bool
 38 |         """
 39 |         # @see https://github.com/macbre/index-digest/issues/4
 40 | 
 41 |         # assume primary is never covered by other indices (plus self check)
 42 |         if self.is_primary or self == index:
 43 |             return False
 44 | 
 45 |         # equal indices - prefer unique over non unique indices
 46 |         # and primary keys over unique ones
 47 |         # @see https://github.com/macbre/index-digest/issues/49
 48 |         if self.columns == index.columns and self.is_unique:
 49 |             # we're covered by the same unique key or a primary key
 50 |             if index.is_unique or index.is_primary:
 51 |                 return True
 52 | 
 53 |             return False
 54 | 
 55 |         # now take the subset of columns from the index we're comparing ourselves too
 56 |         columns_cnt = len(self.columns)
 57 | 
 58 |         if self.columns == index.columns[:columns_cnt]:
 59 |             if self.is_unique and index.is_primary:
 60 |                 # the unique key adds a uniqueness bit to the primary key - #49
 61 |                 return False
 62 | 
 63 |             return True
 64 | 
 65 |         return False
 66 | 
 67 |     @property
 68 |     def name(self):
 69 |         """
 70 |         :rtype: str
 71 |         """
 72 |         return self._name
 73 | 
 74 |     @property
 75 |     def columns(self):
 76 |         """
 77 |         :rtype: list[str]
 78 |         """
 79 |         return self._columns
 80 | 
 81 |     @property
 82 |     def is_unique(self):
 83 |         """
 84 |         :rtype: bool
 85 |         """
 86 |         return self._unique is True
 87 | 
 88 |     @property
 89 |     def is_primary(self):
 90 |         """
 91 |         :rtype: bool
 92 |         """
 93 |         return self._primary is True
 94 | 
 95 |     def __repr__(self):
 96 |         """
 97 |         :rtype: str
 98 |         """
 99 |         return '<{}> {}'.format(self.__class__.__name__, str(self))
100 | 
101 |     def __str__(self):
102 |         """
103 |         :rtype: str
104 |         """
105 |         return '{type}{name} ({columns})'.format(
106 |             type='PRIMARY KEY' if self.is_primary else 'UNIQUE KEY ' if self.is_unique else 'KEY ',
107 |             name=self.name if not self.is_primary else '',
108 |             columns=', '.join(self.columns)
109 |         )
110 | 
111 | 
112 | class Column:
113 |     """
114 |     Keeps a single table column meta-data
115 | 
116 |     @see https://dev.mysql.com/doc/refman/5.7/en/columns-table.html
117 |     """
118 |     def __init__(self, name, column_type, character_set=None, collation=None):
119 |         """
120 |         :type name str
121 |         :type column_type str
122 |         :type character_set str
123 |         :type collation str
124 |         """
125 |         self._name = name
126 |         self._type = column_type
127 |         self._character_set = character_set
128 |         self._collation = collation
129 | 
130 |         # As of MySQL 8.0.17, the ZEROFILL attribute is deprecated for numeric data types
131 |         # and support for it will be removed in a future MySQL version. Consider using
132 |         # an alternative means of producing the effect of this attribute.
133 |         #
134 |         # For example, applications could use the LPAD() function to zero-pad numbers up to
135 |         # the desired width, or they could store the formatted numbers in CHAR columns.
136 |         #
137 |         # https://dev.mysql.com/doc/refman/8.0/en/numeric-type-syntax.html
138 |         if 'int(' in self._type:
139 |             # normalize int(N) from MySQL 8.0.16 and older to int
140 |             self._type = self._type.split('(')[0]
141 | 
142 |     @property
143 |     def name(self):
144 |         """
145 |         :rtype: str
146 |         """
147 |         return self._name
148 | 
149 |     @property
150 |     def type(self):
151 |         """
152 |         :rtype: str
153 |         """
154 |         return self._type
155 | 
156 |     @property
157 |     def character_set(self):
158 |         """
159 |         :rtype: str
160 |         """
161 |         return self._character_set
162 | 
163 |     @property
164 |     def collation(self):
165 |         """
166 |         :rtype: str
167 |         """
168 |         return self._collation
169 | 
170 |     def is_text_type(self):
171 |         """
172 |         :rtype: bool
173 |         """
174 |         base_type = self.type.split('(')[0].upper()
175 |         # @see https://dev.mysql.com/doc/refman/5.7/en/string-types.html
176 |         return base_type in \
177 |                ['CHAR', 'VARCHAR', 'BINARY', 'VARBINARY', 'BLOB', 'TEXT', 'ENUM', 'SET']
178 | 
179 |     def is_timestamp_type(self):
180 |         """
181 |         :rtype: bool
182 |         """
183 |         base_type = self.type.upper()
184 |         # @see https://dev.mysql.com/doc/refman/5.7/en/date-and-time-types.html
185 |         return base_type in \
186 |                ['DATE', 'TIME', 'DATETIME', 'TIMESTAMP', 'YEAR']
187 | 
188 |     def __repr__(self):
189 |         """
190 |         :rtype: str
191 |         """
192 |         return '<{}> {}'.format(self.__class__.__name__, str(self))
193 | 
194 |     def __str__(self):
195 |         """
196 |         :rtype: str
197 |         """
198 |         return self._name
199 | 


--------------------------------------------------------------------------------
/indexdigest/cli/script.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=line-too-long
  2 | """index_digest
  3 | 
  4 | Analyses your database queries and schema and suggests indices improvements.
  5 | 
  6 | Usage:
  7 |   index_digest DSN [--sql-log=<file>] [--format=<formatter>] [--analyze-data] [--check-empty-databases] [--checks=<checks> | --skip-checks=<skip-checks>] [--tables=<tables> | --skip-tables=<skip-tables>]
  8 |   index_digest (-h | --help)
  9 |   index_digest --version
 10 | 
 11 | Options:
 12 |   DSN               Data Source Name of database to check
 13 |   --sql-log=<file>  Text file with SQL queries to check against the database
 14 |   --format=<formatter>  Use a given results formatter (plain, syslog, yaml)
 15 |   --analyze-data    Run additional checks that will query table data (can be slow!)
 16 |   --check-empty-databases  Detect empty databases on this MySQL server
 17 |   --checks=<list>   Comma-separated lists of checks to report
 18 |   --skip-checks=<list> Comma-separated lists of checks to skip from report
 19 |   --tables=<list>   Comma-separated lists of tables to report
 20 |   --skip-tables=<list> Comma-separated lists of tables to skip from report
 21 |   -h --help         Show this screen.
 22 |   --version         Show version.
 23 | 
 24 | Examples:
 25 |   index_digest mysql://username:password@localhost/dbname
 26 |   index_digest mysql://index_digest:qwerty@localhost/index_digest --sql-log=sql.log
 27 |   index_digest mysql://index_digest:qwerty@localhost/index_digest --skip-checks=non_utf_columns
 28 |   index_digest mysql://index_digest:qwerty@localhost/index_digest --analyze-data --checks=data_too_old,data_not_updated_recently
 29 |   index_digest mysql://index_digest:qwerty@localhost/index_digest --analyze-data --skip-tables=DATABASECHANGELOG,DATABASECHANGELOGLOCK
 30 | 
 31 | Visit <https://github.com/macbre/index-digest>
 32 | """
 33 | from __future__ import print_function
 34 | 
 35 | import logging
 36 | from itertools import chain
 37 | from os import getenv, environ
 38 | 
 39 | from docopt import docopt
 40 | 
 41 | import indexdigest
 42 | from indexdigest.database import Database
 43 | from indexdigest.utils import IndexDigestError
 44 | from indexdigest.formatters import \
 45 |     format_plain, \
 46 |     format_syslog, \
 47 |     format_yaml
 48 | from indexdigest.linters import \
 49 |     check_queries_using_filesort, check_queries_using_temporary, \
 50 |     check_not_used_indices, check_queries_not_using_indices, \
 51 |     check_not_used_tables, check_not_used_columns, \
 52 |     check_redundant_indices, \
 53 |     check_full_table_scan, \
 54 |     check_latin_columns, \
 55 |     check_selects_with_like, \
 56 |     check_missing_primary_index, \
 57 |     check_test_tables, \
 58 |     check_insert_ignore_queries, \
 59 |     check_single_column, \
 60 |     check_empty_tables, \
 61 |     check_select_star, \
 62 |     check_having_clause, \
 63 |     check_data_too_old, \
 64 |     check_data_not_updated_recently, \
 65 |     check_generic_primary_key, \
 66 |     check_high_offset_selects, \
 67 |     check_use_innodb, \
 68 |     check_empty_database, \
 69 |     check_low_cardinality_index
 70 | 
 71 | 
 72 | def get_version() -> str:
 73 |     """
 74 |     Return version string, e.g.
 75 |     1.5.0 (git 1a258db)
 76 |     """
 77 |     return '{version} (git {commit})'.format(
 78 |         version=indexdigest.VERSION,
 79 |         commit=getenv('COMMIT_SHA', 'dev')[:7]
 80 |     )
 81 | 
 82 | 
 83 | def get_reports(database, sql_log=None, analyze_data=False, check_empty_databases=False):
 84 |     """
 85 |     :type database Database
 86 |     :type sql_log str
 87 |     :type analyze_data bool
 88 |     :type check_empty_databases bool
 89 |     :rtype: list[indexdigest.utils.LinterEntry]
 90 |     """
 91 |     logger = logging.getLogger(__name__)
 92 | 
 93 |     # read SQL log file (if provided)
 94 |     if sql_log:
 95 |         logger.debug('Trying to open SQL log file: %s', sql_log)
 96 | 
 97 |         with open(sql_log, encoding='utf-8') as log_file:
 98 |             queries = log_file.readlines()
 99 |             queries = list(map(str.strip, queries))  # remove trailing spaces
100 |             logger.debug('Got %d entries in SQL log file', len(queries))
101 |     else:
102 |         queries = None
103 | 
104 |     # run all checks
105 |     reports = chain(
106 |         check_redundant_indices(database),
107 |         check_latin_columns(database),
108 |         check_missing_primary_index(database),
109 |         check_test_tables(database),
110 |         check_single_column(database),
111 |         check_empty_tables(database),
112 |         check_generic_primary_key(database),
113 |         check_use_innodb(database),
114 |         check_low_cardinality_index(database),
115 |     )
116 | 
117 |     # checks that use SQL log
118 |     if queries:
119 |         reports = chain(
120 |             reports,
121 |             check_not_used_indices(database, queries=queries),
122 |             check_not_used_tables(database, queries=queries),
123 |             check_not_used_columns(database, queries=queries),
124 |             check_queries_not_using_indices(database, queries=queries),
125 |             check_queries_using_filesort(database, queries=queries),
126 |             check_queries_using_temporary(database, queries=queries),
127 |             check_full_table_scan(database, queries=queries),
128 |             check_selects_with_like(database, queries=queries),
129 |             check_insert_ignore_queries(database, queries=queries),
130 |             check_select_star(database, queries=queries),
131 |             check_having_clause(database, queries=queries),
132 |             check_high_offset_selects(database, queries=queries),
133 |         )
134 | 
135 |     # checks that require --analyze-data switch to be on (see #28)
136 |     if analyze_data is True:
137 |         logger.info("Will run data analyzing checks, can take a while...")
138 | 
139 |         reports = chain(
140 |             reports,
141 |             check_data_too_old(database, env=environ),
142 |             check_data_not_updated_recently(database, env=environ),
143 |         )
144 | 
145 |     # --check-empty-databases switch to be on to run "empty_database" (see #146)
146 |     if check_empty_databases is True:
147 |         logger.info("Will analyze databases on this MySQL server, can take a while...")
148 | 
149 |         reports = chain(
150 |             reports,
151 |             check_empty_database(database),
152 |         )
153 | 
154 |     return reports
155 | 
156 | 
157 | def filter_reports_by_type(reports, checks=None, skip_checks=None):
158 |     """
159 |     :type reports list[indexdigest.utils.LinterEntry]
160 |     :type checks str
161 |     :type skip_checks str
162 |     :rtype: list[indexdigest.utils.LinterEntry]
163 |     """
164 |     if checks:
165 |         return [
166 |             report for report in reports
167 |             if report.linter_type in checks.split(',')
168 |         ]
169 | 
170 |     if skip_checks:
171 |         return [
172 |             report for report in reports
173 |             if report.linter_type not in skip_checks.split(',')
174 |         ]
175 | 
176 |     return reports
177 | 
178 | 
179 | def filter_reports_by_table(reports, tables=None, skip_tables=None):
180 |     """
181 |     :type reports list[indexdigest.utils.LinterEntry]
182 |     :type tables str
183 |     :type skip_tables str
184 |     :rtype: list[indexdigest.utils.LinterEntry]
185 |     """
186 |     if tables:
187 |         return [
188 |             report for report in reports
189 |             if report.table_name in tables.split(',')
190 |         ]
191 | 
192 |     if skip_tables:
193 |         return [
194 |             report for report in reports
195 |             if report.table_name not in skip_tables.split(',')
196 |         ]
197 | 
198 |     return reports
199 | 
200 | 
201 | def main():  # pragma: no cover
202 |     """ Main entry point for CLI"""
203 |     logger = logging.getLogger(__name__)
204 | 
205 |     arguments = docopt(__doc__, version=f'index_digest {get_version()}')
206 |     logger.debug('Options: %s', arguments)
207 | 
208 |     if 'DSN' not in arguments:
209 |         return
210 | 
211 |     # connect to the database
212 |     database = Database.connect_dsn(arguments['DSN'])
213 |     logger.debug('Connected to MySQL server v%s', database.get_server_version())
214 | 
215 |     reports = get_reports(
216 |         database,
217 |         sql_log=arguments.get('--sql-log'),
218 |         analyze_data=arguments.get('--analyze-data'),
219 |         check_empty_databases=arguments.get('--check-empty-databases')
220 |     )
221 | 
222 |     # handle --checks / --skip-checks
223 |     reports = filter_reports_by_type(
224 |         reports,
225 |         checks=arguments.get('--checks'),
226 |         skip_checks=arguments.get('--skip-checks')
227 |     )
228 | 
229 |     # handle --tables / --skip-tables
230 |     reports = filter_reports_by_table(
231 |         reports,
232 |         tables=arguments.get('--tables'),
233 |         skip_tables=arguments.get('--skip-tables')
234 |     )
235 | 
236 |     # handle --format
237 |     formatter = arguments.get('--format') or 'plain'
238 |     logger.info("Using formatter: %s", formatter)
239 | 
240 |     if formatter == 'plain':
241 |         print(format_plain(database, reports))
242 |     elif formatter == 'syslog':
243 |         ident = getenv('SYSLOG_IDENT', 'index-digest')
244 |         logger.info('Using syslog ident: %s', ident)
245 |         print(format_syslog(database, reports, ident))
246 |     elif formatter == 'yaml':
247 |         print(format_yaml(database, reports))
248 |     else:
249 |         raise IndexDigestError('Unknown formatter provided: {}'.format(formatter))
250 | 


--------------------------------------------------------------------------------
/indexdigest/database.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Database connector wrapper
  3 | """
  4 | import logging
  5 | import re
  6 | from collections import OrderedDict, defaultdict
  7 | from warnings import filterwarnings
  8 | 
  9 | import MySQLdb
 10 | from MySQLdb.cursors import DictCursor
 11 | from MySQLdb._exceptions import OperationalError, ProgrammingError
 12 | 
 13 | from indexdigest.schema import Column, Index
 14 | from indexdigest.utils import parse_dsn, memoize, IndexDigestError
 15 | 
 16 | 
 17 | class IndexDigestQueryError(IndexDigestError):
 18 |     """
 19 |     A wrapper for _mysql_exceptions.OperationalError:
 20 |     """
 21 | 
 22 | 
 23 | class DatabaseBase:
 24 |     """
 25 |     A generic wrapper of database connection with basic querying functionality.
 26 | 
 27 |     Sub-class this to mock database connection
 28 |     """
 29 | 
 30 |     # pylint:disable=too-many-positional-arguments,too-many-arguments,invalid-name
 31 |     def __init__(self, host: str, user: str, passwd: str, db: str, port: int=3306):
 32 |         """
 33 |         Connects to a given database
 34 |         """
 35 |         self.logger = logging.getLogger(__name__)
 36 |         self.query_logger = logging.getLogger(__name__ + '.query')
 37 | 
 38 |         # lazy connect
 39 |         self._connection_params = dict(host=host, port=port, user=user, passwd=passwd, db=db)
 40 |         self._connection = None
 41 |         self.db_name = db
 42 | 
 43 |         # Suppress MySQL warnings when EXPLAIN is run (#63)
 44 |         filterwarnings('ignore', category=MySQLdb.Warning)
 45 | 
 46 |         # register queries
 47 |         self._queries = []
 48 | 
 49 |     @classmethod
 50 |     def connect_dsn(cls, dsn):
 51 |         """
 52 |         :type dsn str
 53 |         :rtype DatabaseBase
 54 |         """
 55 |         parsed = parse_dsn(dsn)
 56 |         return cls(**parsed)
 57 | 
 58 |     @property
 59 |     def connection(self):
 60 |         """
 61 |         Lazy connection
 62 | 
 63 |         :rtype: Connection
 64 |         """
 65 |         if self._connection is None:
 66 |             self.logger.info('Lazy connecting to %s:%i and using %s database',
 67 |                              self._connection_params['host'], self._connection_params['port'],
 68 |                              self._connection_params['db'])
 69 | 
 70 |             self._connection = MySQLdb.connect(**self._connection_params)
 71 | 
 72 |         return self._connection
 73 | 
 74 |     def get_queries(self):
 75 |         """
 76 |         :rtype: list[str]
 77 |         """
 78 |         return self._queries
 79 | 
 80 |     def query(self, sql, cursor_class=None):
 81 |         """
 82 |         :type sql str
 83 |         :type cursor_class MySQLdb.cursors.BaseCursor
 84 |         :rtype: MySQLdb.cursors.Cursor
 85 |         :raises IndexDigestQueryError
 86 |         """
 87 |         self.query_logger.info('%s', sql)
 88 | 
 89 |         cursor = self.connection.cursor(cursorclass=cursor_class)
 90 | 
 91 |         try:
 92 |             # Python 3: query should be str (unicode) when executing %
 93 |             try:
 94 |                 sql = sql.encode('utf8')
 95 |             except UnicodeDecodeError:
 96 |                 pass
 97 | 
 98 |             cursor.execute(sql)
 99 |         except (OperationalError, ProgrammingError) as ex:
100 |             # e.g. (1054, "Unknown column 'test' in 'field list'") - OperationalError
101 |             # e.g. (1146, "Table 'index_digest.t' doesn't exist") - ProgrammingError
102 |             (code, message) = ex.args
103 |             self.query_logger.error('Database error #%d: %s', code, message)
104 |             raise IndexDigestQueryError(message) from ex
105 | 
106 |         # register the query
107 |         self._queries.append(sql)
108 | 
109 |         return cursor
110 | 
111 |     def query_row(self, sql):
112 |         """
113 |         :type sql str
114 |         :rtype: list
115 |         """
116 |         return self.query(sql).fetchone()
117 | 
118 |     def query_dict_row(self, sql):
119 |         """
120 |         Return a single row as a dictionary
121 | 
122 |         :type sql str
123 |         :rtype: dict
124 |         """
125 |         # DictCursor is a Cursor class that returns rows as dictionaries
126 |         return self.query(sql, cursor_class=DictCursor).fetchone()
127 | 
128 |     def query_dict_rows(self, sql):
129 |         """
130 |         Return all rows as dictionaries
131 | 
132 |         :type sql str
133 |         :rtype: dict[]
134 |         """
135 |         # DictCursor is a Cursor class that returns rows as dictionaries
136 |         yield from self.query(sql, cursor_class=DictCursor)
137 | 
138 |     def query_field(self, sql):
139 |         """
140 |         :type sql str
141 |         :rtype: str
142 |         """
143 |         return self.query_row(sql)[0]
144 | 
145 |     def query_list(self, sql):
146 |         """
147 |         Returns an iterator with the first field on each row.
148 | 
149 |         e.g. SHOW TABLES
150 | 
151 |         :type sql str
152 |         :rtype: list[str]
153 |         """
154 |         for row in self.query(sql):
155 |             yield str(row[0])
156 | 
157 |     def query_key_value(self, sql):
158 |         """
159 |         Returns an ordered dictionary with key / value taken fro first two fields of each row.
160 | 
161 |         e.g. SHOW VARIABLES
162 | 
163 |         :type sql str
164 |         :rtype: OrderedDict
165 |         """
166 |         res = OrderedDict()
167 | 
168 |         for row in self.query(sql):
169 |             res[row[0]] = row[1]
170 | 
171 |         return res
172 | 
173 | 
174 | class Database(DatabaseBase):
175 |     """
176 |     Database wrapper extended with some stats-related queries
177 |     """
178 | 
179 |     @memoize
180 |     def get_server_version(self):
181 |         """
182 |         Returns server version (e.g. "5.5.57-0+deb8u1")
183 | 
184 |         :rtype: str
185 |         """
186 |         return self.query_field('SELECT VERSION()')
187 | 
188 |     def get_server_hostname(self):
189 |         """
190 |         Return hostname of the server
191 |         :rtype: str
192 |         """
193 |         return self.get_variables(like='hostname').get('hostname')
194 | 
195 |     @memoize
196 |     def get_tables(self):
197 |         """
198 |         Returns the alphabetically sorted list of tables (ignore views)
199 | 
200 |         :rtype: list[str]
201 |         """
202 |         return sorted(self.query_list(
203 |             'SELECT TABLE_NAME FROM information_schema.tables '
204 |             'WHERE table_schema = "{}" and TABLE_TYPE = "BASE TABLE"'.
205 |             format(self.db_name)
206 |         ))
207 | 
208 |     def get_variables(self, like=None):
209 |         """
210 |         Returns the key / value dictionary with server variables
211 | 
212 |         :type like str
213 |         :rtype: OrderedDict
214 |         """
215 |         sql = 'SHOW VARIABLES'
216 |         if like is not None:
217 |             sql += ' LIKE "{}%"'.format(like)
218 | 
219 |         return self.query_key_value(sql)
220 | 
221 |     @memoize
222 |     def explain_query(self, sql):
223 |         """
224 |         Runs EXPLAIN query for a given SQL
225 | 
226 |         :type sql str
227 |         :rtype: list
228 |         """
229 |         # @see https://dev.mysql.com/doc/refman/5.7/en/explain-output.html
230 |         return list(self.query_dict_rows('EXPLAIN {}'.format(sql)))
231 | 
232 |     @memoize
233 |     def get_table_schema(self, table_name):
234 |         """
235 |         Run SHOW CREATE TABLE query for a given table
236 |         :type table_name str
237 |         :rtype: str
238 |         """
239 |         # @see https://dev.mysql.com/doc/refman/5.7/en/show-create-table.html
240 |         schema = str(self.query_row('SHOW CREATE TABLE `{}`'.format(table_name))[1])
241 | 
242 |         # remove partitions definition (#107)
243 |         schema = re.sub(r'/\*!50100[^*]+\*/', '', schema)
244 | 
245 |         return schema.rstrip()
246 | 
247 |     def _get_information_schema_where(self, table_name):
248 |         """
249 |         :type table_name str
250 |         :rtype: str
251 |         """
252 |         # @see https://dev.mysql.com/doc/refman/5.7/en/information-schema.html
253 |         return "WHERE TABLE_SCHEMA='{db}' AND TABLE_NAME='{table_name}'".format(
254 |             db=self._connection_params['db'], table_name=table_name)
255 | 
256 |     @memoize
257 |     def get_table_metadata(self, table_name):
258 |         """
259 |         Return table's metadata
260 | 
261 |         :type table_name str
262 |         :rtype: dict
263 |         """
264 |         # https://dev.mysql.com/doc/refman/5.7/en/tables-table.html
265 |         # https://mariadb.com/kb/en/information-schema-tables-table/
266 |         stats = self.query_dict_row(
267 |             "SELECT ENGINE, TABLE_ROWS, DATA_LENGTH, INDEX_LENGTH "
268 |             "FROM information_schema.TABLES " + self._get_information_schema_where(table_name))
269 | 
270 |         # TODO: introduce dataclass
271 |         return {
272 |             'engine': stats['ENGINE'],
273 |             'rows': stats['TABLE_ROWS'],  # For InnoDB the row count is only a rough estimate
274 |             'data_size': stats['DATA_LENGTH'],
275 |             'index_size': stats['INDEX_LENGTH'],
276 |         }
277 | 
278 |     @memoize
279 |     def get_table_columns(self, table_name):
280 |         """
281 |         Return the list of indices for a given table
282 | 
283 |         :type table_name str
284 |         :rtype: list[Column]
285 |         """
286 |         # @see https://dev.mysql.com/doc/refman/8.0/en/show-columns.html
287 |         try:
288 |             columns = [
289 |                 row['Field']
290 |                 for row in self.query_dict_rows("SHOW COLUMNS FROM `{}`".format(table_name))
291 |             ]
292 |         except IndexDigestQueryError:
293 |             logger = logging.getLogger('get_table_columns')
294 |             logger.error('Cannot get columns list for table: %s', table_name)
295 |             return None
296 | 
297 |         # @see https://dev.mysql.com/doc/refman/8.0/en/information-schema-columns-table.html
298 |         rows = self.query_dict_rows(
299 |             "SELECT COLUMN_NAME as NAME, COLUMN_TYPE as TYPE, CHARACTER_SET_NAME, COLLATION_NAME "
300 |             "FROM information_schema.COLUMNS " + self._get_information_schema_where(table_name))
301 | 
302 |         meta = dict()
303 | 
304 |         for row in rows:
305 |             # TYPE: e.g. int(9), varchar(32)
306 |             meta[row['NAME']] = Column(name=row['NAME'], column_type=row['TYPE'],
307 |                                        character_set=row['CHARACTER_SET_NAME'],
308 |                                        collation=row['COLLATION_NAME'])
309 | 
310 |         # keep the order taken from SHOW COLUMNS
311 |         return [
312 |             meta[column]
313 |             for column in columns
314 |         ]
315 | 
316 |     @memoize
317 |     def get_table_indices(self, table_name):
318 |         """
319 |         Return the list of indices for a given table
320 | 
321 |         :type table_name str
322 |         :rtype: list[Index]
323 |         """
324 |         # @see https://dev.mysql.com/doc/refman/5.7/en/statistics-table.html
325 |         # @see https://dev.mysql.com/doc/refman/5.7/en/show-index.html
326 |         res = self.query_dict_rows(
327 |             "SELECT INDEX_NAME, NON_UNIQUE, SEQ_IN_INDEX, COLUMN_NAME, CARDINALITY " +
328 |             "FROM information_schema.STATISTICS " + self._get_information_schema_where(table_name) +
329 |             " ORDER BY INDEX_NAME, SEQ_IN_INDEX")
330 | 
331 |         index_columns = defaultdict(list)
332 |         index_meta = OrderedDict()
333 | 
334 |         for row in res:
335 |             index_name = row['INDEX_NAME']
336 |             index_columns[index_name].append(row['COLUMN_NAME'])
337 | 
338 |             if index_name not in index_meta:
339 |                 index_meta[index_name] = {
340 |                     'unique': int(row['NON_UNIQUE']) == 0,
341 |                     'primary': row['INDEX_NAME'] == 'PRIMARY',
342 |                 }
343 | 
344 |         ret = []
345 | 
346 |         for index_name, meta in index_meta.items():
347 |             columns = index_columns[index_name]
348 |             ret.append(Index(
349 |                 name=index_name, columns=columns, primary=meta['primary'], unique=meta['unique']))
350 | 
351 |         return ret
352 | 
353 |     @memoize
354 |     def get_table_rows_estimate(self, table_name):
355 |         """
356 |         Estimate table's rows count by running EXPLAIN SELECT COUNT(*) FROM foo
357 | 
358 |         #96 - For MySQL 8.0 we fall back to a "raw" query: SELECT COUNT(*) FROM foo
359 | 
360 |         :type table_name str
361 |         :rtype int
362 |         """
363 |         sql = "SELECT COUNT(*) FROM `{}`".format(table_name)
364 |         explain_row = self.explain_query(sql)[0]
365 | 
366 |         # EXPLAIN query returned rows count
367 |         if explain_row['rows'] is not None:
368 |             return int(explain_row['rows'])
369 | 
370 |         # "Select tables optimized away" was returned by the query (see #96)
371 |         self.logger.info("'EXPLAIN %s' query returned '%s' in Extra field",
372 |                          sql, explain_row['Extra'])
373 | 
374 |         return self.query_field(sql)
375 | 


--------------------------------------------------------------------------------
/indexdigest/test/core/test_database.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8 -*-
  2 | from __future__ import print_function
  3 | 
  4 | from unittest import TestCase
  5 | 
  6 | from indexdigest.test import DatabaseTestMixin, DatabaseWithMockedRow
  7 | from indexdigest.database import DatabaseBase
  8 | 
  9 | 
 10 | class TestDatabaseBase(TestCase, DatabaseTestMixin):
 11 | 
 12 |     def test_database_connect(self):
 13 |         conn = DatabaseBase(host='127.0.0.1', user='index_digest', passwd='qwerty', db='index_digest')
 14 |         self.assertIsInstance(conn, DatabaseBase)
 15 | 
 16 |     def test_database_connect_dsn(self):
 17 |         self.assertIsInstance(self.connection, DatabaseBase)
 18 | 
 19 |     def test_query_list(self):
 20 |         res = list(self.connection.query_list('SHOW DATABASES'))
 21 | 
 22 |         self.assertTrue('information_schema' in res, res)
 23 |         self.assertTrue('index_digest' in res, res)
 24 | 
 25 |     def test_query_field(self):
 26 |         cnt = self.connection.query_field('SELECT count(*) FROM 0000_the_table')
 27 | 
 28 |         self.assertEqual(cnt, 3)
 29 | 
 30 |     def test_query_row(self):
 31 |         row = self.connection.query_row('SELECT * FROM 0000_the_table WHERE item_id = 1')
 32 | 
 33 |         self.assertEqual(row[0], 1)
 34 |         self.assertEqual(row[1], 'test')
 35 | 
 36 |     def test_query_dict_row(self):
 37 |         row = self.connection.query_dict_row('SELECT * FROM 0000_the_table ORDER BY 1')
 38 |         print(row)
 39 | 
 40 |         self.assertEqual(row['item_id'], 1)
 41 |         self.assertEqual(row['foo'], 'test')
 42 | 
 43 |     def test_query_dict_rows(self):
 44 |         rows = list(self.connection.query_dict_rows('SELECT * FROM 0000_the_table ORDER BY 1'))
 45 |         row = rows[0]
 46 |         print(row)
 47 | 
 48 |         self.assertEqual(len(rows), 3)
 49 | 
 50 |         self.assertEqual(row['item_id'], 1)
 51 |         self.assertEqual(row['foo'], 'test')
 52 | 
 53 | 
 54 | class TestDatabase(TestCase, DatabaseTestMixin):
 55 | 
 56 |     TABLE_NAME = '0000_the_table'
 57 | 
 58 |     def test_database_version(self):
 59 |         # 5.5.57-0+deb8u1 / 8.0.3-rc-log / 9.4.0 MySQL Community Server - GPL
 60 |         # 10.2.10-MariaDB-10.2.10+maria~jessie / 12.0.2-MariaDB-ubu2404 mariadb.org binary distribution
 61 |         version = self.connection.get_server_version()
 62 | 
 63 |         self.assertTrue(
 64 |             version.startswith('5.') or version.startswith('8.') or version.startswith('9.') or 'MariaDB' in version,
 65 |             'MySQL server should be from 5.x/8.x/9.x line or have MariaDB part')
 66 | 
 67 |     def test_get_tables(self):
 68 |         tables = list(self.connection.get_tables())
 69 |         print(tables)
 70 | 
 71 |         assert self.TABLE_NAME in tables
 72 |         assert '0000_the_table-metadata' in tables
 73 |         assert '0000_the_view' not in tables
 74 | 
 75 |     def test_get_variables(self):
 76 |         variables = self.connection.get_variables()
 77 |         print(variables)
 78 | 
 79 |         self.assertTrue('version_compile_os' in variables)
 80 |         self.assertTrue('innodb_data_home_dir' in variables)
 81 | 
 82 |     def test_get_variables_like(self):
 83 |         variables = self.connection.get_variables(like='innodb')
 84 |         print(variables)
 85 | 
 86 |         self.assertFalse('version_compile_os' in variables)  # this variable does not match given like
 87 |         self.assertTrue('innodb_data_home_dir' in variables)
 88 | 
 89 |     def test_explain_and_utf_query(self):
 90 |         """
 91 |         mysql> explain SELECT * FROM 0000_the_table WHERE foo = "foo ąęź";
 92 |         +----+-------------+----------------+------+---------------+---------+---------+-------+------+--------------------------+
 93 |         | id | select_type | table          | type | possible_keys | key     | key_len | ref   | rows | Extra                    |
 94 |         +----+-------------+----------------+------+---------------+---------+---------+-------+------+--------------------------+
 95 |         |  1 | SIMPLE      | 0000_the_table | ref  | idx_foo       | idx_foo | 50      | const |    1 | Using where; Using index |
 96 |         +----+-------------+----------------+------+---------------+---------+---------+-------+------+--------------------------+
 97 |         1 row in set (0.00 sec)
 98 |         """
 99 |         res = list(self.connection.explain_query('SELECT * FROM {} WHERE foo = "foo ąęź"'.format(self.TABLE_NAME)))
100 |         row = res[0]
101 |         print(row)
102 | 
103 |         self.assertEqual(len(res), 1)
104 |         self.assertEqual(row['key'], 'idx_foo')
105 |         self.assertEqual(row['table'], self.TABLE_NAME)
106 |         self.assertTrue('Using index' in row['Extra'])
107 | 
108 |     def test_get_table_indices(self):
109 |         """
110 |         mysql> SELECT INDEX_NAME, NON_UNIQUE, SEQ_IN_INDEX, COLUMN_NAME, CARDINALITY
111 |         FROM INFORMATION_SCHEMA.STATISTICS WHERE table_name = '0000_the_table'
112 |         ORDER BY INDEX_NAME, SEQ_IN_INDEX;
113 |         +------------+------------+--------------+-------------+-------------+
114 |         | INDEX_NAME | NON_UNIQUE | SEQ_IN_INDEX | COLUMN_NAME | CARDINALITY |
115 |         +------------+------------+--------------+-------------+-------------+
116 |         | idx_foo    | 1          |            1 | foo         |           3 |
117 |         | PRIMARY    | 0          |            1 | id          |           3 |
118 |         | PRIMARY    | 0          |            2 | foo         |           3 |
119 |         +------------+------------+--------------+-------------+-------------+
120 |         3 rows in set (0.00 sec)
121 |         """
122 |         (idx, primary) = self.connection.get_table_indices(self.TABLE_NAME)
123 |         print(idx, primary)
124 | 
125 |         self.assertEqual(idx.name, 'idx_foo')
126 |         self.assertEqual(primary.name, 'PRIMARY')
127 | 
128 |         self.assertListEqual(idx.columns, ['foo'])
129 |         self.assertListEqual(primary.columns, ['item_id', 'foo'])
130 | 
131 |         self.assertFalse(idx.is_primary)
132 |         self.assertFalse(idx.is_unique)
133 |         self.assertTrue(primary.is_primary, 'Primary key is correctly detected')
134 |         self.assertTrue(primary.is_unique, 'Primary key should be treated as a unique one')
135 | 
136 |         # assert False
137 | 
138 |     def test_get_table_schema(self):
139 |         schema = self.connection.get_table_schema(self.TABLE_NAME)
140 |         print(schema)
141 | 
142 |         self.assertTrue('CREATE TABLE `0000_the_table` (' in schema)
143 |         self.assertTrue('PRIMARY KEY (`item_id`,`foo`),' in schema)
144 |         self.assertTrue('ENGINE=InnoDB' in schema)
145 | 
146 |         # assert False
147 | 
148 |     def test_get_table_metadata(self):
149 |         meta = self.connection.get_table_metadata(self.TABLE_NAME)
150 |         print(meta)
151 | 
152 |         # stats
153 |         self.assertEqual(meta['engine'], 'InnoDB')
154 |         self.assertAlmostEqual(meta['rows'], 3, delta=1)
155 |         self.assertTrue(meta['index_size'] > 0)
156 |         self.assertTrue(meta['data_size'] > 0)
157 | 
158 |         # assert False
159 | 
160 |     def test_get_table_columns(self):
161 |         columns = self.connection.get_table_columns(self.TABLE_NAME)
162 |         print(columns)
163 | 
164 |         # the columns order is maintained
165 |         column_names = [column.name for column in columns]
166 | 
167 |         # columns
168 |         self.assertTrue('item_id' in column_names)
169 |         self.assertTrue('foo' in column_names)
170 | 
171 |         self.assertEqual(columns[0].name, 'item_id')
172 |         self.assertEqual(columns[0].type, 'int')
173 |         self.assertIsNone(columns[0].character_set)  # numeric column
174 | 
175 |         self.assertEqual(columns[1].name, 'foo')
176 |         self.assertEqual(columns[1].type, 'varchar(16)')
177 |         self.assertIn(columns[1].character_set, ['utf8', 'utf8mb3'])
178 | 
179 |         self.assertEqual(len(columns), 2)
180 | 
181 |         # assert False
182 | 
183 |     def test_get_table_rows_estimate(self):
184 |         self.assertAlmostEqual(self.connection.get_table_rows_estimate(self.TABLE_NAME), 3, delta=1)
185 | 
186 | 
187 | class TestsWithDatabaseMocked(TestCase):
188 | 
189 |     def test_database_hostname(self):
190 |         db = DatabaseWithMockedRow(mocked_row=['hostname', 'kopytko.foo.net'])
191 |         self.assertEqual(db.get_server_hostname(), 'kopytko.foo.net')
192 | 
193 |     def test_database_version(self):
194 |         db = DatabaseWithMockedRow(mocked_row=['5.5.58-0+deb8u1'])
195 |         self.assertEqual(db.get_server_version(), '5.5.58-0+deb8u1')
196 | 
197 | 
198 | class TestMemoization(TestCase, DatabaseTestMixin):
199 | 
200 |     def test_get_queries(self):
201 |         db = DatabaseWithMockedRow(mocked_row=['foo'])
202 | 
203 |         # query method is not memoized, so let's count all queries (even the same ones)
204 |         for _ in range(5):
205 |             self.assertEqual(db.query_row('SELECT FOO'), ['foo'])
206 | 
207 |         self.assertEqual(len(db.get_queries()), 5)
208 |         self.assertEqual(db.get_queries()[0], 'SELECT FOO')
209 | 
210 |     def test_cached_get_tables(self):
211 |         tables_list = ['foo']
212 |         db = DatabaseWithMockedRow(mocked_row=tables_list)
213 | 
214 |         # this would made five queries to database if not memoization in get_tables
215 |         for _ in range(5):
216 |             self.assertEqual(db.get_tables(), tables_list)
217 | 
218 |         # however, only one is made :)
219 |         self.assertEqual(len(db.get_queries()), 1)
220 | 
221 |     def test_cached_explain_query(self):
222 |         db = self.connection
223 | 
224 |         # this would made ten queries to database if not memoization in explain_query
225 |         # also test that @memoize decorator correctly handles different arguments
226 |         for _ in range(5):
227 |             (row,) = db.explain_query('SELECT * FROM 0000_the_table')
228 |             self.assertEqual(row['table'], '0000_the_table')
229 | 
230 |             (row,) = db.explain_query('SELECT * FROM 0002_not_used_indices')
231 |             self.assertEqual(row['table'], '0002_not_used_indices')
232 | 
233 |         queries = db.get_queries()
234 |         print(queries)
235 | 
236 |         # however, only two are made :)
237 |         self.assertEqual(len(queries), 2)
238 | 
239 |         self.assertTrue('EXPLAIN SELECT * FROM 0000_the_table' in str(queries[0]))
240 |         self.assertTrue('EXPLAIN SELECT * FROM 0002_not_used_indices' in str(queries[1]))
241 | 
242 |     def test_cached_get_indices(self):
243 |         db = self.connection
244 | 
245 |         # this would made ten queries to database if not memoization in get_tables
246 |         # also test that @memoize decorator correctly handles different arguments
247 |         for _ in range(5):
248 |             (_, primary) = db.get_table_indices(table_name='0000_the_table')
249 |             self.assertTrue(primary.is_primary)
250 | 
251 |             (idx, _, _) = db.get_table_indices(table_name='0002_not_used_indices')
252 |             self.assertEqual(idx.name, 'foo_id_idx')
253 | 
254 |         queries = db.get_queries()
255 |         print(queries)
256 | 
257 |         # however, only two are made :)
258 |         self.assertEqual(len(queries), 2)
259 | 
260 |         self.assertTrue('0000_the_table' in str(queries[0]))
261 |         self.assertTrue('0002_not_used_indices' in str(queries[1]))
262 | 
263 |     def test_cached_get_columns(self):
264 |         db = self.connection
265 | 
266 |         # this would made ten queries to database if not memoization in get_table_columns
267 |         # also test that @memoize decorator correctly handles different arguments
268 |         for _ in range(5):
269 |             (col, _) = db.get_table_columns(table_name='0000_the_table')
270 |             self.assertEqual(col.name, 'item_id')
271 | 
272 |             (_, col, _, _) = db.get_table_columns(table_name='0002_not_used_indices')
273 |             self.assertEqual(col.name, 'foo')
274 | 
275 |         queries = db.get_queries()
276 |         print(queries)
277 | 
278 |         # however, only four are made :)
279 |         self.assertEqual(len(queries), 4)
280 | 
281 |         self.assertTrue("SHOW COLUMNS FROM `0000_the_table`" in str(queries[0]))
282 |         self.assertTrue("information_schema.COLUMNS WHERE TABLE_SCHEMA='index_digest' AND TABLE_NAME='0000_the_table'" in str(queries[1]))
283 |         self.assertTrue("SHOW COLUMNS FROM `0002_not_used_indices`" in str(queries[2]))
284 |         self.assertTrue("information_schema.COLUMNS WHERE TABLE_SCHEMA='index_digest' AND TABLE_NAME='0002_not_used_indices'" in str(queries[3]))
285 | 
286 |     def test_cached_get_table_schema(self):
287 |         db = DatabaseWithMockedRow(mocked_row=[None, 'CREATE TABLE foo;'])
288 | 
289 |         # this would made ten queries to database if not memoization in get_table_schema
290 |         # also test that @memoize decorator correctly handles different arguments
291 |         for _ in range(5):
292 |             schema = db.get_table_schema('0000_the_table')
293 |             self.assertEqual(schema, 'CREATE TABLE foo;')
294 | 
295 |             schema = db.get_table_schema('0002_not_used_indices')
296 |             self.assertEqual(schema, 'CREATE TABLE foo;')
297 | 
298 |         queries = db.get_queries()
299 |         print(queries)
300 | 
301 |         # however, only two are made :)
302 |         self.assertEqual(len(queries), 2)
303 | 
304 |         self.assertEqual('SHOW CREATE TABLE `0000_the_table`', str(queries[0]))
305 |         self.assertEqual('SHOW CREATE TABLE `0002_not_used_indices`', str(queries[1]))
306 | 


--------------------------------------------------------------------------------