├── DB2 ├── DB2 Table Inventory └── DB2 Table-Column Inventory ├── Hive ├── Hive Table Inventory.sql ├── Hive Table Rowcounts.py ├── Hive Table and Column Metadata.py └── Hive Table-Column Inventory.sql ├── MySQL ├── Columns └── TableRowCount ├── Netezza ├── Netezza Table Inventory └── Netezza Table-Column inventory ├── README.md ├── Redshift ├── Redshift Column Inventory ├── Redshift Column Stats ├── Redshift Table Inventory └── Redshift Table-Column sizes ├── SQLServer ├── SQLServer Column List of values ├── SQLServer Table Inventory ├── SQLServer Table Row Counts ├── SQLServer Table Rowcounts ├── SQLServer Table-Column Inventory ├── SQLServer Table-Column Stats └── Table Rowcounts and Space ├── Sybase └── Sybase Table-Column Stats ├── oracle ├── Oracle Column Value List Distribution ├── Oracle Table Inventory ├── Oracle Table Rowcounts ├── Oracle Table-Column Inventory └── Oracle Table-Column Stats ├── snowflake └── table column stats └── synapse └── synapse table size.sql /DB2/DB2 Table Inventory: -------------------------------------------------------------------------------- 1 | select name, creator, type from sysibm.systables 2 | -------------------------------------------------------------------------------- /DB2/DB2 Table-Column Inventory: -------------------------------------------------------------------------------- 1 | select creator,tbname,name,colno,coltype,length,scale 2 | from sysibm.syscolumns 3 | where TBname in ('D2VCM','IWCVECUS') 4 | -------------------------------------------------------------------------------- /Hive/Hive Table Inventory.sql: -------------------------------------------------------------------------------- 1 | -- Hive SQL -- show tables 2 | -- connect to hive metastore mySQL database -- this contains the data dictionary for hive 3 | -- https://issues.apache.org/jira/secure/attachment/12471108/HiveMetaStore.pdf 4 | -- mysql -h sandbox.hortonworks.com -P 3306 -u root 5 | use hive; 6 | select 7 | T.TBL_ID, T.OWNER, T.TBL_NAME, T.TBL_TYPE 8 | from 9 | TBLS as T 10 |   11 | select 12 | T.TBL_ID, T.OWNER, T.TBL_NAME, T.TBL_TYPE, 13 | TP.numFiles, TP.numRows, TP.totalSize, TP.rawDataSize, TP.STATS, 14 | S.SD_ID,S.CD_ID,SUBSTR(S.OUTPUT_FORMAT,30,20) OFORMAT, S.LOCATION 15 | from 16 | TBLS as T 17 | join SDS S on T.SD_ID=S.SD_ID 18 | join -- pivot table_params from rows to columns 19 | (select TBL_ID, 20 | max(case when PARAM_KEY='numFiles' then PARAM_VALUE else null end) numFiles, 21 | max(case when PARAM_KEY='numRows' then PARAM_VALUE else null end) numRows, 22 | max(case when PARAM_KEY='totalSize' then PARAM_VALUE else null end) totalSize, 23 | max(case when PARAM_KEY='rawDataSize' then PARAM_VALUE else null end) rawData Size, 24 | max(case when PARAM_KEY='COLUMN_STATS_ACCURATE' then PARAM_VALUE else null end) STATS 25 | from TABLE_PARAMS 26 | group by TBL_ID) as TP 27 | on T.TBL_ID=TP.TBL_ID; 28 | 29 | #partitioned tables 30 | select 31 | T.TBL_ID, T.OWNER, T.TBL_NAME, T.TBL_TYPE, P.PART_ID, 32 | TP.numFiles, TP.numRows, TP.totalSize, TP.rawDataSize, 33 | # TP.STATS, S.SD_ID,S.CD_ID, 34 | SUBSTR(S.OUTPUT_FORMAT,30,20) OFORMAT, S.LOCATION 35 | from 36 | TBLS as T 37 | join PARTITIONS P on T.TBL_ID=P.TBL_ID 38 | join SDS S on P.SD_ID=S.SD_ID 39 | join -- pivot table_params from rows to columns 40 | (select PART_ID, 41 | max(case when PARAM_KEY='numFiles' then PARAM_VALUE else null end) numFiles, 42 | max(case when PARAM_KEY='numRows' then PARAM_VALUE else null end) numRows, 43 | max(case when PARAM_KEY='totalSize' then PARAM_VALUE else null end) totalSize, 44 | max(case when PARAM_KEY='rawDataSize' then PARAM_VALUE else null end) rawDataSize, 45 | max(case when PARAM_KEY='COLUMN_STATS_ACCURATE' then PARAM_VALUE else null end) STATS 46 | from PARTITION_PARAMS 47 | group by PART_ID) as TP 48 | on P.PART_ID=TP.PART_ID; 49 | -------------------------------------------------------------------------------- /Hive/Hive Table Rowcounts.py: -------------------------------------------------------------------------------- 1 | %pyspark 2 | print sc.version 3 | ### get count of records for each table 4 | 5 | dftables = sqlContext.tables('default') 6 | #print dftables.show() 7 | print "Database Tables and Row Counts" 8 | df1 = sqlContext.sql("Use default") 9 | for t in dftables.collect(): 10 | #print t.tableName 11 | query = "select " + '"' +t.tableName + '"' + " as tableName, count(*) as num_rows from " + t.tableName 12 | #print query 13 | df2 = sqlContext.sql(query) 14 | for r in df2.collect(): 15 | print r.tableName + '|' , r.num_rows 16 | -------------------------------------------------------------------------------- /Hive/Hive Table and Column Metadata.py: -------------------------------------------------------------------------------- 1 | %pyspark 2 | print sc.version 3 | ## collect and display Hive Metastore data. Written for Zeppelin 4 | 5 | from pyspark.sql.types import * 6 | from pyspark.sql import functions as func 7 | from pyspark.sql.functions import split 8 | from pyspark.sql.functions import lit 9 | from pyspark.sql.functions import trim 10 | from pyspark.sql.functions import first 11 | from pyspark.sql.functions import instr 12 | from pyspark.sql.functions import substring 13 | from pyspark.sql.functions import length 14 | from pyspark.sql.functions import ltrim 15 | from pyspark.sql.functions import rtrim 16 | from pyspark.sql.functions import explode 17 | from pyspark.sql.functions import col 18 | 19 | #Create an empty RDD to eventually hold ALL the hive metastore tables and properties 20 | schema=StructType([\ 21 | StructField("Name",StringType(),False),\ 22 | StructField("Value",StringType(),False),\ 23 | StructField("Tname",StringType(),False),\ 24 | StructField("DBname",StringType(),False)\ 25 | ]) 26 | rdd4 = sqlContext.createDataFrame(sc.emptyRDD(),schema).rdd 27 | 28 | #Create an empty RDD to eventually hold ALL the hive tables and columns 29 | TCschema=StructType([\ 30 | StructField("col_name",StringType(),False),\ 31 | StructField("data_type",StringType(),False),\ 32 | StructField("comment",StringType(),False),\ 33 | StructField("Tname",StringType(),False),\ 34 | StructField("DBname",StringType(),False)\ 35 | ]) 36 | rddTC1 = sqlContext.createDataFrame(sc.emptyRDD(),TCschema).rdd 37 | 38 | d_databases = sqlContext.sql("show databases") 39 | for dbrow in d_databases.collect(): ### loop over each database 40 | #print "Database: " + dbrow.result 41 | dfdd = sqlContext.sql("Use " + dbrow.result) 42 | d_tables = sqlContext.sql("show tables") 43 | for tblrow in d_tables.collect(): ### loop over table each table 44 | table_name = tblrow.tableName 45 | if tblrow.isTemporary : 46 | print table_name + " Temporary" ## temp tables don't have any properties 47 | else : 48 | #print table_name 49 | 50 | df1 = sqlContext.sql("describe formatted " + tblrow.tableName ) 51 | 52 | # get list of table properties, filter out columns and split into name value pairs and put in RDD filter out columns 53 | df2 = df1.filter("result like '%:%'").select( trim(split("result",'\t')[0]).alias("Name") ,split("result",'\t')[1].alias("Value"))\ 54 | .withColumn("Name",func.regexp_replace('Name',":",""))\ 55 | .withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result)) 56 | 57 | # get table stats and append to table properties 58 | df4 = sqlContext.sql("show tblproperties " + tblrow.tableName ) 59 | df5 = df4.select( split("result",'\t')[0].alias("Name") ,split("result",'\t')[1].alias("Value"))\ 60 | .withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result)) 61 | 62 | df6 = df2.unionAll(df5) 63 | 64 | rdd4 = rdd4.union(df6.rdd) #append each tables properties to the whole list of tables and properties 65 | 66 | # get list of table columns 67 | dfc1 = sqlContext.sql("describe " + tblrow.tableName ) 68 | dfc2 = dfc1.filter("instr(col_name,'#')=0").withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result)) 69 | rddTC1 = rddTC1.union(dfc2.rdd) 70 | 71 | # get partition information 72 | if df1.filter("result like '# Partition%'").count()>0: 73 | dfp1 = sqlContext.sql("show table extended like "+ tblrow.tableName) 74 | dfp2 = dfp1.filter("result like 'partition%'").select( trim(split("result",':')[0]).alias("Name") ,split("result",':')[1].alias("Value") ) 75 | dfp3 = dfp2.withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result)) 76 | rdd4 = rdd4.union(dfp3.rdd) # append this to the list of table properties 77 | #print dfp3.show(20,False) 78 | 79 | #dfpc = dfp1.filter("result like 'partitionColumns%'").select(substring("result",45,100).alias('s')).select(explode(split("s",",")).alias('pc')) 80 | #dfpc = dfpc.withColumn("pc",func.regexp_replace('pc',"}","")) 81 | #dfpc = dfpc.withColumn("pc",func.regexp_replace('pc',"^[\s]","")) 82 | #dfpc1 = dfpc.select(split("pc"," ")[1].alias("PartitionColumn")) 83 | #dfpc1 = dfpc1.withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result)).withColumn('isPartitioned',lit(1)) 84 | 85 | #xxx = dfc2.alias('a').join(dfpc1.alias('b'),(col('a.col_name')==col('b.PartitionColumn')) ,'left_outer' ).select('a.*') 86 | #select (["dfc2."+xx for xx in dfc2.columns] + ['dfpc1.isPartitioned']) 87 | 88 | #print xxx.show(50,False) 89 | 90 | #display database and table properties 91 | df7 = sqlContext.createDataFrame(rdd4) 92 | ### pivot the list of properties so that the properties are displayed as columns 93 | df8 = df7.groupBy("DBname","Tname").pivot("Name").agg(first("Value")) 94 | print "Possible fields to select for each table" 95 | df8.printSchema() 96 | #print df8.show() 97 | #print df8.collect() 98 | #print df8.select("*").show() 99 | print df8.select("DBname","Tname","Owner","Table Type","Location","numRows","numFiles","rawDataSize").sort("DBname","Tname").show(50,False) 100 | #for row in df8.collect(): 101 | # print row.Tname + ' | ' +row.Location 102 | 103 | #display table columns 104 | dfc2 = sqlContext.createDataFrame(rddTC1).distinct().orderBy(["DBname","Tname"]) 105 | print dfc2.show(1000) 106 | -------------------------------------------------------------------------------- /Hive/Hive Table-Column Inventory.sql: -------------------------------------------------------------------------------- 1 | --Table - Column meata stored in mySQL hive metastore 2 | use hive; 3 | 4 | select 5 | T.TBL_ID, T.TBL_NAME, 6 | C.COLUMN_NAME, C.CD_ID, C.INTEGER_IDX, C.TYPE_NAME, C.COMMENT 7 | from 8 | TBLS as T 9 | join SDS S on T.SD_ID=S.SD_ID 10 | join COLUMNS_V2 C on S.CD_ID=C.CD_ID 11 | 12 | select 13 | CS.DB_NAME, CS.TBL_ID, CS.TABLE_NAME, CS.COLUMN_NAME, CS.COLUMN_TYPE, 14 | CS.NUM_NULLS, CS.NUM_DISTINCTS, CS.AVG_COL_LEN, CS.MAX_COL_LEN, 15 | CONCAT (IFNULL (LONG_LOW_VALUE,''), IFNULL (DOUBLE_LOW_VALUE,''), IFNULL (BIG_DECIMAL_LOW_VALUE,'')) as LOW_VALUE, 16 | CONCAT (IFNULL (LONG_HIGH_VALUE,''), IFNULL (DOUBLE_HIGH_VALUE,''), IFNULL (BIG_DECIMAL_HIGH_VALUE,'')) as HIGH_VALUE, 17 | CS.LAST_ANALYZED 18 | From TAB_COL_STATS CS 19 | union all 20 | select 21 | CS.DB_NAME, T.TBL_ID, T.TBL_NAME, P.PART_ID, CS.COLUMN_NAME, CS.COLUMN_TYPE , 22 | CS.NUM_NULLS ,CS.NUM_DISTINCTS ,CS.AVG_COL_LEN ,CS.MAX_COL_LEN , 23 | CONCAT (IFNULL(LONG_LOW_VALUE,''),IFNULL(DOUBLE_LOW_VALUE,''),IFNULL(BIG_DECIMAL_LOW_VALUE,'')) as LOW_VALUE, 24 | CONCAT (IFNULL(LONG_HIGH_VALUE,''),IFNULL(DOUBLE_HIGH_VALUE,''),IFNULL(BIG_DECIMAL_HIGH_VALUE,'')) as HIGH_VALUE 25 | #CS.LONG_LOW_VALUE , CS.LONG_HIGH_VALUE , CS.DOUBLE_HIGH_VALUE , CS.DOUBLE_LOW_VALUE , 26 | #CS.BIG_DECIMAL_LOW_VALUE ,CS.BIG_DECIMAL_HIGH_VALUE , 27 | #CS.NUM_TRUES ,CS.NUM_FALSES ,CS.LAST_ANALYZED 28 | from 29 | PART_COL_STATS CS join PARTITIONS P on P.PART_ID=CS.PART_ID JOIN TBLS T ON P.TBL_ID=T.TBL_ID; 30 | 31 | -------------------------------------------------------------------------------- /MySQL/Columns: -------------------------------------------------------------------------------- 1 | Use Information_schema; 2 | SELECT * FROM COLUMNS WHERE TABLE_SCHEMA = '<--DATABASE_NAME-->' AND TABLE_NAME='<--TABLENAME-->'; 3 | -------------------------------------------------------------------------------- /MySQL/TableRowCount: -------------------------------------------------------------------------------- 1 | 14 2 | -- for INNODB engine, these are estimates and not actual row counts 3 | 4 | show table status; 5 | 6 | 7 | SELECT TABLE_NAME,SUM(TABLE_ROWS) 8 | FROM INFORMATION_SCHEMA.TABLES 9 | WHERE TABLE_SCHEMA = 'your_db' 10 | GROUP BY TABLE_NAME; 11 | -------------------------------------------------------------------------------- /Netezza/Netezza Table Inventory: -------------------------------------------------------------------------------- 1 | select database, tablename, reltuples,* from _v_table where database='PROD_LANDING' ;--and createdate > '2014-05-16 10:00:02'; 2 | -------------------------------------------------------------------------------- /Netezza/Netezza Table-Column inventory: -------------------------------------------------------------------------------- 1 | Select T.Owner,T.Database, T.TableName Table_Name, T.reltuples Num_Rows, 2 | C.AttName Column_Name, C.Attnum, C.Format_type, C.attnotnull, C.ZMAPPED, 3 | decode (ATTDISPERSION,0,0,round(1.0/C.ATTDISPERSION)) num_distinct, 4 | S.NULLFRAC,-- S.COMMONFRAC, S.RECENT, S.COMMONVAL, 5 | S.LOVAL, S.HIVAL,d.distattnum as distributed_attnum 6 | from _v_relation_column C 7 | JOIN _V_STATISTIC S ON C.OBJID=S.OBJID AND C.ATTNUM=S.ATTNUM 8 | left join _v_table_dist_map d on c.objid=d.objid and c.attnum=d.attnum 9 | join _v_table T on C.Name=T.tablename 10 | where C.type='TABLE' and C.database = 'PROD_LANDING' 11 | order by 1,2,3,6; 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-profiling 2 | a set of scripts to pull meta data and data profiling metrics from relational database systems 3 | 4 | The collection of scripts and SQL-code which can be tailored to collect specific information about tables and columns within databases. 5 | It facilitates the bulk and rapid collection of high level and common metadata and provides a great starting point for identifiying and inventory of your database objects. 6 | 7 | The scripts are intended to be used within your own particular client user interface and assumes you are able to connect to the database from which you want to collect data. It is also up to you to dump the results out to whatever output meduim you want. Generally by saving the results as a spreadsheet, csv, or cutting and pasting the results from your user interface. This allows the greatest degree of flexibility. 8 | 9 | This repository is not meant to provide very deep data profiling capabilities, other data commercial and open source analytic and data management tools scan do that much better. 10 | 11 | We get this most of the meta data from the internal data dictionary of the database system. Some meta data can only be obtained by querying the the tables themselves. Since performance can often be a problem when profiling large tables, we don't try to do this in bulk for an entire database. This is where the more expensive and sophisticated tools are most useful. 12 | 13 | Data profiling is a process 14 | --------------------------- 15 | Data Profiling generally consists of a series of steps that dig deeper and deeper into the details of the data sets. 16 | The high level steps addressed by this repository include: 17 | 18 | 1. Inventory of databases (schemas) 19 | 2. Inventory and metadata of tables within the databases 20 | 3. Inventory and metadata of columns with the tables ( count distinct values, num nulls, min, max) 21 | 4. For each column we can then dig into the details ( i.e. frequency distribution or list of values for columns) and relationship to other tables/columns 22 | 23 | Databases Platforms 24 | ------------------- 25 | Since each database platform has it's own specific data dictionary, each platform has it's owns set of scripts. 26 | The following databases have scripts in this repository: 27 | 28 | * Oracle 29 | * SQLServer 30 | * MySQL 31 | * Sybase IQ 32 | * Netezza 33 | * Hive 34 | * AWS Redshift 35 | * Snowflake 36 | * AWS Synapse 37 | 38 | 39 | -------------------------------------------------------------------------------- /Redshift/Redshift Column Inventory: -------------------------------------------------------------------------------- 1 | select * from svv_columns where table_schema not in ('pg_catalog','pg_internal','information_schema') order by table_name,ordinal_position; 2 | 3 | select * from pg_table_def where schemaname not in ('pg_catalog','pg_internal','information_schema'); 4 | -------------------------------------------------------------------------------- /Redshift/Redshift Column Stats: -------------------------------------------------------------------------------- 1 | select t.database,t.schema, t.table, a.attname,s.staattnum,s.stadistinct,s.stawidth,stanullfrac,stakind1,staop1 2 | from pg_statistic s 3 | join svv_table_info t on (s.starelid=t.table_id) 4 | left join pg_attribute a on (s.staattnum=a.attnum and s.starelid=attrelid) 5 | where t.schema not in ('pg_catalog','pg_internal','information_schema') 6 | order by 1,2,3,5,6; 7 | -------------------------------------------------------------------------------- /Redshift/Redshift Table Inventory: -------------------------------------------------------------------------------- 1 | select datname,datdba,usename as owner from pg_database d join pg_user u on d.datdba=u.usesysid; 2 | select * from svv_tables where table_schema not in ('pg_catalog','pg_internal','information_schema') ; 3 | 4 | select "database","schema",table_id,"table",tbl_rows,size,"diststyle",sortkey1,sortkey_num 5 | from svv_table_info where schema not in ('pg_catalog','pg_internal','information_schema'); 6 | -------------------------------------------------------------------------------- /Redshift/Redshift Table-Column sizes: -------------------------------------------------------------------------------- 1 | select t.database,t.schema, b.tbl,t.table, b.col, a.attname,count(*) num_blocks 2 | from stv_blocklist b 3 | join svv_table_info t on (b.tbl=t.table_id) 4 | left join pg_attribute a on (b.tbl=a.attrelid and b.col=a.attnum-1) 5 | where t.schema not in ('pg_catalog','pg_internal','information_schema') 6 | group by t.database,t.schema, b.tbl,t.table, b.col, a.attname 7 | order by 1,2,3,4,5; 8 | -------------------------------------------------------------------------------- /SQLServer/SQLServer Column List of values: -------------------------------------------------------------------------------- 1 | -- generate SQL for getting list of values or column distribution 2 | -- note use of temp table #colstats generated earlier 3 | 4 | declare @DynamicSQL2 NVARCHAR(MAX) 5 | set @DynamicSQL2 = '' 6 | create table #coldistribution( tab NVARCHAR(MAX), col NVARCHAR(MAX), colval NVARCHAR(MAX), num_rows int); 7 | 8 | 9 | SELECT @DynamicSQL2 = @DynamicSQL2 + 10 | 'insert into #coldistribution select ' + 11 | '''' + tab + ''' tab,' + 12 | '''' + col + ''' col,' + 13 | + col + ',' + 14 | 'count(*) as num_rows from ' + tab + ' group by ' + col 15 | + CHAR(13) + ';' + CHAR(13) 16 | 17 | from #colstats where num_distinct between 2 and 20; -- can define maximum cardinality here 18 | 19 | SELECT CAST(@DynamicSQL2 AS XML); 20 | execute(@DynamicSQL2); 21 | select * from #coldistribution; 22 | 23 | --drop table #colstats; 24 | --drop table #coldistribution; 25 | go 26 | 27 | drop table #coldistribution; 28 | -------------------------------------------------------------------------------- /SQLServer/SQLServer Table Inventory: -------------------------------------------------------------------------------- 1 | SELECT @@servername as servername, name as database_name, database_id, create_date 2 | FROM sys.databases ; 3 | 4 | select * from information_schema.tables; 5 | 6 | 7 | 8 | CREATE TABLE AllTables ([DB Name] sysname, [Schema Name] sysname, [Table Name] sysname) 9 | DECLARE @SQL NVARCHAR(MAX) 10 | SELECT @SQL = COALESCE(@SQL,'') + ' 11 | insert into AllTables 12 | select ' + QUOTENAME(name,'''') + ' as [DB Name], [Table_Schema] as [Table Schema], [Table_Name] as [Table Name] from ' + 13 | QUOTENAME(Name) + '.INFORMATION_SCHEMA.Tables;' FROM sys.databases where name not in ('model','master','msdb') 14 | ORDER BY name 15 | print @SQL 16 | EXECUTE(@SQL) 17 | 18 | with c as 19 | ( 20 | select db_name() as database_name, s.name schema_name,t.name table_name,count(*) as num_columns 21 | from sys.all_columns c 22 | join sys.all_objects t on (c.object_id=t.object_id and t.type_desc='USER_TABLE') 23 | join sys.types tt on (c.system_type_id=tt.user_type_id) 24 | JOIN sys.schemas s ON t.schema_id = s.schema_id 25 | group by s.name,t.name 26 | ) 27 | SELECT 28 | @@servername as servername, 29 | db_name() as database_name, 30 | s.Name AS Schema_Name, 31 | t.Name AS Table_Name, 32 | c.num_columns as column_count, 33 | p.rows AS Row_Count, 34 | CAST(ROUND((SUM(a.used_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Used_MB, 35 | CAST(ROUND((SUM(a.total_pages) - SUM(a.used_pages)) / 128.00, 2) AS NUMERIC(36, 2)) AS Unused_MB, 36 | CAST(ROUND((SUM(a.total_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Total_MB 37 | FROM sys.tables t 38 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id 39 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id 40 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id 41 | INNER JOIN sys.schemas s ON t.schema_id = s.schema_id 42 | inner join c on c.schema_name=s.name and c.table_name=t.name 43 | where p.index_id IN ( 0, 1 ) 44 | GROUP BY t.Name, s.Name, p.Rows,c.num_columns 45 | ORDER BY s.Name, t.Name; 46 | 47 | -------------------------------------------------------------------------------- /SQLServer/SQLServer Table Row Counts: -------------------------------------------------------------------------------- 1 | DECLARE @DynamicSQL NVARCHAR(MAX) 2 | set @DynamicSQL='' 3 | -- create a sql script that contains a select count(*) from each table 4 | 5 | SELECT @DynamicSQL = @DynamicSQL + 6 | 'SELECT ' + quotename(table_schema,'''') + ' as [Schema Name], ' + 7 | QUOTENAME(TABLE_NAME,'''') + 8 | ‘ as [Table Name], COUNT(*) AS [Records Count] FROM ' + 9 | quotename(Table_schema) + '.' + QUOTENAME(TABLE_NAME) 10 | + CHAR(13) + ' UNION ALL ' + CHAR(13) 11 | FROM INFORMATION_SCHEMA.TABLES 12 | where table_name like ‘P%’ 13 | ORDER BY TABLE_NAME 14 | 15 | set @DynamicSQL = @DynamicSQL + Char(13) + 'Select ''1'',''1'',''1'' ' -- need to account for that last union all 16 | 17 | --print (@DynamicSQL) -- we may want to use PRINT to debug/edit the SQL 18 | SELECT CAST(@DynamicSQL AS XML) 19 | EXEC( @DynamicSQL) 20 | go 21 | -------------------------------------------------------------------------------- /SQLServer/SQLServer Table Rowcounts: -------------------------------------------------------------------------------- 1 | SELECT 2 | s.Name AS Schema_Name, 3 | t.Name AS Table_Name, 4 | t.object_id, 5 | t.type_desc, 6 | p.rows AS Row_Count, 7 | CAST(ROUND((SUM(a.used_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Used_MB, 8 | CAST(ROUND((SUM(a.total_pages) - SUM(a.used_pages)) / 128.00, 2) AS NUMERIC(36, 2)) AS Unused_MB, 9 | CAST(ROUND((SUM(a.total_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Total_MB 10 | FROM sys.tables t 11 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id 12 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id 13 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id 14 | INNER JOIN sys.schemas s ON t.schema_id = s.schema_id 15 | where p.index_id IN ( 0, 1 ) 16 | GROUP BY t.Name, t.object_id,t.type_desc,s.Name, p.Rows 17 | -------------------------------------------------------------------------------- /SQLServer/SQLServer Table-Column Inventory: -------------------------------------------------------------------------------- 1 | select t.name,c.name as column_name,column_id,tt.name as datatype, c.max_length 2 | from sys.all_columns c 3 | join sys.all_objects t on (c.object_id=t.object_id and t.type_desc='USER_TABLE') 4 | join sys.types tt on (c.system_type_id=tt.user_type_id) 5 | order by t.name,c.column_id 6 | -------------------------------------------------------------------------------- /SQLServer/SQLServer Table-Column Stats: -------------------------------------------------------------------------------- 1 | -- generate SQL for getting column stats 2 | 3 | declare @tabcolcursor CURSOR; 4 | declare @tab nvarchar(max); 5 | declare @col nvarchar(max); 6 | declare @datatype nvarchar(max); 7 | declare @dynamicsql nvarchar(max); 8 | 9 | create table #colstats( tab NVARCHAR(MAX), col NVARCHAR(MAX), datatype NVARCHAR(MAX), num_distinct int, num_null int, min_val NVARCHAR(MAX), max_val NVARCHAR(MAX)); 10 | 11 | set @tabcolcursor = cursor for 12 | with T as 13 | ( 14 | SELECT 15 | s.Name AS Schema_Name, 16 | t.Name AS Table_Name, 17 | t.object_id, 18 | t.type_desc, 19 | p.rows AS Row_Count, 20 | CAST(ROUND((SUM(a.used_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Used_MB, 21 | CAST(ROUND((SUM(a.total_pages) - SUM(a.used_pages)) / 128.00, 2) AS NUMERIC(36, 2)) AS Unused_MB, 22 | CAST(ROUND((SUM(a.total_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Total_MB 23 | FROM sys.tables t 24 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id 25 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id 26 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id 27 | INNER JOIN sys.schemas s ON t.schema_id = s.schema_id 28 | where p.index_id IN ( 0, 1 ) 29 | GROUP BY t.Name, t.object_id,t.type_desc,s.Name, p.Rows 30 | ) 31 | SELECT --top 10 32 | T.schema_name, 33 | T.Table_name , 34 | c.name , 35 | tt.name 36 | from sys.all_columns c 37 | join T on (c.object_id=T.object_id and T.type_desc='USER_TABLE') 38 | join sys.types tt on (c.system_type_id=tt.user_type_id) 39 | where 40 | --s.name in ('Staging','Integration') 41 | T.row_count>0 and 42 | T.table_name = 'Office' 43 | --('Item','AppSch_AuditLog','Inscarrier','ItemExam')*/ 44 | order by 1,2,3 45 | ; 46 | 47 | open @tabcolcursor 48 | fetch next from @tabcolcursor into @tab, @col, @datatype 49 | while @@FETCH_STATUS = 0 50 | begin 51 | set @dynamicsql = 'insert into #colstats select ' + 52 | '''' + @tab + ''' tab,' + 53 | '''' + @col + ''' col,' + 54 | '''' + @datatype + ''' datatype,' + 55 | 'count(distinct ' + @col + ') num_distict, ' + 56 | 'sum(case when ' + @col + ' is null then 1 else 0 end) num_null, ' + 57 | --'sum(case when cast(' + @col + ' as varchar(255)) =''?'' then 1 else 0 end) num_q, ' + 58 | --'sum(case when cast(' + @col + ' as varchar(255)) ='''' then 1 else 0 end) num_e, ' + 59 | --'sum(case when cast(' + @col + ' as varchar(255)) =''NOKEY'' then 1 else 0 end) num_nk, ' + 60 | --'max(len(' + @col + ')) max_len, ' + 61 | 'cast(min(' + @col + ') as varchar(255)) min_val, ' + 62 | 'cast(max(' + @col + ') as varchar(255)) max_val' + 63 | ' from ' + @tab + ';' 64 | 65 | print @dynamicsql 66 | exec (@dynamicsql); 67 | fetch next from @tabcolcursor into @tab, @col, @datatype; 68 | end 69 | CLOSE @tabcolcursor; 70 | DEALLOCATE @tabcolcursor; 71 | select * from #colstats; 72 | 73 | --drop table #colstats; 74 | -------------------------------------------------------------------------------- /SQLServer/Table Rowcounts and Space: -------------------------------------------------------------------------------- 1 | CREATE TABLE #SpaceUsed ( 2 | TableName sysname 3 | ,NumRows BIGINT 4 | ,ReservedSpace VARCHAR(50) 5 | ,DataSpace VARCHAR(50) 6 | ,IndexSize VARCHAR(50) 7 | ,UnusedSpace VARCHAR(50) 8 | ) ; 9 | 10 | DECLARE @str VARCHAR(500) 11 | SET @str = 'exec sp_spaceused ''?''' 12 | INSERT INTO #SpaceUsed 13 | EXEC sp_msforeachtable @command1=@str 14 | 15 | SELECT distinct * FROM #SpaceUsed ORDER BY TableName; 16 | drop table #SpaceUsed; 17 | 18 | 19 | 20 | SELECT 21 | t.NAME AS TableName, 22 | s.Name AS SchemaName, 23 | p.rows AS RowCounts, 24 | CAST(((SUM(a.total_pages) * 8) / 1024.00) AS NUMERIC(36, 2)) AS TotalSpaceMB, 25 | CAST(((SUM(a.used_pages) * 8) / 1024.00) AS NUMERIC(36, 2)) AS UsedSpaceMB 26 | FROM 27 | sys.tables t 28 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id 29 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id 30 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id 31 | LEFT OUTER JOIN sys.schemas s ON t.schema_id = s.schema_id 32 | WHERE 33 | t.NAME NOT LIKE 'dt%' 34 | AND t.is_ms_shipped = 0 35 | AND i.OBJECT_ID > 255 36 | GROUP BY t.Name, s.Name, p.Rows 37 | ORDER BY t.Name 38 | -------------------------------------------------------------------------------- /Sybase/Sybase Table-Column Stats: -------------------------------------------------------------------------------- 1 | 2 | Select st.table_name, 3 | st.count as row_count, 4 | col_count = (SELECT count(*) FROM sys.syscolumn where table_id = st.table_id) 5 | from SYS.SYSTABLE st where st.creator <> 0 and st.count > 0 6 | order by st.table_name 7 | 8 | --script to generate sql script that selects min, max, num_distinct for each table-column 9 | 10 | select 11 | 'select ' , 12 | ''''+t.table_name+'.'+c.column_name+'''' +',' , 13 | '''|'',' , 14 | 'count(distinct '+c.column_name+'),' , 15 | '''|'',' , 16 | 'min('+c.column_name+'),' , Select st.table_name, 17 | st.count as row_count, 18 | col_count = (SELECT count(*) FROM sys.syscolumn where table_id = st.table_id) 19 | from SYS.SYSTABLE st where st.creator <> 0 and st.count > 0 20 | order by st.table_name 21 | '''|'',' , 22 | 'max( '+c.column_name+')' , 23 | ' from ' +'com.'+ t.table_name+';' 24 | from syscolumn c join systable t on (t.table_id=c.table_id) 25 | where 26 | t.table_name ='COM_CUSTOMER_DIM' 27 | -- t.table_name like 'BHTV%DIM' 28 | --or t.table_name like 'BHTV%FACT' 29 | -------------------------------------------------------------------------------- /oracle/Oracle Column Value List Distribution: -------------------------------------------------------------------------------- 1 | -- generat a sql script to obtain the list of values and num of distinct values for low-cardinality columns 2 | select 3 | 'SELECT '''||OWNER||'.'||TABLE_NAME||'.'||COLUMN_NAME||'|'','|| COLUMN_NAME ||',', 4 | '''|''', 5 | ', COUNT(*) FROM ' ||OWNER||'.'||TABLE_NAME|| ' GROUP BY ' || COLUMN_NAME|| ' ORDER BY 3 desc;' 6 | FROM ALL_TAB_COLUMNS tc 7 | WHERE owner='CIA' and NUM_DISTINCT between 1 and 25 8 | order by tc.owner,table_name,column_name; 9 | -------------------------------------------------------------------------------- /oracle/Oracle Table Inventory: -------------------------------------------------------------------------------- 1 | select owner, tablespace_name, count(*) 2 | from all_tables 3 | group by owner, tablespace_name 4 | order by owner; 5 | 6 | select owner,table_name,tablespace_name ,last_analyzed, 7 | NUM_ROWS,AVG_ROW_LEN,BLOCKS,CACHE,PARTITIONED,COMPRESSION 8 | from all_tables 9 | order by 1,2 10 | 11 | SELECT * FROM ALL_TAB_COMMENTS 12 | WHERE OWNER IN ('STBMGR','MAMGR','STBRPTMGR') 13 | ORDER BY 1,2,3; 14 | -------------------------------------------------------------------------------- /oracle/Oracle Table Rowcounts: -------------------------------------------------------------------------------- 1 | --generate a SQL script that select count(*) from each table 2 | 3 | select 4 | 'select ' SS 5 | , ''''||table_name||''' tablename,' as Table_name 6 | ,'''|'',' D0 7 | , 'count(*) as Num_Rows' 8 | ,' from '||owner||'.'||table_name||';' FF 9 | from all_tables 10 | where owner='CIA'; 11 | --prompt from &&tab 12 | --prompt where rownum <=&&row_limit 13 | -------------------------------------------------------------------------------- /oracle/Oracle Table-Column Inventory: -------------------------------------------------------------------------------- 1 | SELECT * FROM ALL_COL_COMMENTS 2 | WHERE OWNER IN ('STBMGR','STBRPTMGR','MAMGR') 3 | ORDER BY 1,2,3; 4 | 5 | 6 | Select OWNER,table_name, column_id, column_name, 7 | --data_type, 8 | data_type || ' ' || decode (data_type, 9 | 'DATE',' ', 10 | 'TIMESTAMP(6)',' ', 11 | 'CHAR',TO_CHAR(DATA_LENGTH), 12 | 'VARCHAR2',TO_CHAR(DATA_LENGTH), 13 | 'RAW',' ', 14 | TO_CHAR(DATA_PRECISION)||','||TO_CHAR(DATA_SCALE)) DATATYPE, 15 | NULLABLE, NUM_DISTINCT, NUM_NULLS 16 | --data_length,data_precision,data_scale 17 | from all_tab_columns 18 | where 19 | owner in ('STBMGR','MAMGR','STBRPTMGR') 20 | and table_name not like '%ERR' 21 | order by 1,2 22 | -------------------------------------------------------------------------------- /oracle/Oracle Table-Column Stats: -------------------------------------------------------------------------------- 1 | 2 | --script to generate the collection of min,max,num_distinct,num_null for each column in each table. 3 | select 4 | 'select ' SS 5 | , ''''||table_name||''' tab,' as Table_name 6 | ,'''|'',' D0 7 | , ''''||column_name||''' col,' as Column_name 8 | ,'''|'',' D1 9 | ,'count(distinct '||column_name||') num_distict, ' C2 10 | ,'''|'',' D2 11 | ,'sum(decode('||column_name||',null,1,0)) num_nulls, ' C3 12 | ,'''|'',' D3 13 | ,'max(length('||column_name||')) max_len, ' C4 14 | ,'''|'',' D4 15 | ,'min('||column_name||') min_val, ' C5 16 | ,'''|'',' D5 17 | ,'max('||column_name||') max_val' C6 18 | ,' from '||owner||'.'||table_name||';' FF 19 | from all_tab_columns 20 | where owner='CIA'; 21 | --prompt from &&tab 22 | --prompt where rownum <=&&row_limit 23 | -------------------------------------------------------------------------------- /snowflake/table column stats: -------------------------------------------------------------------------------- 1 | use role LAB_DEVELOPER; 2 | --create database if not exists NYC_Taxi; 3 | Use database lab_db; 4 | use schema information_schema; 5 | create schema if not exists NYC_Taxi; 6 | use schema Admin; 7 | 8 | create or replace view all_tables 9 | as 10 | /* returns all tables , some attributes and storage metrics for all tables in the account */ 11 | with T as 12 | ( 13 | select table_catalog as database_name,table_schema as schema_name,table_name,table_id,table_type,is_transient,is_iceberg,row_count,cast(deleted as date) as deleted_dt 14 | from snowflake.account_usage.tables 15 | where table_type <> 'VIEW' 16 | ) 17 | , 18 | S as 19 | ( 20 | select 21 | id,table_catalog,table_schema,table_name,active_bytes,failsafe_bytes,time_travel_bytes,deleted 22 | from snowflake.account_usage.table_storage_metrics 23 | where active_bytes+failsafe_bytes+time_travel_bytes >0 or deleted = 'FALSE' 24 | ) 25 | select T.*,S.active_bytes,S.failsafe_bytes,time_travel_bytes,deleted 26 | from T inner join S on T.table_id=S.id 27 | order by 1,2,3; 28 | 29 | Create or replace view database_tables 30 | AS 31 | with T as 32 | ( 33 | select table_catalog as database_name,table_schema as schema_name,table_name,table_type,is_transient,is_iceberg,row_count 34 | from information_schema.tables 35 | where table_type <> 'VIEW' 36 | ) 37 | , 38 | S as 39 | ( 40 | select 41 | table_catalog,table_schema,table_name,sum(active_bytes) as active_bytes,sum(failsafe_bytes) as failsafe_bytes,sum(time_travel_bytes) as time_travel_bytes 42 | from information_schema.table_storage_metrics 43 | group by 1,2,3 44 | ) 45 | select T.*,S.active_bytes,S.failsafe_bytes,S.time_travel_bytes 46 | from T inner join S on T.database_name=S.table_catalog and T.schema_name=S.table_schema and T.Table_name=S.table_name 47 | order by 1,2,3; 48 | 49 | select * from all_tables order by 1,2,3; 50 | select * from database_tables order by 1,2,3; 51 | 52 | grant select on view all_tables to role sysadmin; 53 | grant usage on schema lab_db.admin to role sysadmin; 54 | grant usage on database lab_db to role sysadmin; 55 | 56 | CREATE OR REPLACE PROCEDURE SP_Data_Profile (DATABASE_NAME varchar, SCHEMA_NAME varchar, TABLE_NAME varchar) 57 | COPY GRANTS -- ensures grants are re-applied if the proc is replaced/modified 58 | RETURNS object 59 | LANGUAGE SQL 60 | --RETURNS NULL ON NULL INPUT -- will not call stored proc if any input is null and null return 61 | CALLED on NULL INPUT -- will always call stored proc with null inputs 62 | COMMENT = 'Obtains column metrics for a given table. Creates a temporary table (data_profile)containing some a data profile of the selected tables. min,max,distinct count and null count for each column of the tables ' 63 | EXECUTE AS CALLER -- CALLER/OWNER execution rights 64 | /* 65 | Description: Obtains column metrics for a given table. Creates a temporary table (current schema.data_profile) containing a data profile of the specified table. 66 | min,max,distinct count and null count for each column of the tables 67 | 68 | Usage : Call SP_Data_Profile(Database_Name , Schema_name , Table_name ); 69 | Parameters: 70 | Database_Name 71 | Schema_Name 72 | Table_Name 73 | Returns: a JSON object of the table containing the Database_name,Schema_name,Table_name and the columns and metrics 74 | 75 | Change Log: 76 | 2024-02-06:Paul S inintial version 77 | */ 78 | AS 79 | DECLARE 80 | c1 cursor for -- cursor to hold list of tables/columns to profile 81 | select 82 | --table_catalog, table_schema,table_name 83 | column_name,data_type 84 | from information_schema.columns 85 | where table_schema ilike ? and table_name ilike ?; 86 | sql_stmt text; 87 | insertcount integer :=0; 88 | column_name varchar; 89 | data_type varchar; 90 | row_count integer; 91 | distinct_count integer; 92 | null_count integer; 93 | max_value varchar; 94 | min_value varchar; 95 | error_msg varchar; 96 | profile_object object; 97 | BEGIN 98 | SYSTEM$LOG('debug', 'creating data_profile table'); 99 | 100 | CREATE OR REPLACE TEMPORARY TABLE Data_Profile 101 | ( database_name varchar,schema_name varchar,table_name varchar ,column_name varchar ,data_type varchar, row_count integer ,distinct_count integer ,null_count integer ,min_value varchar ,max_value varchar); 102 | 103 | SYSTEM$LOG('debug', 'getting schema'); 104 | OPEN C1 USING (schema_name,table_name); 105 | FOR record in C1 DO 106 | column_name := record.column_name; 107 | data_type := record.data_type; 108 | 109 | sql_stmt := 'insert into data_profile 110 | select ''' || :database_name || ''', '''|| :schema_name || ''', ''' || :table_name || ''', ''' || :column_name || ''', ''' || :data_type 111 | || ''' ,count(*)' || ', count(distinct ' || :column_name || '), count(*)-count(' || :column_name || '), ' 112 | || ' min( ' || :column_name || '), max(' || :column_name || ') ' 113 | || ' from ' || database_name || '.' || schema_name || '.' || table_name ; 114 | 115 | --SYSTEM$LOG('debug', sql_stmt); 116 | EXECUTE IMMEDIATE sql_stmt; 117 | insertcount := insertcount + 1; 118 | END FOR; 119 | 120 | SYSTEM$LOG('debug', 'inserted ' || insertcount::varchar || ' rows into data_profile'); 121 | /* lets return the the profile as a JSON object. we could easily return this as a table ... not sure which is best */ 122 | select object_construct ( 123 | 'Database_name',database_name, 124 | 'Schema_name',schema_name, 125 | 'Table_name',table_name, 126 | 'Table_row_count', max(row_count), 127 | 'column_count', count(*), 128 | 'columns',array_agg(object_construct ( 129 | 'column_name',column_name, 130 | 'data_type', data_type, 131 | 'distinct_count', distinct_count, 132 | 'null_count',null_count, 133 | 'min_value', min_value, 134 | 'max_value', max_value 135 | )) 136 | ) into :profile_object 137 | from data_profile 138 | group by database_name,schema_name,table_name; 139 | RETURN profile_object; 140 | EXCEPTION 141 | WHEN OTHER /*OR STATEMENT_ERROR OR EXPRESSION_ERROR*/ THEN 142 | error_msg := SQLSTATE||':'||SQLCODE||':'||SQLERRM; 143 | SYSTEM$LOG('error', error_msg); 144 | RAISE; 145 | END; 146 | 147 | Call SP_Data_Profile('Lab_DB','NYC_TAXI','GREEN_TRIPDATA'); 148 | Call SP_Data_Profile('Lab_DB','NYC_TAXI','yellow_TRIPDATA'); 149 | Call SP_Data_Profile('Lab_DB','NYC_TAXI','NYC_Weather'); 150 | Call SP_Data_Profile('Lab_DB','CITIBIKE','TRIPS'); 151 | 152 | alter session set log_level='debug'; 153 | 154 | select * from data_profile; 155 | 156 | 157 | With T as 158 | ( 159 | select object_construct ( 160 | 'Database_name',database_name, 161 | 'Schema_name',schema_name, 162 | 'Table_name',table_name, 163 | 'Table_row_count', max(row_count), 164 | 'column_count', count(*), 165 | 'columns',array_agg(object_construct ( 166 | 'column_name',column_name, 167 | 'data_type', data_type, 168 | 'distinct_count', distinct_count, 169 | 'null_count',null_count, 170 | 'min_value', min_value, 171 | 'max_value', max_value 172 | )) 173 | ) as v 174 | from data_profile 175 | group by database_name,schema_name,table_name 176 | ) 177 | select 178 | v:Database_name::varchar, v:Schema_name::varchar, v:Table_name::varchar, v:Table_row_count::integer, 179 | C.value:column_name,C.value:data_type,C.value:distinct_count 180 | from T 181 | ,lateral flatten(input=>v:columns) C 182 | 183 | -------------------------------------------------------------------------------- /synapse/synapse table size.sql: -------------------------------------------------------------------------------- 1 | --select * from sys.databases; 2 | select 3 | SERVERPROPERTY('ServerName') Servername, 4 | db_name() DB_name, 5 | SERVERPROPERTY('InstanceName') AS [Instance], 6 | SERVERPROPERTY('Edition') AS [Edition], 7 | SERVERPROPERTY('ProductVersion') AS [ProductVersion]; 8 | 9 | select * from sys.databases; 10 | 11 | sp_spaceused -- 515,931,352 KB 12 | DBCC PDW_SHOWSPACEUSED ( "ssplbdp01.E_IL1.benchmark_cc_detail" ); 13 | 14 | select * from sys.pdw_nodes_tables 15 | 16 | drop view Admin.vtablesize; 17 | 18 | 19 | --synapse 20 | drop view admin.vtabledistribution; 21 | 22 | CREATE VIEW Admin.vTableDistribution as 23 | SELECT 24 | SERVERPROPERTY('ServerName') Server_Name, 25 | db_name() as Database_Name 26 | ,s.name as Schema_Name 27 | ,t.name as Table_Name 28 | ,t.object_id as Table_Object_Id 29 | ,tp.distribution_policy_desc as Dist_Policy_Name 30 | ,nt.object_id as node_table_ojbect_id,nt.pdw_node_id,nt.distribution_id,nt.modify_date,nt.type_desc 31 | ,tm.physical_name 32 | ,nps.partition_number 33 | ,nps.index_id -- ID of the heap or index the partition is part of.0 = Heap 1 = Clustered index. > 1 = Nonclustered index 34 | ,case when row_number() over (partition by nps.object_id,nps.partition_number,nps.distribution_id order by nps.index_id)=1 then nps.row_count else 0 end as row_count 35 | --,nps.row_count 36 | ,nps.reserved_page_count/128.0 as reserved_space_mb 37 | ,nps.used_page_count/128.0 as used_space_mb 38 | from 39 | sys.schemas s 40 | JOIN sys.tables t ON s.[schema_id] = t.[schema_id] 41 | JOIN sys.pdw_table_distribution_properties tp ON t.[object_id] = tp.[object_id] 42 | JOIN sys.pdw_table_mappings tm ON t.[object_id] = tm.[object_id] 43 | JOIN sys.pdw_nodes_tables nt ON tm.[physical_name] = nt.[name] 44 | --JOIN sys.dm_pdw_nodes pn ON nt.[pdw_node_id] = pn.[pdw_node_id] 45 | --JOIN sys.pdw_distributions di ON nt.[distribution_id] = di.[distribution_id] 46 | JOIN sys.dm_pdw_nodes_db_partition_stats nps ON nt.[object_id] = nps.[object_id] AND nt.[pdw_node_id] = nps.[pdw_node_id] AND nt.[distribution_id] = nps.[distribution_id] 47 | 48 | select * from admin.vtabledistribution where table_name = ; 49 | 50 | select table_name,count(*),sum(reserved_space_mb) from admin.vtabledistribution where table_name='XXXX' group by table_name; 51 | 52 | 53 | drop view admin.vtablesize; 54 | Create view Admin.vTableSize 55 | as 56 | Select 57 | Server_Name, 58 | Database_Name, 59 | Schema_Name, 60 | Table_Name, 61 | Table_Object_Id, 62 | Dist_Policy_Name, 63 | count(*) as Distribution_Cnt,sum(row_count) as Row_cnt, sum(reserved_space_mb) Reserved_Space_MB , sum(used_space_mb) Used_Space_MB 64 | from Admin.vTableDistribution 65 | group by 66 | Server_name,Database_Name,schema_name,table_name,table_object_id,dist_policy_name; 67 | 68 | grant view database state to lbdp_sql_developer; --- needed for developers to query sys.pdw tables and tablesize 69 | 70 | select * from Admin.vtablesize 71 | --where row_cnt>0 72 | order by 1,2 73 | 74 | 75 | /* hash columns */ 76 | 77 | SELECT SERVERPROPERTY('ServerName') Servername, 78 | OBJECT_SCHEMA_NAME(tdp.object_id) schemaName, 79 | OBJECT_NAME(tdp.object_id) tableName, 80 | tdp.distribution_policy_desc, 81 | c.name AS hashDistributionColumnName, 82 | cdp.distribution_ordinal 83 | FROM sys.pdw_table_distribution_properties tdp 84 | INNER JOIN sys.pdw_column_distribution_properties cdp ON tdp.object_id = cdp.object_id 85 | INNER JOIN sys.columns c ON cdp.object_id = c.object_id 86 | AND cdp.column_id = c.column_id 87 | WHERE tdp.distribution_policy_desc = 'HASH' 88 | AND cdp.distribution_ordinal > 0 89 | order by 1,2,3 90 | 91 | --------------------------------------------------------------------------------