├── DB2
    ├── DB2 Table Inventory
    └── DB2 Table-Column Inventory
├── Hive
    ├── Hive Table Inventory.sql
    ├── Hive Table Rowcounts.py
    ├── Hive Table and Column Metadata.py
    └── Hive Table-Column Inventory.sql
├── MySQL
    ├── Columns
    └── TableRowCount
├── Netezza
    ├── Netezza Table Inventory
    └── Netezza Table-Column inventory
├── README.md
├── Redshift
    ├── Redshift Column Inventory
    ├── Redshift Column Stats
    ├── Redshift Table Inventory
    └── Redshift Table-Column sizes
├── SQLServer
    ├── SQLServer Column List of values
    ├── SQLServer Table Inventory
    ├── SQLServer Table Row Counts
    ├── SQLServer Table Rowcounts
    ├── SQLServer Table-Column Inventory
    ├── SQLServer Table-Column Stats
    └── Table Rowcounts and Space
├── Sybase
    └── Sybase Table-Column Stats
├── oracle
    ├── Oracle Column Value List Distribution
    ├── Oracle Table Inventory
    ├── Oracle Table Rowcounts
    ├── Oracle Table-Column Inventory
    └── Oracle Table-Column Stats
├── snowflake
    └── table column stats
└── synapse
    └── synapse table size.sql


/DB2/DB2 Table Inventory:
--------------------------------------------------------------------------------
1 | select name, creator, type from sysibm.systables
2 | 


--------------------------------------------------------------------------------
/DB2/DB2 Table-Column Inventory:
--------------------------------------------------------------------------------
1 | select creator,tbname,name,colno,coltype,length,scale 
2 | from sysibm.syscolumns 
3 | where TBname in ('D2VCM','IWCVECUS')
4 | 


--------------------------------------------------------------------------------
/Hive/Hive Table Inventory.sql:
--------------------------------------------------------------------------------
 1 | -- Hive SQL -- show tables
 2 | -- connect to hive metastore mySQL database -- this contains the data dictionary for hive
 3 | -- https://issues.apache.org/jira/secure/attachment/12471108/HiveMetaStore.pdf
 4 | -- mysql -h sandbox.hortonworks.com -P 3306 -u root
 5 | use hive;
 6 | select
 7 | T.TBL_ID, T.OWNER, T.TBL_NAME, T.TBL_TYPE 
 8 | from
 9 | TBLS as T
10 |   
11 | select
12 | T.TBL_ID, T.OWNER, T.TBL_NAME, T.TBL_TYPE,
13 | TP.numFiles, TP.numRows, TP.totalSize, TP.rawDataSize, TP.STATS,
14 | S.SD_ID,S.CD_ID,SUBSTR(S.OUTPUT_FORMAT,30,20) OFORMAT, S.LOCATION
15 | from
16 |   TBLS as T
17 |   join SDS S on T.SD_ID=S.SD_ID
18 |   join -- pivot table_params from rows to columns
19 |   (select TBL_ID,
20 |    max(case when PARAM_KEY='numFiles' then PARAM_VALUE else null end) numFiles,
21 |    max(case when PARAM_KEY='numRows' then PARAM_VALUE else null end) numRows,
22 |    max(case when PARAM_KEY='totalSize' then PARAM_VALUE else null end) totalSize,
23 |    max(case when PARAM_KEY='rawDataSize' then PARAM_VALUE else null end) rawData Size,
24 |    max(case when PARAM_KEY='COLUMN_STATS_ACCURATE' then PARAM_VALUE else null end) STATS
25 |    from TABLE_PARAMS
26 |    group by TBL_ID) as TP
27 |    on T.TBL_ID=TP.TBL_ID;
28 | 
29 | #partitioned tables
30 | select
31 | T.TBL_ID, T.OWNER, T.TBL_NAME, T.TBL_TYPE, P.PART_ID,
32 | TP.numFiles, TP.numRows, TP.totalSize, TP.rawDataSize,
33 | # TP.STATS, S.SD_ID,S.CD_ID,
34 | SUBSTR(S.OUTPUT_FORMAT,30,20) OFORMAT, S.LOCATION
35 | from
36 |   TBLS as T
37 |   join PARTITIONS P on T.TBL_ID=P.TBL_ID
38 |   join SDS S on P.SD_ID=S.SD_ID
39 |   join -- pivot table_params from rows to columns
40 |   (select PART_ID,
41 |    max(case when PARAM_KEY='numFiles' then PARAM_VALUE else null end) numFiles,
42 |    max(case when PARAM_KEY='numRows' then PARAM_VALUE else null end) numRows,
43 |    max(case when PARAM_KEY='totalSize' then PARAM_VALUE else null end) totalSize,
44 |    max(case when PARAM_KEY='rawDataSize' then PARAM_VALUE else null end) rawDataSize,
45 |    max(case when PARAM_KEY='COLUMN_STATS_ACCURATE' then PARAM_VALUE else null end) STATS
46 |    from PARTITION_PARAMS
47 |    group by PART_ID) as TP
48 |    on P.PART_ID=TP.PART_ID;
49 | 


--------------------------------------------------------------------------------
/Hive/Hive Table Rowcounts.py:
--------------------------------------------------------------------------------
 1 | %pyspark
 2 | print sc.version
 3 | ### get count of records for each table
 4 | 
 5 | dftables = sqlContext.tables('default')
 6 | #print dftables.show()
 7 | print "Database  Tables and Row Counts"
 8 | df1 = sqlContext.sql("Use default")
 9 | for t in dftables.collect():
10 |     #print t.tableName
11 |     query = "select " + '"' +t.tableName + '"' + " as tableName, count(*) as num_rows from " + t.tableName
12 |     #print query
13 |     df2 = sqlContext.sql(query)
14 |     for r in df2.collect():
15 |         print r.tableName + '|' , r.num_rows
16 | 


--------------------------------------------------------------------------------
/Hive/Hive Table and Column Metadata.py:
--------------------------------------------------------------------------------
  1 | %pyspark
  2 | print sc.version
  3 | ## collect and display Hive Metastore data.  Written for Zeppelin
  4 | 
  5 | from pyspark.sql.types import *
  6 | from pyspark.sql import functions as func
  7 | from pyspark.sql.functions import split
  8 | from pyspark.sql.functions import lit
  9 | from pyspark.sql.functions import trim
 10 | from pyspark.sql.functions import first
 11 | from pyspark.sql.functions import instr
 12 | from pyspark.sql.functions import substring
 13 | from pyspark.sql.functions import length
 14 | from pyspark.sql.functions import ltrim
 15 | from pyspark.sql.functions import rtrim
 16 | from pyspark.sql.functions import explode
 17 | from pyspark.sql.functions import col
 18 | 
 19 | #Create an empty RDD to eventually hold ALL the hive metastore tables and properties
 20 | schema=StructType([\
 21 | StructField("Name",StringType(),False),\
 22 | StructField("Value",StringType(),False),\
 23 | StructField("Tname",StringType(),False),\
 24 | StructField("DBname",StringType(),False)\
 25 | ])
 26 | rdd4 = sqlContext.createDataFrame(sc.emptyRDD(),schema).rdd
 27 | 
 28 | #Create an empty RDD to eventually hold ALL the hive tables and columns
 29 | TCschema=StructType([\
 30 | StructField("col_name",StringType(),False),\
 31 | StructField("data_type",StringType(),False),\
 32 | StructField("comment",StringType(),False),\
 33 | StructField("Tname",StringType(),False),\
 34 | StructField("DBname",StringType(),False)\
 35 | ])
 36 | rddTC1 = sqlContext.createDataFrame(sc.emptyRDD(),TCschema).rdd
 37 | 
 38 | d_databases = sqlContext.sql("show databases")
 39 | for dbrow in d_databases.collect():  ### loop over each database
 40 |     #print "Database: " + dbrow.result
 41 |     dfdd = sqlContext.sql("Use " + dbrow.result)
 42 |     d_tables = sqlContext.sql("show tables")      
 43 |     for tblrow in d_tables.collect():      ### loop over table each table
 44 |         table_name = tblrow.tableName 
 45 |         if tblrow.isTemporary :
 46 |             print table_name + " Temporary"  ## temp tables don't have any properties
 47 |         else :
 48 |             #print table_name
 49 | 
 50 |             df1 = sqlContext.sql("describe formatted " + tblrow.tableName )
 51 |             
 52 |             # get list of table properties, filter out columns and split into name value pairs and put in RDD filter out columns
 53 |             df2 = df1.filter("result like '%:%'").select( trim(split("result",'\t')[0]).alias("Name") ,split("result",'\t')[1].alias("Value"))\
 54 |             .withColumn("Name",func.regexp_replace('Name',":",""))\
 55 |             .withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result))
 56 |             
 57 |             # get table stats and append to table properties
 58 |             df4 = sqlContext.sql("show tblproperties " + tblrow.tableName ) 
 59 |             df5 = df4.select( split("result",'\t')[0].alias("Name") ,split("result",'\t')[1].alias("Value"))\
 60 |             .withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result))
 61 |             
 62 |             df6 = df2.unionAll(df5)
 63 |             
 64 |             rdd4 = rdd4.union(df6.rdd)  #append each tables properties to the whole list of tables and properties
 65 |             
 66 |             # get list of table columns
 67 |             dfc1 = sqlContext.sql("describe " + tblrow.tableName )
 68 |             dfc2 = dfc1.filter("instr(col_name,'#')=0").withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result))
 69 |             rddTC1 = rddTC1.union(dfc2.rdd)
 70 |             
 71 |             # get partition information
 72 |             if df1.filter("result like '# Partition%'").count()>0:
 73 |                 dfp1 = sqlContext.sql("show table extended like "+ tblrow.tableName)
 74 |                 dfp2 = dfp1.filter("result like 'partition%'").select( trim(split("result",':')[0]).alias("Name") ,split("result",':')[1].alias("Value")  )
 75 |                 dfp3 = dfp2.withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result))
 76 |                 rdd4 = rdd4.union(dfp3.rdd)   # append this to the list of table properties
 77 |                 #print dfp3.show(20,False)
 78 |                 
 79 |                 #dfpc = dfp1.filter("result like 'partitionColumns%'").select(substring("result",45,100).alias('s')).select(explode(split("s",",")).alias('pc'))
 80 |                 #dfpc = dfpc.withColumn("pc",func.regexp_replace('pc',"}",""))
 81 |                 #dfpc = dfpc.withColumn("pc",func.regexp_replace('pc',"^[\s]",""))
 82 |                 #dfpc1 = dfpc.select(split("pc"," ")[1].alias("PartitionColumn"))
 83 |                 #dfpc1 = dfpc1.withColumn('Tname', lit(tblrow.tableName)).withColumn('DBname', lit(dbrow.result)).withColumn('isPartitioned',lit(1))
 84 |                 
 85 |                 #xxx = dfc2.alias('a').join(dfpc1.alias('b'),(col('a.col_name')==col('b.PartitionColumn')) ,'left_outer' ).select('a.*')
 86 |                 #select (["dfc2."+xx for xx in dfc2.columns] + ['dfpc1.isPartitioned'])
 87 |                
 88 |                 #print xxx.show(50,False)
 89 |                 
 90 | #display database and table properties            
 91 | df7 = sqlContext.createDataFrame(rdd4)
 92 | ### pivot the list of properties so that the properties are displayed as columns
 93 | df8 = df7.groupBy("DBname","Tname").pivot("Name").agg(first("Value"))
 94 | print "Possible fields to select for each table"
 95 | df8.printSchema()
 96 | #print df8.show()
 97 | #print df8.collect()
 98 | #print df8.select("*").show()
 99 | print df8.select("DBname","Tname","Owner","Table Type","Location","numRows","numFiles","rawDataSize").sort("DBname","Tname").show(50,False)
100 | #for row in df8.collect():
101 | #    print row.Tname + ' | ' +row.Location
102 | 
103 | #display table columns
104 | dfc2 = sqlContext.createDataFrame(rddTC1).distinct().orderBy(["DBname","Tname"])
105 | print dfc2.show(1000)
106 | 


--------------------------------------------------------------------------------
/Hive/Hive Table-Column Inventory.sql:
--------------------------------------------------------------------------------
 1 | --Table - Column meata stored in mySQL hive metastore
 2 | use hive;
 3 | 
 4 | select
 5 | T.TBL_ID, T.TBL_NAME,
 6 | C.COLUMN_NAME, C.CD_ID, C.INTEGER_IDX, C.TYPE_NAME, C.COMMENT
 7 | from
 8 |   TBLS as T
 9 |   join SDS S on T.SD_ID=S.SD_ID
10 |   join COLUMNS_V2 C on S.CD_ID=C.CD_ID
11 | 
12 | select
13 | CS.DB_NAME, CS.TBL_ID, CS.TABLE_NAME, CS.COLUMN_NAME, CS.COLUMN_TYPE,
14 | CS.NUM_NULLS, CS.NUM_DISTINCTS, CS.AVG_COL_LEN, CS.MAX_COL_LEN,
15 | CONCAT (IFNULL (LONG_LOW_VALUE,''), IFNULL (DOUBLE_LOW_VALUE,''), IFNULL (BIG_DECIMAL_LOW_VALUE,'')) as LOW_VALUE,
16 | CONCAT (IFNULL (LONG_HIGH_VALUE,''), IFNULL (DOUBLE_HIGH_VALUE,''), IFNULL (BIG_DECIMAL_HIGH_VALUE,'')) as HIGH_VALUE,
17 | CS.LAST_ANALYZED
18 | From TAB_COL_STATS CS
19 | union all
20 | select
21 | CS.DB_NAME, T.TBL_ID, T.TBL_NAME, P.PART_ID, CS.COLUMN_NAME, CS.COLUMN_TYPE ,
22 | CS.NUM_NULLS ,CS.NUM_DISTINCTS ,CS.AVG_COL_LEN ,CS.MAX_COL_LEN ,
23 | CONCAT (IFNULL(LONG_LOW_VALUE,''),IFNULL(DOUBLE_LOW_VALUE,''),IFNULL(BIG_DECIMAL_LOW_VALUE,'')) as LOW_VALUE,
24 | CONCAT (IFNULL(LONG_HIGH_VALUE,''),IFNULL(DOUBLE_HIGH_VALUE,''),IFNULL(BIG_DECIMAL_HIGH_VALUE,'')) as HIGH_VALUE
25 | #CS.LONG_LOW_VALUE , CS.LONG_HIGH_VALUE , CS.DOUBLE_HIGH_VALUE , CS.DOUBLE_LOW_VALUE ,
26 | #CS.BIG_DECIMAL_LOW_VALUE ,CS.BIG_DECIMAL_HIGH_VALUE ,
27 | #CS.NUM_TRUES ,CS.NUM_FALSES ,CS.LAST_ANALYZED
28 |   from
29 | PART_COL_STATS CS join PARTITIONS P on P.PART_ID=CS.PART_ID JOIN TBLS T ON P.TBL_ID=T.TBL_ID;
30 | 
31 | 


--------------------------------------------------------------------------------
/MySQL/Columns:
--------------------------------------------------------------------------------
1 | Use Information_schema;
2 | SELECT * FROM COLUMNS WHERE TABLE_SCHEMA = '<--DATABASE_NAME-->' AND   TABLE_NAME='<--TABLENAME-->';
3 | 


--------------------------------------------------------------------------------
/MySQL/TableRowCount:
--------------------------------------------------------------------------------
 1 | 14
 2 | -- for INNODB engine, these are estimates and not actual row counts
 3 | 
 4 | show table status;
 5 | 
 6 | 
 7 |  SELECT TABLE_NAME,SUM(TABLE_ROWS) 
 8 |  FROM INFORMATION_SCHEMA.TABLES 
 9 |  WHERE TABLE_SCHEMA = 'your_db' 
10 |  GROUP BY TABLE_NAME;
11 | 


--------------------------------------------------------------------------------
/Netezza/Netezza Table Inventory:
--------------------------------------------------------------------------------
1 | select database, tablename, reltuples,* from _v_table where database='PROD_LANDING' ;--and createdate > '2014-05-16 10:00:02';
2 | 


--------------------------------------------------------------------------------
/Netezza/Netezza Table-Column inventory:
--------------------------------------------------------------------------------
 1 | Select  T.Owner,T.Database, T.TableName Table_Name, T.reltuples Num_Rows,
 2 | C.AttName Column_Name, C.Attnum, C.Format_type, C.attnotnull, C.ZMAPPED,
 3 | decode (ATTDISPERSION,0,0,round(1.0/C.ATTDISPERSION)) num_distinct, 
 4 | S.NULLFRAC,-- S.COMMONFRAC, S.RECENT, S.COMMONVAL, 
 5 | S.LOVAL, S.HIVAL,d.distattnum as distributed_attnum
 6 | from _v_relation_column C
 7 | JOIN _V_STATISTIC S ON C.OBJID=S.OBJID AND C.ATTNUM=S.ATTNUM
 8 | left join _v_table_dist_map d on c.objid=d.objid and c.attnum=d.attnum
 9 | join _v_table T on C.Name=T.tablename
10 | where C.type='TABLE' and C.database = 'PROD_LANDING'
11 | order by 1,2,3,6;
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # data-profiling
 2 | a set of scripts to pull meta data and data profiling metrics from relational database systems
 3 | 
 4 | The collection of scripts and SQL-code which can be tailored to collect specific information about tables and columns within databases.
 5 | It facilitates the bulk and rapid collection of high level and common metadata and provides a great starting point for identifiying and inventory of your database objects.
 6 | 
 7 | The scripts are intended to be used within your own particular client user interface and assumes you are able to connect to the database from which you want to collect data.   It is also up to you to dump the results out to whatever output meduim you want.   Generally by saving the results as a spreadsheet, csv, or cutting and pasting the results from your user interface.   This allows the greatest degree of flexibility.  
 8 | 
 9 | This repository is not meant to provide very deep data profiling capabilities,  other data commercial and open source analytic and data management tools scan do that much better.  
10 | 
11 | We get this most of the meta data from the internal data dictionary of the database system.   Some meta data can only be obtained by querying the the tables themselves.   Since performance can often be a problem when profiling large tables, we don't try to do this in bulk for an entire database.  This is where the more expensive and sophisticated tools are most useful.
12 | 
13 | Data profiling is a process
14 | ---------------------------
15 | Data Profiling generally consists of a series of steps that dig deeper and deeper into the details of the data sets.
16 | The high level steps addressed by this repository include: 
17 | 
18 |   1.  Inventory of databases (schemas)
19 |   2.  Inventory and metadata of tables within the databases
20 |   3.  Inventory and metadata of columns with the tables ( count distinct values, num nulls, min, max)
21 |   4.  For each column we can then dig into the details ( i.e. frequency distribution or list of values for columns) and relationship to other tables/columns
22 |   
23 | Databases Platforms
24 | -------------------
25 | Since each database platform has it's own specific data dictionary, each platform has it's owns set of scripts.
26 | The following databases have scripts in this repository:
27 | 
28 |   	* Oracle
29 |   	* SQLServer
30 |   	* MySQL
31 |   	* Sybase IQ
32 |   	* Netezza
33 |   	* Hive
34 |     * AWS Redshift
35 |     * Snowflake
36 |     * AWS Synapse
37 |  
38 |   
39 | 


--------------------------------------------------------------------------------
/Redshift/Redshift Column Inventory:
--------------------------------------------------------------------------------
1 | select * from svv_columns where table_schema not in ('pg_catalog','pg_internal','information_schema') order by table_name,ordinal_position;
2 | 
3 | select * from pg_table_def where schemaname not in ('pg_catalog','pg_internal','information_schema');
4 | 


--------------------------------------------------------------------------------
/Redshift/Redshift Column Stats:
--------------------------------------------------------------------------------
1 | select t.database,t.schema, t.table, a.attname,s.staattnum,s.stadistinct,s.stawidth,stanullfrac,stakind1,staop1
2 | from pg_statistic s
3 | join svv_table_info t on (s.starelid=t.table_id)
4 | left join pg_attribute a on (s.staattnum=a.attnum and s.starelid=attrelid)
5 | where t.schema not in ('pg_catalog','pg_internal','information_schema')
6 | order by 1,2,3,5,6;
7 | 


--------------------------------------------------------------------------------
/Redshift/Redshift Table Inventory:
--------------------------------------------------------------------------------
1 | select datname,datdba,usename as owner from pg_database d join pg_user u on d.datdba=u.usesysid;
2 | select * from svv_tables where table_schema not in ('pg_catalog','pg_internal','information_schema') ;
3 | 
4 | select "database","schema",table_id,"table",tbl_rows,size,"diststyle",sortkey1,sortkey_num
5 | from svv_table_info where schema not in ('pg_catalog','pg_internal','information_schema');
6 | 


--------------------------------------------------------------------------------
/Redshift/Redshift Table-Column sizes:
--------------------------------------------------------------------------------
1 | select t.database,t.schema, b.tbl,t.table, b.col, a.attname,count(*) num_blocks
2 | from stv_blocklist b
3 | join svv_table_info t on (b.tbl=t.table_id)
4 | left join pg_attribute a on (b.tbl=a.attrelid and b.col=a.attnum-1)
5 | where t.schema not in ('pg_catalog','pg_internal','information_schema')
6 | group by t.database,t.schema, b.tbl,t.table, b.col, a.attname
7 | order by 1,2,3,4,5;
8 | 


--------------------------------------------------------------------------------
/SQLServer/SQLServer Column List of values:
--------------------------------------------------------------------------------
 1 | -- generate SQL for getting list of values or column distribution 
 2 | -- note use of temp table #colstats generated earlier
 3 | 
 4 | declare @DynamicSQL2 NVARCHAR(MAX)
 5 | set @DynamicSQL2 = ''
 6 | create table #coldistribution( tab NVARCHAR(MAX), col NVARCHAR(MAX), colval NVARCHAR(MAX), num_rows int);
 7 | 
 8 | 
 9 | SELECT   @DynamicSQL2 = @DynamicSQL2 +  
10 | 'insert into #coldistribution select ' +
11 |  '''' + tab + ''' tab,' +
12 |  '''' + col + ''' col,' + 
13 |  + col + ',' + 
14 |  'count(*) as num_rows from ' + tab +  ' group by ' + col
15 |  + CHAR(13) + ';'  + CHAR(13) 
16 | 
17 | from #colstats where num_distinct between 2 and 20;  -- can define maximum cardinality here
18 | 
19 | SELECT CAST(@DynamicSQL2 AS XML);
20 | execute(@DynamicSQL2);
21 | select * from #coldistribution;
22 | 
23 | --drop table #colstats;
24 | --drop table #coldistribution;
25 | go
26 | 
27 | drop table #coldistribution;
28 | 


--------------------------------------------------------------------------------
/SQLServer/SQLServer Table Inventory:
--------------------------------------------------------------------------------
 1 | SELECT @@servername as servername, name as database_name, database_id, create_date
 2 | FROM sys.databases ;
 3 | 
 4 | select * from information_schema.tables;
 5 | 
 6 | 
 7 | 
 8 | CREATE TABLE AllTables ([DB Name] sysname, [Schema Name] sysname, [Table Name] sysname)
 9 | DECLARE @SQL NVARCHAR(MAX)
10 | SELECT @SQL = COALESCE(@SQL,'') + '
11 | insert into AllTables
12 | select ' + QUOTENAME(name,'''') + ' as [DB Name], [Table_Schema] as [Table Schema], [Table_Name] as [Table Name] from ' +
13 | QUOTENAME(Name) + '.INFORMATION_SCHEMA.Tables;' FROM sys.databases where name not in ('model','master','msdb')
14 | ORDER BY name
15 |  print @SQL
16 | EXECUTE(@SQL)
17 | 
18 | with c as
19 | (
20 | select db_name() as database_name, s.name schema_name,t.name table_name,count(*) as num_columns 
21 |  from sys.all_columns c
22 | join sys.all_objects  t on  (c.object_id=t.object_id and t.type_desc='USER_TABLE')
23 | join sys.types tt on (c.system_type_id=tt.user_type_id)
24 | JOIN sys.schemas s ON t.schema_id = s.schema_id
25 | group by s.name,t.name
26 | )
27 | SELECT
28 | @@servername as servername,
29 | db_name() as database_name,
30 | s.Name AS Schema_Name,
31 | t.Name AS Table_Name,
32 | c.num_columns as column_count,
33 | p.rows AS Row_Count,
34 | CAST(ROUND((SUM(a.used_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Used_MB,
35 | CAST(ROUND((SUM(a.total_pages) - SUM(a.used_pages)) / 128.00, 2) AS NUMERIC(36, 2)) AS Unused_MB,
36 | CAST(ROUND((SUM(a.total_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Total_MB
37 | FROM sys.tables t
38 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id
39 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id
40 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id
41 | INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
42 | inner join c on c.schema_name=s.name and c.table_name=t.name
43 | where p.index_id IN ( 0, 1 )
44 | GROUP BY t.Name, s.Name, p.Rows,c.num_columns
45 | ORDER BY s.Name, t.Name;
46 | 
47 | 


--------------------------------------------------------------------------------
/SQLServer/SQLServer Table Row Counts:
--------------------------------------------------------------------------------
 1 | DECLARE  @DynamicSQL NVARCHAR(MAX)
 2 | set @DynamicSQL=''
 3 | -- create a sql script that contains a select count(*) from each table
 4 | 
 5 | SELECT   @DynamicSQL = @DynamicSQL +
 6 |            'SELECT ' + quotename(table_schema,'''') + ' as [Schema Name], ' +
 7 |            QUOTENAME(TABLE_NAME,'''') + 
 8 |            ‘ as [Table Name], COUNT(*) AS [Records Count] FROM ' + 
 9 |            quotename(Table_schema) + '.' + QUOTENAME(TABLE_NAME)
10 | 					+ CHAR(13) + ' UNION ALL ' + CHAR(13)
11 | FROM  INFORMATION_SCHEMA.TABLES
12 | where table_name like ‘P%’ 
13 | ORDER BY TABLE_NAME
14 | 
15 | set @DynamicSQL = @DynamicSQL + Char(13) + 'Select ''1'',''1'',''1''  ' -- need to account for that last union all
16 |  
17 | --print (@DynamicSQL) -- we may want to use PRINT to debug/edit the SQL
18 | SELECT CAST(@DynamicSQL AS XML)
19 | EXEC( @DynamicSQL)
20 | go
21 | 


--------------------------------------------------------------------------------
/SQLServer/SQLServer Table Rowcounts:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | s.Name AS Schema_Name,
 3 | t.Name AS Table_Name,
 4 | t.object_id,
 5 | t.type_desc,
 6 | p.rows AS Row_Count,
 7 | CAST(ROUND((SUM(a.used_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Used_MB,
 8 | CAST(ROUND((SUM(a.total_pages) - SUM(a.used_pages)) / 128.00, 2) AS NUMERIC(36, 2)) AS Unused_MB,
 9 | CAST(ROUND((SUM(a.total_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Total_MB
10 | FROM sys.tables t
11 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id
12 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id
13 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id
14 | INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
15 | where p.index_id IN ( 0, 1 )
16 | GROUP BY t.Name, t.object_id,t.type_desc,s.Name, p.Rows
17 | 


--------------------------------------------------------------------------------
/SQLServer/SQLServer Table-Column Inventory:
--------------------------------------------------------------------------------
1 | select  t.name,c.name as column_name,column_id,tt.name as datatype, c.max_length
2 |  from sys.all_columns c
3 | join sys.all_objects  t on  (c.object_id=t.object_id and t.type_desc='USER_TABLE')
4 | join sys.types tt on (c.system_type_id=tt.user_type_id)
5 | order by t.name,c.column_id
6 | 


--------------------------------------------------------------------------------
/SQLServer/SQLServer Table-Column Stats:
--------------------------------------------------------------------------------
 1 | -- generate SQL for getting column stats
 2 | 
 3 | declare @tabcolcursor CURSOR;
 4 | declare @tab nvarchar(max);
 5 | declare @col nvarchar(max);
 6 | declare @datatype nvarchar(max);
 7 | declare @dynamicsql nvarchar(max);
 8 | 
 9 | create table #colstats( tab NVARCHAR(MAX), col NVARCHAR(MAX), datatype NVARCHAR(MAX), num_distinct int, num_null int, min_val NVARCHAR(MAX), max_val NVARCHAR(MAX));
10 | 
11 | set @tabcolcursor = cursor for
12 | with T as
13 | (
14 | SELECT
15 | s.Name AS Schema_Name,
16 | t.Name AS Table_Name,
17 | t.object_id,
18 | t.type_desc,
19 | p.rows AS Row_Count,
20 | CAST(ROUND((SUM(a.used_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Used_MB,
21 | CAST(ROUND((SUM(a.total_pages) - SUM(a.used_pages)) / 128.00, 2) AS NUMERIC(36, 2)) AS Unused_MB,
22 | CAST(ROUND((SUM(a.total_pages) / 128.00), 2) AS NUMERIC(36, 2)) AS Total_MB
23 | FROM sys.tables t
24 | INNER JOIN sys.indexes i ON t.OBJECT_ID = i.object_id
25 | INNER JOIN sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id
26 | INNER JOIN sys.allocation_units a ON p.partition_id = a.container_id
27 | INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
28 | where p.index_id IN ( 0, 1 )
29 | GROUP BY t.Name, t.object_id,t.type_desc,s.Name, p.Rows
30 | )
31 | SELECT --top 10 
32 |  T.schema_name,
33 |  T.Table_name ,
34 |  c.name ,
35 |  tt.name  
36 | from sys.all_columns c
37 | join T on  (c.object_id=T.object_id and T.type_desc='USER_TABLE')
38 | join sys.types tt on (c.system_type_id=tt.user_type_id)
39 | where
40 | --s.name in ('Staging','Integration')
41 | T.row_count>0 and
42 | T.table_name  = 'Office'
43 | --('Item','AppSch_AuditLog','Inscarrier','ItemExam')*/
44 | order by 1,2,3
45 | ;
46 | 
47 | open @tabcolcursor
48 | fetch next from @tabcolcursor into @tab, @col, @datatype
49 | while @@FETCH_STATUS = 0 
50 | begin
51 | 	  set @dynamicsql =   'insert into #colstats select ' +
52 | 	 '''' + @tab + ''' tab,' +
53 | 	 '''' + @col + ''' col,' +
54 | 	 '''' + @datatype + ''' datatype,' + 
55 | 	'count(distinct ' + @col + ') num_distict, ' +
56 | 	'sum(case when ' +  @col + ' is null then 1 else 0 end) num_null, ' +
57 | 	--'sum(case when cast(' + @col + ' as varchar(255)) =''?'' then 1 else 0 end) num_q, ' +
58 | 	--'sum(case when cast(' + @col + ' as varchar(255)) ='''' then 1 else 0 end) num_e, ' +
59 | 	--'sum(case when cast(' + @col + ' as varchar(255)) =''NOKEY'' then 1 else 0 end) num_nk, ' +
60 | 	--'max(len(' + @col + ')) max_len, ' +
61 | 	'cast(min(' + @col + ') as varchar(255)) min_val, ' +
62 | 	'cast(max(' + @col + ') as varchar(255)) max_val' +
63 | 	' from '  + @tab + ';'
64 | 
65 |    print @dynamicsql
66 |    exec (@dynamicsql);
67 |    fetch next from @tabcolcursor into @tab, @col, @datatype;
68 | end
69 | CLOSE @tabcolcursor;
70 | DEALLOCATE @tabcolcursor;
71 | select * from #colstats;
72 | 
73 | --drop table #colstats;
74 | 


--------------------------------------------------------------------------------
/SQLServer/Table Rowcounts and Space:
--------------------------------------------------------------------------------
 1 | CREATE TABLE #SpaceUsed (
 2 | 	 TableName sysname
 3 | 	,NumRows BIGINT
 4 | 	,ReservedSpace VARCHAR(50)
 5 | 	,DataSpace VARCHAR(50)
 6 | 	,IndexSize VARCHAR(50)
 7 | 	,UnusedSpace VARCHAR(50)
 8 | 	) ;
 9 | 
10 | DECLARE @str VARCHAR(500)
11 | SET @str =  'exec sp_spaceused ''?'''
12 | INSERT INTO #SpaceUsed 
13 | EXEC sp_msforeachtable @command1=@str
14 | 
15 | SELECT distinct * FROM #SpaceUsed ORDER BY TableName;
16 | drop table #SpaceUsed;
17 | 
18 | 
19 | 
20 | SELECT 
21 |     t.NAME AS TableName,
22 |     s.Name AS SchemaName,
23 |     p.rows AS RowCounts,
24 |     CAST(((SUM(a.total_pages) * 8) / 1024.00) AS NUMERIC(36, 2)) AS TotalSpaceMB,
25 |     CAST(((SUM(a.used_pages) * 8) / 1024.00) AS NUMERIC(36, 2)) AS UsedSpaceMB
26 | FROM 
27 |     sys.tables t
28 | INNER JOIN  sys.indexes i ON t.OBJECT_ID = i.object_id
29 | INNER JOIN  sys.partitions p ON i.object_id = p.OBJECT_ID AND i.index_id = p.index_id
30 | INNER JOIN  sys.allocation_units a ON p.partition_id = a.container_id
31 | LEFT OUTER JOIN sys.schemas s ON t.schema_id = s.schema_id
32 | WHERE 
33 |     t.NAME NOT LIKE 'dt%' 
34 |     AND t.is_ms_shipped = 0
35 |     AND i.OBJECT_ID > 255 
36 | GROUP BY   t.Name, s.Name, p.Rows
37 | ORDER BY   t.Name
38 | 


--------------------------------------------------------------------------------
/Sybase/Sybase Table-Column Stats:
--------------------------------------------------------------------------------
 1 | 
 2 |  Select st.table_name, 
 3 |     st.count as row_count,
 4 |     col_count = (SELECT count(*) FROM sys.syscolumn where table_id = st.table_id)
 5 |     from SYS.SYSTABLE st where st.creator <> 0 and st.count > 0
 6 |     order by st.table_name
 7 | 
 8 | --script to generate sql script that selects min, max, num_distinct for each table-column
 9 | 
10 | select 
11 | 'select ' ,
12 | ''''+t.table_name+'.'+c.column_name+'''' +','  , 
13 | '''|'',' ,
14 | 'count(distinct '+c.column_name+'),'  ,
15 | '''|'',' ,
16 | 'min('+c.column_name+'),'  , Select st.table_name, 
17 |     st.count as row_count,
18 |     col_count = (SELECT count(*) FROM sys.syscolumn where table_id = st.table_id)
19 |     from SYS.SYSTABLE st where st.creator <> 0 and st.count > 0
20 |     order by st.table_name
21 | '''|'',' ,
22 | 'max( '+c.column_name+')' ,
23 | ' from ' +'com.'+ t.table_name+';'
24 |   from syscolumn c join systable t on (t.table_id=c.table_id)
25 |   where
26 |   t.table_name ='COM_CUSTOMER_DIM'
27 |   -- t.table_name like 'BHTV%DIM'
28 |   --or t.table_name like 'BHTV%FACT'
29 | 


--------------------------------------------------------------------------------
/oracle/Oracle Column Value List Distribution:
--------------------------------------------------------------------------------
1 | -- generat a sql script to obtain the list of values and num of distinct values for low-cardinality columns
2 | select 
3 | 'SELECT '''||OWNER||'.'||TABLE_NAME||'.'||COLUMN_NAME||'|'','|| COLUMN_NAME ||',',
4 | '''|''',
5 | ', COUNT(*) FROM ' ||OWNER||'.'||TABLE_NAME|| ' GROUP BY ' || COLUMN_NAME|| ' ORDER BY 3 desc;'
6 | FROM ALL_TAB_COLUMNS  tc
7 | WHERE owner='CIA' and NUM_DISTINCT between 1 and 25
8 | order by tc.owner,table_name,column_name;
9 | 


--------------------------------------------------------------------------------
/oracle/Oracle Table Inventory:
--------------------------------------------------------------------------------
 1 | select owner, tablespace_name, count(*)
 2 |  from all_tables
 3 |  group by owner, tablespace_name
 4 |  order by owner;
 5 |  
 6 |  select owner,table_name,tablespace_name ,last_analyzed,
 7 | NUM_ROWS,AVG_ROW_LEN,BLOCKS,CACHE,PARTITIONED,COMPRESSION
 8 | from all_tables
 9 | order by 1,2
10 | 
11 | SELECT * FROM ALL_TAB_COMMENTS
12 | WHERE OWNER IN ('STBMGR','MAMGR','STBRPTMGR')
13 | ORDER BY 1,2,3;
14 | 


--------------------------------------------------------------------------------
/oracle/Oracle Table Rowcounts:
--------------------------------------------------------------------------------
 1 | --generate a SQL script that select count(*) from each table
 2 | 
 3 | select
 4 | 'select ' SS
 5 | , ''''||table_name||''' tablename,' as Table_name
 6 | ,'''|'',' D0
 7 | , 'count(*) as Num_Rows'
 8 | ,' from '||owner||'.'||table_name||';'  FF
 9 | from all_tables
10 | where owner='CIA';
11 | --prompt from &&tab
12 | --prompt where rownum <=&&row_limit
13 | 


--------------------------------------------------------------------------------
/oracle/Oracle Table-Column Inventory:
--------------------------------------------------------------------------------
 1 | SELECT * FROM ALL_COL_COMMENTS
 2 | WHERE OWNER IN ('STBMGR','STBRPTMGR','MAMGR')
 3 | ORDER BY 1,2,3;
 4 | 
 5 | 
 6 | Select  OWNER,table_name, column_id, column_name,
 7 | --data_type, 
 8 | data_type || ' ' || decode (data_type,
 9 | 'DATE',' ',
10 | 'TIMESTAMP(6)',' ',
11 | 'CHAR',TO_CHAR(DATA_LENGTH),
12 | 'VARCHAR2',TO_CHAR(DATA_LENGTH),
13 | 'RAW',' ',
14 | TO_CHAR(DATA_PRECISION)||','||TO_CHAR(DATA_SCALE)) DATATYPE,
15 | NULLABLE, NUM_DISTINCT, NUM_NULLS
16 | --data_length,data_precision,data_scale
17 | from all_tab_columns
18 | where
19 | owner in ('STBMGR','MAMGR','STBRPTMGR')
20 | and table_name not like '%ERR'
21 | order by 1,2
22 | 


--------------------------------------------------------------------------------
/oracle/Oracle Table-Column Stats:
--------------------------------------------------------------------------------
 1 | 
 2 | --script to generate the collection of  min,max,num_distinct,num_null for each column in each table.
 3 | select
 4 | 'select ' SS
 5 | , ''''||table_name||''' tab,' as Table_name
 6 | ,'''|'',' D0
 7 | , ''''||column_name||''' col,' as Column_name
 8 | ,'''|'',' D1
 9 | ,'count(distinct '||column_name||') num_distict, ' C2
10 | ,'''|'',' D2
11 | ,'sum(decode('||column_name||',null,1,0)) num_nulls, ' C3
12 | ,'''|'',' D3
13 | ,'max(length('||column_name||')) max_len, ' C4
14 | ,'''|'',' D4
15 | ,'min('||column_name||') min_val, ' C5
16 | ,'''|'',' D5
17 | ,'max('||column_name||') max_val' C6
18 | ,' from '||owner||'.'||table_name||';'  FF
19 | from all_tab_columns
20 | where owner='CIA';
21 | --prompt from &&tab
22 | --prompt where rownum <=&&row_limit
23 | 


--------------------------------------------------------------------------------
/snowflake/table column stats:
--------------------------------------------------------------------------------
  1 | use role LAB_DEVELOPER;
  2 | --create database if not exists NYC_Taxi;
  3 | Use database lab_db;
  4 | use schema information_schema;
  5 | create schema if not exists NYC_Taxi;
  6 | use schema Admin;
  7 | 
  8 | create or replace  view all_tables
  9 | as
 10 | /* returns all tables , some attributes and storage metrics for all tables in the account */
 11 | with T as 
 12 | (
 13 | select table_catalog as database_name,table_schema as schema_name,table_name,table_id,table_type,is_transient,is_iceberg,row_count,cast(deleted as date) as deleted_dt
 14 | from snowflake.account_usage.tables
 15 | where table_type <> 'VIEW'
 16 | )
 17 | ,
 18 | S as
 19 | (
 20 | select 
 21 | id,table_catalog,table_schema,table_name,active_bytes,failsafe_bytes,time_travel_bytes,deleted
 22 | from snowflake.account_usage.table_storage_metrics 
 23 | where active_bytes+failsafe_bytes+time_travel_bytes >0  or deleted = 'FALSE'
 24 | )
 25 | select T.*,S.active_bytes,S.failsafe_bytes,time_travel_bytes,deleted
 26 | from T inner join S on T.table_id=S.id
 27 | order by 1,2,3;
 28 | 
 29 | Create or replace view database_tables 
 30 | AS
 31 | with T as 
 32 | (
 33 | select table_catalog as database_name,table_schema as schema_name,table_name,table_type,is_transient,is_iceberg,row_count
 34 | from information_schema.tables
 35 | where table_type <> 'VIEW'
 36 | )
 37 | ,
 38 | S as
 39 | (
 40 | select 
 41 | table_catalog,table_schema,table_name,sum(active_bytes) as active_bytes,sum(failsafe_bytes) as failsafe_bytes,sum(time_travel_bytes) as time_travel_bytes
 42 | from information_schema.table_storage_metrics 
 43 | group by 1,2,3  
 44 | )
 45 | select T.*,S.active_bytes,S.failsafe_bytes,S.time_travel_bytes
 46 | from T inner join S on T.database_name=S.table_catalog and T.schema_name=S.table_schema and T.Table_name=S.table_name
 47 | order by 1,2,3;
 48 | 
 49 | select * from all_tables order by 1,2,3;
 50 | select * from database_tables order by 1,2,3;
 51 | 
 52 | grant select on view all_tables to role sysadmin;
 53 | grant usage on schema lab_db.admin to role sysadmin;
 54 | grant usage on database lab_db to role sysadmin;
 55 | 
 56 | CREATE OR REPLACE PROCEDURE SP_Data_Profile (DATABASE_NAME varchar, SCHEMA_NAME varchar, TABLE_NAME varchar)
 57 |     COPY GRANTS -- ensures grants are re-applied if the proc is replaced/modified
 58 |     RETURNS object 
 59 |     LANGUAGE SQL
 60 |     --RETURNS NULL ON NULL INPUT -- will not call stored proc if any input is null and null return
 61 |     CALLED on NULL INPUT -- will always call stored proc with null inputs
 62 |     COMMENT = 'Obtains column metrics for a given table.  Creates a temporary table (data_profile)containing some a data profile of the selected tables.  min,max,distinct count and null count for each column of the tables '
 63 |     EXECUTE AS CALLER  -- CALLER/OWNER execution rights
 64 |     /*
 65 |     Description: Obtains column metrics for a given table.  Creates a temporary table (current schema.data_profile) containing a data profile of the specified table.
 66 |     min,max,distinct count and null count for each column of the tables 
 67 |   
 68 |     Usage : Call SP_Data_Profile(Database_Name , Schema_name , Table_name );
 69 |     Parameters: 
 70 |         Database_Name
 71 |         Schema_Name
 72 |         Table_Name
 73 |     Returns: a JSON object of the table containing the Database_name,Schema_name,Table_name and the columns and metrics
 74 |     
 75 |     Change Log: 
 76 |         2024-02-06:Paul S inintial version     
 77 |     */
 78 |     AS
 79 |     DECLARE
 80 |         c1 cursor for  -- cursor to hold list of tables/columns to profile
 81 |             select 
 82 |                 --table_catalog, table_schema,table_name
 83 |                 column_name,data_type 
 84 |             from information_schema.columns
 85 |             where table_schema ilike ? and  table_name ilike ?;
 86 |         sql_stmt text;
 87 |         insertcount integer :=0;
 88 |         column_name varchar;
 89 |         data_type varchar;
 90 |         row_count integer;
 91 |         distinct_count integer;
 92 |         null_count integer;
 93 |         max_value varchar;
 94 |         min_value varchar;
 95 |         error_msg varchar;
 96 |         profile_object object;
 97 |     BEGIN
 98 |         SYSTEM$LOG('debug', 'creating data_profile table');
 99 | 
100 |         CREATE OR REPLACE TEMPORARY TABLE Data_Profile
101 |         ( database_name varchar,schema_name varchar,table_name varchar ,column_name varchar ,data_type varchar, row_count integer ,distinct_count integer ,null_count integer ,min_value varchar ,max_value varchar);
102 |         
103 |         SYSTEM$LOG('debug', 'getting schema');
104 |         OPEN  C1 USING (schema_name,table_name);
105 |         FOR record in C1 DO
106 |             column_name := record.column_name;
107 |             data_type := record.data_type;
108 | 
109 |             sql_stmt := 'insert into data_profile 
110 |                 select ''' || :database_name || ''', '''|| :schema_name || ''', ''' || :table_name || ''', ''' || :column_name  || ''', ''' || :data_type 
111 |                 || ''' ,count(*)' || ', count(distinct ' || :column_name || '), count(*)-count(' || :column_name || '), '
112 |                 || ' min( ' || :column_name || '), max(' || :column_name || ') '
113 |                 || ' from ' || database_name || '.' || schema_name || '.' || table_name ; 
114 |                 
115 |             --SYSTEM$LOG('debug', sql_stmt);
116 |             EXECUTE IMMEDIATE sql_stmt;
117 |             insertcount := insertcount + 1;
118 |         END FOR;
119 |        
120 |         SYSTEM$LOG('debug', 'inserted ' || insertcount::varchar || ' rows into data_profile');
121 |           /* lets return the the profile as a JSON object.    we could easily return this as a table  ... not sure which is best */
122 |           select object_construct (
123 |                 'Database_name',database_name,
124 |                 'Schema_name',schema_name,
125 |                 'Table_name',table_name,
126 |                 'Table_row_count', max(row_count),
127 |                 'column_count', count(*),
128 |                 'columns',array_agg(object_construct (
129 |                     'column_name',column_name,
130 |                     'data_type', data_type,
131 |                     'distinct_count', distinct_count,
132 |                     'null_count',null_count,
133 |                     'min_value', min_value,
134 |                     'max_value', max_value
135 |                     ))
136 |                 )  into :profile_object
137 |                 from data_profile
138 |                 group by database_name,schema_name,table_name;
139 |     RETURN profile_object;
140 | EXCEPTION
141 |     WHEN OTHER  /*OR STATEMENT_ERROR OR EXPRESSION_ERROR*/ THEN
142 |         error_msg := SQLSTATE||':'||SQLCODE||':'||SQLERRM;
143 |         SYSTEM$LOG('error', error_msg);
144 |         RAISE;
145 |     END;
146 | 
147 |     Call SP_Data_Profile('Lab_DB','NYC_TAXI','GREEN_TRIPDATA');
148 |     Call SP_Data_Profile('Lab_DB','NYC_TAXI','yellow_TRIPDATA');
149 |     Call SP_Data_Profile('Lab_DB','NYC_TAXI','NYC_Weather');
150 |     Call SP_Data_Profile('Lab_DB','CITIBIKE','TRIPS');
151 | 
152 |     alter session set log_level='debug';
153 | 
154 |     select * from data_profile;
155 | 
156 | 
157 |     With T as
158 |     (
159 |     select object_construct (
160 |     'Database_name',database_name,
161 |     'Schema_name',schema_name,
162 |     'Table_name',table_name,
163 |     'Table_row_count', max(row_count),
164 |     'column_count', count(*),
165 |     'columns',array_agg(object_construct (
166 |         'column_name',column_name,
167 |         'data_type', data_type,
168 |         'distinct_count', distinct_count,
169 |         'null_count',null_count,
170 |         'min_value', min_value,
171 |         'max_value', max_value
172 |         )) 
173 |     ) as v
174 |     from data_profile
175 |     group by database_name,schema_name,table_name
176 |     )
177 |     select
178 |     v:Database_name::varchar, v:Schema_name::varchar, v:Table_name::varchar, v:Table_row_count::integer, 
179 |     C.value:column_name,C.value:data_type,C.value:distinct_count
180 |     from T
181 |     ,lateral flatten(input=>v:columns) C
182 |     
183 | 


--------------------------------------------------------------------------------
/synapse/synapse table size.sql:
--------------------------------------------------------------------------------
 1 | --select * from sys.databases;
 2 | select
 3 | SERVERPROPERTY('ServerName') Servername,
 4 | db_name() DB_name,
 5 | SERVERPROPERTY('InstanceName') AS [Instance], 
 6 | SERVERPROPERTY('Edition') AS [Edition],
 7 | SERVERPROPERTY('ProductVersion') AS [ProductVersion]; 
 8 | 
 9 | select * from sys.databases;
10 | 
11 | sp_spaceused -- 515,931,352 KB
12 | DBCC PDW_SHOWSPACEUSED ( "ssplbdp01.E_IL1.benchmark_cc_detail" );  
13 | 
14 | select * from sys.pdw_nodes_tables
15 | 
16 |  drop view Admin.vtablesize;
17 | 
18 | 
19 | --synapse
20 | drop view admin.vtabledistribution;
21 | 
22 | CREATE VIEW Admin.vTableDistribution as
23 | SELECT
24 | SERVERPROPERTY('ServerName') Server_Name,
25 | db_name() as Database_Name
26 | ,s.name as Schema_Name
27 | ,t.name as  Table_Name
28 | ,t.object_id as Table_Object_Id
29 | ,tp.distribution_policy_desc as  Dist_Policy_Name
30 | ,nt.object_id as node_table_ojbect_id,nt.pdw_node_id,nt.distribution_id,nt.modify_date,nt.type_desc
31 | ,tm.physical_name
32 | ,nps.partition_number
33 | ,nps.index_id --  ID of the heap or index the partition is part of.0 = Heap  1 = Clustered index. > 1 = Nonclustered index
34 | ,case when row_number() over (partition by nps.object_id,nps.partition_number,nps.distribution_id order by nps.index_id)=1 then nps.row_count else 0 end  as row_count
35 | --,nps.row_count
36 | ,nps.reserved_page_count/128.0 as reserved_space_mb  
37 | ,nps.used_page_count/128.0 as used_space_mb
38 | from
39 | sys.schemas s
40 | JOIN sys.tables t    ON s.[schema_id] = t.[schema_id]
41 | JOIN sys.pdw_table_distribution_properties tp    ON t.[object_id] = tp.[object_id]
42 | JOIN sys.pdw_table_mappings tm    ON t.[object_id] = tm.[object_id]
43 | JOIN sys.pdw_nodes_tables nt    ON tm.[physical_name] = nt.[name]
44 | --JOIN sys.dm_pdw_nodes pn    ON  nt.[pdw_node_id] = pn.[pdw_node_id]
45 | --JOIN sys.pdw_distributions di    ON  nt.[distribution_id] = di.[distribution_id]
46 | JOIN sys.dm_pdw_nodes_db_partition_stats nps ON nt.[object_id] = nps.[object_id]    AND nt.[pdw_node_id] = nps.[pdw_node_id]    AND nt.[distribution_id] = nps.[distribution_id]
47 | 
48 | select * from admin.vtabledistribution where table_name = ;
49 | 
50 | select table_name,count(*),sum(reserved_space_mb) from admin.vtabledistribution where table_name='XXXX' group by table_name;
51 | 
52 | 
53 | drop view admin.vtablesize;
54 | Create view Admin.vTableSize
55 | as
56 | Select
57 | Server_Name,
58 | Database_Name,
59 | Schema_Name,
60 | Table_Name,
61 | Table_Object_Id,
62 | Dist_Policy_Name,
63 | count(*) as Distribution_Cnt,sum(row_count) as Row_cnt, sum(reserved_space_mb) Reserved_Space_MB , sum(used_space_mb) Used_Space_MB
64 | from Admin.vTableDistribution
65 | group by 
66 | Server_name,Database_Name,schema_name,table_name,table_object_id,dist_policy_name;
67 | 
68 | grant view database state to lbdp_sql_developer; ---  needed for developers to query sys.pdw tables and tablesize
69 | 
70 | select * from Admin.vtablesize 
71 | --where row_cnt>0 
72 | order by 1,2
73 | 
74 | 
75 | /* hash columns */
76 | 
77 | SELECT   SERVERPROPERTY('ServerName') Servername,
78 |     OBJECT_SCHEMA_NAME(tdp.object_id) schemaName,
79 |     OBJECT_NAME(tdp.object_id) tableName,
80 | 	tdp.distribution_policy_desc,
81 |     c.name AS hashDistributionColumnName,
82 |     cdp.distribution_ordinal
83 | FROM sys.pdw_table_distribution_properties tdp
84 |         INNER JOIN sys.pdw_column_distribution_properties cdp ON tdp.object_id = cdp.object_id
85 |             INNER JOIN sys.columns c ON cdp.object_id = c.object_id
86 |                 AND cdp.column_id = c.column_id
87 | WHERE tdp.distribution_policy_desc = 'HASH'
88 |   AND cdp.distribution_ordinal > 0
89 |   order by 1,2,3
90 | 
91 | 


--------------------------------------------------------------------------------