├── tips ├── foreach │ ├── README.md │ ├── IngestTable.dbc │ ├── ListaTables.dbc │ ├── Ambiente Job Foreach.dbc │ └── Job Foreach - Ingestion Tables.json ├── regex │ ├── README.md │ ├── regexteste.py │ └── regex.py ├── feliznatal │ ├── README.md │ └── feliznatal.sql ├── VNET │ ├── Databricks.jpg │ └── README.md ├── count │ ├── README.md │ ├── 00000000000000000001.json │ ├── Count(_) vs Count(1).sql │ ├── 00000000000000000002.json │ └── 00000000000000000000.json ├── markdown │ ├── README.md │ ├── Magic Commands.sql │ └── OptimizeAndVacuum_Doc.py ├── parallel │ ├── README.md │ └── Paralelismo.py ├── run │ ├── README.md │ ├── notebook2.py │ ├── notebook3.py │ └── notebook1.py ├── widgets │ ├── README.md │ └── Widgets.py ├── dbutils │ ├── README.md │ └── Dbutils-Dataframe.py ├── DatabricksSDKPython │ ├── README.md │ └── Python-SDK.py ├── input_file_name │ ├── README.md │ ├── generate json.py │ └── bronze_demo.py ├── DatabricksServicePrincipal │ ├── README.md │ └── Generate ServicePrincipal Token.py ├── SHOW │ ├── README.md │ └── SHOW COMMANDs.sql ├── logicapp │ ├── README.md │ ├── TableLogicApps.sql │ └── logicapp.json ├── deltaTable │ ├── README.md │ └── Protocols.sql ├── parameters │ ├── README.md │ └── RunDevProd.py ├── sparkconfs │ ├── README.md │ └── Spark Confs.py ├── EXPLODE_STRING │ ├── README.md │ └── Explode usando SQL.py ├── VacuumInventory │ ├── README.md │ └── Vacuum Inventory.py ├── deletionVector │ ├── README.md │ └── DeletionVectors.py ├── parquetvsdelta │ ├── README.md │ └── Delta vs Parquet.sql ├── particionamento │ ├── README.md │ └── Particionar ou Nao_.sql ├── timeTravelVsCDF │ ├── README.md │ └── Time Travel vs Change Data Feed.sql ├── Table lineage │ ├── README.md │ ├── Usabilidade por usuario.sql │ ├── Usabilidade por dia.sql │ └── Usabilidade das tabelas.sql ├── System Tables │ └── ScriptSQL.sql ├── UpgradeMethods │ ├── README.md │ └── UpgradeUC_Examples.sql └── DatabricksAcademy │ └── Learning.md ├── routines ├── tablesSize&Vacuum │ ├── TablesSize&Vacuum.dbc │ └── README.md └── OptimizeAndVacuum │ ├── README.md │ ├── Demo.sql │ └── OptimizeAndVacuum.py ├── API └── databricks │ ├── README.md │ ├── API SQL Statement.py │ ├── Databrick Jobs List - API.py │ └── Databricks API - Clusters.py └── README.md /tips/foreach/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tips/regex/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tips/feliznatal/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tips/VNET/Databricks.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/VNET/Databricks.jpg -------------------------------------------------------------------------------- /tips/foreach/IngestTable.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/foreach/IngestTable.dbc -------------------------------------------------------------------------------- /tips/foreach/ListaTables.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/foreach/ListaTables.dbc -------------------------------------------------------------------------------- /tips/foreach/Ambiente Job Foreach.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/foreach/Ambiente Job Foreach.dbc -------------------------------------------------------------------------------- /routines/tablesSize&Vacuum/TablesSize&Vacuum.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/routines/tablesSize&Vacuum/TablesSize&Vacuum.dbc -------------------------------------------------------------------------------- /tips/count/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | -------------------------------------------------------------------------------- /tips/markdown/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | -------------------------------------------------------------------------------- /tips/parallel/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | -------------------------------------------------------------------------------- /tips/run/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 3 |
4 | Selecionar o opção Import 5 | image 6 |
7 | Selecionar o script e importar: 8 | image 9 | -------------------------------------------------------------------------------- /tips/widgets/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | -------------------------------------------------------------------------------- /tips/dbutils/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 3 |
4 | Selecionar o opção Import 5 | image 6 |
7 | Selecionar o script e importar: 8 | image 9 | -------------------------------------------------------------------------------- /tips/DatabricksSDKPython/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | -------------------------------------------------------------------------------- /tips/input_file_name/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | -------------------------------------------------------------------------------- /tips/DatabricksServicePrincipal/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 3 |
4 | Selecionar o opção Import 5 | image 6 |
7 | Selecionar o script e importar: 8 | image 9 | -------------------------------------------------------------------------------- /API/databricks/README.md: -------------------------------------------------------------------------------- 1 | Scripts sobre Databricks API. 2 | 3 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 4 |
5 | Selecionar o opção Import 6 | image 7 |
8 | Selecionar o script e importar: 9 | image 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/run/notebook2.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Sessao do Spark 3 | # MAGIC %scala 4 | # MAGIC spark 5 | 6 | # COMMAND ---------- 7 | 8 | # DBTITLE 1,Contexto do Spark 9 | # MAGIC %scala 10 | # MAGIC spark.sparkContext 11 | 12 | # COMMAND ---------- 13 | 14 | print('ola mund0 - notebook2 aqui') 15 | 16 | # COMMAND ---------- 17 | 18 | # DBTITLE 1,Mostrando variaveis do Notebook1 19 | #Usando Argument 20 | print(getArgument("dataini")) 21 | 22 | #Usando Widgets 23 | print(dbutils.widgets.get("datafim")) 24 | -------------------------------------------------------------------------------- /tips/run/notebook3.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | print('ola mund0 - notebook3 aqui') 3 | 4 | # COMMAND ---------- 5 | 6 | # DBTITLE 1,Definindo váriaveis com retorno de uma query 7 | dataini = spark.sql("select dataini from tb_parameters")[0][0] 8 | datafim = spark.sql("select datafim from tb_parameters")[0][0] 9 | 10 | # COMMAND ---------- 11 | 12 | # DBTITLE 1,Mostrando variaveis do Notebook1 13 | #Usando Argument 14 | print(getArgument("dataini")) 15 | 16 | #Usando Widgets 17 | print(dbutils.widgets.get("datafim")) 18 | -------------------------------------------------------------------------------- /tips/SHOW/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/logicapp/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/deltaTable/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/parameters/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/sparkconfs/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/EXPLODE_STRING/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/VacuumInventory/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/deletionVector/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/parquetvsdelta/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/particionamento/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/timeTravelVsCDF/README.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | Notebooks e dicas sobre Databricks 3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 5 |
6 | Selecionar o opção Import 7 | image 8 |
9 | Selecionar o script e importar: 10 | image 11 | -------------------------------------------------------------------------------- /tips/Table lineage/README.md: -------------------------------------------------------------------------------- 1 | # Monitoramento de usabilidade das tabelas no Unity Catalog 2 | image 3 | 4 | ## Monitore tabelas mais usadas no ambiente 5 | image 6 | 7 | ## Mapeamento das tabelas não utilizadas, tabelas com muitas escritas e poucas leituras 8 | image 9 | -------------------------------------------------------------------------------- /tips/input_file_name/generate json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from faker import Faker 3 | import random 4 | 5 | fake = Faker() 6 | 7 | for i in range(1, 11): 8 | data = [{ 9 | 'name': fake.name(), 10 | 'address': fake.address(), 11 | 'email': fake.email(), 12 | 'phone_number': fake.phone_number(), 13 | 'job': fake.job(), 14 | 'age': random.randint(18, 65), 15 | 'company': fake.company(), 16 | 'credit_card_number': fake.credit_card_number(), 17 | 'date_joined': str(fake.date_this_decade()) 18 | }] 19 | 20 | with open(f'/Users/reginaldosilva/Documents/Jsons/data{i}.json', 'w') as f: 21 | json.dump(data, f, indent=4) 22 | -------------------------------------------------------------------------------- /tips/logicapp/TableLogicApps.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Table to log events 3 | drop table tb_OrchestratorEvents; 4 | CREATE TABLE tb_OrchestratorEvents 5 | ( 6 | id int IDENTITY PRIMARY KEY, 7 | jobName VARCHAR(200), 8 | jobId VARCHAR(200), 9 | databricksWorkspace VARCHAR(200), 10 | emailList VARCHAR(MAX), 11 | subject VARCHAR(MAX), 12 | customBody VARCHAR(MAX), 13 | dateLog datetime 14 | ) 15 | 16 | -- Generate Event 17 | INSERT INTO tb_OrchestratorEvents VALUES ( 18 | 'Job1-Teste', 19 | '981175440018532', 20 | 'https://adb-4013955633331914.14.azuredatabricks.net/api/2.1/jobs/run-now', 21 | 'reginaldo.silva@dataside.com.br', 22 | 'LogicApp - Item criado na tabela tb_OrchestratorEvents', 23 | 'Event Information: Job Run created ', 24 | GETDATE() 25 | ) 26 | 27 | --Events 28 | select * from tb_OrchestratorEvents 29 | -------------------------------------------------------------------------------- /tips/markdown/Magic Commands.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- DBTITLE 1,SQL - Default desse notebook 3 | -- MAGIC %sql 4 | -- MAGIC -- Não preciso especificar, mas se quiser voce pode 5 | -- MAGIC SELECT 'usando linguagem SQL' 6 | 7 | -- COMMAND ---------- 8 | 9 | -- DBTITLE 1,Python 10 | -- MAGIC %python 11 | -- MAGIC var = "Opa, agora to no python!" 12 | -- MAGIC print(var) 13 | 14 | -- COMMAND ---------- 15 | 16 | -- DBTITLE 1,Shell script 17 | -- MAGIC %sh 18 | -- MAGIC ls -l 19 | 20 | -- COMMAND ---------- 21 | 22 | -- DBTITLE 1,Scala 23 | -- MAGIC %scala 24 | -- MAGIC val var = "Vai de scala?" 25 | -- MAGIC println(var) 26 | 27 | -- COMMAND ---------- 28 | 29 | -- DBTITLE 1,R 30 | -- MAGIC %r 31 | -- MAGIC var <- "R é para os bruxos!" 32 | -- MAGIC print(var) 33 | 34 | -- COMMAND ---------- 35 | 36 | -- DBTITLE 1,Markdown 37 | -- MAGIC %md 38 | -- MAGIC ## Esse é o tema do post 39 | -- MAGIC Vamos falar mais sobre Markdown 40 | 41 | -- COMMAND ---------- 42 | 43 | -- DBTITLE 1,FS 44 | -- MAGIC %fs 45 | -- MAGIC ls / 46 | 47 | -- COMMAND ---------- 48 | 49 | -- DBTITLE 1,run - chamando notebooks 50 | -- MAGIC %run /maintenanceDeltalake 51 | -------------------------------------------------------------------------------- /tips/parquetvsdelta/Delta vs Parquet.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- DBTITLE 1,Criando a tabela DEMO 3 | -- MAGIC %py 4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 5 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta') 6 | -- MAGIC df.display() 7 | 8 | -- COMMAND ---------- 9 | 10 | SET spark.databricks.delta.formatCheck.enabled=false 11 | 12 | -- COMMAND ---------- 13 | 14 | -- DBTITLE 1,Lendo Parquets 15 | select * from parquet.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/*.parquet` 16 | 17 | -- COMMAND ---------- 18 | 19 | -- DBTITLE 1,Executando 1 Update na tabela Delta 20 | -- Atualizando 1 registro 21 | update db_demo.PatientInfoDelta set age = '33s' where patient_id = '1000000001'; 22 | select * from delta.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/`; 23 | 24 | -- COMMAND ---------- 25 | 26 | -- DBTITLE 1,Lendo Parquets das tabelas Delta 27 | select * from parquet.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/*.parquet` 28 | 29 | -- COMMAND ---------- 30 | 31 | -- DBTITLE 1,Lendo parquets - spark.databricks.delta.formatCheck.enabled 32 | SET spark.databricks.delta.formatCheck.enabled=true; 33 | select * from parquet.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/*.parquet` 34 | -------------------------------------------------------------------------------- /tips/regex/regexteste.py: -------------------------------------------------------------------------------- 1 | import re 2 | log_string = """ 3 | 04:06:58 3 of 68 OK created sql incremental model bronze.vendors [INSERT 0 2 in 6.70s] 4 | """ 5 | pattern1 = r"(\d{2}:\d{2}:\d{2})\s" # hora 6 | pattern2 = r"(\d+)\s+of\s" # id 7 | pattern3 = r"OK\s+created\s+sql\s+(\w+)\s+model\s" # tipo 8 | pattern4 = r"(\d+\.\d+)s\]" # tabela 9 | pattern5 = r"(\d{2}:\d{2}:\d{2})\s+(\d+)\s+of\s+\d+\s+OK\s+created\s+sql\s+(\w+)\s+model\s+([\w\.]+)\s+.*?\[.*?in\s+(\d+\.\d+)s\]" # todas colunas 10 | 11 | print("--------------------------------------") 12 | print(re.search(pattern1, log_string)) 13 | print(re.search(pattern1, log_string).group(1)) 14 | print("--------------------------------------") 15 | print(re.search(pattern2, log_string)) 16 | print(re.search(pattern2, log_string).group(1)) 17 | print("--------------------------------------") 18 | print(re.search(pattern3, log_string)) 19 | print(re.search(pattern3, log_string).group(1)) 20 | print("--------------------------------------") 21 | print(re.search(pattern4, log_string)) 22 | print(re.search(pattern4, log_string).group(1)) 23 | print("--------------------------------------") 24 | print(re.search(pattern5, log_string)) 25 | print(re.search(pattern5, log_string).group(1)) 26 | print(re.search(pattern5, log_string).group(2)) 27 | print(re.search(pattern5, log_string).group(3)) 28 | print(re.search(pattern5, log_string).group(4)) 29 | print(re.search(pattern5, log_string).group(5)) -------------------------------------------------------------------------------- /tips/feliznatal/feliznatal.sql: -------------------------------------------------------------------------------- 1 | -- Letra F 2 | SELECT CAST ('POLYGON((1 10, 1 11, 2 11, 2 10.8, 1.25 10.8, 1.25 10.6, 1.75 10.6, 1.75 10.4, 1.25 10.4, 1.25 10, 1 10))' as geometry) 3 | UNION ALL 4 | -- Letra E 5 | SELECT CAST ('POLYGON((2 10, 2 11, 3 11, 3 10.8, 2.25 10.8, 2.25 10.6, 2.75 10.6, 2.75 10.4, 2.25 10.4, 2.25 10.2, 3 10.2, 3 10, 2 10))' as geometry) 6 | UNION ALL 7 | -- Letra L 8 | SELECT CAST ('POLYGON((3.15 11, 3.15 10, 3.85 10, 3.85 10.2, 3.35 10.2, 3.35 11, 3.15 11))' as geometry) 9 | UNION ALL 10 | -- Letra I 11 | SELECT CAST ('POLYGON((4.2 11, 4.8 11, 4.8 10.8, 4.6 10.8, 4.6 10.2, 4.8 10.2, 4.8 10, 4.2 10, 4.2 10.2, 4.4 10.2, 4.4 10.8, 4.2 10.8, 4.2 11))' as geometry) 12 | UNION ALL 13 | -- Letra Z 14 | SELECT CAST ('POLYGON((5 11, 6 11, 5.4 10.2, 6 10.2, 6 10, 5 10, 5.6 10.8, 5 10.8, 5 11))' as geometry) 15 | UNION ALL 16 | -- Letra N 17 | SELECT CAST ('POLYGON((1 10, 1 9, 1.2 9, 1.2 9.8, 1.8 9, 2 9, 2 10, 1.8 10, 1.8 9.3, 1.3 10, 1 10))' as geometry) 18 | UNION ALL 19 | -- Letra A 20 | SELECT CAST ('POLYGON((2 9, 2 10, 3 10, 3 9, 2.75 9, 2.75 9.3, 2.25 9.3, 2.25 9, 2 9),(2.25 9.5, 2.25 9.8, 2.75 9.8, 2.75 9.5, 2.25 9.5))' as geometry) 21 | UNION ALL 22 | -- Letra T 23 | SELECT CAST ('POLYGON((3 9.8, 3 10,4 10, 4 9.8, 3.6 9.8, 3.6 9, 3.4 9, 3.4 9.8, 3 9.8))' as geometry) 24 | UNION ALL 25 | -- Letra A 26 | SELECT CAST ('POLYGON((4 9, 4 10, 5 10, 5 9, 4.75 9, 4.75 9.3, 4.25 9.3, 4.25 9, 4 9),(4.25 9.5, 4.25 9.8, 4.75 9.8, 4.75 9.5, 4.25 9.5))' as geometry) 27 | UNION ALL 28 | -- Letra L 29 | SELECT CAST ('POLYGON((5.15 10, 5.15 9, 5.85 9, 5.85 9.2, 5.35 9.2, 5.35 10, 5.15 10))' as geometry) -------------------------------------------------------------------------------- /tips/input_file_name/bronze_demo.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Listando arquivos na camada Landing 3 | # MAGIC %fs 4 | # MAGIC ls abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/landing 5 | 6 | # COMMAND ---------- 7 | 8 | # DBTITLE 1,Lendo arquivos JSON 9 | # MAGIC %py 10 | # MAGIC df = spark.read.option("multiLine", "True").json('abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/landing/*.json') 11 | # MAGIC df.display() 12 | 13 | # COMMAND ---------- 14 | 15 | # DBTITLE 1,Criando uma tabela Delta 16 | # MAGIC %py 17 | # MAGIC df.write.format('delta') \ 18 | # MAGIC .mode('overwrite') \ 19 | # MAGIC .saveAsTable("db_demo.person",path='abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/bronze/person') 20 | 21 | # COMMAND ---------- 22 | 23 | # DBTITLE 1,Como saber de qual arquivo veio cada pessoa? 24 | # MAGIC %sql 25 | # MAGIC select name,* from db_demo.person 26 | 27 | # COMMAND ---------- 28 | 29 | # DBTITLE 1,Usar na tabela Delta? 30 | # MAGIC %sql 31 | # MAGIC select input_file_name(),name,* from db_demo.person 32 | 33 | # COMMAND ---------- 34 | 35 | # DBTITLE 1,Adicionando uma nova coluna nomeArquivo 36 | # MAGIC %py 37 | # MAGIC from pyspark.sql.functions import input_file_name 38 | # MAGIC df.withColumn("nomeArquivo",input_file_name()) \ 39 | # MAGIC .write.format('delta') \ 40 | # MAGIC .mode('overwrite') \ 41 | # MAGIC .option("overwriteSchema", True) \ 42 | # MAGIC .saveAsTable("db_demo.person",path='abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/bronze/person') 43 | 44 | # COMMAND ---------- 45 | 46 | # DBTITLE 1,Agora sim! 47 | # MAGIC %sql 48 | # MAGIC select nomeArquivo,name,* from db_demo.person 49 | -------------------------------------------------------------------------------- /tips/count/00000000000000000001.json: -------------------------------------------------------------------------------- 1 | {"commitInfo":{"timestamp":1683375419086,"userId":"8675301566931963","userName":"reginaldo.silva@dataside.com.br","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"2263512646416784"},"clusterId":"0213-212148-y5jr9wle","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"4064"},"engineInfo":"Databricks-Runtime/12.1.x-scala2.12","txnId":"826be0c3-6b85-4652-85b7-cd6fa83da78f"}} 2 | {"add":{"path":"part-00000-c83f8e97-cbf8-4034-8775-27e5b3f0466c-c000.snappy.parquet","partitionValues":{},"size":4064,"modificationTime":1683375418000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"patient_id\":\"1000003211\",\"sex\":\"male\",\"age\":\"31s\",\"country\":\"Brazil\",\"province\":\"Sao Paulo\",\"city\":\"Boituva\",\"infection_case\":\"Dataholic\",\"contact_number\":\"12\",\"symptom_onset_date\":\"2023-05-06\",\"confirmed_date\":\"2023-05-06\",\"released_date\":\"2023-05-06\",\"state\":\"released\"},\"maxValues\":{\"patient_id\":\"1000003211\",\"sex\":\"male\",\"age\":\"31s\",\"country\":\"Brazil\",\"province\":\"Sao Paulo\",\"city\":\"Boituva\",\"infection_case\":\"Dataholic\",\"contact_number\":\"12\",\"symptom_onset_date\":\"2023-05-06\",\"confirmed_date\":\"2023-05-06\",\"released_date\":\"2023-05-06\",\"state\":\"released\"},\"nullCount\":{\"patient_id\":0,\"sex\":0,\"age\":0,\"country\":0,\"province\":0,\"city\":0,\"infection_case\":0,\"infected_by\":1,\"contact_number\":0,\"symptom_onset_date\":0,\"confirmed_date\":0,\"released_date\":0,\"deceased_date\":1,\"state\":0}}","tags":{"INSERTION_TIME":"1683375418000000","MIN_INSERTION_TIME":"1683375418000000","MAX_INSERTION_TIME":"1683375418000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} 3 | -------------------------------------------------------------------------------- /tips/widgets/Widgets.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Definindo manualmente um Widget do tipo texto 3 | #Define Widgets 4 | dbutils.widgets.text('path', '') 5 | dbutils.widgets.text('dataini', '') 6 | dbutils.widgets.text('datafim', '') 7 | dbutils.widgets.dropdown('debug', 'False', ['True','False']) 8 | 9 | # Define variaveis 10 | path = dbutils.widgets.get('path') 11 | dataini = dbutils.widgets.get('dataini') 12 | datafim = dbutils.widgets.get('datafim') 13 | debug = dbutils.widgets.get('debug') == 'True' # Retorna um boolean 14 | 15 | # Se for modo Debug apenas mostra o valor das variaveis, senao executa um comando, nesse caso o dbutils 16 | if(debug): 17 | print('path : ',path) 18 | print('dataini: ',dataini) 19 | print('datafim: ',datafim) 20 | else: 21 | dbutils.fs.ls(path) 22 | 23 | # COMMAND ---------- 24 | 25 | # DBTITLE 1,Chamando uma função usando uma váriavel 26 | def getDirContent(ls_path): 27 | path_list = dbutils.fs.ls(ls_path) 28 | for dir_path in dbutils.fs.ls(ls_path): 29 | if dir_path.isDir() and ls_path != dir_path.path and '_delta_log' not in dir_path.path: 30 | path_list += getDirContent(dir_path.path) 31 | 32 | getDirContent(path) 33 | 34 | # COMMAND ---------- 35 | 36 | # DBTITLE 1,Chamando uma função com valor fixo 37 | def getDirContent(ls_path): 38 | path_list = dbutils.fs.ls(ls_path) 39 | for dir_path in dbutils.fs.ls(ls_path): 40 | if dir_path.isDir() and ls_path != dir_path.path and '_delta_log' not in dir_path.path: 41 | path_list += getDirContent(dir_path.path) 42 | 43 | getDirContent('/databricks-datasets/COVID/USAFacts/') 44 | 45 | # COMMAND ---------- 46 | 47 | dbutils.widgets.removeAll() 48 | 49 | # COMMAND ---------- 50 | 51 | # DBTITLE 1,Veja as opções de Widgets 52 | dbutils.widgets.help() 53 | -------------------------------------------------------------------------------- /API/databricks/API SQL Statement.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import pandas as pd 4 | from tabulate import tabulate 5 | import matplotlib.pyplot as plt 6 | 7 | url = f"https://adb-xxxx.13.azuredatabricks.net/api/2.0/sql/statements/" 8 | 9 | headers = { 10 | 'Authorization': "Bearer xxxx-3", 11 | "Content-Type": "application/json" 12 | } 13 | 14 | data = { 15 | "warehouse_id": "xxxxxx", 16 | "statement": "select date_format(usage_end_time,'yyyy-MM') as Mes, \ 17 | sum(usage_quantity) as DBUs, \ 18 | (sum(usage_quantity) * max(c.pricing.default)) as TotalUSD \ 19 | from system.billing.usage a \ 20 | inner join system.billing.list_prices c on c.sku_name = a.sku_name \ 21 | group by all order by 1 desc limit 10", 22 | "wait_timeout": "5s" 23 | } 24 | 25 | response = requests.post( 26 | url = url, 27 | headers=headers, 28 | data=json.dumps(data) 29 | ) 30 | 31 | result = json.loads(response.content) 32 | 33 | print("Status Code:", response.status_code) 34 | print(json.dumps(result,indent=4)) 35 | 36 | # Extrair colunas e dados 37 | columns = [col["name"] for col in result["manifest"]["schema"]["columns"]] 38 | data = result["result"]["data_array"] 39 | 40 | print(columns) 41 | print(data) 42 | 43 | # Criar DataFrame 44 | df = pd.DataFrame(data, columns=columns) 45 | 46 | # Print tabulado 47 | print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False)) 48 | 49 | df["TotalUSD"] = pd.to_numeric(df["TotalUSD"]) 50 | 51 | # Plotar o gráfico 52 | plt.plot(df['Mes'], df['TotalUSD'],marker='o', linestyle='-', color='b') 53 | 54 | # Adicionar rótulos e título 55 | plt.xlabel('Mes') 56 | plt.ylabel('TotalUSD') 57 | plt.title('Consumo DBUS') 58 | 59 | # Mostrar o gráfico 60 | plt.grid(True) 61 | plt.show() 62 | -------------------------------------------------------------------------------- /tips/Table lineage/Usabilidade por usuario.sql: -------------------------------------------------------------------------------- 1 | -- tabelas mais lidas por usuário 2 | -- Aplique filtros de datas se achar necessário 3 | -- Customize conforme sua necessidade 4 | select 5 | loginName, 6 | catalogName, 7 | schemaName, 8 | tableName, 9 | sum(READS) as READS, 10 | sum(WRITES) as WRITES 11 | from 12 | ( 13 | select 14 | read.created_By as loginName, 15 | t.table_catalog as catalogName, 16 | t.table_schema as schemaName, 17 | t.table_name as tableName, 18 | sum( 19 | case 20 | when read.source_table_name is not null then 1 21 | else 0 22 | end 23 | ) READS, 24 | 0 WRITES 25 | from 26 | system.information_schema.tables t 27 | left join system.access.table_lineage read on t.table_name = read.source_table_name 28 | and t.table_schema = read.source_table_schema 29 | and t.table_catalog = read.source_table_catalog 30 | where 31 | t.table_catalog not in('system') 32 | and t.table_schema not in('information_schema') 33 | and ( 34 | read.target_type in('TABLE') 35 | ) 36 | group by 37 | all 38 | union all 39 | select 40 | write.created_By as loginName, 41 | t.table_catalog as catalogName, 42 | t.table_schema as schemaName, 43 | t.table_name as tableName, 44 | 0 READS, 45 | sum( 46 | case 47 | when write.target_table_name is not null then 1 48 | else 0 49 | end 50 | ) WRITES 51 | from 52 | system.information_schema.tables t 53 | left join system.access.table_lineage write on t.table_name = write.target_table_name 54 | and t.table_schema = write.target_table_schema 55 | and t.table_catalog = write.target_table_catalog 56 | where 57 | t.table_catalog not in('system') 58 | and t.table_schema not in('information_schema') 59 | and ( 60 | write.target_type in('TABLE') 61 | ) 62 | group by 63 | all 64 | ) Tabs 65 | group by 66 | all 67 | order by 68 | 1 desc 69 | -------------------------------------------------------------------------------- /tips/Table lineage/Usabilidade por dia.sql: -------------------------------------------------------------------------------- 1 | -- tabelas mais lidas por dia 2 | -- Aplique filtros de datas se achar necessário 3 | -- Customize conforme sua necessidade 4 | select 5 | event_date, 6 | catalogName, 7 | schemaName, 8 | tableName, 9 | sum(READS) as READS, 10 | sum(WRITES) as WRITES 11 | from 12 | ( 13 | select 14 | event_date, 15 | t.table_catalog as catalogName, 16 | t.table_schema as schemaName, 17 | t.table_name as tableName, 18 | sum( 19 | case 20 | when read.source_table_name is not null then 1 21 | else 0 22 | end 23 | ) READS, 24 | 0 WRITES 25 | from 26 | system.information_schema.tables t 27 | left join system.access.table_lineage read on t.table_name = read.source_table_name 28 | and t.table_schema = read.source_table_schema 29 | and t.table_catalog = read.source_table_catalog 30 | where 31 | t.table_catalog not in('system') 32 | and t.table_schema not in('information_schema') 33 | and ( 34 | read.target_type in('TABLE') 35 | or read.target_type is null 36 | ) 37 | group by 38 | all 39 | union all 40 | select 41 | event_date, 42 | t.table_catalog as catalogName, 43 | t.table_schema as schemaName, 44 | t.table_name as tableName, 45 | 0 READS, 46 | sum( 47 | case 48 | when write.target_table_name is not null then 1 49 | else 0 50 | end 51 | ) WRITES 52 | from 53 | system.information_schema.tables t 54 | left join system.access.table_lineage write on t.table_name = write.target_table_name 55 | and t.table_schema = write.target_table_schema 56 | and t.table_catalog = write.target_table_catalog 57 | where 58 | t.table_catalog not in('system') 59 | and t.table_schema not in('information_schema') 60 | and ( 61 | write.target_type in('TABLE') 62 | or write.target_type is null 63 | ) 64 | group by 65 | all 66 | ) Tabs 67 | group by 68 | all 69 | order by 70 | 1 desc 71 | -------------------------------------------------------------------------------- /tips/foreach/Job Foreach - Ingestion Tables.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Job Foreach - Ingestion Tables", 3 | "email_notifications": { 4 | "no_alert_for_skipped_runs": false 5 | }, 6 | "webhook_notifications": {}, 7 | "timeout_seconds": 0, 8 | "max_concurrent_runs": 1, 9 | "tasks": [ 10 | { 11 | "task_key": "ListTables", 12 | "run_if": "ALL_SUCCESS", 13 | "notebook_task": { 14 | "notebook_path": "/Workspace/Scripts/ListaTables", 15 | "source": "WORKSPACE" 16 | }, 17 | "timeout_seconds": 0, 18 | "email_notifications": {}, 19 | "notification_settings": { 20 | "no_alert_for_skipped_runs": false, 21 | "no_alert_for_canceled_runs": false, 22 | "alert_on_last_attempt": false 23 | }, 24 | "webhook_notifications": {} 25 | }, 26 | { 27 | "task_key": "ForeachTable", 28 | "depends_on": [ 29 | { 30 | "task_key": "ListTables" 31 | } 32 | ], 33 | "run_if": "ALL_SUCCESS", 34 | "for_each_task": { 35 | "inputs": "{{tasks.ListTables.values.tableList}}", 36 | "concurrency": 5, 37 | "task": { 38 | "task_key": "IngestTable", 39 | "run_if": "ALL_SUCCESS", 40 | "notebook_task": { 41 | "notebook_path": "/Workspace/Scripts/IngestTable", 42 | "base_parameters": { 43 | "tableConfig": "{{input}}" 44 | }, 45 | "source": "WORKSPACE" 46 | }, 47 | "timeout_seconds": 0, 48 | "email_notifications": {}, 49 | "notification_settings": { 50 | "no_alert_for_skipped_runs": false, 51 | "no_alert_for_canceled_runs": false, 52 | "alert_on_last_attempt": false 53 | }, 54 | "webhook_notifications": {} 55 | } 56 | }, 57 | "timeout_seconds": 0, 58 | "email_notifications": {}, 59 | "notification_settings": { 60 | "no_alert_for_skipped_runs": false, 61 | "no_alert_for_canceled_runs": false, 62 | "alert_on_last_attempt": false 63 | }, 64 | "webhook_notifications": {} 65 | } 66 | ], 67 | "queue": { 68 | "enabled": true 69 | }, 70 | "run_as": { 71 | "user_name": "reginaldo.silva27@hotmail.com" 72 | } 73 | } -------------------------------------------------------------------------------- /tips/count/Count(_) vs Count(1).sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- DBTITLE 1,Cria tabela de teste e COUNT de linhas 3 | -- MAGIC %py 4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 5 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta') 6 | -- MAGIC df.count() 7 | 8 | -- COMMAND ---------- 9 | 10 | -- DBTITLE 1,Visualizar os dados 11 | select * from db_demo.PatientInfoDelta 12 | 13 | -- COMMAND ---------- 14 | 15 | -- DBTITLE 1,Ver plano de execução 16 | explain extended select count(*) from db_demo.monitoramento 17 | 18 | -- COMMAND ---------- 19 | 20 | -- DBTITLE 1,Insere novo registro para gerar uma nova versão 21 | insert into db_demo.PatientInfoDelta values('1000003211','male','31s','Brazil','Sao Paulo','Boituva','Dataholic',null,12,current_date(),current_date(),current_date(),null,'released') 22 | 23 | -- COMMAND ---------- 24 | 25 | -- DBTITLE 1,Deleta 11 registros para gerar uma nova versão 26 | delete from db_demo.PatientInfoDelta where patient_id between '1000000001' and '1000000011' 27 | 28 | -- COMMAND ---------- 29 | 30 | -- DBTITLE 1,Visualizar tamanho da tabela 31 | describe detail db_demo.PatientInfoDelta 32 | 33 | -- COMMAND ---------- 34 | 35 | -- DBTITLE 1,Visualizar versões da tabela 36 | describe history db_demo.PatientInfoDelta 37 | 38 | -- COMMAND ---------- 39 | 40 | -- DBTITLE 1,Count de linha atual 41 | select count(*) from db_demo.PatientInfoDelta 42 | 43 | -- COMMAND ---------- 44 | 45 | -- DBTITLE 1,COUNT usando o Delta Log 46 | select 47 | sum(from_json( 48 | add.stats,'numRecords DOUBLE' 49 | ).numRecords) as numRecordsAdd 50 | from 51 | json.`abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta/_delta_log/0000000000000000*.json` 52 | where add is not null 53 | and add.path NOT IN ( 54 | select remove.path from json.`abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta/_delta_log/0000000000000000*.json` 55 | where remove is not null 56 | ) 57 | 58 | -- COMMAND ---------- 59 | 60 | -- DBTITLE 1,Visualizando os metados do _delta_log 61 | select 62 | from_json( 63 | add.stats,'numRecords DOUBLE' 64 | ).numRecords as numRecordsAdd, 65 | * 66 | from 67 | json.`abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta/_delta_log/0000000000000000*.json` 68 | -------------------------------------------------------------------------------- /tips/Table lineage/Usabilidade das tabelas.sql: -------------------------------------------------------------------------------- 1 | -- tabelas mais lidas 2 | -- Aplique filtros de datas se achar necessário 3 | -- Customize conforme sua necessidade 4 | select 5 | catalogName, 6 | schemaName, 7 | tableName, 8 | min(first_read) as first_read, 9 | max(last_read) as last_read, 10 | min(first_write) as first_write, 11 | max(last_write) as last_write, 12 | sum(READS) as READS, 13 | sum(WRITES) as WRITES 14 | from 15 | ( 16 | select 17 | t.table_catalog as catalogName, 18 | t.table_schema as schemaName, 19 | t.table_name as tableName, 20 | MIN(read.event_date) first_read, 21 | MAX(read.event_date) last_read, 22 | null first_write, 23 | null last_write, 24 | sum( 25 | case 26 | when read.source_table_name is not null then 1 27 | else 0 28 | end 29 | ) READS, 30 | 0 WRITES 31 | from 32 | system.information_schema.tables t 33 | left join system.access.table_lineage read on t.table_name = read.source_table_name 34 | and t.table_schema = read.source_table_schema 35 | and t.table_catalog = read.source_table_catalog 36 | where 37 | t.table_catalog not in('system') 38 | and t.table_schema not in('information_schema') 39 | and ( 40 | read.target_type in('TABLE') 41 | or read.target_type is null 42 | ) 43 | group by 44 | all 45 | union all 46 | select 47 | t.table_catalog as catalogName, 48 | t.table_schema as schemaName, 49 | t.table_name as tableName, 50 | null first_read, 51 | null last_read, 52 | MIN(write.event_date) first_write, 53 | MAX(write.event_date) last_write, 54 | 0 READS, 55 | sum( 56 | case 57 | when write.target_table_name is not null then 1 58 | else 0 59 | end 60 | ) WRITES 61 | from 62 | system.information_schema.tables t 63 | left join system.access.table_lineage write on t.table_name = write.target_table_name 64 | and t.table_schema = write.target_table_schema 65 | and t.table_catalog = write.target_table_catalog 66 | where 67 | t.table_catalog not in('system') 68 | and t.table_schema not in('information_schema') 69 | and ( 70 | write.target_type in('TABLE') 71 | or write.target_type is null 72 | ) 73 | group by 74 | all 75 | ) Tabs 76 | group by 77 | all 78 | order by 79 | READS DESC, 80 | WRITES DESC 81 | -------------------------------------------------------------------------------- /routines/OptimizeAndVacuum/README.md: -------------------------------------------------------------------------------- 1 | 2 |

Descriçao dos parametros

3 | 4 | | Parametro | Descrição | Tipo 5 | | ------------- | ------------- | ------------- | 6 | | nomeSchema | Nome do Database onde a tabela está criada | string | 7 | | nomeTabela | Nome da tabela que será aplicado a manutenção | string | 8 | | vacuum | True: Vacuum será executado, False: Pula vacuum | bool | 9 | | optimize | True: OPTIMIZE será executado, False: Pula OPTIMIZE | bool | 10 | | colunasZorder | Se informado e optimize for igual a True, aplicada Zorder na lista de colunas separado por vírgula (,) | string | 11 | | vacuumRetention | Quantidade de horas que será retida após execucao do Vacuum | integer | 12 | | Debug | Apenas imprime o resultado na tela | bool | 13 | 14 |

Exemplos:

15 | 16 | #### --> Primeiro instanciar a Function <-- 17 | `` %run /Users/reginaldo.silva@dataside.com.br/OptimizeAndVacuum `` 18 | 19 | #### --> Executando VACUUM com retenção de 72 horas e OPTMIZE SEM ZORDER <-- 20 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='funcionario', colunasZorder='none', vacuumRetention=72, vacuum=True, optimize=True, debug=False)`` 21 | 22 | #### --> Executando VACUUM retenção padrão e OPTMIZE COM ZORDER <-- 23 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='patient_id', vacuumRetention=168, vacuum=True, optimize=True, debug=False)`` 24 | 25 | #### --> Executando somente VACUUM <-- 26 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=False, debug=False)`` 27 | 28 | #### --> Executando somente OPTMIZE <-- 29 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=False, optimize=True, debug=False)`` 30 | 31 | #### --> Modo Debug - Apenas print <-- 32 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True)`` 33 | 34 | ``Criado por: Reginaldo Silva`` 35 | - [Blog Data In Action](https://datainaction.dev/) 36 | - [Github](https://github.com/reginaldosilva27) 37 | 38 | ``Referencias:`` 39 | - 40 | - 41 | - 42 | -------------------------------------------------------------------------------- /tips/deletionVector/DeletionVectors.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Cria uma nova tabela 3 | df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 4 | df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta') 5 | df.count() 6 | 7 | # COMMAND ---------- 8 | 9 | # DBTITLE 1,Ver detalhes da tabela 10 | # MAGIC %sql 11 | # MAGIC describe extended db_demo.PatientInfoDelta 12 | 13 | # COMMAND ---------- 14 | 15 | # DBTITLE 1,Delete sem Deletion Vector 16 | # MAGIC %sql 17 | # MAGIC delete from db_demo.PatientInfoDelta where patient_id = 1000000002 18 | 19 | # COMMAND ---------- 20 | 21 | # DBTITLE 1,Habilitar Deletion Vector - Irá realizar Upgrade do protocolo Delta 22 | # MAGIC %sql 23 | # MAGIC ALTER TABLE db_demo.PatientInfoDelta SET TBLPROPERTIES ('delta.enableDeletionVectors' = true); 24 | 25 | # COMMAND ---------- 26 | 27 | # DBTITLE 1,Delete com Deletion Vector 28 | # MAGIC %sql 29 | # MAGIC delete from db_demo.PatientInfoDelta where patient_id = 1000000001 30 | 31 | # COMMAND ---------- 32 | 33 | # DBTITLE 1,COUNT para validar 34 | # MAGIC %sql 35 | # MAGIC select count(*) from db_demo.PatientInfoDelta 36 | 37 | # COMMAND ---------- 38 | 39 | # DBTITLE 1,Update com Deletion Vector? Somente com Photon 40 | # MAGIC %sql 41 | # MAGIC update db_demo.PatientInfoDelta set sex = 'male' where patient_id = '1000000033' 42 | 43 | # COMMAND ---------- 44 | 45 | # DBTITLE 1,Limpando versões e deletion vectors 46 | # MAGIC %sql 47 | # MAGIC set spark.databricks.delta.retentionDurationCheck.enabled = false; 48 | # MAGIC VACUUM db_demo.PatientInfoDelta RETAIN 0 HOURS 49 | 50 | # COMMAND ---------- 51 | 52 | # DBTITLE 1,Deletes com Deletion Vector - Testando performance 53 | id = 1000000001 54 | while 1 == 1: 55 | spark.sql(f"delete from db_demo.PatientInfoDelta where patient_id = {id}") 56 | print(id) 57 | id=id+1 58 | 59 | # COMMAND ---------- 60 | 61 | df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 62 | df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDeltaSemDeletion",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDeltaSemDeletion') 63 | 64 | # COMMAND ---------- 65 | 66 | # DBTITLE 1,Deletes SEM Deletion Vector - Testando performance 67 | id = 1000000001 68 | while 1 == 1: 69 | spark.sql(f"delete from db_demo.PatientInfoDeltaSemDeletion where patient_id = {id}") 70 | print(id) 71 | id=id+1 72 | -------------------------------------------------------------------------------- /tips/dbutils/Dbutils-Dataframe.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Usando o Dbutils.fs.ls para listar uma pasta 3 | dbutils.fs.ls('/databricks-datasets/COVID/') 4 | 5 | # COMMAND ---------- 6 | 7 | # DBTITLE 1,Transformar Dbutils em Dataframe 8 | # MAGIC %py 9 | # MAGIC from pyspark.sql.types import StructType, StructField, IntegerType, StringType 10 | # MAGIC 11 | # MAGIC ddlSchema = StructType([ 12 | # MAGIC StructField('path',StringType()), 13 | # MAGIC StructField('name',StringType()), 14 | # MAGIC StructField('size',IntegerType()), 15 | # MAGIC StructField('modificationTime',StringType()) 16 | # MAGIC ]) 17 | # MAGIC 18 | # MAGIC ls = dbutils.fs.ls('/databricks-datasets/COVID/') 19 | # MAGIC dfPath = spark.createDataFrame(ls,ddlSchema) 20 | # MAGIC dfPath.createOrReplaceTempView('vw_Files') 21 | 22 | # COMMAND ---------- 23 | 24 | # DBTITLE 1,Consultando com SQL 25 | # MAGIC %sql 26 | # MAGIC -- Note que temos apenas 2 arquivos os demais sao pastas 27 | # MAGIC select count(*) qtdFiles,sum(size) / 1024 / 1024 as size_Mb from vw_Files where size > 0 28 | 29 | # COMMAND ---------- 30 | 31 | # DBTITLE 1,Visualizando estrutura 32 | # MAGIC %sql 33 | # MAGIC -- As pastas sempre ficam com Size 0 mesmo tendo arquivos dentro 34 | # MAGIC select * from vw_Files 35 | 36 | # COMMAND ---------- 37 | 38 | # DBTITLE 1,Função recursiva para listar todos niveis de pasta 39 | # Basicamente é uma função que chama ela mesma durante a execução 40 | def get_dir_content(ls_path): 41 | path_list = dbutils.fs.ls(ls_path) 42 | for dir_path in dbutils.fs.ls(ls_path): 43 | if dir_path.isDir() and ls_path != dir_path.path and '_delta_log' not in dir_path.path: 44 | path_list += get_dir_content(dir_path.path) 45 | return path_list 46 | 47 | # COMMAND ---------- 48 | 49 | # DBTITLE 1,Agora vamos usar nossa função para gerar o DataFrame 50 | # MAGIC %py 51 | # MAGIC from pyspark.sql.types import StructType, StructField, IntegerType, StringType 52 | # MAGIC 53 | # MAGIC ddlSchema = StructType([ 54 | # MAGIC StructField('path',StringType()), 55 | # MAGIC StructField('name',StringType()), 56 | # MAGIC StructField('size',IntegerType()), 57 | # MAGIC StructField('modificationTime',StringType()) 58 | # MAGIC ]) 59 | # MAGIC 60 | # MAGIC dfPath = spark.createDataFrame(get_dir_content('/databricks-datasets/COVID/covid-19-data'),ddlSchema) 61 | # MAGIC dfPath.createOrReplaceTempView('vw_Files') 62 | 63 | # COMMAND ---------- 64 | 65 | # DBTITLE 1,Agora temos todos os arquivos daquela pasta e subpastas 66 | # MAGIC %sql 67 | # MAGIC select count(*) qtdFiles,sum(size) / 1024 / 1024 as size_Mb from vw_Files where size > 0 68 | -------------------------------------------------------------------------------- /tips/count/00000000000000000002.json: -------------------------------------------------------------------------------- 1 | {"commitInfo":{"timestamp":1683375447311,"userId":"8675301566931963","userName":"reginaldo.silva@dataside.com.br","operation":"DELETE","operationParameters":{"predicate":"[\"((spark_catalog.db_demo.PatientInfoDelta.patient_id >= '1000000001') AND (spark_catalog.db_demo.PatientInfoDelta.patient_id <= '1000000011'))\"]"},"notebook":{"notebookId":"2263512646416784"},"clusterId":"0213-212148-y5jr9wle","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"1","numCopiedRows":"5154","numDeletionVectorsAdded":"0","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"0","executionTimeMs":"1260","numDeletedRows":"11","scanTimeMs":"674","numAddedFiles":"1","rewriteTimeMs":"586"},"engineInfo":"Databricks-Runtime/12.1.x-scala2.12","txnId":"7da2bc04-7b91-4796-805c-aa05261e4c71"}} 2 | {"remove":{"path":"part-00000-dd7b4b44-cbd0-40ac-9549-e9fa424e2888-c000.snappy.parquet","deletionTimestamp":1683375447309,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":53856,"tags":{"INSERTION_TIME":"1683375359000000","MIN_INSERTION_TIME":"1683375359000000","MAX_INSERTION_TIME":"1683375359000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} 3 | {"add":{"path":"part-00000-4024a4e5-5f88-4091-980f-2e9c49b1ef79-c000.snappy.parquet","partitionValues":{},"size":53373,"modificationTime":1683375447000,"dataChange":true,"stats":"{\"numRecords\":5154,\"minValues\":{\"patient_id\":\"1000000012\",\"sex\":\"female\",\"age\":\"0s\",\"country\":\"Bangladesh\",\"province\":\"Busan\",\"city\":\"Andong-si\",\"infection_case\":\"Anyang Gunpo Pastors Group\",\"infected_by\":\"1000000003\",\"contact_number\":\"-\",\"symptom_onset_date\":\" \",\"confirmed_date\":\"2020-01-20\",\"released_date\":\"2020-02-06\",\"deceased_date\":\"2020-02-19\",\"state\":\"deceased\"},\"maxValues\":{\"patient_id\":\"7000000019\",\"sex\":\"male\",\"age\":\"90s\",\"country\":\"Vietnam\",\"province\":\"Ulsan\",\"city\":\"sankyeock-dong\",\"infection_case\":\"overseas inflow\",\"infected_by\":\"7000000009\",\"contact_number\":\"95\",\"symptom_onset_date\":\"2020-06-28\",\"confirmed_date\":\"2020-06-30\",\"released_date\":\"2020-06-28\",\"deceased_date\":\"2020-05-25\",\"state\":\"released\"},\"nullCount\":{\"patient_id\":0,\"sex\":1122,\"age\":1380,\"country\":0,\"province\":0,\"city\":94,\"infection_case\":919,\"infected_by\":3813,\"contact_number\":4374,\"symptom_onset_date\":4466,\"confirmed_date\":3,\"released_date\":3578,\"deceased_date\":5088,\"state\":0}}","tags":{"MAX_INSERTION_TIME":"1683375359000000","INSERTION_TIME":"1683375359000000","MIN_INSERTION_TIME":"1683375359000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} 4 | -------------------------------------------------------------------------------- /tips/deltaTable/Protocols.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- DBTITLE 1,Cria tabela básica sem nenhuma feature nova 3 | create table tb_teste (campo1 int); 4 | describe extended tb_teste; 5 | 6 | -- COMMAND ---------- 7 | 8 | -- DBTITLE 1,Realizando upgrade para utilizar o CDC 9 | -- Upgrades the reader protocol version to 1 and the writer protocol version to 4. 10 | ALTER TABLE tb_teste SET TBLPROPERTIES('delta.minReaderVersion' = '1', 'delta.minWriterVersion' = '4'); 11 | describe extended tb_teste; 12 | 13 | -- COMMAND ---------- 14 | 15 | -- DBTITLE 1,Criando uma tabela com CDC habilitado 16 | create table tb_teste2 (campo1 int) TBLPROPERTIES (delta.enableChangeDataFeed = true); 17 | describe extended tb_teste2; 18 | 19 | -- COMMAND ---------- 20 | 21 | -- DBTITLE 1,Criando uma tabela com a ultima versão para usar Deletion Vector 22 | drop table tb_teste3; 23 | create table tb_teste3 (campo1 int) TBLPROPERTIES('delta.minReaderVersion' = '3', 'delta.minWriterVersion' = '7'); 24 | describe extended tb_teste3; 25 | 26 | -- COMMAND ---------- 27 | 28 | -- DBTITLE 1,Habilitando Deletion Vector 29 | alter table tb_teste3 SET TBLPROPERTIES ('delta.enableDeletionVectors' = true); 30 | describe extended tb_teste3; 31 | 32 | -- COMMAND ---------- 33 | 34 | -- DBTITLE 1,Tentando usar uma feature não suportada pelo Databricks Runtime 35 | alter table tb_teste3 SET TBLPROPERTIES ('delta.feature.timestampNtz' = 'supported') 36 | 37 | -- COMMAND ---------- 38 | 39 | -- DBTITLE 1,Downgrade 40 | ALTER TABLE tb_teste3 SET TBLPROPERTIES('delta.minReaderVersion' = '1', 'delta.minWriterVersion' = '4') 41 | 42 | -- COMMAND ---------- 43 | 44 | create table tb_teste4 (campo1 int) TBLPROPERTIES (delta.enableChangeDataFeed = true); 45 | describe extended tb_teste4; 46 | 47 | -- COMMAND ---------- 48 | 49 | -- DBTITLE 1,Tentando ler a tabela com Runtime 11.3 50 | select * from tb_teste3 51 | 52 | -- COMMAND ---------- 53 | 54 | -- DBTITLE 1,Habilitando timestampNtz 55 | create table tb_teste5 (campo1 int) TBLPROPERTIES (delta.feature.timestampNtz = 'supported'); 56 | describe extended tb_teste5; 57 | 58 | -- COMMAND ---------- 59 | 60 | create table tb_teste6 (campo1 int) TBLPROPERTIES (delta.feature.enableDeletionVectors = 'supported'); 61 | describe extended tb_teste6; 62 | 63 | -- COMMAND ---------- 64 | 65 | -- DBTITLE 1,Resumo do Table Features 66 | CREATE TABLE db_demo.teste7 ( 67 | patient_id STRING) 68 | USING delta 69 | LOCATION 'abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/teste7' 70 | TBLPROPERTIES ( 71 | 'delta.enableDeletionVectors' = 'true', 72 | 'delta.feature.appendOnly' = 'supported', 73 | 'delta.feature.deletionVectors' = 'supported', 74 | 'delta.feature.invariants' = 'supported', 75 | 'delta.minReaderVersion' = '3', 76 | 'delta.minWriterVersion' = '7') 77 | -------------------------------------------------------------------------------- /tips/run/notebook1.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Defini variaveis notebook 1 3 | dataini = '2023-01-01' 4 | datafim = '2023-03-31' 5 | 6 | # COMMAND ---------- 7 | 8 | # DBTITLE 1,Chama notebook 2 com notebook.run() 9 | dbutils.notebook.run('/Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2', 10 | -30, 11 | {"dataini": dataini, "datafim": datafim} 12 | ) 13 | 14 | # COMMAND ---------- 15 | 16 | # DBTITLE 1,Utilizando %Run - Passando valores fixos funciona 17 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 $dataini=2023-01-01 $datafim=2023-03-31 18 | 19 | # COMMAND ---------- 20 | 21 | # DBTITLE 1,Passando varaveis não funciona 22 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 $dataini=dataini $datafim=datafim 23 | 24 | # COMMAND ---------- 25 | 26 | # DBTITLE 1,Definindo Widgets 27 | dbutils.widgets.text("dataini", "2023-01-01") 28 | dbutils.widgets.text("datafim", "2023-03-31") 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,utilizando widgets parece que funciona ne? 33 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 34 | 35 | # COMMAND ---------- 36 | 37 | # DBTITLE 1,Atualizando valores 38 | dbutils.widgets.text("dataini", "2023-02-01") 39 | dbutils.widgets.text("datafim", "2023-02-28") 40 | 41 | # COMMAND ---------- 42 | 43 | # DBTITLE 1,Com widgets - Valores não atualizaram 44 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 45 | 46 | # COMMAND ---------- 47 | 48 | # DBTITLE 1,Limpando Widgets 49 | dbutils.widgets.removeAll() 50 | 51 | # COMMAND ---------- 52 | 53 | # DBTITLE 1,Criando tabela de parametros 54 | # MAGIC %sql 55 | # MAGIC drop table if exists tb_parameters; 56 | # MAGIC create table if not exists tb_parameters (dataini date, datafim date); 57 | # MAGIC insert into tb_parameters values('2023-01-01','2023-03-31'); 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %sql 62 | # MAGIC select * from tb_parameters 63 | 64 | # COMMAND ---------- 65 | 66 | # DBTITLE 1,Agora sim! 67 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook3 68 | 69 | # COMMAND ---------- 70 | 71 | # DBTITLE 1,Sessao do Spark 72 | # MAGIC %scala 73 | # MAGIC spark 74 | 75 | # COMMAND ---------- 76 | 77 | # DBTITLE 1,Contexto 78 | # MAGIC %scala 79 | # MAGIC spark.sparkContext 80 | 81 | # COMMAND ---------- 82 | 83 | # DBTITLE 1,Testando com notebook.run() 84 | dbutils.notebook.run('/Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2', 85 | -30, 86 | {"dataini": dataini, "datafim": datafim} 87 | ) 88 | 89 | # COMMAND ---------- 90 | 91 | # DBTITLE 1,Testando sessao do Spark 92 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 93 | -------------------------------------------------------------------------------- /API/databricks/Databrick Jobs List - API.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | ##prod 3 | from requests import request 4 | import requests 5 | import json 6 | 7 | instance_id = 'xxxxxx.azuredatabricks.net' 8 | 9 | api_version = '/api/2.1' 10 | api_command = '/jobs/list' 11 | url = f"https://{instance_id}{api_version}{api_command}" 12 | #Adicionar secret 13 | headers = { 14 | 'Authorization': "Bearer xxxxxxx" 15 | } 16 | 17 | has_more = True 18 | count = 0 19 | offset = 0 20 | jsonDataList = [] 21 | while has_more: 22 | params = { 23 | 'expand_tasks': 'true', 24 | 'offset': offset 25 | } 26 | 27 | response = requests.get( 28 | url = url, 29 | params = params, 30 | headers= headers 31 | ) 32 | 33 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2)) 34 | jsonRDD = sc.parallelize(jsonDataList) 35 | dfProd = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD) 36 | try: 37 | has_more = json.loads(response.text)['has_more'] 38 | except: 39 | has_more = False 40 | 41 | count = count + 1 42 | offset = offset + 20 43 | print(count) 44 | print(json.dumps(json.loads(response.text), indent = 2)) 45 | 46 | # COMMAND ---------- 47 | 48 | from pyspark.sql.functions import * 49 | dfJobsProd = dfProd.select(explode("jobs").alias("jobs")).withColumn("environment", lit("PROD")) 50 | dfJobsProd = dfJobsProd.withColumn('jobname',col('jobs.settings.name').cast('string')) 51 | dfJobsProd.count() 52 | 53 | # COMMAND ---------- 54 | 55 | dfJobsProd.select( 56 | dfJobsProd.environment.cast('string').alias("environment"), 57 | dfJobsProd.jobs.job_id.cast('string').alias("job_id"), 58 | dfJobsProd.jobs.creator_user_name.cast('string').alias("creator_user_name"), 59 | dfJobsProd.jobname, 60 | dfJobsProd.jobs.settings.schedule.cast('string').alias("schedule"), 61 | dfJobsProd.jobs.settings.schedule.quartz_cron_expression.cast('string').alias("quartz_cron_expression"), 62 | dfJobsProd.jobs.settings.email_notifications.cast('string').alias("email_notifications"), 63 | dfJobsProd.jobs.settings.timeout_seconds.cast('string').alias("timeout_seconds"), 64 | dfJobsProd.jobs.settings.max_concurrent_runs.cast('string').alias("max_concurrent_runs"), 65 | dfJobsProd.jobs.settings.tasks.cast('string').alias("tasks"), 66 | dfJobsProd.jobs.settings.format.cast('string').alias("format"), 67 | dfJobsProd.jobs.settings.tasks[0].existing_cluster_id.cast('string').alias("existing_cluster_id"), 68 | dfJobsProd.jobs.settings.tasks[1].existing_cluster_id.cast('string').alias("existing_cluster_id2"), 69 | dfJobsProd.jobs.settings.tasks[2].existing_cluster_id.cast('string').alias("existing_cluster_id3"), 70 | to_timestamp(dfJobsProd.jobs.created_time / 1000).alias('created_time') 71 | ).createOrReplaceTempView('vwJobs') 72 | 73 | # COMMAND ---------- 74 | 75 | # MAGIC %sql 76 | # MAGIC select * from vwJobs 77 | -------------------------------------------------------------------------------- /tips/DatabricksSDKPython/Python-SDK.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Instalando SDK 3 | # MAGIC %pip install databricks-sdk --upgrade 4 | 5 | # COMMAND ---------- 6 | 7 | # DBTITLE 1,Reiniciando Kernel 8 | dbutils.library.restartPython() 9 | 10 | # COMMAND ---------- 11 | 12 | # DBTITLE 1,Listando todos os clusters All Purpose 13 | from databricks.sdk import WorkspaceClient 14 | 15 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx') 16 | 17 | for c in w.clusters.list(): 18 | print(c.cluster_name) 19 | 20 | # COMMAND ---------- 21 | 22 | # DBTITLE 1,Ligando Clusters 23 | from databricks.sdk import WorkspaceClient 24 | 25 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx') 26 | 27 | for c in w.clusters.list(): 28 | try: 29 | print('Ligando Cluster: ', c.cluster_name) 30 | w.clusters.start(cluster_id=c.cluster_id).result() 31 | except: 32 | print('Cluster já está ligado: ', c.cluster_name) 33 | 34 | # COMMAND ---------- 35 | 36 | # DBTITLE 1,Listando todos os Jobs e quantidade de clusters e tasks 37 | from databricks.sdk import WorkspaceClient 38 | 39 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx') 40 | 41 | job_list = w.jobs.list(expand_tasks=True) 42 | for j in job_list: 43 | #print(j) 44 | print('job_id: ',j.job_id, ' - name:', j.settings.name, ' - job_clusters:', len(j.settings.job_clusters) if j.settings.job_clusters else 'None', ' - tasks:', len(j.settings.tasks), ' - tags:', j.settings.tags) 45 | 46 | # COMMAND ---------- 47 | 48 | # DBTITLE 1,Listando todos os Notebooks e subpastas de usuário corrente 49 | from databricks.sdk import WorkspaceClient 50 | 51 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx') 52 | 53 | names = [] 54 | for i in w.workspace.list(f'/Users/{w.current_user.me().user_name}', recursive=True): 55 | names.append(i.path) 56 | print(i.path) 57 | assert len(names) > 0 58 | 59 | # COMMAND ---------- 60 | 61 | # DBTITLE 1,Listando todos os Notebooks e subpastas do Workspace 62 | from databricks.sdk import WorkspaceClient 63 | 64 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx') 65 | 66 | names = [] 67 | for i in w.workspace.list(f'/', recursive=True): 68 | names.append(i.path) 69 | print(i.path) 70 | assert len(names) > 0 71 | 72 | # COMMAND ---------- 73 | 74 | # DBTITLE 1,Listando Users do Workspace 75 | from databricks.sdk import WorkspaceClient 76 | from databricks.sdk.service import iam 77 | 78 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx') 79 | 80 | all_users = w.users.list(attributes="id,userName", 81 | sort_by="userName", 82 | sort_order=iam.ListSortOrder.DESCENDING) 83 | 84 | for u in all_users: 85 | print(u.user_name) 86 | -------------------------------------------------------------------------------- /tips/parallel/Paralelismo.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import time 3 | from datetime import datetime 4 | from concurrent.futures import ThreadPoolExecutor 5 | 6 | # COMMAND ---------- 7 | 8 | # DBTITLE 1,Serial Way 9 | # Função que recebe um numero e printa ele na tela junto com a data e hora 10 | def printNumber(number): 11 | try: 12 | print(f"{number} - {datetime.today()}") 13 | time.sleep(1) 14 | except: 15 | print(number + ' - ' + str(datetime.today())) 16 | 17 | # Gerando uma lista de numeros e passando cada um para a função 18 | numbers = range(1,11) 19 | [printNumber(i) for i in numbers] 20 | 21 | # COMMAND ---------- 22 | 23 | # DBTITLE 1,Parallel Way 24 | # Essa é a mesma função de printar o numero usada no serial 25 | def printNumber(number): 26 | try: 27 | print(f"{number} - {datetime.today()}") 28 | time.sleep(1) 29 | except: 30 | print(number + ' - ' + str(datetime.today())) 31 | 32 | # Criamos uma função que irá receber uma lista de numeros e printar ele de forma paralela 33 | # Note que especificamos tambem a quantidade maxima de paralelismo que pode ser usada 34 | def parallelInt(numbers, numInParallel): 35 | with ThreadPoolExecutor(max_workers=numInParallel) as ec: 36 | return [ec.submit(printNumber, number) for number in numbers] 37 | 38 | # Definindo a lista de numeros e quantidade de threads em paralelo 39 | numbers = range(1,11) 40 | parallelThreads = 10 41 | print(numbers) 42 | result = parallelInt(numbers,parallelThreads) 43 | 44 | # COMMAND ---------- 45 | 46 | # MAGIC %sql 47 | # MAGIC describe history db_festivaldemo.PatientInfoDelta 48 | 49 | # COMMAND ---------- 50 | 51 | # MAGIC %sql 52 | # MAGIC -- Gerando Delete de exemplo 53 | # MAGIC delete from db_festivaldemo.PatientInfoDelta where patient_id = 1000000002 54 | 55 | # COMMAND ---------- 56 | 57 | # Executando um COUNT(*) em cada versão da tabela 58 | listVersions = spark.sql("describe history db_festivaldemo.PatientInfoDelta").collect() 59 | for row in listVersions: 60 | print(f'Version -> {row.version} - Count: {spark.sql(f"select count(*) as qtd from db_festivaldemo.PatientInfoDelta VERSION AS OF {row.version}").collect()[0][0]} - {datetime.today()}') 61 | 62 | # COMMAND ---------- 63 | 64 | # Função para executar um count em cada versão da tabela 65 | def getversion(version): 66 | try: 67 | print(f'Version -> {version} - Count: {spark.sql(f"select count(*) as qtd from db_festivaldemo.PatientInfoDelta VERSION AS OF {version}").collect()[0][0]} - {datetime.today()}') 68 | except: 69 | print(version + ' - ' + str(datetime.today())) 70 | 71 | def parallelInt2(numbers, numInParallel): 72 | with ThreadPoolExecutor(max_workers=numInParallel) as ec: 73 | return [ec.submit(getversion, item.version) for item in listVersions] 74 | 75 | listVersions = spark.sql("describe history db_festivaldemo.PatientInfoDelta").collect() 76 | parallelThreads = 25 77 | result = parallelInt2(numbers,parallelThreads) 78 | -------------------------------------------------------------------------------- /tips/parameters/RunDevProd.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,1 - Variáveis de ambiente configuradas por Cluster 3 | import os 4 | environment = os.getenv("environment") 5 | database = os.getenv("database") 6 | storageroot = os.getenv("storageroot") 7 | 8 | if environment == 'dev': 9 | print(environment) 10 | print(database) 11 | print(storageroot) 12 | 13 | # Exemplo de utilização: 14 | tbName = 'person' 15 | path = f"{storageroot}\{database}\{tbName}" 16 | print(path) 17 | df.write.option("mergeSchema", "true") \ 18 | .mode(f"append") \ 19 | .format("delta") \ 20 | .saveAsTable(f"{database}.{tbName}",path=path) 21 | 22 | # COMMAND ---------- 23 | 24 | # DBTITLE 1,Recuperando Tag Default 25 | spark.conf.get('spark.databricks.clusterUsageTags.clusterId') 26 | 27 | # COMMAND ---------- 28 | 29 | # DBTITLE 1,Tag clusterAllTags 30 | spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags') 31 | 32 | # COMMAND ---------- 33 | 34 | # DBTITLE 1,2 - Azure Tags - Automaticamente adicionadas ao cluster 35 | import json 36 | ## Essas tags só podem ser acessadas via clusterAllTags, diferente das Custom e Default 37 | tags = json.loads(spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags')) 38 | for tag in tags: 39 | if tag["key"] == 'storageroot': 40 | storageroot = tag["value"] 41 | if tag["key"] == 'databricks-environment': 42 | environment = tag["value"] 43 | if tag["key"] == 'department': 44 | department = tag["value"] 45 | if tag["key"] == 'company': 46 | company = tag["value"] 47 | 48 | print(environment) 49 | print(storageroot) 50 | print(department) 51 | print(company) 52 | 53 | # COMMAND ---------- 54 | 55 | # DBTITLE 1,3 - Spark Conf - Fixando valor no notebook 56 | workspace = spark.conf.get("spark.databricks.clusterUsageTags.clusterOwnerOrgId") 57 | 58 | if workspace == '5800865833021444': ##dev 59 | instance_id = f'adb-5800865833021444.4.azuredatabricks.net' 60 | storageroot='abfss://lakedev@storageaccountlake.dfs.core.windows.net' 61 | database='db_catalog_dev' 62 | environment='dev' 63 | if workspace == '5800865833021442': ##prod 64 | instance_id = 'adb-5800865833021442.4.azuredatabricks.net' 65 | storageroot='abfss://lakeprod@storageaccountlake.dfs.core.windows.net' 66 | database='db_catalog_prod' 67 | environment='prod' 68 | 69 | print(environment) 70 | print(storageroot) 71 | print(database) 72 | print(instance_id) 73 | 74 | # COMMAND ---------- 75 | 76 | # DBTITLE 1,4 - Widgets 77 | # https://www.datainaction.dev/post/databricks-parametrizando-seus-notebooks-like-a-boss-usando-widgets 78 | 79 | # Definindo Widgets manualmente - Não é obrigatório, se você enviar via Job direto funciona 80 | dbutils.widgets.text('environment', '') 81 | dbutils.widgets.text('storageroot', '') 82 | dbutils.widgets.text('database', '') 83 | 84 | # Pegando valor dos Widgets 85 | environment = dbutils.widgets.get('environment') 86 | storageroot = dbutils.widgets.get('storageroot') 87 | database = dbutils.widgets.get('database') 88 | 89 | print(environment) 90 | print(storageroot) 91 | print(database) 92 | -------------------------------------------------------------------------------- /tips/VNET/README.md: -------------------------------------------------------------------------------- 1 | ### VNET (Virtual Network): 2 | A VNET é uma rede privada no Azure que permite o isolamento de recursos e a comunicação segura entre eles. No contexto de Databricks, a VNET Injection permite que o cluster seja implantado dentro de uma VNET do cliente, oferecendo maior controle sobre o tráfego de rede e a conectividade com recursos externos. 3 | 4 | ### Subnets: 5 | As sub-redes (subnets) dividem uma VNET em segmentos menores. Elas são utilizadas para isolar recursos, controlar o tráfego e aplicar regras de segurança específicas. No Databricks, diferentes subnets podem ser usadas para isolar a comunicação entre clusters e outros serviços. 6 | 7 | ### Network Security Groups (NSG): 8 | Um NSG contém regras de segurança que controlam o tráfego de rede de entrada e saída para os recursos de uma subnet ou interface de rede. Essas regras ajudam a proteger os recursos dentro da VNET, permitindo ou bloqueando o tráfego com base em endereços IP, portas e protocolos. 9 | 10 | ### Private Endpoint: 11 | Um Private Endpoint cria uma interface de rede privada dentro de uma subnet para conectar-se a serviços do Azure (como Databricks ou o Azure Storage) sem expor o tráfego à internet pública. Isso melhora a segurança ao garantir que toda a comunicação aconteça dentro da rede privada. 12 | 13 | ### Private DNS Servers: 14 | Para que os recursos dentro de uma VNET resolvam corretamente nomes de domínio associados a Private Endpoints, é necessário configurar servidores DNS privados. Esses servidores permitem a resolução de endereços IP internos e externos, garantindo a comunicação adequada dentro da infraestrutura isolada. 15 | 16 | ### Peering: 17 | O VNET Peering conecta duas redes virtuais no Azure, permitindo que elas se comuniquem diretamente sem a necessidade de gateways ou roteamento através da internet. No cenário de Databricks, o peering pode ser utilizado para conectar a VNET onde o workspace está injetado com outras VNETs que hospedam recursos críticos. 18 | 19 | ### VPN Gateway: 20 | Um gateway VPN oferece conectividade segura entre a rede on-premises e a VNET do Azure através de uma conexão encriptada (IPsec). Isso permite que os recursos no Azure se conectem à infraestrutura local de forma segura e privada, útil para cenários híbridos. 21 | 22 | ### ExpressRoute: 23 | O ExpressRoute é uma solução de conectividade privada que permite conexões dedicadas e de baixa latência entre a rede on-premises e o Azure, sem passar pela internet pública. É geralmente usada para cargas de trabalho sensíveis e de alta performance, garantindo maior confiabilidade e segurança. 24 | 25 | ### Route Tables: 26 | As tabelas de rota controlam como o tráfego é direcionado dentro de uma VNET. Elas definem rotas personalizadas, permitindo que o tráfego seja direcionado para diferentes subnets, gateways ou outros destinos. No Databricks, as route tables podem ser usadas para garantir que o tráfego siga por caminhos seguros e otimizados. 27 | 28 | ### NAT Gateway: 29 | O NAT Gateway permite que recursos dentro de uma subnet privada acessem a internet de forma segura, mascarando seus endereços IP com um único endereço público de saída. Isso é útil para controlar o tráfego de saída e limitar a exposição direta dos recursos internos à internet pública. 30 | -------------------------------------------------------------------------------- /routines/OptimizeAndVacuum/Demo.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %py 3 | -- MAGIC dbutils.fs.rm('/mnt/raw/database=covid/table=PatientInfoDelta',True) 4 | 5 | -- COMMAND ---------- 6 | 7 | -- DBTITLE 1,Preparando o ambiente 8 | -- MAGIC %py 9 | -- MAGIC #Ambiente lendo CSV de exemplos e salvando como tabela DELTA 10 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 11 | -- MAGIC #Salve onde quiser, estou usando um Mount para facilitar 12 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_festivaldemo.PatientInfoDelta",path='/mnt/raw/database=covid/table=PatientInfoDelta') 13 | -- MAGIC count = 0 14 | -- MAGIC #Alguns Updates para gerar alguns logs e tudo pronto 15 | -- MAGIC while count < 12: 16 | -- MAGIC spark.sql(f"update db_festivaldemo.PatientInfoDelta set age={count} where patient_id = 1000000001") 17 | -- MAGIC print(count) 18 | -- MAGIC count=count+1 19 | 20 | -- COMMAND ---------- 21 | 22 | -- DBTITLE 1,Instanciar o notebook utilizando o RUN 23 | -- MAGIC %run /Users/reginaldo.silva@dataside.com.br/OptimizeAndVacuum 24 | 25 | -- COMMAND ---------- 26 | 27 | -- DBTITLE 1,Detalhes da tabela 28 | -- Olhe o campo numFiles 29 | describe detail db_festivaldemo.PatientInfoDelta 30 | 31 | -- COMMAND ---------- 32 | 33 | -- DBTITLE 1,Quantidade de arquivos no Storage 34 | -- MAGIC %py 35 | -- MAGIC #note que temos 13 arquivos de dados, contudo no numFiles temos apenas 1, ou seja, esses 12 sao historico e podem ser limpos se na forem ser usados para Time Travel 36 | -- MAGIC len(dbutils.fs.ls('dbfs:/mnt/raw/database=covid/table=PatientInfoDelta')) 37 | 38 | -- COMMAND ---------- 39 | 40 | -- DBTITLE 1,Historico 41 | -- Todas as alterações que fizemos com o Loop 42 | describe history db_festivaldemo.PatientInfoDelta 43 | 44 | -- COMMAND ---------- 45 | 46 | -- DBTITLE 1,Usando a função: Chamando com Debug habilitado 47 | -- MAGIC %py 48 | -- MAGIC #Chamando a função instanciada no notebook 49 | -- MAGIC #Usando 2 colunas no ZORDER, apenas para exemplo 50 | -- MAGIC maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='sex,patient_id', vacuumRetention=144, vacuum=True, optimize=True, debug=True) 51 | 52 | -- COMMAND ---------- 53 | 54 | -- DBTITLE 1,Usando a função: Executando 55 | -- MAGIC %py 56 | -- MAGIC #Chamando a função instanciada no notebook 57 | -- MAGIC #Usando 0 horas apenas para exemplo, o recomendado é 7 dias 58 | -- MAGIC maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='sex,patient_id', vacuumRetention=0, vacuum=True, optimize=True, debug=False) 59 | 60 | -- COMMAND ---------- 61 | 62 | -- DBTITLE 1,Rodando sem HIVE 63 | -- MAGIC %py 64 | -- MAGIC #Execuando passando o caminho direto no Lake, defina o schema como delta e coloca o caminho entre `caminho` 65 | -- MAGIC maintenanceDeltalake(nomeSchema='delta', nomeTabela='`/mnt/raw/database=covid/table=PatientInfoDelta`', colunasZorder='sex,patient_id', vacuumRetention=144, vacuum=True, optimize=True, debug=False) 66 | 67 | -- COMMAND ---------- 68 | 69 | -- MAGIC %py 70 | -- MAGIC #recontagem 71 | -- MAGIC len(dbutils.fs.ls('dbfs:/mnt/raw/database=covid/table=PatientInfoDelta')) 72 | -------------------------------------------------------------------------------- /tips/regex/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | from tabulate import tabulate 4 | 5 | log_string = """ 6 | 04:06:51 1 of 25 START sql incremental model bronze.users [RUN] 7 | 04:06:51 2 of 25 START sql incremental model bronze.prices [RUN] 8 | 04:06:51 3 of 25 START sql incremental model bronze.vendors [RUN] 9 | 04:06:51 4 of 25 START sql table model bronze.customers [RUN] 10 | 04:06:58 3 of 25 OK created sql incremental model bronze.vendors [INSERT 0 2 in 6.70s] 11 | 04:06:58 5 of 25 START sql incremental model bronze.orders [RUN] 12 | 04:06:58 4 of 25 OK created sql table model bronze.customers [SELECT in 6.94s] 13 | 04:06:58 6 of 25 START sql incremental model bronze.teste [RUN] 14 | 04:07:00 2 of 25 OK created sql incremental model bronze.prices [INSERT 0 133 in 8.31s] 15 | 04:07:00 7 of 25 START sql table model bronze.email .............. [RUN] 16 | 04:07:06 1 of 25 OK created sql incremental model bronze.users [INSERT 0 178089 in 14.30s] 17 | 04:07:06 8 of 25 START sql view model bronze.sales [RUN] 18 | 04:07:10 5 of 25 OK created sql incremental model bronze.orders [INSERT 0 5 in 1200.90s] 19 | 04:07:10 9 of 25 START sql view model bronze.people [RUN] 20 | 04:07:13 8 of 25 OK created sql view model bronze.sales [CREATE VIEW in 74.74s] 21 | 04:07:13 10 of 25 START sql view model bronze.transfers ... [RUN] 22 | 04:07:18 9 of 25 OK created sql view model bronze.people [CREATE VIEW in 8.04s] 23 | 04:07:18 11 of 25 START sql view model bronze.employees [RUN] 24 | 04:07:21 10 of 25 OK created sql view model bronze.transfers [CREATE VIEW in 700.72s] 25 | 04:07:21 12 of 25 START sql incremental model bronze.undefined .. [RUN] 26 | 04:07:23 11 of 25 OK created sql view model bronze.employees [CREATE VIEW in 80.90s] 27 | """ 28 | 29 | # Criando lista, quebrando por quebra de linha 30 | logs = log_string.split("\n") 31 | 32 | # Filtrando apenas eventos de finalização 33 | logs = list(filter(lambda x: "OK created" in x, logs)) 34 | 35 | # Regra Regex para extrair informações necessárias 36 | pattern = r"(\d{2}:\d{2}:\d{2})\s+(\d+)\s+of\s+\d+\s+OK\s+created\s+sql\s+(\w+)\s+model\s+([\w\.]+)\s+.*?\[.*?in\s+(\d+\.\d+)s\]" 37 | 38 | # Criando um loop para processar cada log 39 | log_data = [] 40 | for log in logs: 41 | match = re.search(pattern, log) 42 | if match: 43 | start_time = match.group(1) 44 | task_number = int(match.group(2)) 45 | model_type = match.group(3) 46 | model_name = match.group(4) 47 | duration_seconds = float(match.group(5)) 48 | duration_minutes = round(duration_seconds / 60,2) 49 | 50 | # Adicionando os dados à lista 51 | log_data.append([start_time, task_number, model_type, model_name, duration_seconds, duration_minutes]) 52 | else: 53 | print("Log não corresponde ao padrão esperado.") 54 | 55 | # Ordenando pelo mais demorado 56 | log_data.sort(key=lambda x: x[4], reverse=True) 57 | 58 | # Printando na tela em formato tabular 59 | print(tabulate(log_data, headers=["start", "task", "type", "model", "duration_sec","duration_min"])) 60 | 61 | # Gerando CSV 62 | csv_file = "/Users/reginaldosilva/Downloads/log_data.csv" 63 | with open(csv_file, mode='w', newline='') as file: 64 | writer = csv.writer(file) 65 | writer.writerow(["start", "task", "type", "model", "duration_sec","duration_min"]) 66 | writer.writerows(log_data) -------------------------------------------------------------------------------- /tips/count/00000000000000000000.json: -------------------------------------------------------------------------------- 1 | {"commitInfo":{"timestamp":1683375359766,"userId":"8675301566931963","userName":"reginaldo.silva@dataside.com.br","operation":"CREATE OR REPLACE TABLE AS SELECT","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"notebook":{"notebookId":"2263512646416784"},"clusterId":"0213-212148-y5jr9wle","isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5165","numOutputBytes":"53856"},"engineInfo":"Databricks-Runtime/12.1.x-scala2.12","txnId":"705577b8-7af9-4bff-ba50-745fb31a9e10"}} 2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}} 3 | {"metaData":{"id":"8806a8ae-e6cb-41dc-8f63-b137f76f944e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"patient_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sex\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"country\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"province\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"city\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"infection_case\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"infected_by\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"contact_number\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"symptom_onset_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"confirmed_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"released_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deceased_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"state\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1683375358932}} 4 | {"add":{"path":"part-00000-dd7b4b44-cbd0-40ac-9549-e9fa424e2888-c000.snappy.parquet","partitionValues":{},"size":53856,"modificationTime":1683375359000,"dataChange":true,"stats":"{\"numRecords\":5165,\"minValues\":{\"patient_id\":\"1000000001\",\"sex\":\"female\",\"age\":\"0s\",\"country\":\"Bangladesh\",\"province\":\"Busan\",\"city\":\"Andong-si\",\"infection_case\":\"Anyang Gunpo Pastors Group\",\"infected_by\":\"1000000002\",\"contact_number\":\"-\",\"symptom_onset_date\":\" \",\"confirmed_date\":\"2020-01-20\",\"released_date\":\"2020-02-05\",\"deceased_date\":\"2020-02-19\",\"state\":\"deceased\"},\"maxValues\":{\"patient_id\":\"7000000019\",\"sex\":\"male\",\"age\":\"90s\",\"country\":\"Vietnam\",\"province\":\"Ulsan\",\"city\":\"sankyeock-dong\",\"infection_case\":\"overseas inflow\",\"infected_by\":\"7000000009\",\"contact_number\":\"95\",\"symptom_onset_date\":\"2020-06-28\",\"confirmed_date\":\"2020-06-30\",\"released_date\":\"2020-06-28\",\"deceased_date\":\"2020-05-25\",\"state\":\"released\"},\"nullCount\":{\"patient_id\":0,\"sex\":1122,\"age\":1380,\"country\":0,\"province\":0,\"city\":94,\"infection_case\":919,\"infected_by\":3819,\"contact_number\":4374,\"symptom_onset_date\":4475,\"confirmed_date\":3,\"released_date\":3578,\"deceased_date\":5099,\"state\":0}}","tags":{"INSERTION_TIME":"1683375359000000","MIN_INSERTION_TIME":"1683375359000000","MAX_INSERTION_TIME":"1683375359000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} 5 | -------------------------------------------------------------------------------- /tips/particionamento/Particionar ou Nao_.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- DBTITLE 1,Criando ambiente 3 | -- MAGIC %py 4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 5 | -- MAGIC df.count() 6 | 7 | -- COMMAND ---------- 8 | 9 | -- DBTITLE 1,Exemplo dos dados 10 | -- MAGIC %py 11 | -- MAGIC df.display() 12 | 13 | -- COMMAND ---------- 14 | 15 | -- DBTITLE 1,Gravando tabela particionada por pais 16 | -- MAGIC %py 17 | -- MAGIC df.write.format('parquet').mode('overwrite').partitionBy('Country').saveAsTable("db_demo.PatientInfoParquet_Country",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoParquet_Country') 18 | 19 | -- COMMAND ---------- 20 | 21 | -- DBTITLE 1,Gravando tabela sem particionamento 22 | -- MAGIC %py 23 | -- MAGIC df.write.format('parquet').mode('overwrite').saveAsTable("db_demo.PatientInfoParquet_SemParticao",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoParquet_SemParticao') 24 | 25 | -- COMMAND ---------- 26 | 27 | -- DBTITLE 1,Leitura com usando particionamento 28 | select * from db_demo.PatientInfoParquet_Country where country = 'Canada' 29 | 30 | -- COMMAND ---------- 31 | 32 | -- DBTITLE 1,Leitura sem particionamento 33 | select * from db_demo.PatientInfoParquet_SemParticao where country = 'Canada' 34 | 35 | -- COMMAND ---------- 36 | 37 | -- MAGIC %py 38 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDeltaSemParticao",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoDeltaSemParticao') 39 | 40 | -- COMMAND ---------- 41 | 42 | -- MAGIC %py 43 | -- MAGIC df.write.format('delta').mode('overwrite').partitionBy('Country').saveAsTable("db_demo.PatientInfoDeltaCountry",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoDeltaCountry') 44 | 45 | -- COMMAND ---------- 46 | 47 | select * from db_demo.PatientInfoDeltaSemParticao where country = 'Canada' 48 | 49 | -- COMMAND ---------- 50 | 51 | select * from db_demo.PatientInfoDeltaCountry where country = 'Canada' 52 | 53 | -- COMMAND ---------- 54 | 55 | OPTIMIZE db_demo.PatientInfoDelta ZORDER BY (country) 56 | 57 | -- COMMAND ---------- 58 | 59 | -- DBTITLE 1,Exemplo de particionamento por várias colunas 60 | -- MAGIC %py 61 | -- MAGIC df.write.format("delta").mode("overwrite").partitionBy( 62 | -- MAGIC "country", "province", "city", "sex" 63 | -- MAGIC ).saveAsTable( 64 | -- MAGIC "db_demo.PatientInfoDeltaParticionada", 65 | -- MAGIC path="abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoDeltaParticionada", 66 | -- MAGIC ) 67 | 68 | -- COMMAND ---------- 69 | 70 | select * from parquet.`abfss://reginaldo@stdts360.dfs.core.windows.net/part-00000-acd72083-0f7c-4f3e-85d2-07fc39aa714c.c000.snappy.parquet` 71 | 72 | -- COMMAND ---------- 73 | 74 | select * from ( 75 | select from_json(add.stats,'numRecords bigint').numRecords as numRecords, 76 | from_json(add.stats,'minValues struct').minValues.tickets_id as minValues, 77 | from_json(add.stats,'maxValues struct').maxValues.tickets_id as maxValues, 78 | add.path 79 | from json.`abfss://xxxx@xxxx.dfs.core.windows.net/xxxx/logs2/_delta_log/00000000000000000002.json` 80 | where add is not null 81 | ) tab where 22334863 between minValues and maxValues 82 | order by maxValues,minValues desc 83 | -------------------------------------------------------------------------------- /tips/System Tables/ScriptSQL.sql: -------------------------------------------------------------------------------- 1 | -- Todas as tabelas do seu ambiente 2 | select * from system.information_schema.tables where table_owner <> 'System user'; 3 | 4 | -- Todas as colunas de cada tabela 5 | select c.table_name,array_join(collect_set(column_name), ',') as columns from system.information_schema.columns c 6 | inner join system.information_schema.tables t on c.table_name = t.table_name and c.table_catalog = t.table_catalog 7 | where t.table_owner <> 'System user' 8 | group by all; 9 | 10 | -- Quantidade de tabelas por schema e catalog 11 | select table_catalog,table_schema,count(*) as qtdTables 12 | from system.information_schema.tables where table_owner <> 'System user' 13 | group by all; 14 | 15 | -- Auditoria do seu ambiente 16 | select * from system.access.audit order by event_time desc; 17 | 18 | -- Ultimo acesso nas suas tabelas 19 | select LastAccess.event_time,LastAccess.entity_type,LastAccess.created_by,* from system.information_schema.tables a 20 | LEFT JOIN 21 | LATERAL (select max(b.event_time) as event_time, LAST(b.entity_type) as entity_type, LAST(b.created_by) as created_by 22 | from system.access.table_lineage b where b.target_table_name = a.table_name) as LastAccess 23 | where a.table_owner <> 'System user'; 24 | 25 | -- Quem acessou sua tabela e quando? 26 | select * from system.access.table_lineage where target_table_name = 'tbordersliquid' 27 | order by event_time desc; 28 | 29 | -- Todos os clusters do ambiente 30 | select cluster_source,count(*) as qtd from system.compute.clusters 31 | group by all; 32 | 33 | -- Clusters All Purpose 34 | select * from system.compute.clusters where cluster_source = 'UI'; 35 | 36 | -- Job Clusters mais custosos 37 | SELECT usage_metadata.job_id as `Job ID`, sum(usage_quantity) as `DBUs` 38 | FROM system.billing.usage 39 | WHERE usage_metadata.job_id IS NOT NULL 40 | GROUP BY `Job ID` 41 | ORDER BY `DBUs` DESC; 42 | 43 | -- Cluster mais custoso 44 | select b.cluster_name, sum(usage_quantity) as `DBUs Consumed` from system.billing.usage a 45 | inner join system.compute.clusters b on a.usage_metadata.cluster_id = b.cluster_id 46 | where usage_metadata.cluster_id is not null 47 | group by all 48 | order by 2 desc; 49 | 50 | -- Cluster All purpose mais custoso 51 | select usage_date as `Date`, sum(usage_quantity) as `DBUs Consumed` from system.billing.usage a 52 | inner join system.compute.clusters b on a.usage_metadata.cluster_id = b.cluster_id 53 | where usage_metadata.cluster_id is not null 54 | group by all 55 | order by 1 desc; 56 | 57 | 58 | -- Cluster mais custoso em USD 59 | select b.cluster_name, sum(usage_quantity) as `DBUs Consumed`, (sum(usage_quantity) * max(c.pricing.default)) as TotalUSD 60 | from system.billing.usage a 61 | inner join system.compute.clusters b on a.usage_metadata.cluster_id = b.cluster_id 62 | inner join system.billing.list_prices c on c.sku_name = a.sku_name 63 | where usage_metadata.cluster_id is not null 64 | and usage_start_time between '2023-11-01' and '2023-11-30' 65 | group by all 66 | order by 3 desc; 67 | 68 | 69 | -- total em USD por mês 70 | select month(usage_end_time) as mes,sum(usage_quantity) as `DBUs Consumed`, (sum(usage_quantity) * max(c.pricing.default)) as TotalUSD 71 | from system.billing.usage a 72 | inner join system.billing.list_prices c on c.sku_name = a.sku_name 73 | group by all 74 | order by 1 desc 75 | 76 | -- Execuções do PREDICTIVE OPTIMIZATION 77 | select * from system.storage.predictive_optimization_operations_history; 78 | -------------------------------------------------------------------------------- /tips/VacuumInventory/Vacuum Inventory.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Variables 3 | from datetime import datetime, timedelta 4 | ## Script for Azure Databricks 5 | ## For AWS and GCP you need customize some code blocks 6 | ## If you are not using Unity Catalog, use as catalog name: hive_metastore 7 | ## Author: Reginaldo Silva 8 | 9 | ########################## 10 | ## Set Variables 11 | ########################## 12 | storageName = 'sahierarchicaldatalake' 13 | dataBucket = 'datalake' 14 | inventoryBucket = 'inventory' 15 | 16 | inventoryCatalogName = 'dev' 17 | inventoryDabaseName = 'datainaction' 18 | inventoryTableName = 'vacuumInventory' 19 | ########################## 20 | 21 | current_day = datetime.now().strftime('%d') 22 | current_month = datetime.now().strftime('%m') 23 | current_year = datetime.now().year 24 | dataStoragePath = f'abfss://{dataBucket}@{storageName}.dfs.core.windows.net/' 25 | 26 | try: 27 | dbutils.fs.ls(f'abfss://{inventoryBucket}@{storageName}.dfs.core.windows.net/{current_year}/{current_month}/{current_day}/') 28 | except: 29 | print('No files found using current day, trying D-1...') 30 | try: 31 | current_day = (datetime.today() + timedelta(days=-1)).strftime('%d') 32 | dbutils.fs.ls(f'abfss://{inventoryBucket}@{storageName}.dfs.core.windows.net/{current_year}/{current_month}/{current_day}/') 33 | print('Using D-1!') 34 | except: 35 | print('No files found!') 36 | dbutils.notebook.exit('No files found in inventory folder!') 37 | 38 | inventoryStoragePath = f'abfss://{inventoryBucket}@{storageName}.dfs.core.windows.net/{current_year}/{current_month}/{current_day}/*/*/*.parquet' 39 | 40 | print('Inventory Storage path: ', inventoryStoragePath) 41 | print('Data Storage path: ', dataStoragePath) 42 | print(f'Inventory Table: {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName}') 43 | 44 | # COMMAND ---------- 45 | 46 | # DBTITLE 1,Create Inventory Table 47 | spark.sql(f""" 48 | CREATE TABLE IF NOT EXISTS {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName} 49 | ( 50 | path string, 51 | `creationTime` long, 52 | `modificationTime` long, 53 | `length` long, 54 | `isDir` boolean, 55 | `LastAccessTime` string, 56 | SourceFileName string not null, 57 | datetimeLoad timestamp 58 | ); 59 | """ 60 | ) 61 | 62 | # COMMAND ---------- 63 | 64 | # DBTITLE 1,Clean data 65 | # Clean inventory table, we will load new updated data 66 | spark.sql(f""" 67 | Truncate table {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName} 68 | """ 69 | ) 70 | 71 | # COMMAND ---------- 72 | 73 | # DBTITLE 1,INSERT INTO inventory table 74 | # Insert new inventory Data to delta table 75 | # Get just the necessary fields 76 | # Field hdi_isfolder is another Option to achive isDir field 77 | spark.sql(f""" 78 | INSERT INTO {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName} 79 | select 80 | concat('{dataStoragePath}',replace(name,'{dataBucket}/','')) as path, 81 | `Creation-Time` creationTime, 82 | `Last-Modified` modificationTime, 83 | `Content-Length` as length, 84 | case when `Content-Length` > 0 then false else true end isDir, 85 | cast(from_unixtime(`LastAccessTime` / 1000) as string) LastAccessTime, 86 | _metadata.file_name as SourceFileName, 87 | current_timestamp as datetimeLoad 88 | from 89 | parquet.`{inventoryStoragePath}` 90 | """ 91 | ).display() 92 | 93 | # COMMAND ---------- 94 | 95 | # MAGIC %md 96 | # MAGIC > Example 97 | # MAGIC >> **vacuum catalog.database.tableName using inventory (select path, length, isDir, modificationTime from dev.datainaction.vacuumInventory) RETAIN 48 HOURS** 98 | -------------------------------------------------------------------------------- /tips/SHOW/SHOW COMMANDs.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | SHOW CATALOGS 3 | 4 | -- COMMAND ---------- 5 | 6 | -- Databases = Schemas 7 | SHOW SCHEMAS FROM DEV 8 | 9 | -- COMMAND ---------- 10 | 11 | SHOW TABLES FROM DEV.db_demo 12 | 13 | -- COMMAND ---------- 14 | 15 | USE CATALOG DEV; 16 | USE SCHEMA db_demo; 17 | SHOW TABLE EXTENDED LIKE 'tb*'; 18 | 19 | -- COMMAND ---------- 20 | 21 | DROP TABLE testeuc2 22 | 23 | -- COMMAND ---------- 24 | 25 | SHOW TABLES DROPPED IN db_demo 26 | 27 | -- COMMAND ---------- 28 | 29 | UNDROP table DEV.db_demo.testeuc2 30 | 31 | -- COMMAND ---------- 32 | 33 | ALTER TABLE DEV.db_demo.tbordersliquid SET TBLPROPERTIES ('delta.deletedFileRetentionDuration' = '1 days'); 34 | 35 | -- COMMAND ---------- 36 | 37 | SHOW TBLPROPERTIES DEV.db_demo.tbordersliquid 38 | 39 | -- COMMAND ---------- 40 | 41 | SHOW COLUMNS FROM DEV.db_demo.tbordersliquid 42 | 43 | -- COMMAND ---------- 44 | 45 | -- MAGIC %py 46 | -- MAGIC listcolunas = '' 47 | -- MAGIC list = spark.sql('SHOW COLUMNS FROM DEV.db_demo.tbordersliquid').collect() 48 | -- MAGIC 49 | -- MAGIC print(listcolunas) 50 | 51 | -- COMMAND ---------- 52 | 53 | -- MAGIC %py 54 | -- MAGIC listcolunas = ','.join(str(col.col_name) for col in spark.sql('SHOW COLUMNS FROM DEV.db_demo.tbordersliquid').collect()) 55 | -- MAGIC print(listcolunas) 56 | 57 | -- COMMAND ---------- 58 | 59 | SHOW CREATE TABLE DEV.db_demo.tbordersliquid 60 | 61 | -- COMMAND ---------- 62 | 63 | SHOW PARTITIONS DEV.db_demo.tborderspartition 64 | 65 | -- COMMAND ---------- 66 | 67 | SHOW USERS 68 | 69 | -- COMMAND ---------- 70 | 71 | SHOW USERS LIKE '*dataside*' 72 | 73 | -- COMMAND ---------- 74 | 75 | SHOW GROUPS 76 | 77 | -- COMMAND ---------- 78 | 79 | SHOW GROUPS WITH USER `reginaldo.silva@dataside.com.br`; 80 | 81 | -- COMMAND ---------- 82 | 83 | SHOW GROUPS WITH GROUP `read_write_prod`; 84 | 85 | -- COMMAND ---------- 86 | 87 | USE CATALOG DEV 88 | 89 | -- COMMAND ---------- 90 | 91 | -- MAGIC %py 92 | -- MAGIC from pyspark.sql.functions import lit 93 | -- MAGIC from datetime import datetime 94 | -- MAGIC countTBOk = 1 95 | -- MAGIC countError = 0 96 | -- MAGIC countTotal = 1 97 | -- MAGIC for db in spark.sql("show databases").collect(): 98 | -- MAGIC print('>>>>>>>> iniciando DB: ',db.databaseName) 99 | -- MAGIC for tb in spark.sql(f"show tables from {db.databaseName}").collect(): 100 | -- MAGIC try: 101 | -- MAGIC countTotal = countTotal + 1 102 | -- MAGIC print(countTotal,' - ',db.databaseName,'.',tb.tableName) 103 | -- MAGIC spark.sql(f"select * from {db.databaseName}.{tb.tableName} limit 1") 104 | -- MAGIC countTBOk = countTBOk + 1 105 | -- MAGIC except Exception as error: 106 | -- MAGIC print("#######error ocurred on: ", db.databaseName,'.',tb.tableName, error) 107 | -- MAGIC countError = countError + 1 108 | -- MAGIC print ('------Quantidade de erros:', countError) 109 | -- MAGIC 110 | -- MAGIC print('Tabelas OK: ', countTBOk) 111 | -- MAGIC print('Tabelas com Erro: ', countError) 112 | -- MAGIC print('Total tabelas: ', countTotal) 113 | 114 | -- COMMAND ---------- 115 | 116 | -- MAGIC %py 117 | -- MAGIC from pyspark.sql.functions import lit 118 | -- MAGIC from datetime import datetime 119 | -- MAGIC countTotal = 0 120 | -- MAGIC for db in spark.sql("show databases").collect(): 121 | -- MAGIC print('>>>>>>>> iniciando DB: ',db.databaseName) 122 | -- MAGIC for tb in spark.sql(f"show tables from {db.databaseName}").collect(): 123 | -- MAGIC try: 124 | -- MAGIC countTotal = countTotal + 1 125 | -- MAGIC print(countTotal,' - ',str(db.databaseName).replace(' ',''),'.',str(tb.tableName).replace(' ','')) 126 | -- MAGIC listcolunas = ','.join(str(col.col_name) for col in spark.sql(f"""SHOW COLUMNS FROM {db.databaseName.replace(' ','')}.{tb.tableName} """).collect()) 127 | -- MAGIC print('->>> TableName: ',db.databaseName,'.',tb.tableName, ' ->>> List Cols: ',listcolunas) 128 | -- MAGIC except Exception as error: 129 | -- MAGIC print("#######error ocurred on: ", db.databaseName,'.',tb.tableName, error) 130 | -------------------------------------------------------------------------------- /tips/DatabricksServicePrincipal/Generate ServicePrincipal Token.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Criando uma ServicePrincipal e colocando em um grupo especifico 3 | # MAGIC %sh 4 | # MAGIC curl --netrc -X POST \ 5 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/preview/scim/v2/ServicePrincipals \ 6 | # MAGIC --header 'Content-type: application/scim+json' \ 7 | # MAGIC --header 'Authorization: Bearer xxx-3' \ 8 | # MAGIC --data '{"schemas": ["urn:ietf:params:scim:schemas:core:2.0:ServicePrincipal"],"applicationId": "96a9c13a-bd04-459f-a186-e36fe24b6c9a","displayName": "databricks-serviceprincipal","groups": [{"value": "612835559850353"}],"entitlements": [{ "value": "allow-cluster-create"}], "active": true}' 9 | 10 | # COMMAND ---------- 11 | 12 | # DBTITLE 1,Resgatando o GroupID 13 | ##prod 14 | from requests import request 15 | from pyspark.sql.functions import * 16 | import requests 17 | import json 18 | 19 | instance_id = 'adb-4013955633331914.14.azuredatabricks.net' 20 | 21 | api_version = '/api/2.0' 22 | api_command = '/preview/scim/v2/Groups' 23 | url = f"https://{instance_id}{api_version}{api_command}" 24 | 25 | #Adicionar secret 26 | headers = { 27 | 'Authorization': "Bearer xxxx-3" 28 | } 29 | 30 | response = requests.get( 31 | url = url, 32 | headers=headers 33 | ) 34 | 35 | jsonDataList = [] 36 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2)) 37 | jsonRDD = sc.parallelize(jsonDataList) 38 | dfGroups = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD) 39 | dfExplode = dfGroups.withColumn("Groups",explode(dfGroups.Resources)) 40 | dfExplode.select(dfExplode.Groups.id,dfExplode.Groups.displayName).display() 41 | 42 | # COMMAND ---------- 43 | 44 | # DBTITLE 1,List ServicePrincipal 45 | # MAGIC %sh 46 | # MAGIC curl -X GET \ 47 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/preview/scim/v2/ServicePrincipals \ 48 | # MAGIC --header "Authorization: Bearer xxxx-3" 49 | 50 | # COMMAND ---------- 51 | 52 | # DBTITLE 1,Genarate a short-live token - Use this token to generate a Databricks PAT for an app 53 | # MAGIC %sh 54 | # MAGIC curl -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ 55 | # MAGIC https://login.microsoftonline.com/[TenantID]/oauth2/v2.0/token \ 56 | # MAGIC -d 'client_id=96a9c13a-bd04-459f-a186-e36fe24b6c9a' \ 57 | # MAGIC -d 'grant_type=client_credentials' \ 58 | # MAGIC -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \ 59 | # MAGIC -d 'client_secret=[App Secret]' 60 | 61 | # COMMAND ---------- 62 | 63 | # DBTITLE 1,Create Token for ServicePrincipal - Use short-live token to authenticate 64 | # MAGIC %sh 65 | # MAGIC curl -X POST \ 66 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/token/create \ 67 | # MAGIC --header "Content-type: application/json" \ 68 | # MAGIC --header "Authorization: Bearer [token]" \ 69 | # MAGIC --data '{"application_id": "96a9c13a-bd04-459f-a186-e36fe24b6c9a","comment": "Token para acesso no PowerBI, token não expira","lifetime_seconds": -1}' 70 | 71 | # COMMAND ---------- 72 | 73 | # DBTITLE 1,Token List - Use App Token 74 | ##prod 75 | from pyspark.sql.functions import * 76 | from requests import request 77 | import requests 78 | import json 79 | 80 | instance_id = 'adb-4013955633331914.14.azuredatabricks.net' 81 | 82 | api_version = '/api/2.0' 83 | api_command = '/token/list' 84 | url = f"https://{instance_id}{api_version}{api_command}" 85 | 86 | #Adicionar secret 87 | headers = { 88 | 'Authorization': "Bearer xxx-3" 89 | } 90 | 91 | response = requests.get( 92 | url = url, 93 | headers=headers 94 | ) 95 | 96 | jsonDataList = [] 97 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2)) 98 | jsonRDD = sc.parallelize(jsonDataList) 99 | dfGroups = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD) 100 | print(json.dumps(json.loads(response.text), indent = 2)) 101 | 102 | # COMMAND ---------- 103 | 104 | # DBTITLE 1,Test Resquest API with ServicePrincipal Token 105 | # MAGIC %sh 106 | # MAGIC curl -X GET \ 107 | # MAGIC -H 'Authorization: Bearer xxx-3' \ 108 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/clusters/list 109 | -------------------------------------------------------------------------------- /tips/logicapp/logicapp.json: -------------------------------------------------------------------------------- 1 | { 2 | "definition": { 3 | "$schema": "https://schema.management.azure.com/providers/Microsoft.Logic/schemas/2016-06-01/workflowdefinition.json#", 4 | "actions": { 5 | "HTTP_-_Run_Databricks_Job": { 6 | "inputs": { 7 | "body": { 8 | "job_id": "@triggerBody()?['jobId']" 9 | }, 10 | "headers": { 11 | "Authorization": "Bearer xxxxx" 12 | }, 13 | "method": "POST", 14 | "uri": "@triggerBody()?['databricksWorkspace']" 15 | }, 16 | "runAfter": {}, 17 | "type": "Http" 18 | }, 19 | "Send_Email_Notification": { 20 | "inputs": { 21 | "body": { 22 | "Body": "

Abaixo detalhes da execução:
\n
\n@{triggerBody()?['customBody']}
\n
\nJobName:@{triggerBody()?['jobName']}
\nDatabricksAPI:@{triggerBody()?['databricksWorkspace']}
\nDate: @{triggerBody()?['dateLog']}
\nResult:@{body('HTTP_-_Run_Databricks_Job')} 

", 23 | "Importance": "High", 24 | "Subject": "@triggerBody()?['subject']", 25 | "To": "@triggerBody()?['emailList']" 26 | }, 27 | "host": { 28 | "connection": { 29 | "name": "@parameters('$connections')['office365']['connectionId']" 30 | } 31 | }, 32 | "method": "post", 33 | "path": "/v2/Mail" 34 | }, 35 | "runAfter": { 36 | "HTTP_-_Run_Databricks_Job": [ 37 | "Succeeded" 38 | ] 39 | }, 40 | "type": "ApiConnection" 41 | } 42 | }, 43 | "contentVersion": "1.0.0.0", 44 | "outputs": {}, 45 | "parameters": { 46 | "$connections": { 47 | "defaultValue": {}, 48 | "type": "Object" 49 | } 50 | }, 51 | "triggers": { 52 | "Events_Monitor_": { 53 | "evaluatedRecurrence": { 54 | "frequency": "Minute", 55 | "interval": 1 56 | }, 57 | "inputs": { 58 | "host": { 59 | "connection": { 60 | "name": "@parameters('$connections')['sql_3']['connectionId']" 61 | } 62 | }, 63 | "method": "get", 64 | "path": "/v2/datasets/@{encodeURIComponent(encodeURIComponent('default'))},@{encodeURIComponent(encodeURIComponent('default'))}/tables/@{encodeURIComponent(encodeURIComponent('tb_OrchestratorEvents'))}/onnewitems" 65 | }, 66 | "recurrence": { 67 | "frequency": "Minute", 68 | "interval": 1 69 | }, 70 | "splitOn": "@triggerBody()?['value']", 71 | "type": "ApiConnection" 72 | } 73 | } 74 | }, 75 | "parameters": { 76 | "$connections": { 77 | "value": { 78 | "office365": { 79 | "connectionId": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/resourceGroups/rgDatabricks/providers/Microsoft.Web/connections/office365-2", 80 | "connectionName": "office365-2", 81 | "id": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/providers/Microsoft.Web/locations/eastus/managedApis/office365" 82 | }, 83 | "sql_3": { 84 | "connectionId": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/resourceGroups/rgDatabricks/providers/Microsoft.Web/connections/sql-10", 85 | "connectionName": "sql-10", 86 | "id": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/providers/Microsoft.Web/locations/eastus/managedApis/sql" 87 | } 88 | } 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /tips/UpgradeMethods/README.md: -------------------------------------------------------------------------------- 1 | ##Tabela de migração: Estrategias por tipo de tabela 2 | 3 | | Id | Tipo HMS | Location | Tipo UC | Método | 4 | |----|----------|----------------|--------------------|--------------------------| 5 | | 1 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE | 6 | | 2 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE | 7 | | 3 | Hive SerDe | DBFS Root | Managed/External | CTAS / DEEP CLONE | 8 | | 4 | Managed | Mount | External | SYNC com Convert | 9 | | 5 | Managed | Mount | Managed | CTAS / DEEP CLONE | 10 | | 6 | External | Mount | External | SYNC | 11 | | 7 | External | Mount | Managed | CTAS / DEEP CLONE | 12 | | 8 | Managed | Cloud Storage | External | SYNC com Convert | 13 | | 9 | Managed | Cloud Storage | Managed | CTAS / DEEP CLONE | 14 | | 10 | External | Cloud Storage | External | SYNC | 15 | | 11 | External | Cloud Storage | Managed | CTAS / DEEP CLONE | 16 | 17 | ## Observação importante 18 | - **set spark.databricks.sync.command.enableManagedTable=true;** 19 | - Ao usar essa opção, você não pode dropar a tabela no HMS, pois, o dados serão excluídos do Storage 20 | - Caso queira dropar, use o script Scala para trocar ela de Managed para External 21 | - Outra dica, após a migração das suas tabelas do HMS para o UC, caso você não drop elas no HMS, voce pode usar essa opção para evitar que alguém escreva nelas, principalmente se forem Managed. 22 | 23 | ## Tabelas Managed vs External 24 | 25 | - **Tabelas Managed**: 26 | - Dados e metadados são gerenciados pelo Unity Catalog. 27 | - Os dados são armazenados no local especificado pelo catálogo Unity (tipicamente em armazenamento cloud). 28 | - A exclusão de uma tabela managed remove também os dados. 29 | - Se for HMS os dados são removidos imediatamente 30 | - Se for no UC os dados são mantidos por mais 30 dias 31 | - Aqui voce pode usar o UNDROP até 7 dias 32 | 33 | - **Tabelas External**: 34 | - Apenas os metadados são gerenciados pelo Unity Catalog, os dados permanecem no armazenamento externo (geralmente em um bucket ou outro recurso cloud). 35 | - A exclusão de uma tabela external remove apenas os metadados; os dados permanecem no armazenamento original. 36 | - Permite que os dados sejam compartilhados entre diferentes sistemas ou aplicações. 37 | 38 | ### DBFS Root vs Mount vs Cloud Storage 39 | 40 | - **DBFS Root**: 41 | - O sistema de arquivos distribuído do Databricks (Databricks File System). 42 | - Armazenamento temporário e volátil, com possíveis limitações em operações de longa duração. 43 | - Os dados ficam fisicamente no storage da Databricks que voce não tem acesso 44 | 45 | - **Mount**: 46 | - Uma forma de acessar o armazenamento externo (como S3, ADLS) no DBFS como se fosse um diretório local. 47 | - Os dados permanecem no armazenamento externo, mas podem ser acessados dentro de Databricks via caminhos montados. 48 | 49 | - **Cloud Storage**: 50 | - Armazenamento na nuvem (ex: AWS S3, Azure Data Lake, Google Cloud Storage) onde os dados podem ser armazenados e acessados diretamente. 51 | - Mais flexível para armazenamento de grande volume e soluções a longo prazo. 52 | 53 | ### Métodos CTAS, DEEP CLONE e SYNC 54 | 55 | - **CTAS (Create Table As Select)**: 56 | - Método usado para criar uma nova tabela a partir dos resultados de uma consulta SQL. 57 | - A nova tabela pode ser criada com dados agregados ou filtrados. 58 | - Exemplo de uso: `CREATE TABLE nova_tabela AS SELECT * FROM tabela_existente WHERE condição`. 59 | 60 | - **DEEP CLONE**: 61 | - Método utilizado para clonar tabelas, incluindo seus dados, metadados e histórico de transações. 62 | - Utilizado para cópia rápida de tabelas, útil em cenários de backup ou migração. 63 | - Exemplo: `DEEP CLONE origem DESTINO` cria uma cópia completa da tabela de origem. 64 | 65 | - **SYNC**: 66 | - Sincroniza tabelas external com o Unity Catalog, garantindo que o catálogo reflita as alterações feitas diretamente no armazenamento. 67 | - Essencial para manter a consistência entre os metadados no Unity Catalog e o armazenamento externo. 68 | - Útil para cenários onde os dados podem ser alterados por fora do Databricks. 69 | 70 | 71 | Post Databricks: 72 | https://www.databricks.com/blog/migrating-tables-hive-metastore-unity-catalog-metastore#appendix 73 | 74 | Notebook oficial: 75 | https://notebooks.databricks.com/notebooks/uc-upgrade-scenario-with-examples-for-blog.dbc?_gl=1*1nrxwtq*_gcl_au*OTUxMzE5NDg3LjE2OTM0NjcxNDM. 76 | -------------------------------------------------------------------------------- /API/databricks/Databricks API - Clusters.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Listando todos os clusters Databricks 3 | import requests 4 | import json 5 | 6 | ## O seu databricks Instance voce encontra na sua URL. 7 | instance_id = 'xxxxxxx.azuredatabricks.net' 8 | 9 | # Aqui estamos usando a API na versão 2.0 que é a mais recente no momento 10 | api_version = '/api/2.0' 11 | api_command = '/clusters/list' 12 | url = f"https://{instance_id}{api_version}{api_command}" 13 | print(url) 14 | 15 | headers = { 16 | 'Authorization': "Bearer xxxxxxx" ## put your databricks token here 17 | } 18 | 19 | response = requests.get( 20 | url = url, 21 | headers=headers 22 | ) 23 | 24 | # Transformando nosso retorno em um Dataframe 25 | jsonDataList = [] 26 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2)) 27 | jsonRDD = sc.parallelize(jsonDataList) 28 | df = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD) 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,Print em modo Text 33 | print(response.text) 34 | 35 | # COMMAND ---------- 36 | 37 | # DBTITLE 1,Print bonito 38 | print(json.dumps(json.loads(response.text), indent = 2)) 39 | 40 | # COMMAND ---------- 41 | 42 | # DBTITLE 1,Expandindo itens 43 | from pyspark.sql.functions import * 44 | # Usando o Explode para expandir nosso Json. 45 | dfclusters = df.select(explode("clusters").alias("cl")) 46 | dfclusters.display() 47 | 48 | # COMMAND ---------- 49 | 50 | # DBTITLE 1,Selecionando os campos relevantes 51 | dfclusters.select( 52 | dfclusters.cl.cluster_id, 53 | dfclusters.cl.cluster_name, 54 | dfclusters.cl.cluster_cores, 55 | dfclusters.cl.cluster_memory_mb, 56 | dfclusters.cl.state, 57 | dfclusters.cl.spark_conf, 58 | dfclusters.cl.cluster_source.alias("cluster_source"), 59 | dfclusters.cl.creator_user_name, 60 | dfclusters.cl.autotermination_minutes, 61 | dfclusters.cl.azure_attributes, 62 | dfclusters.cl.autoscale, 63 | dfclusters.cl.custom_tags, 64 | dfclusters.cl.default_tags, 65 | dfclusters.cl.driver, 66 | dfclusters.cl.driver_instance_source, 67 | dfclusters.cl.driver_node_type_id, 68 | dfclusters.cl.node_type_id, 69 | dfclusters.cl.effective_spark_version.alias("effective_spark_version"), 70 | dfclusters.cl.enable_elastic_disk, 71 | dfclusters.cl.last_restarted_time, 72 | dfclusters.cl.last_state_loss_time, 73 | dfclusters.cl.num_workers, 74 | dfclusters.cl.runtime_engine.alias("runtime_engine"), 75 | dfclusters.cl.spark_conf, 76 | dfclusters.cl.start_time, 77 | dfclusters.cl.state, 78 | dfclusters.cl.state_message, 79 | dfclusters.cl.terminated_time, 80 | dfclusters.cl.termination_reason 81 | ).createOrReplaceTempView('vw_clusters') 82 | 83 | # COMMAND ---------- 84 | 85 | # DBTITLE 1,Consultando com SQL 86 | # MAGIC %sql 87 | # MAGIC -- Para os amantes de SQL 88 | # MAGIC select * from vw_clusters 89 | 90 | # COMMAND ---------- 91 | 92 | # DBTITLE 1,Agrupando por versão e origem 93 | # MAGIC %sql 94 | # MAGIC select cluster_source,effective_spark_version,count(*) as qtdClusters from vw_clusters 95 | # MAGIC group by cluster_source,effective_spark_version 96 | # MAGIC order by cluster_source,effective_spark_version 97 | 98 | # COMMAND ---------- 99 | 100 | # DBTITLE 1,Criando um novo cluster via API 101 | # Aqui estamos usando a API na versão 2.0 que é a mais recente no momento 102 | api_version = '/api/2.0' 103 | api_command = '/clusters/create' 104 | url = f"https://{instance_id}{api_version}{api_command}" 105 | print(url) 106 | 107 | headers = { 108 | 'Authorization': "Bearer xxxxxx-2" ## put your databricks token here 109 | } 110 | 111 | datajson = { 112 | "cluster_name": "my-cluster-api", 113 | "spark_version": "11.3.x-scala2.12", 114 | "node_type_id": "Standard_D3_v2", 115 | "spark_conf": { 116 | "spark.speculation": True 117 | }, 118 | "num_workers": 1 119 | } 120 | 121 | print(json.dumps(datajson, indent = 2)) 122 | data = json.dumps(datajson, indent = 2) 123 | response = requests.post(url = url, headers = headers, data = data) 124 | print(response) 125 | 126 | # COMMAND ---------- 127 | 128 | # DBTITLE 1,Deletando um cluster via API 129 | # Aqui estamos usando a API na versão 2.0 que é a mais recente no momento 130 | api_version = '/api/2.0' 131 | api_command = '/clusters/delete' 132 | url = f"https://{instance_id}{api_version}{api_command}" 133 | print(url) 134 | 135 | headers = { 136 | 'Authorization': "Bearer xxxxxx" ## put your databricks token here 137 | } 138 | 139 | datajson = {"cluster_id": "0211-131904-kvyksq3e"} 140 | 141 | print(json.dumps(datajson, indent = 2)) 142 | data = json.dumps(datajson, indent = 2) 143 | response = requests.post(url = url, headers = headers, data = data) 144 | print(response) 145 | 146 | # COMMAND ---------- 147 | 148 | # DBTITLE 1,Salvando como tabela Delta 149 | ## Adicione o caminho do storage 150 | caminhoDatalakeLog = '[]' 151 | df = spark.sql("select * from vw_clusters") 152 | df.write.option("mergeSchema", "true").mode(f"overwrite").format("delta").save(f"{caminhoDatalakeLog}") 153 | spark.sql(f"Create Table if not exists [nome do seu banco de dados].monitoramento_clusters Using Delta Location '{caminhoDatalakeLog}'") 154 | -------------------------------------------------------------------------------- /routines/OptimizeAndVacuum/OptimizeAndVacuum.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC

Descriçao dos parametros

4 | # MAGIC 5 | # MAGIC | Parametro | Descrição | Tipo 6 | # MAGIC | ------------- | ------------- | ------------- | 7 | # MAGIC | nomeSchema | Nome do Database onde a tabela está criada | string | 8 | # MAGIC | nomeTabela | Nome da tabela que será aplicado a manutenção | string | 9 | # MAGIC | vacuum | True: Vacuum será executado, False: Pula vacuum | bool | 10 | # MAGIC | optimize | True: OPTIMIZE será executado, False: Pula OPTIMIZE | bool | 11 | # MAGIC | colunasZorder | Se informado e optimize for igual a True, aplicada Zorder na lista de colunas separado por vírgula (,) | string | 12 | # MAGIC | vacuumRetention | Quantidade de horas que será retida após execucao do Vacuum | integer | 13 | # MAGIC | Debug | Apenas imprime o resultado na tela | bool | 14 | # MAGIC 15 | # MAGIC

Exemplos:

16 | # MAGIC 17 | # MAGIC #### --> Primeiro instanciar a Function <-- 18 | # MAGIC `` %run /Users/reginaldo.silva@dataside.com.br/OptimizeAndVacuum `` 19 | # MAGIC 20 | # MAGIC #### --> Executando VACUUM com retenção de 72 horas e OPTMIZE SEM ZORDER <-- 21 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='funcionario', colunasZorder='none', vacuumRetention=72, vacuum=True, optimize=True, debug=False)`` 22 | # MAGIC 23 | # MAGIC #### --> Executando VACUUM retenção padrão e OPTMIZE COM ZORDER <-- 24 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='patient_id', vacuumRetention=168, vacuum=True, optimize=True, debug=False)`` 25 | # MAGIC 26 | # MAGIC #### --> Executando somente VACUUM <-- 27 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=False, debug=False)`` 28 | # MAGIC 29 | # MAGIC #### --> Executando somente OPTMIZE <-- 30 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=False, optimize=True, debug=False)`` 31 | # MAGIC 32 | # MAGIC #### --> Modo Debug - Apenas print <-- 33 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True)`` 34 | # MAGIC 35 | # MAGIC ``Criado por: Reginaldo Silva`` 36 | # MAGIC - [Blog Data In Action](https://datainaction.dev/) 37 | # MAGIC - [Github](https://github.com/reginaldosilva27) 38 | # MAGIC 39 | # MAGIC ``Referencias:`` 40 | # MAGIC - 41 | # MAGIC - 42 | 43 | # COMMAND ---------- 44 | 45 | from datetime import datetime 46 | def maintenanceDeltalake (nomeSchema='silver', nomeTabela='none', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True): 47 | if debug: 48 | print("Modo Debug habilitado!") 49 | if optimize: 50 | if colunasZorder != "none": 51 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}") 52 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})") 53 | else: 54 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}") 55 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}") 56 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}") 57 | else: 58 | print(f"### Não executado OPTIMIZE! ###") 59 | 60 | if vacuum: 61 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}") 62 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours") 63 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}") 64 | else: 65 | print(f"### Não executado VACUUM! ###") 66 | else: 67 | print("Modo Debug desabilitado!") 68 | if optimize: 69 | if colunasZorder != "none": 70 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}") 71 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})") 72 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})") 73 | else: 74 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}") 75 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}") 76 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela}") 77 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}") 78 | else: 79 | print(f"### Não executado OPTIMIZE! ###") 80 | 81 | if vacuum: 82 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}") 83 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = false") 84 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours") 85 | spark.sql(f"VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours") 86 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = true") 87 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}") 88 | else: 89 | print(f"### Não executado VACUUM! ###") 90 | 91 | # COMMAND ---------- 92 | 93 | # DBTITLE 1,Enviar parâmetros para execução após instanciar a função 94 | # MAGIC %py 95 | # MAGIC #Caso queira já chamar a função diretamente do Azure Data Factory, informar os parametros na chamada do notebook 96 | # MAGIC try: 97 | # MAGIC maintenanceDeltalake(nomeSchema=getArgument("NomeSchema"), nomeTabela=getArgument("NomeTabela"), colunasZorder=getArgument("ColunasZorder"), vacuumRetention=getArgument("VacuumRetention"), vacuum=eval(getArgument("Vacuum")), optimize=eval(getArgument("Optimize")), debug=eval(getArgument("Debug"))) 98 | # MAGIC except: 99 | # MAGIC print("Função maintenanceDeltalake() instanciada no contexto!") 100 | -------------------------------------------------------------------------------- /tips/timeTravelVsCDF/Time Travel vs Change Data Feed.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- DBTITLE 1,Criando tabela demo 3 | -- MAGIC %py 4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv') 5 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta') 6 | -- MAGIC df.display() 7 | 8 | -- COMMAND ---------- 9 | 10 | -- DBTITLE 1,Time travel exemplo 11 | -- Atualizando 1 registro 12 | update db_demo.PatientInfoDelta set age = '33s' where patient_id = '1000000001' 13 | 14 | -- COMMAND ---------- 15 | 16 | -- DBTITLE 1,Visualizando o histórico de alterações na tabela 17 | describe history db_demo.PatientInfoDelta 18 | 19 | -- COMMAND ---------- 20 | 21 | -- DBTITLE 1,Viajando no tempo usando VERSION AS OF 22 | select * from db_demo.PatientInfoDelta VERSION AS OF 0 where patient_id = '1000000001' 23 | 24 | -- COMMAND ---------- 25 | 26 | -- DBTITLE 1,Viajando no tempo usando TIMESTAMP AS OF 27 | select 'OLD', * from db_demo.PatientInfoDelta timestamp AS OF '2023-06-03T14:19:07.000+0000' 28 | where patient_id = '1000000001' union all 29 | select 'NEW', * from db_demo.PatientInfoDelta where patient_id = '1000000001' 30 | 31 | -- COMMAND ---------- 32 | 33 | -- DBTITLE 1,DELETE Sem Where - E agora quem poderá nos defender? 34 | delete from db_demo.PatientInfoDelta; 35 | select * from db_demo.PatientInfoDelta 36 | 37 | -- COMMAND ---------- 38 | 39 | -- DBTITLE 1,Historico de alterações 40 | describe history db_demo.PatientInfoDelta 41 | 42 | -- COMMAND ---------- 43 | 44 | -- DBTITLE 1,Restaurando a tabela com historico do TIME TRAVEL 45 | RESTORE db_demo.PatientInfoDelta VERSION AS OF 1; 46 | select * from db_demo.PatientInfoDelta 47 | 48 | -- COMMAND ---------- 49 | 50 | -- DBTITLE 1,Habilitando o Change Data Feed 51 | Alter table db_demo.PatientInfoDelta SET TBLPROPERTIES (delta.enableChangeDataFeed = true) 52 | 53 | -- COMMAND ---------- 54 | 55 | -- DBTITLE 1,Criar nossa tabela Silver para simular o CDF na prática 56 | -- Essa silver só terá dados de pacientes infectados por outros pacientes filtrando pelo infected_by 57 | Create or Replace table db_demo.SilverPatientInfectedBy 58 | as 59 | select patient_id,sex,age,country,province,city,infection_case,infected_by from db_demo.PatientInfoDelta where infected_by is not null; 60 | 61 | select * from db_demo.SilverPatientInfectedBy; 62 | 63 | -- COMMAND ---------- 64 | 65 | -- DBTITLE 1,Gerando algumas modificações 66 | -- Atualizando 2 registro, deletando 1 registro e inserindo 1 registro 67 | -- Note que estou aplicando 2 updates no mesmo registro 1000000003 68 | update db_demo.PatientInfoDelta set age = '70s' where patient_id = '1000000003'; 69 | update db_demo.PatientInfoDelta set sex = 'female' where patient_id = '1000000003'; 70 | delete from db_demo.PatientInfoDelta where patient_id = '1000000005'; 71 | insert into db_demo.PatientInfoDelta values('1000003211','male','31s','Brazil','Sao Paulo','Boituva','Dataholic','1500000033',12,current_date(),current_date(),current_date(),null,'released'); 72 | 73 | -- COMMAND ---------- 74 | 75 | -- DBTITLE 1,Visualizando as versões 76 | describe history db_demo.PatientInfoDelta 77 | 78 | -- COMMAND ---------- 79 | 80 | -- DBTITLE 1,Usando table_changes() para navegar nas versões 81 | -- Pegando a partir da versão 4 tudo que aconteceu 82 | SELECT _change_type,_commit_version,_commit_timestamp,* FROM table_changes('db_demo.`PatientInfoDelta`', 4) 83 | 84 | -- COMMAND ---------- 85 | 86 | -- DBTITLE 1,Criando uma View temporaria para pegar somente a ultima versão de cada registro 87 | -- Para Updates pegamos somente o update_postimage que são os dados novos 88 | -- Estamos aplicando a função ROW_NUMBER pela chave da tabela (patient_id) ordenando pelo _commit_version 89 | -- Note que o rnb filtramos apenas o 1, então se o paciente tiver 2 Updates será aplicado o mais recente 90 | -- Estou passando a versão fixa no table_changes, mas pode ser dinamico 91 | CREATE OR REPLACE TEMPORARY VIEW vwPatientInfectedBy as 92 | SELECT * 93 | FROM 94 | (SELECT *, row_number() over (partition by patient_id order by _commit_version desc) as rnb 95 | FROM table_changes('db_demo.`PatientInfoDelta`', 4) where _change_type !='update_preimage' and infected_by is not null) 96 | WHERE rnb=1; 97 | 98 | select _change_type,_commit_version,_commit_timestamp,rnb,* from vwPatientInfectedBy; 99 | 100 | -- COMMAND ---------- 101 | 102 | -- DBTITLE 1,Visualizando alterações antes 103 | select * from db_demo.SilverPatientInfectedBy where patient_id in('1000000003','1000000005','1000003211'); 104 | 105 | -- COMMAND ---------- 106 | 107 | -- DBTITLE 1,Aplicando as alterações na nossa tabela Silver 108 | MERGE INTO db_demo.SilverPatientInfectedBy as t 109 | USING vwPatientInfectedBy as s 110 | ON s.patient_id = t.patient_id 111 | WHEN MATCHED AND s._change_type = 'delete' THEN DELETE 112 | WHEN MATCHED AND s._change_type = 'update_postimage' THEN UPDATE SET * 113 | WHEN NOT MATCHED AND _change_type != 'delete' THEN INSERT * 114 | 115 | -- COMMAND ---------- 116 | 117 | -- DBTITLE 1,Visualizando alterações depois 118 | select * from db_demo.SilverPatientInfectedBy where patient_id in('1000000003','1000000005','1000003211'); 119 | 120 | -- COMMAND ---------- 121 | 122 | -- DBTITLE 1,Sem CDC daria pra fazer? 123 | -- Somente para INSERT e UPDATE, Delete não é replicavel, a não ser que voce compare as tabelas inteiras, que na maioria dos casos não é viável, pois são cargas incrementais 124 | -- Nome a quantidade de escrita, praticamente a tabela Silver inteira foi reescrita 125 | MERGE INTO db_demo.SilverPatientInfectedBy as t 126 | USING db_demo.PatientInfoDelta as s 127 | ON s.patient_id = t.patient_id and s.infected_by is not null 128 | WHEN MATCHED 129 | THEN UPDATE SET * 130 | WHEN NOT MATCHED and s.infected_by is not null 131 | THEN INSERT * 132 | 133 | -- COMMAND ---------- 134 | 135 | -- DBTITLE 1,Restaurar um DELETE\UPDATE sem WHERE com CDF? 136 | UPDATE db_demo.PatientInfoDelta set age = '10s'; 137 | select * from db_demo.PatientInfoDelta 138 | 139 | -- COMMAND ---------- 140 | 141 | describe history db_demo.PatientInfoDelta 142 | 143 | -- COMMAND ---------- 144 | 145 | -- DBTITLE 1,Olhando somente a versão 9 146 | SELECT _change_type,_commit_version,_commit_timestamp,* FROM table_changes('db_demo.`PatientInfoDelta`', 9,9) 147 | where _change_type = 'update_preimage' 148 | 149 | -- COMMAND ---------- 150 | 151 | -- DBTITLE 1,Voltando um Update sem WHERE com CDF 152 | -- Voltando todos os UPDATES da versão 9 153 | MERGE INTO db_demo.PatientInfoDelta as t 154 | USING (SELECT row_number() over (partition by patient_id order by _commit_version desc) as rnb,* 155 | FROM table_changes('db_demo.`PatientInfoDelta`', 9,9) where _change_type = 'update_preimage') as s 156 | ON s.patient_id = t.patient_id and _change_type = 'update_preimage' and rnb = 1 157 | WHEN MATCHED 158 | THEN UPDATE SET * 159 | 160 | -- COMMAND ---------- 161 | 162 | -- DBTITLE 1,É, funciona também 163 | select * from db_demo.PatientInfoDelta; 164 | -------------------------------------------------------------------------------- /tips/markdown/OptimizeAndVacuum_Doc.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Exemplo de markdown com imagem 3 | # MAGIC %md 4 | # MAGIC | **Coluna1** | **Coluna2** | **Coluna3** | 5 | # MAGIC | --------------- | ------------------| ----------- | 6 | # MAGIC | `linha1` | Desc1. | `str` | 7 | # MAGIC | `linha2` | Desc2. | `str` | 8 | # MAGIC | `linha3` | Desc3. | `str` | 9 | # MAGIC 10 | # MAGIC ![Nome da imagem](https://static.wixstatic.com/media/a794bc_cb3926c8de254beea142cc8cb2c40e58~mv2.jpg) 11 | 12 | # COMMAND ---------- 13 | 14 | # MAGIC %md 15 | # MAGIC # `maintenanceDeltalake` 16 | # MAGIC 17 | # MAGIC A função `maintenanceDeltalake` é utilizada para executar operações de manutenção em uma tabela Delta Lake. Ela oferece a opção de otimizar a tabela e executar a limpeza de versionamento. 18 | # MAGIC 19 | # MAGIC ## Parâmetros 20 | # MAGIC 21 | # MAGIC | **Nome** | **Descrição** | **Tipo** | 22 | # MAGIC | ------------------- | ------------------------------------------------------------------------------------- | ----------- | 23 | # MAGIC | `nomeSchema` | O nome do esquema (schema) da tabela Delta Lake. | `str` | 24 | # MAGIC | `nomeTabela` | O nome da tabela Delta Lake. | `str` | 25 | # MAGIC | `colunasZorder` | O grupo de colunas para aplicar o ZORDER. | `str` | 26 | # MAGIC | `vacuumRetention` | O tempo de retenção em horas para a limpeza de versionamento. | `int` | 27 | # MAGIC | `vacuum` | Indica se a limpeza de versionamento deve ser executada. | `bool` | 28 | # MAGIC | `optimize` | Indica se a otimização da tabela deve ser executada. | `bool` | 29 | # MAGIC | `debug` | Indica se o modo de depuração está habilitado. | `bool` | 30 | # MAGIC 31 | # MAGIC ## Exemplo de Uso 32 | # MAGIC 33 | # MAGIC Aqui estão três exemplos de chamadas da função `maintenanceDeltalake` com diferentes parâmetros: 34 | # MAGIC 35 | # MAGIC 1. Exemplo com otimização e limpeza habilitadas: 36 | # MAGIC ```python 37 | # MAGIC maintenanceDeltalake(nomeSchema='silver', nomeTabela='my_table', colunasZorder='column1, column2', vacuumRetention=72, vacuum=True, optimize=True, debug=True) 38 | # MAGIC ``` 39 | # MAGIC
2. Exemplo sem otimização e com limpeza desabilitada: 40 | # MAGIC ```python 41 | # MAGIC maintenanceDeltalake(nomeSchema='silver', nomeTabela='my_table', colunasZorder='none', vacuumRetention=168, vacuum=False, optimize=False, debug=True) 42 | # MAGIC ``` 43 | # MAGIC
3. Exemplo sem otimização e limpeza habilitadas, em modo de produção: 44 | # MAGIC ```python 45 | # MAGIC maintenanceDeltalake(nomeSchema='silver', nomeTabela='my_table', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=False, debug=False) 46 | # MAGIC ``` 47 | # MAGIC 48 | # MAGIC >Observação: Lembre-se de fornecer os valores corretos para os parâmetros, com base nas suas necessidades específicas. 49 | # MAGIC 50 | # MAGIC Referência 51 | # MAGIC Para obter mais informações sobre como otimizar seu Delta Lake e reduzir os custos de storage e computação no Databricks, confira o seguinte post: Otimize seu Delta Lake e reduza custos de storage, Databricks e computação. 52 | 53 | # COMMAND ---------- 54 | 55 | from datetime import datetime 56 | def maintenanceDeltalake (nomeSchema='silver', nomeTabela='none', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True): 57 | if debug: 58 | print("Modo Debug habilitado!") 59 | if optimize: 60 | if colunasZorder != "none": 61 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}") 62 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})") 63 | else: 64 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}") 65 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}") 66 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}") 67 | else: 68 | print(f"### Não executado OPTIMIZE! ###") 69 | 70 | if vacuum: 71 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}") 72 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours") 73 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}") 74 | else: 75 | print(f"### Não executado VACUUM! ###") 76 | else: 77 | print("Modo Debug desabilitado!") 78 | if optimize: 79 | if colunasZorder != "none": 80 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}") 81 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})") 82 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})") 83 | else: 84 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}") 85 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}") 86 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela}") 87 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}") 88 | else: 89 | print(f"### Não executado OPTIMIZE! ###") 90 | 91 | if vacuum: 92 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}") 93 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = false") 94 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours") 95 | spark.sql(f"VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours") 96 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = true") 97 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}") 98 | else: 99 | print(f"### Não executado VACUUM! ###") 100 | 101 | # COMMAND ---------- 102 | 103 | # MAGIC %md 104 | # MAGIC Enviar parâmetros para execução após instanciar a função 105 | 106 | # COMMAND ---------- 107 | 108 | # DBTITLE 1,Enviar parâmetros para execução após instanciar a função 109 | # MAGIC %py 110 | # MAGIC #Caso queira já chamar a função diretamente do Azure Data Factory, informar os parametros na chamada do notebook 111 | # MAGIC try: 112 | # MAGIC maintenanceDeltalake(nomeSchema=getArgument("NomeSchema"), nomeTabela=getArgument("NomeTabela"), colunasZorder=getArgument("ColunasZorder"), vacuumRetention=getArgument("VacuumRetention"), vacuum=eval(getArgument("Vacuum")), optimize=eval(getArgument("Optimize")), debug=eval(getArgument("Debug"))) 113 | # MAGIC except: 114 | # MAGIC print("Função maintenanceDeltalake() instanciada no contexto!") 115 | -------------------------------------------------------------------------------- /tips/EXPLODE_STRING/Explode usando SQL.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Listando Grupos de logins no Databricks via API 3 | import requests 4 | import json 5 | 6 | # Define a URL base da API do Databricks 7 | instance_id = 'adb-47319640954053.13.azuredatabricks.net' 8 | 9 | api_version = '/api/2.0' 10 | api_command = '/preview/scim/v2/Groups' 11 | url_list = f"https://{instance_id}{api_version}{api_command}" 12 | url_list_members = f"https://{instance_id}/api/2.0/preview/scim/v2/Groups/" 13 | 14 | print(url_list) 15 | 16 | # Define o cabeçalho com o token de autenticação do Databricks 17 | headers = { 18 | 'Authorization': "Bearer xxx-3", 19 | "Content-Type": "application/json" 20 | } 21 | 22 | has_more = True 23 | count = 0 24 | offset = 0 25 | jsonGroups = [] 26 | while has_more: 27 | params = { 28 | 'expand_tasks': 'true', 29 | 'offset': offset 30 | } 31 | try: 32 | print('Listando grupos') 33 | responseList = requests.get( 34 | url = url_list, 35 | params = params, 36 | headers= headers 37 | ) 38 | except Exception as error: 39 | print(error) 40 | 41 | jsonGroups.append(responseList.json()) 42 | try: 43 | has_more = json.loads(responseList.text)['has_more'] 44 | except: 45 | has_more = False 46 | 47 | count = count + 1 48 | offset = offset + 20 49 | 50 | print(jsonGroups) 51 | 52 | # COMMAND ---------- 53 | 54 | # DBTITLE 1,Transformando a lista em Dataframe 55 | jsonRDD = sc.parallelize(jsonGroups) 56 | df = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD) 57 | df.createOrReplaceTempView('vw_sql_temp') 58 | 59 | # COMMAND ---------- 60 | 61 | # DBTITLE 1,Criando o mesmo Dataframe usando tudo como String 62 | jsonValues = [] 63 | jsonValues.append({'totalResults': 7, 'startIndex': 1, 'itemsPerPage': 7, 'schemas': ["urn:ietf:params:scim:api:messages: 2.0:ListResponse"], 'Resources': "[{'displayName': 'read_write_prod', 'entitlements': null, 'groups': [{'$ref': 'Groups/769409655224333', 'display': 'read_dev', 'type': 'direct', 'value': '769409655224333'}], 'id': '67674397758141', 'members': [{'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'ServicePrincipals/2955041608089028', 'display': 'databricks-serviceprincipal', 'value': '2955041608089028'}, {'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1581289963735043', 'display': 'Regis Naldo', 'value': '1581289963735043'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'read_prod', 'entitlements': null, 'groups': [{'$ref': 'Groups/766964445608499', 'display': 'read_write_dev', 'type': 'direct', 'value': '766964445608499'}], 'id': '138152945819756', 'members': [{'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'users', 'entitlements': [{'value': 'workspace-access'}, {'value': 'databricks-sql-access'}], 'groups': [], 'id': '371637887295750', 'members': [{'$ref': 'ServicePrincipals/2955041608089028', 'display': 'databricks-serviceprincipal', 'value': '2955041608089028'}, {'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1581289963735043', 'display': 'Regis Naldo', 'value': '1581289963735043'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'WorkspaceGroup'}}, {'displayName': 'read_write_dev', 'entitlements': [{'value': 'databricks-sql-access'}], 'groups': [], 'id': '766964445608499', 'members': [{'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}, {'$ref': 'Groups/138152945819756', 'display': 'read_prod', 'value': '138152945819756'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'read_dev', 'entitlements': null, 'groups': [], 'id': '769409655224333', 'members': [{'$ref': 'Groups/67674397758141', 'display': 'read_write_prod', 'value': '67674397758141'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'admins', 'entitlements': [{'value': 'workspace-access'}, {'value': 'databricks-sql-access'}, {'value': 'allow-cluster-create'}, {'value': 'allow-instance-pool-create'}], 'groups': [], 'id': '868174163364744', 'members': [{'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'WorkspaceGroup'}}, {'displayName': 'demogroup', 'entitlements': null, 'groups': [], 'id': '1053327257318900', 'members': [{'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1581289963735043', 'display': 'Regis Naldo', 'value': '1581289963735043'}, {'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'Group'}}]" }) 64 | 65 | from pyspark.sql.types import * 66 | schema = StructType([ 67 | StructField('Resources', StringType(), True), 68 | StructField('itemsPerPage', StringType(), True), 69 | StructField('schemas', StringType(), True), 70 | StructField('startIndex', StringType(), True), 71 | StructField('totalResults', StringType(), True) 72 | ]) 73 | 74 | dfString = spark.createDataFrame(jsonValues,schema) 75 | dfString.createOrReplaceTempView('vw_sql_temp_string') 76 | dfString.printSchema() 77 | 78 | # COMMAND ---------- 79 | 80 | # DBTITLE 1,Dataframe tipado Array e Struct 81 | # MAGIC %sql 82 | # MAGIC select * from vw_sql_temp 83 | 84 | # COMMAND ---------- 85 | 86 | # DBTITLE 1,Dataframe usando String 87 | # MAGIC %sql 88 | # MAGIC select * from vw_sql_temp_string 89 | 90 | # COMMAND ---------- 91 | 92 | # MAGIC %sql 93 | # MAGIC -- Acessando um item especifico do Array 94 | # MAGIC select Resources[3] from vw_sql_temp 95 | 96 | # COMMAND ---------- 97 | 98 | # DBTITLE 1,Acessando um campo Array\Struct 99 | # MAGIC %sql 100 | # MAGIC -- Voce pode navegar de forma bem simples usando ponto 101 | # MAGIC select Resources[3].displayName from vw_sql_temp 102 | 103 | # COMMAND ---------- 104 | 105 | # DBTITLE 1,Explodindo e acessando niveis de forma simples 106 | # MAGIC %sql 107 | # MAGIC -- Listando todos os usuários e grupos a nivel de linha 108 | # MAGIC -- Note que estamos acessando os campos apenas com . apos aplocar o Explode da coluna Resources 109 | # MAGIC select Resources.displayName,explode(Resources.members.display) from( 110 | # MAGIC select explode(Resources) as Resources from vw_sql_temp 111 | # MAGIC ) nivel1 112 | 113 | # COMMAND ---------- 114 | 115 | # MAGIC %sql 116 | # MAGIC select from_json(Resources,'a string'),* from vw_sql_temp 117 | 118 | # COMMAND ---------- 119 | 120 | # DBTITLE 1,Acessando campos do JSON no formato string 121 | # MAGIC %sql 122 | # MAGIC -- Ops, não é tão simples assim 123 | # MAGIC select Resources.displayName,* from vw_sql_temp_string 124 | 125 | # COMMAND ---------- 126 | 127 | # DBTITLE 1,FROM_JSON 128 | # MAGIC %sql 129 | # MAGIC select from_json(Resources,"ARRAY>") as Resources 130 | # MAGIC from vw_sql_temp_string 131 | 132 | # COMMAND ---------- 133 | 134 | # MAGIC %sql 135 | # MAGIC select from_json(Resources,"ARRAY>, groups: ARRAY>, id: STRING, members: ARRAY>, meta: STRUCT>>") as Resources 136 | # MAGIC from vw_sql_temp_string 137 | 138 | # COMMAND ---------- 139 | 140 | # MAGIC %sql 141 | # MAGIC select Resources.displayName,explode(Resources.members.display) from( 142 | # MAGIC select explode(from_json(Resources,"ARRAY>, groups: ARRAY>, id: STRING, members: ARRAY>, meta: STRUCT>>")) as Resources from vw_sql_temp_string 143 | # MAGIC ) nivel1 144 | -------------------------------------------------------------------------------- /routines/tablesSize&Vacuum/README.md: -------------------------------------------------------------------------------- 1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks. 2 |
3 | Selecionar o opção Import 4 | image 5 |
6 | Selecionar o script e importar: 7 | image 8 | 9 | 10 | | Versão | data | Descrição | 11 | |-----------|-------|----------| 12 | | `v1.0` | 2022-12-01 | Executando em clientes internos | 13 | | `v1.1` | 2023-02-25 | Liberado para mais alguns clientes e engenheiros| 14 | | `v2.0` | 2023-04-24 | Liberado publicamente | 15 | 16 | Link do post: https://www.datainaction.dev/post/databricks-tablessize-vacuum-monitore-e-reduza-custos-do-seu-delta-lake 17 | 18 |

Funcionalidade e objetivo

19 | 20 | > Esse notebook tem como principal objetivo coletar informações de tamanho e realizar limpeza das tabelas no formato Delta. 21 | > 22 | > São coletadas informações do tamanho da tabela no **Storage** e cruzadas com o tamanho da **versão atual**, assim podemos estimar quanto de espaço a operação de Vacuum poderia liberar. 23 | > 24 | > **Nota**: Focado para ambientes sem Unity Catalog ainda, embora funcione, o script será adaptado para ambientes com Unity Catalog 25 | > 26 | > **Nota 2**: As primeiras execuções do Vacuum podem demorar mais se seu ambiente nunca passou por essa manutenção, as execuções posteriores serão mais rápidas, pois, menos tabelas precisarão de vacuum, isso conforme o parâmetro vacuumThreshold. 27 | > 28 | > **Nota 3**: Reforçando, a primeira execução rode com o parâmetro **runVacuum = False**, apenas para você avaliar e ter uma noção de como está seu ambiente e quanto tempo a rotina irá levar. 29 | > 30 | > **Nota 4**: Se você sofrer algum erro, me envie pelo Github, LinkeDin ou por e-mail e posso te ajudar: reginaldo.silva27@gmail.com 31 | > 32 | > **Nota 5**: Pode aumentar o custo do seu Storage em relação às transações devido às chamadas do DButils.fs.ls, por isso, use com cautela, monitore e rode no máximo 1 vez por semana. 33 | > 34 | > **Nota 6**: Testes realizados com Azure (Usando o caminho absoluto e Mount), AWS (Mount) e GCP (Mount) para gravar as tabelas de controle. 35 | > 36 | > Abaixo são os passos executados em orderm de execução: 37 | 1. Listagem de todas as tabelas existentes para determinado database ou grupo de databases, usamos um SHOW DATABASES E SHOW TABLES 38 | 2. Executado um **describe detail** para cada tabela e armazenado o resultado em uma tabela Delta para análise e monitoramento 39 | 3. É executado uma varredura (dbutils.fs.ls) nas pastas do Storage recursivamente para calcular o espaço ocupado no Storage por cada tabela, excluindo os _delta_logs 40 | 4. Executado queries de análise para avaliar quais tabelas podem se beneficiar do Vacuum 41 | 5. Executado a operação de Vacuum nas tabelas que atingem o threshold definido 42 | 43 | **Recomendação de Cluster:**
44 | > Comece com um cluster pequeno e monitore, cluster inicial: **Driver: Standard_DS4_v2 · 2 x Workers: Standard_DS3_v2 · Runtime >11.3** 45 | 46 | **Observações:**
47 | - **Existem outras formas de realizar essa operação, no entanto, essa foi a menos complexa e com melhor performance com exceção da operação Vacuum DRY RUN em Scala**
48 | - **A primeira versão desenvolvida utilizei a leitura dos Delta Logs, Jsons e checkpoints, contudo, não consegui reproduzir exatamente a operação de Vacuum DRY RUN e a performance ficou pior devido a quantidade de validações que precisei adicionar**
49 | - Nessa versão mapiei todos os arquivos marcados como Remove no log, embora, seja mais performático a precisão não era (dados não batiam) por alguns fatores, para contornar esses fatores o script ficou mais lento 50 | - Tentei reproduzir via Scala, contudo, meu conhecimento em Scala é limitado e ficou muito complexo 51 | - Falei com alguns engenheiros da comunidade Delta, mas não tive sucesso em evoluir via Scala 52 | - **Se você rodar o Vaccum Dry Run via Scala ele printa o retorno, contudo, esse retorno vai para o Stdout e ficou muito complexo de recuperar**
53 | ``%scala 54 | vacuum tablename dry run`` 55 | - **Estou avaliando uma nova versão com delta.rs** 56 | - referencia: 57 | 58 | Caso você queira se aventurar com Scala, aqui está o código-fonte:
59 | 60 |
61 | Se você conseguir levantar essa quantidade de espaço de forma mais performática, me atualiza via comentário ou no Github. 62 | 63 | **Ponto de atenção:**
64 | - **Para tabelas particionadas com muitas partições o tempo de execução pode ser mais demorado, por isso monitore as primeiras execuções com cautela, o uso é de sua responsabilidade, apesar de não ter nenhum risco mapeado até o momento, apenas pode gerar mais transações para sua storage**
65 | - **Custo das transações do Storage no Azure: Read operations (per 10,000) - R$0.0258 (Dois centavos por 10 mil operações) (Preço estimado em 21/04/2023)** 66 | 67 |

Descriçao dos parametros

68 | 69 | ### Parametros de controle 70 | 71 | | Parametro | Valor | Descrição | 72 | |-----------|-------|----------| 73 | | `numThreadsParallel` | **15** | Número de threads paralelas, avalie o melhor valor para o seu ambiente, faça testes | 74 | | `vacuumThreadsParallel` | **5** | Número de threads paralelas para execução do **Vacuum**, utilize um valor menor, pois, pode dar problema no cluster, avalie o melhor valor para o seu ambiente, faça testes | 75 | | `runVacuum` | **False** | Se definido como **True** executa o Vacuum com os parâmetros configurados, o valor padrão é **False** execute a primeira vez no default para ter uma noção de como está o seu ambiente | 76 | | `vacuumHours` | **168** | Quantidade de horas que será mantido de versões após o Vacuum, defina o melhor para o seu ambiente, o padrão é 7 dias | 77 | | `vacuumThreshold` | **5x** | Executar o Vacuum apenas nas tabelas em que o storage for **5x** maior do que a versão atual, exemplo: **Uma tabela que tem 100GB de dados na versão atual e possuir 500GB no Storage, irá entrar na rotina de limpeza** | 78 | | `enableLogs` | **False** | Se definido como **True** irá gerar logs para facilitar algumas análises como, por exemplo, o tempo de duração para cada tabela e o erro se houver, contudo,nível eleva bastante o tempo de processamento se você tiver muitas tabelas | 79 | | `enableHistory` | **False** | Se definido como **True** mantém histórico de todas as execuções da rotina e cada execução possuirá um identificador único para poder relacionar entre as tabelas, se definido como **False** (valor padrão) as tabelas de logs (tableDetails e tableStorageFiles) sempre serão truncadas | 80 | 81 | ### Parametros para definição dos metadados 82 | 83 | | Parametro | Valor | Descrição | 84 | |-----------|-------|----------| 85 | | `databaseTarget` | 'bronze*' | Define quais bancos de dados serão analisados, aceita regex com **, exemplo: todos os databases que começam com a palavra bronze: 'bronze*' | 86 | | `tablesTarget` | '*' | Definir quais tabelas serão analisadas, aceita regex com **, por padrão todas serão analisadas | 87 | | `databaseCatalog` | 'db_controle' | Define em qual database será armazenado os logs, caso não exista será criado um novo | 88 | | `tableCatalog` | 'tbCatalog' | Define nome da tabela de controle para armazenar as tabelas que serão analisadas, caso não exista a tabela será criada | 89 | | `tbVacuumSummary` | 'tbVacuumSummary' | Define nome da tabela para armazenar o resultado agregado da execução, caso não exista a tabela será criada | 90 | | `tablesSizeMonitor` | 'tablesSizeMonitor' | Define nome da tabela para armazenar o resultado agregado da execução com detalhes no nível de tabela, caso não exista a tabela será criada | 91 | | `tableDetails` | 'bdaTablesDetails' | Define nome da tabela que irá armazenar o resultado do describe detail, caso não exista a tabela será criada | 92 | | `tableStorageFiles` | 'bdaTablesStorageSize' | Define nome da tabela que irá armazenar o resultado do dbutils.fs.ls | 93 | | `storageLocation` | 'abfss://[container]@[Storage].dfs.core.windows.net/pastaraiz/' [**Exemplo no Azure**]| Define endereço de storage principal, pode ser usado o valor absoluto ou um Mount (dbfs:/mnt/bronze/pastaRaiz/) | 94 | | `tableCatalogLocation` | f'database=db_controle/table_name={tableCatalog}' | Define storage da tabela de catálogo | 95 | | `tablesdetailsLocation` | f'database=db_controle/table_name={tableDetails}' | Define storage da tabela de detalhes do describe | 96 | | `tableStorageFilesLocation` | f'database=db_controle/table_name={tableStorageFiles}' | Define storage da tabela de resultado do dbutils | 97 | | `writeMode` | "overwrite" | Modo de escrita, "append" se `enableHistory` é verdadeiro, caso contrário "overwrite" | 98 | | `identifier` | str(hash(datetime.today())) | Identificador unico para cada execução, usado para vincular as tabelas com suas devidas execuções | 99 | 100 | ## Objetos criados: 1 database e 5x tabelas 101 | 102 | > 1x Database nomeado através da variáve databaseCatalog, por padrão o nome será **db_controle**
103 | > 1x Tabela de catalogo, irá armazenar a listagem, por padrão o nome será **tbCatalog**, se o parâmetro enableHistory estiver desabilitado ela será sobrescrita em cada execução
104 | > 1x Tabela para armazenar o resultado do describe detail, por padrão será chamada de **bdaTablesDetails**, se o parâmetro enableHistory estiver desabilitado ela será sobrescrita em cada execução
105 | > 1x Tabela para armazenar o resultado do List files, por padrão será chamada de **tableStorageFiles**, se o parâmetro enableHistory estiver desabilitado ela será sobrescrita em cada execução
106 | > 1x Tabela para armazenar o resultado agregado da execução com detalhes no nivel de tabela, por padrão será chamada de **tablesSizeMonitor**, essa tabela nunca é truncada
107 | > 1x Tabela para armazenar o resultado agregado da execução, por padrão será chamada de **tbVacuumSummary**, essa tabela nunca é truncada
108 | 109 | ## Monitoramento 110 | 111 | > Monitore seu ambiente através das tabelas tbVacuumSummary e tablesSizeMonitor
112 | > A tabela **tbVacuumSummary** armazena 1 linha por execução de dados sumarizados
113 | > A tabela **tablesSizeMonitor** armazena 1 linha por tabela por execução com dados sumarizados
114 | 115 | 116 | ## Benchmarch: 117 | 118 | > Ambiente com mais de 3 mil tabelas - 200 TB de Storage - 12 horas - Cluster (1xnode DS5) - 50 Threads para analysis e 10 para vacuum - **Sem logs (enableLogs=False)** - Primeira execução
119 | > Ambiente com 300 tabelas - 5 TB de Storage - 1 hora - Cluster (1xnode DS3) - 25 Threads para analysis e 10 para vacuum - **Sem logs (enableLogs=False)** - Primeira execução
120 | > Ambiente com 300 tabelas - 5 TB de Storage - 2 horas - Cluster (1xnode DS3) - 25 Threads para analysis e 10 para vacuum - **Com logs (enableLogs=True)** - Primeira execução
121 | > Ambiente com 1000 tabelas - 10 GB de Storage - **6 horas - Cluster (1xnode DS3)** - 25 Threads para analysis e 10 para vacuum - **Com logs (enableLogs=True)** - Primeira execução
122 | Ambiente com 1000 tabelas - 10 GB de Storage - 6 horas - Cluster (1xnode DS3) - 25 Threads para analysis e 10 para vacuum - **Sem Logs (enableLogs=False)** - Primeira execução 123 | 124 | ## Cases reais: 125 | 126 | > **Case 1 - Azure:** Liberado mais de 250 TB na primeira execução em um ambiente que não havia rotina
127 | > **Case 2 - GCP:** Liberado 5 TB de logs em um ambiente pequeno, onde o total de dados eram apenas 50 GB e o storage tinha 5 TB
128 | > **Case 3 - Azure:** Liberado em média 10 TB de logs por semana, utilizando em um Databricks Job com agendamento para todos os finais de semana
129 | 130 | ## Implementações futuras: 131 | 132 | > 1. Utilizar Unity Catalog
133 | > 2. Converter código para uma Lib em python
134 | > 3. Refatorar código usando a Lib e melhorando a usabilidade 135 | > 4. Minimizar custos com dbutils.fs.ls, olhando direto para o transction log
136 | 137 | ## Referencias: 138 | 139 | 140 | 141 | 142 | 143 |
144 | 145 | > ``Criado por: Reginaldo Silva`` 146 | - [Blog Data In Action](https://datainaction.dev/) 147 | - [Github](https://github.com/reginaldosilva27) 148 | -------------------------------------------------------------------------------- /tips/sparkconfs/Spark Confs.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,Mostrar todas as configurações disponíveis 3 | df = spark.sparkContext.getConf().getAll() 4 | i=1 5 | for d in df: 6 | print(str(i),' - ',d) 7 | i = i+1 8 | 9 | # COMMAND ---------- 10 | 11 | # DBTITLE 1,Mostrar todas as ClusterTags 12 | df = spark.sparkContext.getConf().getAll() 13 | i=1 14 | for d in df: 15 | if 'clusterUsageTags' in d[0]: 16 | print(str(i),' - ',d) 17 | i=i+1 18 | 19 | # COMMAND ---------- 20 | 21 | # DBTITLE 1,Cluster tags mais comuns 22 | print( 23 | ' | Description | Value | Description |\n', 24 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n', 25 | '| spark.databricks.clusterUsageTags.cloudProvider | ',spark.conf.get('spark.databricks.clusterUsageTags.cloudProvider'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.cloudProvider'))) * ' ','| Cloud que esta operando o Databricks', '|\n', 26 | '| spark.databricks.clusterUsageTags.azureSubscriptionId | ',spark.conf.get('spark.databricks.clusterUsageTags.azureSubscriptionId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.azureSubscriptionId'))) * ' ','| ID da assinatura do Azure', '|\n', 27 | '| spark.databricks.clusterUsageTags.managedResourceGroup | ',spark.conf.get('spark.databricks.clusterUsageTags.managedResourceGroup'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.managedResourceGroup'))) * ' ','| Grupo de recursos no Azure que é gerenciado pelo Databricks', '|\n', 28 | '| spark.databricks.clusterUsageTags.clusterId | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterId'))) * ' ','| ID do cluster', '|\n', 29 | '| spark.databricks.clusterUsageTags.region | ',spark.conf.get('spark.databricks.clusterUsageTags.region'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.region'))) * ' ','| Região que hospeda os Clusters', '|\n', 30 | '| spark.databricks.clusterUsageTags.workerEnvironmentId | ',spark.conf.get('spark.databricks.clusterUsageTags.workerEnvironmentId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.workerEnvironmentId'))) * ' ','| ID do Worksapce', '|\n', 31 | '| spark.databricks.clusterUsageTags.region | ',spark.conf.get('spark.databricks.clusterUsageTags.region'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.region'))) * ' ','| Região que hospeda os Clusters', '|\n', 32 | '| spark.databricks.clusterUsageTags.clusterLogDestination | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterLogDestination'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterLogDestination'))) * ' ','| Caminho onde os logs serão entregues', '|\n', 33 | '| spark.databricks.clusterUsageTags.isSingleUserCluster | ',spark.conf.get('spark.databricks.clusterUsageTags.isSingleUserCluster'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.isSingleUserCluster'))) * ' ','| É um cluster de usuario unico?', '|\n', 34 | '| spark.databricks.clusterUsageTags.clusterName | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterName'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterName'))) * ' ','| Nome do Cluster', '|\n', 35 | '| spark.databricks.clusterUsageTags.clusterScalingType | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterScalingType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterScalingType'))) * ' ','| Tem auto scale?', '|\n', 36 | '| spark.databricks.clusterUsageTags.clusterNodeType | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterNodeType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterNodeType'))) * ' ','| Familia da maquina para os nodes', '|\n', 37 | '| spark.databricks.clusterUsageTags.driverNodeType | ',spark.conf.get('spark.databricks.clusterUsageTags.driverNodeType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.driverNodeType'))) * ' ','| Familia da maquina para o Driver', '|\n', 38 | '| spark.databricks.clusterUsageTags.clusterWorkers | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterWorkers'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterWorkers'))) * ' ','| Quantidade de workers Online', '|\n', 39 | '| spark.databricks.clusterUsageTags.effectiveSparkVersion | ',spark.conf.get('spark.databricks.clusterUsageTags.effectiveSparkVersion'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.effectiveSparkVersion'))) * ' ','| Versão do Spark operando no Cluster', '|\n', 40 | '| spark.databricks.clusterUsageTags.clusterSku | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterSku'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterSku'))) * ' ','| Tipo do Cluster', '|\n', 41 | '| spark.databricks.clusterUsageTags.clusterAvailability | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterAvailability'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterAvailability'))) * ' ','| Tipo de VMs em uso, SPOT ou On Demand', '|\n', 42 | '| spark.databricks.clusterUsageTags.enableElasticDisk | ',spark.conf.get('spark.databricks.clusterUsageTags.enableElasticDisk'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableElasticDisk'))) * ' ','| Possui discos elasticos para escalar?', '|\n', 43 | '| spark.databricks.clusterUsageTags.autoTerminationMinutes | ',spark.conf.get('spark.databricks.clusterUsageTags.autoTerminationMinutes'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.autoTerminationMinutes'))) * ' ','| Desligar cluster automaticamente após X minutos', '|\n', 44 | '| spark.databricks.clusterUsageTags.runtimeEngine | ',spark.conf.get('spark.databricks.clusterUsageTags.runtimeEngine'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.runtimeEngine'))) * ' ','| Tipo da Engine em execução', '|\n', 45 | '| spark.databricks.clusterUsageTags.clusterLastActivityTime | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterLastActivityTime'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterLastActivityTime'))) * ' ','| Data da ultima atividade executada no cluster', '|\n', 46 | '| spark.databricks.clusterUsageTags.enableCredentialPassthrough | ',spark.conf.get('spark.databricks.clusterUsageTags.enableCredentialPassthrough'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableCredentialPassthrough'))) * ' ','| Cluster com Passthrough habilitado?', '|\n', 47 | '| spark.databricks.clusterUsageTags.instanceWorkerEnvNetworkType| ',spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvNetworkType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvNetworkType'))) * ' ','|(vnet-injection) ou Default?', '|\n', 48 | '| spark.databricks.clusterUsageTags.enableLocalDiskEncryption | ',spark.conf.get('spark.databricks.clusterUsageTags.enableLocalDiskEncryption'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableLocalDiskEncryption'))) * ' ','| Criptografica local?', '|\n', 49 | '| park.databricks.clusterUsageTags.clusterOwnerOrgId | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterOwnerOrgId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterOwnerOrgId'))) * ' ','| ID da organização, faz parte da URL do Workspace', '|\n', 50 | '| spark.databricks.clusterUsageTags.clusterPythonVersion | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterPythonVersion'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterPythonVersion'))) * ' ','| Versão do Python rodando no Cluster', '|\n', 51 | '| spark.databricks.clusterUsageTags.enableDfAcls | ',spark.conf.get('spark.databricks.clusterUsageTags.enableDfAcls'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableDfAcls'))) * ' ','| Possui ACL habilitado?', '|\n', 52 | '| spark.databricks.clusterUsageTags.instanceWorkerEnvId | ',spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvId'))) * ' ','| ID da Instancia', '|\n', 53 | '| spark.databricks.clusterUsageTags.clusterUnityCatalogMode | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterUnityCatalogMode'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterUnityCatalogMode'))) * ' ','| Utiliza Unity Catalog?', '|\n', 54 | '| spark.databricks.clusterUsageTags.enableSqlAclsOnly | ',spark.conf.get('spark.databricks.clusterUsageTags.enableSqlAclsOnly'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableSqlAclsOnly'))) * ' ','| ACL com SQL habilitado?', '|\n', 55 | '| spark.databricks.clusterUsageTags.clusterPinned | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterPinned'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterPinned'))) * ' ','| Cluster esta pinado?', '|\n', 56 | '| spark.databricks.clusterUsageTags.privateLinkEnabled | ',spark.conf.get('spark.databricks.clusterUsageTags.privateLinkEnabled'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.privateLinkEnabled'))) * ' ','| Possui Private Link habilitado?', '|\n', 57 | '| spark.databricks.clusterUsageTags.clusterCreator | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterCreator'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterCreator'))) * ' ','| Cluster criador por', '|\n', 58 | '| spark.databricks.clusterUsageTags.clusterNumCustomTags | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterNumCustomTags'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterNumCustomTags'))) * ' ','| Quantidade de tags customizadas', '|\n', 59 | '| spark.databricks.clusterUsageTags.clusterAllTags | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags'))) * ' ','| Quantidade de tags customizadas', '|\n', 60 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n', 61 | 'Links Reference:\n', 62 | 'https://spark.apache.org/docs/latest/configuration.html \n', 63 | 'https://books.japila.pl/delta-lake-internals/' 64 | ) 65 | 66 | # COMMAND ---------- 67 | 68 | # DBTITLE 1,Spark confs comuns 69 | print( 70 | ' | Description | Value | Description |\n', 71 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n', 72 | '| spark.databricks.cloudProvider | ',spark.conf.get('spark.databricks.cloudProvider'),(40-len(spark.conf.get('spark.databricks.cloudProvider'))) * ' ','| Cloud que esta operando o Databricks', '|\n', 73 | '| spark.databricks.workspaceUrl | ',spark.conf.get('spark.databricks.workspaceUrl'),(40-len(spark.conf.get('spark.databricks.workspaceUrl'))) * ' ','| URL para acessar o Worksapce', '|\n', 74 | '| spark.app.startTime | ',spark.conf.get('spark.app.startTime'),(40-len(spark.conf.get('spark.app.startTime'))) * ' ','| Hora de inicio da aplicação Spark', '|\n', 75 | '| spark.app.name | ',spark.conf.get('spark.app.name'),(40-len(spark.conf.get('spark.app.name'))) * ' ','| Nome da aplicação Spark', '|\n', 76 | '| spark.app.id | ',spark.conf.get('spark.app.id'),(40-len(spark.conf.get('spark.app.id'))) * ' ','| Id da aplicação criada pelo Spark ', '|\n', 77 | '| spark.databricks.clusterSource | ',spark.conf.get('spark.databricks.clusterSource'),(40-len(spark.conf.get('spark.databricks.clusterSource'))) * ' ','| Cluster criado via UI, JOB or API', '|\n', 78 | '| spark.driver.maxResultSize | ',spark.conf.get('spark.driver.maxResultSize'),(40-len(spark.conf.get('spark.driver.maxResultSize'))) * ' ','| Tamanho máximo do retorno de uma ação, exemplo Collect(), caso contrario será abortado para evitar problemas no Driver', '|\n', 79 | '| spark.sql.sources.default | ',spark.conf.get('spark.sql.sources.default'),(40-len(spark.conf.get('spark.sql.sources.default'))) * ' ','| Padrão de fonte utilizada, no Spark puro o default é Parquet', '|\n', 80 | '| spark.databricks.delta.multiClusterWrites.enabled | ',spark.conf.get('spark.databricks.delta.multiClusterWrites.enabled'),(40-len(spark.conf.get('spark.databricks.delta.multiClusterWrites.enabled'))) * ' ','| Permite escrita por mais de um cluster paralelo', '|\n', 81 | '| spark.databricks.workerNodeTypeId | ',spark.conf.get('spark.databricks.workerNodeTypeId'),(40-len(spark.conf.get('spark.databricks.workerNodeTypeId'))) * ' ','| Familia da VM utilizada nos Workers do Cluster', '|\n', 82 | '| spark.driver.host | ',spark.conf.get('spark.driver.host'),(40-len(spark.conf.get('spark.driver.host'))) * ' ','| IP da VM do Driver', '|\n', 83 | '| spark.master | ',spark.conf.get('spark.master'),(40-len(spark.conf.get('spark.master'))) * ' ','| Gerenciador do Cluster', '|\n', 84 | '| spark.databricks.driverNodeTypeId | ',spark.conf.get('spark.databricks.driverNodeTypeId'),(40-len(spark.conf.get('spark.databricks.driverNodeTypeId'))) * ' ','| Familia da VM utilizada no Driver do Cluster', '|\n', 85 | '| spark.executor.memory | ',spark.conf.get('spark.executor.memory'),(40-len(spark.conf.get('spark.executor.memory'))) * ' ','| Quantidade de memoria RAM nos Workers', '|\n', 86 | '| spark.sql.hive.metastore.version | ',spark.conf.get('spark.sql.hive.metastore.version'),(40-len(spark.conf.get('spark.sql.hive.metastore.version'))) * ' ','| Versão do Metastore', '|\n', 87 | '| spark.databricks.automl.serviceEnabled | ',spark.conf.get('spark.databricks.automl.serviceEnabled'),(40-len(spark.conf.get('spark.databricks.automl.serviceEnabled'))) * ' ','| Validar se o serviço de ML esta habilitado', '|\n', 88 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n', 89 | 'Links Reference:\n', 90 | 'https://spark.apache.org/docs/latest/configuration.html \n', 91 | 'https://books.japila.pl/delta-lake-internals/' 92 | ) 93 | 94 | # COMMAND ---------- 95 | 96 | # DBTITLE 1,Environment Variables 97 | import os 98 | i=1 99 | for var in os.environ.items(): 100 | print(str(i),' - ',var) 101 | i = i+1 102 | -------------------------------------------------------------------------------- /tips/DatabricksAcademy/Learning.md: -------------------------------------------------------------------------------- 1 | # Mastering Databricks - Tudo que voce precisa saber sobre Databricks 2 | 3 | # Table of Contents 4 | 1. [Introdução](#Introdução) 5 | 2. [Fundamentals](#Fundamentals) 6 | 3. [Learning Paths](#Learning-Paths) 7 | 4. [Cursos Complementares](#Cursos-Complementares) 8 | 5. [Accreditation e Badges](#Accreditation-e-Badges) 9 | 6. [Certificações oficiais](#Certificações-oficiais) 10 | 7. [Cursos Legendados](#Cursos-Legendados) 11 | 8. [Ordem recomendada de estudo - Data Engineer](#Ordem-recomendada-de-estudo-para-Data-Engineer) 12 | 9. [Ordem recomendada para Certificação](#Ordem-recomendada-para-Certificação) 13 | 14 | 15 | ## Introdução 16 | Aqui temos um compilado de cursos dentro da plataforma **Databricks Academy** que você pode aproveitar 100% FREE (0800), **são mais de 50 mil reais em cursos**. 17 | 18 | Não está listado todos os cursos da plataforma, mas sim, os que considero essenciais. 19 | 20 | ### **ATENÇÃO**: 21 | Existem **dois portais** de estudo: 22 |
**Partner**: Aqui estão liberados praticamente todos os cursos e materias FREE para empresas que são Partners, você precisa acessar com o e-mail da sua empresa, para saber se sua empresa é Parter ou como se tornar um, entre em contato com um Account Manager Databricks 23 |
**Customer**: Para quem não é Parter temos um conteúdo um pouco maislimitado, mas quase todos os essenciais são FREE, **APROVEITE** 24 |
**OBS: Deixarei uma TAG na frente dos links destacando de qual Portal é o curso: [PARTNER], [CUSTOMER]** 25 |
26 |
Para descobrir se sua empresa é Partner, busque ela nesse portal, se ela for, sinta-se presenteado, são mais de **50 mil** reais em cursos totalmente FREE. 27 |
https://www.databricks.com/company/partners 28 |
29 | image 30 |
31 | #### Acessando o Portal: 32 | https://www.databricks.com/br/learn/training/login 33 | image 34 |
35 |
36 | ## Fundamentals 37 | 38 | > Aqui são os cursos introdutórios sobre Databricks e Generative AI, são cursos focados nos fundamentos, essencial que você não pule essa etapda. 39 | 40 | - What is Big Data?: **[PARTNER]** 41 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/100/what-is-big-data 42 | 43 | - Databricks Fundamentals Learning Plan: **[PARTNER]** 44 | > https://partner-academy.databricks.com/learn/learning_plan/view/215/databricks-fundamentals-learning-plan 45 | 46 | - Databricks Fundamentals Learning Plan: **[CUSTOMER]** 47 | > https://customer-academy.databricks.com/learn/learning_plan/view/215/databricks-fundamentals-learning-plan 48 | 49 | - Databricks Generative AI Fundamentals Learning Plan: **[PARTNER]** 50 | > https://partner-academy.databricks.com/learn/lp/275/Databricks%2520Generative%2520AI%2520Fundamentals%2520Learning%2520Plan 51 | 52 | - Databricks Generative AI Fundamentals Learning Plan: **[CUSTOMER]** 53 | > https://customer-academy.databricks.com/learn/learning_plan/view/275/databricks-generative-ai-fundamentals-learning-plan 54 | 55 | ## Learning Paths 56 | 57 | - Apache Spark Developer Learning Plan: **[PARTNER]** 58 | > https://partner-academy.databricks.com/learn/lp/160/Apache%2520Spark%2520Developer%2520Learning%2520Plan 59 | 60 | - Apache Spark Developer Learning Plan: **[CUSTOMER]** 61 | > https://customer-academy.databricks.com/learn/learning_plan/view/160/apache-spark-developer-learning-plan 62 | 63 | - Data Engineer Learning Plan: **[PARTNER]** 64 | > https://partner-academy.databricks.com/learn/lp/10/Data%2520Engineer%2520Learning%2520Plan 65 | 66 | - Data Engineer Learning Plan: **[CUSTOMER]** 67 | > https://customer-academy.databricks.com/learn/learning_plan/view/10/data-engineer-learning-plan 68 | 69 | - Data Analyst Learning Plan: **[PARTNER]** 70 | > https://partner-academy.databricks.com/learn/lp/78/Data%2520Analyst%2520Learning%2520Plan 71 | 72 | - Data Analyst Learning Plan: **[CUSTOMER]** 73 | > https://customer-academy.databricks.com/learn/learning_plan/view/78/data-analyst-learning-plan 74 | 75 | - Generative AI Engineering Pathway: **[PARTNER]** 76 | > https://partner-academy.databricks.com/learn/learning_plan/view/315/generative-ai-engineering-pathway 77 | 78 | - Generative AI Engineering Pathway: **[CUSTOMER]** 79 | > https://customer-academy.databricks.com/learn/learning_plan/view/315/generative-ai-engineering-pathway 80 | 81 | - Machine Learning Practitioner Learning Plan: **[PARTNER]** 82 | > https://partner-academy.databricks.com/learn/learning_plan/view/11/machine-learning-practitioner-learning-plan 83 | 84 | - Machine Learning Practitioner Learning Plan: **[CUSTOMER]** 85 | > https://customer-academy.databricks.com/learn/learning_plan/view/11/machine-learning-practitioner-learning-plan 86 | 87 | - Azure Databricks Platform Architect Learning Plan: **[PARTNER]** 88 | > https://partner-academy.databricks.com/learn/lp/254/Azure%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan 89 | 90 | - Azure Databricks Platform Architect Learning Plan: **[CUSTOMER]** 91 | > https://customer-academy.databricks.com/learn/learning_plan/view/254/azure-databricks-platform-architect-learning-plan 92 | 93 | - AWS Databricks Platform Architect Learning Plan: **[PARTNER]** 94 | > https://partner-academy.databricks.com/learn/lp/230/AWS%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan 95 | 96 | - AWS Databricks Platform Architect Learning Plan: **[CUSTOMER]** 97 | > https://customer-academy.databricks.com/learn/learning_plan/view/230/aws-databricks-platform-architect-learning-plan 98 | 99 | - GCP Databricks Platform Architect Learning Plan: **[PARTNER]** 100 | > https://partner-academy.databricks.com/learn/lp/266/GCP%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan 101 | 102 | - GCP Databricks Platform Architect Learning Plan: **[CUSTOMER]** 103 | > https://customer-academy.databricks.com/learn/learning_plan/view/266/gcp-databricks-platform-architect-learning-plan 104 | 105 | - Platform Administrator Learning Plan: **[PARTNER]** 106 | > https://partner-academy.databricks.com/learn/lp/207/Platform%2520Administrator%2520Learning%2520Plan 107 | 108 | - Platform Administrator Learning Plan: **[CUSTOMER]** 109 | > https://customer-academy.databricks.com/learn/learning_plan/view/207/platform-administrator-learning-plan 110 | 111 | ## Cursos Complementares 112 | 113 | - Databricks Specialist Sessions (Muito conteúdo top): **[PARTNER]** 114 | > https://partner-academy.databricks.com/learn/course/1456/Databricks%2520Specialist%2520Sessions 115 | 116 | - Databricks Specialist Sessions (Muito conteúdo top): **[CUSTOMER]** 117 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/1456/databricks-specialist-sessions 118 | 119 | - Advanced Data Engineering with Databricks (Focado para certificação Professional): **[PARTNER]** 120 | > https://partner-academy.databricks.com/learn/course/2268/Advanced%2520Data%2520Engineering%2520with%2520Databricks 121 | 122 | - Advanced Data Engineering with Databricks (Focado para certificação Professional): **[CUSTOMER]** 123 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/2268/advanced-data-engineering-with-databricks 124 | 125 | - Unity Catalog Essentials: **[PARTNER]** 126 | > https://partner-academy.databricks.com/learn/learning_plan/view/211/unity-catalog-essentials 127 | 128 | - Unity Catalog Essentials: **[CUSTOMER]** 129 | > https://customer-academy.databricks.com/learn/learning_plan/view/211/unity-catalog-essentials 130 | 131 | - Preparing for UC Upgrades: **[PARTNER]** 132 | > https://partner-academy.databricks.com/learn/learning_plan/view/292/preparing-for-uc-upgrades 133 | 134 | - FY 24 Tech Summit: Partner Content: **[PARTNER]** 135 | > https://partner-academy.databricks.com/learn/learning_plan/view/294/fy-24-tech-summit-partner-content 136 | 137 | - Escape to the Lakehouse: Data Engineering Edition: **[PARTNER]** 138 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/1979/escape-to-the-lakehouse-data-engineering-edition 139 | 140 | - Escape to the Lakehouse: Data Warehousing Edition: **[PARTNER]** 141 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/1978/escape-to-the-lakehouse-data-warehousing-edition 142 | 143 | - Databricks Partner Essentials: **[PARTNER]** 144 | > https://partner-academy.databricks.com/learn/course/1263/Databricks%2520Partner%2520Essentials 145 | 146 | 147 | ## Accreditation e Badges 148 | 149 | - Databricks Fundamentals Accreditation: 150 | > https://partner-academy.databricks.com/learn/course/2308/databricks-fundamentals-accreditation;lp=215 151 | 152 | - Generative AI Fundamentals Accreditation: 153 | > https://partner-academy.databricks.com/learn/course/1811/generative-ai-fundamentals-accreditation;lp=275 154 | image 155 | 156 | 157 | 158 | - Azure Databricks Platform Architect Accreditation: 159 | > https://partner-academy.databricks.com/learn/course/1752/azure-databricks-platform-architect-accreditation;lp=254 160 | 161 | - AWS Databricks Platform Architect Accreditation: 162 | > https://partner-academy.databricks.com/learn/lp/230/AWS%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan 163 | 164 | - GCP Databricks Platform Architect Accreditation: 165 | > https://partner-academy.databricks.com/learn/course/1756/gcp-databricks-platform-architect-accreditation;lp=266 166 | image 167 | 168 | 169 | 170 | - Databricks Accredited Platform Administrator Accreditation: 171 | > https://partner-academy.databricks.com/learn/course/1229/databricks-accredited-platform-administrator-accreditation;lp=207 172 | image 173 | 174 | ## Certificações oficiais 175 | 176 | - Databricks Certified Data Analyst Associate: 177 | > https://www.databricks.com/learn/certification/data-analyst-associate 178 | image 179 | 180 | 181 | 182 | - Databricks Certified Data Engineer Associate: 183 | > https://www.databricks.com/learn/certification/data-engineer-associate 184 | 185 | - Databricks Certified Data Engineer Professional: 186 | > https://www.databricks.com/learn/certification/data-engineer-professional 187 | image 188 | 189 | 190 | 191 | 192 | - Databricks Certified Machine Learning Associate: 193 | > https://www.databricks.com/learn/certification/machine-learning-associate 194 | 195 | - Databricks Certified Machine Learning Professional: 196 | > https://www.databricks.com/learn/certification/machine-learning-professional 197 | image 198 | 199 | 200 | 201 | 202 | - Databricks Certified Associate Developer for Apache Spark: 203 | > https://www.databricks.com/learn/certification/apache-spark-developer-associate 204 | 205 | - Databricks Certified Hadoop Migration Architect: 206 | > https://www.databricks.com/learn/certification/hadoop-migration-architect 207 | image 208 | 209 | 210 | ## Cursos Legendados 211 | ### Nota: São cursos com legenda em português. 212 | 213 | - Databricks Fundamentals Learning Plan - Portuguese BR: **[PARTNER]** 214 | > https://partner-academy.databricks.com/learn/learning_plan/view/317/plano-de-aprendizado-dos-fundamentos-da-databricks-databricks-fundamentals-learning-plan-portuguese-br 215 | 216 | - Get Started with Databricks for Data Engineering - Portuguese BR: **[PARTNER]** 217 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/2331/get-started-with-databricks-for-data-engineering-portuguese-br 218 | 219 | - Get Started with Databricks for Data Engineering - Portuguese BR: **[CUSTOMER]** 220 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/2331/get-started-with-databricks-for-data-engineering-portuguese-br 221 | 222 | - Data Engineering with Databricks - Portuguese BR: **[PARTNER]** 223 | > https://partner-academy.databricks.com/learn/course/2263/play/16173 224 | 225 | - Data Engineering with Databricks - Portuguese BR: **[CUSTOMER]** 226 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/2263/data-engineering-with-databricks-portuguese-br 227 | 228 | - Databricks Generative AI Fundamentals Learning Plan - Portuguese BR: **[PARTNER]** 229 | > https://partner-academy.databricks.com/learn/learning_plan/view/314/plano-de-aprendizado-fundamentais-de-ia-generativa-da-databricks-databricks-generative-ai-fundamentals-learning-plan-portuguese-br 230 | 231 | - Databricks Generative AI Fundamentals Learning Plan - Portuguese BR: **[CUSTOMER]** 232 | > https://customer-academy.databricks.com/learn/learning_plan/view/314/plano-de-aprendizado-fundamentais-de-ia-generativa-da-databricks-databricks-generative-ai-fundamentals-learning-plan-portuguese-br 233 | 234 | ## Ordem recomendada de estudo para Data Engineer 235 | 236 | A ordem abaixo é baseado na MINHA opinião, não existe ordem 100% correta, mas existem algumas ordens mais lógicas. 237 | 238 | 1. What is Big Data? 239 | 2. Databricks Fundamentals Learning Plan 240 | 3. Apache Spark Developer Learning Plan 241 | 4. Data Engineer Learning Plan 242 | 5. Data Analyst Learning Plan 243 | 6. Platform Administrator Learning Plan 244 | 7. [Cloud] Databricks Platform Architect Learning Plan 245 | 8. Advanced Data Engineering with Databricks 246 | 9. Databricks Specialist Sessions 247 | 10. Unity Catalog Essentials 248 | 249 | ## Ordem recomendada para Certificação 250 | 251 | A ordem abaixo é baseado na MINHA opinião, não existe ordem 100% correta, mas existem algumas ordens mais lógicas. 252 | 253 | 1. Databricks Certified Associate Developer for Apache Spark 254 | 2. Databricks Certified Data Engineer Associate 255 | 3. Databricks Certified Data Analyst Associate 256 | 4. Databricks Certified Data Engineer Professional 257 | 258 | 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /tips/UpgradeMethods/UpgradeUC_Examples.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC ##Tabela de migração: Estrategias por tipo de tabela 4 | -- MAGIC 5 | -- MAGIC | Id | Tipo HMS | Location | Tipo UC | Método | 6 | -- MAGIC |----|----------|----------------|--------------------|--------------------------| 7 | -- MAGIC | 1 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE | 8 | -- MAGIC | 2 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE | 9 | -- MAGIC | 3 | Hive SerDe | DBFS Root | Managed/External | CTAS / DEEP CLONE | 10 | -- MAGIC | 4 | Managed | Mount | External | SYNC com Convert | 11 | -- MAGIC | 5 | Managed | Mount | Managed | CTAS / DEEP CLONE | 12 | -- MAGIC | 6 | External | Mount | External | SYNC | 13 | -- MAGIC | 7 | External | Mount | Managed | CTAS / DEEP CLONE | 14 | -- MAGIC | 8 | Managed | Cloud Storage | External | SYNC com Convert | 15 | -- MAGIC | 9 | Managed | Cloud Storage | Managed | CTAS / DEEP CLONE | 16 | -- MAGIC | 10 | External | Cloud Storage | External | SYNC | 17 | -- MAGIC | 11 | External | Cloud Storage | Managed | CTAS / DEEP CLONE | 18 | -- MAGIC 19 | -- MAGIC ## Observação importante 20 | -- MAGIC - **set spark.databricks.sync.command.enableManagedTable=true;** 21 | -- MAGIC - Ao usar essa opção, você não pode dropar a tabela no HMS, pois, o dados serão excluídos do Storage 22 | -- MAGIC - Caso queira dropar, use o script Scala para trocar ela de Managed para External 23 | -- MAGIC 24 | -- MAGIC ## Tabelas Managed vs External 25 | -- MAGIC 26 | -- MAGIC - **Tabelas Managed**: 27 | -- MAGIC - Dados e metadados são gerenciados pelo Unity Catalog. 28 | -- MAGIC - Os dados são armazenados no local especificado pelo catálogo Unity (tipicamente em armazenamento cloud). 29 | -- MAGIC - A exclusão de uma tabela managed remove também os dados. 30 | -- MAGIC - Se for HMS os dados são removidos imediatamente 31 | -- MAGIC - Se for no UC os dados são mantidos por mais 30 dias 32 | -- MAGIC - Aqui voce pode usar o UNDROP até 7 dias 33 | -- MAGIC 34 | -- MAGIC - **Tabelas External**: 35 | -- MAGIC - Apenas os metadados são gerenciados pelo Unity Catalog, os dados permanecem no armazenamento externo (geralmente em um bucket ou outro recurso cloud). 36 | -- MAGIC - A exclusão de uma tabela external remove apenas os metadados; os dados permanecem no armazenamento original. 37 | -- MAGIC - Permite que os dados sejam compartilhados entre diferentes sistemas ou aplicações. 38 | -- MAGIC 39 | -- MAGIC ### DBFS Root vs Mount vs Cloud Storage 40 | -- MAGIC 41 | -- MAGIC - **DBFS Root**: 42 | -- MAGIC - O sistema de arquivos distribuído do Databricks (Databricks File System). 43 | -- MAGIC - Armazenamento temporário e volátil, com possíveis limitações em operações de longa duração. 44 | -- MAGIC - Os dados ficam fisicamente no storage da Databricks que voce não tem acesso 45 | -- MAGIC 46 | -- MAGIC - **Mount**: 47 | -- MAGIC - Uma forma de acessar o armazenamento externo (como S3, ADLS) no DBFS como se fosse um diretório local. 48 | -- MAGIC - Os dados permanecem no armazenamento externo, mas podem ser acessados dentro de Databricks via caminhos montados. 49 | -- MAGIC 50 | -- MAGIC - **Cloud Storage**: 51 | -- MAGIC - Armazenamento na nuvem (ex: AWS S3, Azure Data Lake, Google Cloud Storage) onde os dados podem ser armazenados e acessados diretamente. 52 | -- MAGIC - Mais flexível para armazenamento de grande volume e soluções a longo prazo. 53 | -- MAGIC 54 | -- MAGIC ### Métodos CTAS, DEEP CLONE e SYNC 55 | -- MAGIC 56 | -- MAGIC - **CTAS (Create Table As Select)**: 57 | -- MAGIC - Método usado para criar uma nova tabela a partir dos resultados de uma consulta SQL. 58 | -- MAGIC - A nova tabela pode ser criada com dados agregados ou filtrados. 59 | -- MAGIC - Exemplo de uso: `CREATE TABLE nova_tabela AS SELECT * FROM tabela_existente WHERE condição`. 60 | -- MAGIC 61 | -- MAGIC - **DEEP CLONE**: 62 | -- MAGIC - Método utilizado para clonar tabelas, incluindo seus dados, metadados e histórico de transações. 63 | -- MAGIC - Utilizado para cópia rápida de tabelas, útil em cenários de backup ou migração. 64 | -- MAGIC - Exemplo: `DEEP CLONE origem DESTINO` cria uma cópia completa da tabela de origem. 65 | -- MAGIC 66 | -- MAGIC - **SYNC**: 67 | -- MAGIC - Sincroniza tabelas external com o Unity Catalog, garantindo que o catálogo reflita as alterações feitas diretamente no armazenamento. 68 | -- MAGIC - Essencial para manter a consistência entre os metadados no Unity Catalog e o armazenamento externo. 69 | -- MAGIC - Útil para cenários onde os dados podem ser alterados por fora do Databricks. 70 | -- MAGIC 71 | -- MAGIC 72 | -- MAGIC Post Databricks: 73 | -- MAGIC https://www.databricks.com/blog/migrating-tables-hive-metastore-unity-catalog-metastore#appendix 74 | -- MAGIC 75 | -- MAGIC Notebook oficial: 76 | -- MAGIC https://notebooks.databricks.com/notebooks/uc-upgrade-scenario-with-examples-for-blog.dbc?_gl=1*1nrxwtq*_gcl_au*OTUxMzE5NDg3LjE2OTM0NjcxNDM. 77 | -- MAGIC 78 | 79 | -- COMMAND ---------- 80 | 81 | -- MAGIC %md 82 | -- MAGIC ##Scenario 1: Managed tables on HMS with DBFS Root location 83 | 84 | -- COMMAND ---------- 85 | 86 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade; 87 | create database if not exists hive_metastore.hmsdb_upgrade_db; 88 | 89 | -- COMMAND ---------- 90 | 91 | desc schema extended hive_metastore.hmsdb_upgrade_db; 92 | 93 | -- COMMAND ---------- 94 | 95 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_parquet; 96 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_parquet 97 | using parquet 98 | as 99 | select * from parquet.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.parquet/` limit 100; 100 | 101 | -- COMMAND ---------- 102 | 103 | desc extended hive_metastore.hmsdb_upgrade_db.people_parquet; 104 | 105 | -- COMMAND ---------- 106 | 107 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade; 108 | 109 | -- COMMAND ---------- 110 | 111 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db"; 112 | 113 | -- COMMAND ---------- 114 | 115 | -- DBTITLE 1,Create UC managed Delta table using CTAS (Preferred) 116 | -- Pq não deep clone? DEEP CLONE é recomendado para tabelas DELTA 117 | drop table if exists demo_uc_demo.uc_upgrade_db.people_delta; 118 | create table if not exists 119 | demo_uc_demo.uc_upgrade_db.people_delta 120 | as 121 | select * from hive_metastore.hmsdb_upgrade_db.people_parquet; 122 | 123 | -- COMMAND ---------- 124 | 125 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 126 | 127 | -- COMMAND ---------- 128 | 129 | -- DBTITLE 1,Alternatively Create UC External table (with the same HMS file format) using CTAS 130 | drop table if exists demo_uc_demo.uc_upgrade_db.people_parquet_ext; 131 | create table if not exists demo_uc_demo.uc_upgrade_db.people_parquet_ext 132 | using parquet 133 | location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db/people_parquet_ext" 134 | as 135 | select * from hive_metastore.hmsdb_upgrade_db.people_parquet; 136 | 137 | -- COMMAND ---------- 138 | 139 | desc extended demo_uc_demo.uc_upgrade_db.people_parquet_ext; 140 | 141 | -- COMMAND ---------- 142 | 143 | -- MAGIC %md 144 | -- MAGIC ##Scenario 2: External tables on HMS with DBFS Root location 145 | 146 | -- COMMAND ---------- 147 | 148 | -- MAGIC %md 149 | -- MAGIC ## Scenario 3: HMS Hive SerDe table 150 | 151 | -- COMMAND ---------- 152 | 153 | -- MAGIC %md 154 | -- MAGIC ## Scenario 4: Managed table on HMS with mounted file paths to External UC Table 155 | 156 | -- COMMAND ---------- 157 | 158 | -- MAGIC %python 159 | -- MAGIC dbutils.fs.mounts() 160 | 161 | -- COMMAND ---------- 162 | 163 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade; 164 | create database if not exists hive_metastore.hmsdb_upgrade_db location "dbfs:/mnt/landing/hmsdb_upgrade_db/" 165 | 166 | -- COMMAND ---------- 167 | 168 | -- DBTITLE 1,Managed Delta HMS table 169 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta; 170 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta 171 | as 172 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100; 173 | 174 | -- COMMAND ---------- 175 | 176 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta; 177 | 178 | -- COMMAND ---------- 179 | 180 | select current_version(); 181 | 182 | -- COMMAND ---------- 183 | 184 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade; 185 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db"; 186 | 187 | -- COMMAND ---------- 188 | 189 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN; 190 | 191 | -- COMMAND ---------- 192 | 193 | set spark.databricks.sync.command.enableManagedTable=true; 194 | 195 | -- COMMAND ---------- 196 | 197 | describe extended hive_metastore.hmsdb_upgrade_db.people_delta; 198 | 199 | -- COMMAND ---------- 200 | 201 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta; 202 | 203 | -- COMMAND ---------- 204 | 205 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 206 | 207 | -- COMMAND ---------- 208 | 209 | select * from demo_uc_demo.uc_upgrade_db.people_delta; 210 | 211 | -- COMMAND ---------- 212 | 213 | -- DBTITLE 1,Convert HMS Managed Table to External Table 214 | -- MAGIC %scala 215 | -- MAGIC import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} 216 | -- MAGIC import org.apache.spark.sql.catalyst.TableIdentifier 217 | -- MAGIC 218 | -- MAGIC val tableName = "people_delta" 219 | -- MAGIC val dbName = "hmsdb_upgrade_db" 220 | -- MAGIC 221 | -- MAGIC val oldTable: CatalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName, Some(dbName))) 222 | -- MAGIC val alteredTable: CatalogTable = oldTable.copy(tableType = CatalogTableType.EXTERNAL) 223 | -- MAGIC spark.sessionState.catalog.alterTable(alteredTable) 224 | 225 | -- COMMAND ---------- 226 | 227 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta; 228 | 229 | -- COMMAND ---------- 230 | 231 | -- Arquivos não serão apagados 232 | drop table hive_metastore.hmsdb_upgrade_db.people_delta; 233 | 234 | -- COMMAND ---------- 235 | 236 | select * from demo_uc_demo.uc_upgrade_db.people_delta; 237 | 238 | -- COMMAND ---------- 239 | 240 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 241 | 242 | -- COMMAND ---------- 243 | 244 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN; 245 | 246 | -- COMMAND ---------- 247 | 248 | -- MAGIC %md 249 | -- MAGIC ## Scenario 5: Managed table on HMS with mounted file paths to Managed UC Table 250 | 251 | -- COMMAND ---------- 252 | 253 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade; 254 | create database if not exists hive_metastore.hmsdb_upgrade_db location "dbfs:/mnt/landing/hmsdb_upgrade_db/" 255 | 256 | -- COMMAND ---------- 257 | 258 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta; 259 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta 260 | as 261 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100; 262 | 263 | -- COMMAND ---------- 264 | 265 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta; 266 | 267 | -- COMMAND ---------- 268 | 269 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade; 270 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db/uc_upgrade_schema_2/"; 271 | 272 | -- COMMAND ---------- 273 | 274 | set spark.databricks.sync.command.enableManagedTable=false; 275 | 276 | -- COMMAND ---------- 277 | 278 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN; 279 | 280 | -- COMMAND ---------- 281 | 282 | -- OU DEEP CLONE 283 | drop table if exists demo_uc_demo.uc_upgrade_db.people_delta; 284 | create table if not exists demo_uc_demo.uc_upgrade_db.people_delta 285 | as 286 | select * from hive_metastore.hmsdb_upgrade_db.people_delta; 287 | 288 | -- COMMAND ---------- 289 | 290 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 291 | 292 | -- COMMAND ---------- 293 | 294 | -- MAGIC %md 295 | -- MAGIC ## Scenario 6: External table on HMS with mounted file paths to External UC Table 296 | 297 | -- COMMAND ---------- 298 | 299 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade; 300 | create database if not exists hive_metastore.hmsdb_upgrade_db location "dbfs:/mnt/landing/hmsdb_upgrade_db/" 301 | 302 | -- COMMAND ---------- 303 | 304 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta; 305 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta 306 | location "dbfs:/mnt/landing/hmsdb_upgrade_db/people_delta" 307 | as 308 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100; 309 | 310 | -- COMMAND ---------- 311 | 312 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta; 313 | 314 | -- COMMAND ---------- 315 | 316 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade; 317 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/uc_upgrade_schema_2/"; 318 | 319 | -- COMMAND ---------- 320 | 321 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN; 322 | 323 | -- COMMAND ---------- 324 | 325 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta; 326 | 327 | -- COMMAND ---------- 328 | 329 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 330 | 331 | -- COMMAND ---------- 332 | 333 | -- MAGIC %md 334 | -- MAGIC ## Scenario 7: External table on HMS with mounted file paths to Managed UC Table 335 | 336 | -- COMMAND ---------- 337 | 338 | -- MAGIC %md 339 | -- MAGIC ## Scenario 8: Managed table on HMS with cloud storage file paths to External UC Table 340 | 341 | -- COMMAND ---------- 342 | 343 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade; 344 | create database if not exists hive_metastore.hmsdb_upgrade_db location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/hmsdb_upgrade_db/" 345 | 346 | -- COMMAND ---------- 347 | 348 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta; 349 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta 350 | as 351 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100; 352 | 353 | -- COMMAND ---------- 354 | 355 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta; 356 | 357 | -- COMMAND ---------- 358 | 359 | set spark.databricks.sync.command.enableManagedTable=true; 360 | 361 | -- COMMAND ---------- 362 | 363 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade; 364 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/uc_upgrade_schema_10/"; 365 | 366 | -- COMMAND ---------- 367 | 368 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN; 369 | 370 | -- COMMAND ---------- 371 | 372 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta; 373 | 374 | -- COMMAND ---------- 375 | 376 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 377 | 378 | -- COMMAND ---------- 379 | 380 | -- MAGIC %scala 381 | -- MAGIC import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} 382 | -- MAGIC import org.apache.spark.sql.catalyst.TableIdentifier 383 | -- MAGIC 384 | -- MAGIC val tableName = "people_delta" 385 | -- MAGIC val dbName = "hmsdb_upgrade_db" 386 | -- MAGIC 387 | -- MAGIC val oldTable: CatalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName, Some(dbName))) 388 | -- MAGIC val alteredTable: CatalogTable = oldTable.copy(tableType = CatalogTableType.EXTERNAL) 389 | -- MAGIC spark.sessionState.catalog.alterTable(alteredTable) 390 | 391 | -- COMMAND ---------- 392 | 393 | select * from demo_uc_demo.uc_upgrade_db.people_delta ; 394 | 395 | -- COMMAND ---------- 396 | 397 | desc extended demo_uc_demo.uc_upgrade_db.people_delta ; 398 | 399 | -- COMMAND ---------- 400 | 401 | -- MAGIC %md 402 | -- MAGIC ## Scenario 9: Managed table on HMS with cloud storage file paths to Managed UC Table 403 | 404 | -- COMMAND ---------- 405 | 406 | -- MAGIC %md 407 | -- MAGIC ## Scenario 10: External table on HMS with cloud storage file paths to External UC Table 408 | 409 | -- COMMAND ---------- 410 | 411 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade; 412 | create database if not exists hive_metastore.hmsdb_upgrade_db location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/hms/hmsdb_upgrade_db/" 413 | 414 | -- COMMAND ---------- 415 | 416 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta; 417 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta 418 | location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/hms/hmsdb_upgrade_db/people_delta" 419 | as 420 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100; 421 | 422 | -- COMMAND ---------- 423 | 424 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta; 425 | 426 | -- COMMAND ---------- 427 | 428 | create catalog if not exists demo_uc_demo 429 | 430 | -- COMMAND ---------- 431 | 432 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade; 433 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc_demo/uc_upgrade_db/"; 434 | 435 | -- COMMAND ---------- 436 | 437 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN; 438 | 439 | -- COMMAND ---------- 440 | 441 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta; 442 | 443 | -- COMMAND ---------- 444 | 445 | desc extended demo_uc_demo.uc_upgrade_db.people_delta; 446 | 447 | -- COMMAND ---------- 448 | 449 | select * from demo_uc_demo.uc_upgrade_db.people_delta; 450 | 451 | -- COMMAND ---------- 452 | 453 | -- MAGIC %md 454 | -- MAGIC ## Scenario 11: External table on HMS with cloud storage file paths to Managed UC Table 455 | --------------------------------------------------------------------------------